[Bf-blender-cvs] [118e31a0a99] master: Cycles: Fix tricubic sampling with NanoVDB

Mon Nov 9 12:38:09 CET 2020

Commit: 118e31a0a995ae4e8845376215d9c35017a8f781
Author: Patrick Mours
Date:   Fri Nov 6 15:19:58 2020 +0100
Branches: master
https://developer.blender.org/rB118e31a0a995ae4e8845376215d9c35017a8f781

Cycles: Fix tricubic sampling with NanoVDB

Volumes using tricubic sampling were producing different results with NanoVDB compared
to dense textures. This fixes that by using the same tricubic sampling algorithm in both
cases. It also fixes some remaining offset issues and some minor things that broke OpenCL
kernel compilation on NVIDIA.

Reviewed By: brecht

Differential Revision: https://developer.blender.org/D9491

===================================================================

M	intern/cycles/kernel/kernel_compat_opencl.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
M	intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
M	intern/cycles/kernel/kernels/opencl/kernel_opencl_image.h
M	intern/cycles/render/image_vdb.cpp
M	intern/cycles/util/util_types.h

===================================================================

diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index ba7ab43a47a..1848f6059b6 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -48,7 +48,7 @@
 #define ccl_align(n) __attribute__((aligned(n)))
 #define ccl_optional_struct_init
 
-#if __OPENCL_VERSION__ >= 200
+#if __OPENCL_VERSION__ >= 200 && !defined(__NV_CL_C_VERSION)
 #  define ccl_loop_no_unroll __attribute__((opencl_unroll_hint(1)))
 #else
 #  define ccl_loop_no_unroll
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
index b466b41f456..b97400a443a 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_image.h
@@ -28,7 +28,6 @@ CCL_NAMESPACE_BEGIN
  * instruction sets. */
 namespace {
 
-template<typename T> struct TextureInterpolator {
 #define SET_CUBIC_SPLINE_WEIGHTS(u, t) \
   { \
     u[0] = (((-1.0f / 6.0f) * t + 0.5f) * t - 0.5f) * t + (1.0f / 6.0f); \
@@ -38,6 +37,15 @@ template<typename T> struct TextureInterpolator {
   } \
   (void)0
 
+ccl_always_inline float frac(float x, int *ix)
+{
+  int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
+  *ix = i;
+  return x - (float)i;
+}
+
+template<typename T> struct TextureInterpolator {
+
   static ccl_always_inline float4 read(float4 r)
   {
     return r;
@@ -106,13 +114,6 @@ template<typename T> struct TextureInterpolator {
     return clamp(x, 0, width - 1);
   }
 
-  static ccl_always_inline float frac(float x, int *ix)
-  {
-    int i = float_to_int(x) - ((x < 0.0f) ? 1 : 0);
-    *ix = i;
-    return x - (float)i;
-  }
-
   /* ********  2D interpolation ******** */
 
   static ccl_always_inline float4 interp_closest(const TextureInfo &info, float x, float y)
@@ -370,7 +371,7 @@ template<typename T> struct TextureInterpolator {
   static ccl_never_inline
 #endif
       float4
-      interp_3d_tricubic(const TextureInfo &info, float x, float y, float z)
+      interp_3d_cubic(const TextureInfo &info, float x, float y, float z)
   {
     int width = info.width;
     int height = info.height;
@@ -469,14 +470,16 @@ template<typename T> struct TextureInterpolator {
       case INTERPOLATION_LINEAR:
         return interp_3d_linear(info, x, y, z);
       default:
-        return interp_3d_tricubic(info, x, y, z);
+        return interp_3d_cubic(info, x, y, z);
     }
   }
-#undef SET_CUBIC_SPLINE_WEIGHTS
 };
 
 #ifdef WITH_NANOVDB
 template<typename T> struct NanoVDBInterpolator {
+
+  typedef nanovdb::ReadAccessor<nanovdb::NanoRoot<T>> ReadAccessorT;
+
   static ccl_always_inline float4 read(float r)
   {
     return make_float4(r, r, r, 1.0f);
@@ -487,26 +490,93 @@ template<typename T> struct NanoVDBInterpolator {
     return make_float4(r[0], r[1], r[2], 1.0f);
   }
 
+  static ccl_always_inline float4 interp_3d_closest(ReadAccessorT acc, float x, float y, float z)
+  {
+    const nanovdb::Vec3f xyz(x, y, z);
+    return read(nanovdb::NearestNeighborSampler<ReadAccessorT, false>(acc)(xyz));
+  }
+
+  static ccl_always_inline float4 interp_3d_linear(ReadAccessorT acc, float x, float y, float z)
+  {
+    const nanovdb::Vec3f xyz(x - 0.5f, y - 0.5f, z - 0.5f);
+    return read(nanovdb::TrilinearSampler<ReadAccessorT, false>(acc)(xyz));
+  }
+
+#  if defined(__GNUC__) || defined(__clang__)
+  static ccl_always_inline
+#  else
+  static ccl_never_inline
+#  endif
+      float4
+      interp_3d_cubic(ReadAccessorT acc, float x, float y, float z)
+  {
+    int ix, iy, iz;
+    int nix, niy, niz;
+    int pix, piy, piz;
+    int nnix, nniy, nniz;
+    /* Tricubic b-spline interpolation. */
+    const float tx = frac(x - 0.5f, &ix);
+    const float ty = frac(y - 0.5f, &iy);
+    const float tz = frac(z - 0.5f, &iz);
+    pix = ix - 1;
+    piy = iy - 1;
+    piz = iz - 1;
+    nix = ix + 1;
+    niy = iy + 1;
+    niz = iz + 1;
+    nnix = ix + 2;
+    nniy = iy + 2;
+    nniz = iz + 2;
+
+    const int xc[4] = {pix, ix, nix, nnix};
+    const int yc[4] = {piy, iy, niy, nniy};
+    const int zc[4] = {piz, iz, niz, nniz};
+    float u[4], v[4], w[4];
+
+    /* Some helper macro to keep code reasonable size,
+     * let compiler to inline all the matrix multiplications.
+     */
+#  define DATA(x, y, z) (read(acc.getValue(nanovdb::Coord(xc[x], yc[y], zc[z]))))
+#  define COL_TERM(col, row) \
+    (v[col] * (u[0] * DATA(0, col, row) + u[1] * DATA(1, col, row) + u[2] * DATA(2, col, row) + \
+               u[3] * DATA(3, col, row)))
+#  define ROW_TERM(row) \
+    (w[row] * (COL_TERM(0, row) + COL_TERM(1, row) + COL_TERM(2, row) + COL_TERM(3, row)))
+
+    SET_CUBIC_SPLINE_WEIGHTS(u, tx);
+    SET_CUBIC_SPLINE_WEIGHTS(v, ty);
+    SET_CUBIC_SPLINE_WEIGHTS(w, tz);
+
+    /* Actual interpolation. */
+    return ROW_TERM(0) + ROW_TERM(1) + ROW_TERM(2) + ROW_TERM(3);
+
+#  undef COL_TERM
+#  undef ROW_TERM
+#  undef DATA
+  }
+
   static ccl_always_inline float4
   interp_3d(const TextureInfo &info, float x, float y, float z, InterpolationType interp)
   {
-    const nanovdb::Vec3f xyz(x, y, z);
-    nanovdb::NanoGrid<T> *const grid = (nanovdb::NanoGrid<T> *)info.data;
-    const nanovdb::NanoRoot<T> &root = grid->tree().root();
+    using namespace nanovdb;
+
+    NanoGrid<T> *const grid = (NanoGrid<T> *)info.data;
+    const NanoRoot<T> &root = grid->tree().root();
 
-    typedef nanovdb::ReadAccessor<nanovdb::NanoRoot<T>> ReadAccessorT;
     switch ((interp == INTERPOLATION_NONE) ? info.interpolation : interp) {
       case INTERPOLATION_CLOSEST:
-        return read(nanovdb::SampleFromVoxels<ReadAccessorT, 0, false>(root)(xyz));
+        return interp_3d_closest(root, x, y, z);
       case INTERPOLATION_LINEAR:
-        return read(nanovdb::SampleFromVoxels<ReadAccessorT, 1, false>(root)(xyz));
+        return interp_3d_linear(root, x, y, z);
       default:
-        return read(nanovdb::SampleFromVoxels<ReadAccessorT, 3, false>(root)(xyz));
+        return interp_3d_cubic(root, x, y, z);
     }
   }
 };
 #endif
 
+#undef SET_CUBIC_SPLINE_WEIGHTS
+
 ccl_device float4 kernel_tex_image_interp(KernelGlobals *kg, int id, float x, float y)
 {
   const TextureInfo &info = kernel_tex_fetch(__texture_info, id);
diff --git a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
index c2a0ee06dbc..b8aaacba960 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
+++ b/intern/cycles/kernel/kernels/cuda/kernel_cuda_image.h
@@ -24,17 +24,14 @@ ccl_device float cubic_w0(float a)
 {
   return (1.0f / 6.0f) * (a * (a * (-a + 3.0f) - 3.0f) + 1.0f);
 }
-
 ccl_device float cubic_w1(float a)
 {
   return (1.0f / 6.0f) * (a * a * (3.0f * a - 6.0f) + 4.0f);
 }
-
 ccl_device float cubic_w2(float a)
 {
   return (1.0f / 6.0f) * (a * (a * (-3.0f * a + 3.0f) + 3.0f) + 1.0f);
 }
-
 ccl_device float cubic_w3(float a)
 {
   return (1.0f / 6.0f) * (a * a * a);
@@ -45,7 +42,6 @@ ccl_device float cubic_g0(float a)
 {
   return cubic_w0(a) + cubic_w1(a);
 }
-
 ccl_device float cubic_g1(float a)
 {
   return cubic_w2(a) + cubic_w3(a);
@@ -54,13 +50,11 @@ ccl_device float cubic_g1(float a)
 /* h0 and h1 are the two offset functions */
 ccl_device float cubic_h0(float a)
 {
-  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
-  return -1.0f + cubic_w1(a) / (cubic_w0(a) + cubic_w1(a)) + 0.5f;
+  return (cubic_w1(a) / cubic_g0(a)) - 1.0f;
 }
-
 ccl_device float cubic_h1(float a)
 {
-  return 1.0f + cubic_w3(a) / (cubic_w2(a) + cubic_w3(a)) + 0.5f;
+  return (cubic_w3(a) / cubic_g1(a)) + 1.0f;
 }
 
 /* Fast bicubic texture lookup using 4 bilinear lookups, adapted from CUDA samples. */
@@ -79,10 +73,11 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f
 
   float g0x = cubic_g0(fx);
   float g1x = cubic_g1(fx);
-  float x0 = (px + cubic_h0(fx)) / info.width;
-  float x1 = (px + cubic_h1(fx)) / info.width;
-  float y0 = (py + cubic_h0(fy)) / info.height;
-  float y1 = (py + cubic_h1(fy)) / info.height;
+  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
 
   return cubic_g0(fy) * (g0x * tex2D<T>(tex, x0, y0) + g1x * tex2D<T>(tex, x1, y0)) +
          cubic_g1(fy) * (g0x * tex2D<T>(tex, x0, y1) + g1x * tex2D<T>(tex, x1, y1));
@@ -90,7 +85,7 @@ ccl_device T kernel_tex_image_interp_bicubic(const TextureInfo &info, float x, f
 
 /* Fast tricubic texture lookup using 8 trilinear lookups. */
 template<typename T>
-ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo &info, float x, float y, float z)
+ccl_device T kernel_tex_image_interp_tricubic(const TextureInfo &info, float x, float y, float z)
 {
   CUtexObject tex = (CUtexObject)info.data;
 
@@ -112,12 +107,13 @@ ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo &info, float x
   float g0z = cubic_g0(fz);
   float g1z = cubic_g1(fz);
 
-  float x0 = (px + cubic_h0(fx)) / info.width;
-  float x1 = (px + cubic_h1(fx)) / info.width;
-  float y0 = (py + cubic_h0(fy)) / info.height;
-  float y1 = (py + cubic_h1(fy)) / info.height;
-  float z0 = (pz + cubic_h0(fz)) / info.depth;
-  float z1 = (pz + cubic_h1(fz)) / info.depth;
+  /* Note +0.5 offset to compensate for CUDA linear filtering convention. */
+  float x0 = (px + cubic_h0(fx) + 0.5f) / info.width;
+  float x1 = (px + cubic_h1(fx) + 0.5f) / info.width;
+  float y0 = (py + cubic_h0(fy) + 0.5f) / info.height;
+  float y1 = (py + cubic_h1(fy) + 0.5f) / info.height;
+  float z0 = (pz + cubic_h0(fz) + 0.5f) / info.depth;
+  float z1 = (pz + cubic_h1(fz) + 0.5f) / info.depth;
 
   return g0z * (g0y * (g0x * tex3D<T>(tex, x0, y0, z0) + g1x * tex3D<T>(tex, x1, y0, z0)) +
                 g1y * (g0x * tex3D<T>(tex, x0, y1, z0) + g1x * tex3D<T>(tex, x1, y1, z0))) +
@@ -126,22 +122,56 @@ ccl_device T kernel_tex_image_interp_bicubic_3d(const TextureInfo &info, float x
 }
 
 #ifdef WITH_NANOVDB
+template<typename T, typename S>
+ccl_device T kernel_tex_image_inte

@@ Diff output truncated at 10240 characters. @@