[Bf-blender-cvs] [10077229d6f] soc-2019-cycles-procedural: SSE noise implementation

Wed Jul 3 13:52:41 CEST 2019

Commit: 10077229d6f30d7d6f3e42b9a5cca63db1493a91
Author: OmarSquircleArt
Date:   Wed Jul 3 13:53:32 2019 +0200
Branches: soc-2019-cycles-procedural
https://developer.blender.org/rB10077229d6f30d7d6f3e42b9a5cca63db1493a91

SSE noise implementation

===================================================================

M	intern/cycles/kernel/svm/svm_musgrave.h
M	intern/cycles/kernel/svm/svm_noise.h
M	intern/cycles/kernel/svm/svm_noisetex.h
M	intern/cycles/kernel/svm/svm_texture.h
M	intern/cycles/kernel/svm/svm_wave.h
M	intern/cycles/render/nodes.cpp
M	intern/cycles/util/util_hash.h
M	intern/cycles/util/util_math.h
M	intern/cycles/util/util_ssef.h
M	intern/cycles/util/util_ssei.h

===================================================================

diff --git a/intern/cycles/kernel/svm/svm_musgrave.h b/intern/cycles/kernel/svm/svm_musgrave.h
index 67fb5ca6241..efbf4d20f88 100644
--- a/intern/cycles/kernel/svm/svm_musgrave.h
+++ b/intern/cycles/kernel/svm/svm_musgrave.h
@@ -34,14 +34,14 @@ ccl_device_noinline float noise_musgrave_fBm(float3 p, float H, float lacunarity
   int i;
 
   for (i = 0; i < float_to_int(octaves); i++) {
-    value += snoise(p) * pwr;
+    value += snoise_3d(p) * pwr;
     pwr *= pwHL;
     p *= lacunarity;
   }
 
   rmd = octaves - floorf(octaves);
   if (rmd != 0.0f)
-    value += rmd * snoise(p) * pwr;
+    value += rmd * snoise_3d(p) * pwr;
 
   return value;
 }
@@ -65,14 +65,14 @@ ccl_device_noinline float noise_musgrave_multi_fractal(float3 p,
   int i;
 
   for (i = 0; i < float_to_int(octaves); i++) {
-    value *= (pwr * snoise(p) + 1.0f);
+    value *= (pwr * snoise_3d(p) + 1.0f);
     pwr *= pwHL;
     p *= lacunarity;
   }
 
   rmd = octaves - floorf(octaves);
   if (rmd != 0.0f)
-    value *= (rmd * pwr * snoise(p) + 1.0f); /* correct? */
+    value *= (rmd * pwr * snoise_3d(p) + 1.0f); /* correct? */
 
   return value;
 }
@@ -94,11 +94,11 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(
   int i;
 
   /* first unscaled octave of function; later octaves are scaled */
-  value = offset + snoise(p);
+  value = offset + snoise_3d(p);
   p *= lacunarity;
 
   for (i = 1; i < float_to_int(octaves); i++) {
-    increment = (snoise(p) + offset) * pwr * value;
+    increment = (snoise_3d(p) + offset) * pwr * value;
     value += increment;
     pwr *= pwHL;
     p *= lacunarity;
@@ -106,7 +106,7 @@ ccl_device_noinline float noise_musgrave_hetero_terrain(
 
   rmd = octaves - floorf(octaves);
   if (rmd != 0.0f) {
-    increment = (snoise(p) + offset) * pwr * value;
+    increment = (snoise_3d(p) + offset) * pwr * value;
     value += rmd * increment;
   }
 
@@ -129,7 +129,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
   float pwr = pwHL;
   int i;
 
-  result = snoise(p) + offset;
+  result = snoise_3d(p) + offset;
   weight = gain * result;
   p *= lacunarity;
 
@@ -137,7 +137,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
     if (weight > 1.0f)
       weight = 1.0f;
 
-    signal = (snoise(p) + offset) * pwr;
+    signal = (snoise_3d(p) + offset) * pwr;
     pwr *= pwHL;
     result += weight * signal;
     weight *= gain * signal;
@@ -146,7 +146,7 @@ ccl_device_noinline float noise_musgrave_hybrid_multi_fractal(
 
   rmd = octaves - floorf(octaves);
   if (rmd != 0.0f)
-    result += rmd * ((snoise(p) + offset) * pwr);
+    result += rmd * ((snoise_3d(p) + offset) * pwr);
 
   return result;
 }
@@ -167,7 +167,7 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(
   float pwr = pwHL;
   int i;
 
-  signal = offset - fabsf(snoise(p));
+  signal = offset - fabsf(snoise_3d(p));
   signal *= signal;
   result = signal;
   weight = 1.0f;
@@ -175,7 +175,7 @@ ccl_device_noinline float noise_musgrave_ridged_multi_fractal(
   for (i = 1; i < float_to_int(octaves); i++) {
     p *= lacunarity;
     weight = saturate(signal * gain);
-    signal = offset - fabsf(snoise(p));
+    signal = offset - fabsf(snoise_3d(p));
     signal *= signal;
     signal *= weight;
     result += signal * pwr;
diff --git a/intern/cycles/kernel/svm/svm_noise.h b/intern/cycles/kernel/svm/svm_noise.h
index a4824076066..dd7d7178101 100644
--- a/intern/cycles/kernel/svm/svm_noise.h
+++ b/intern/cycles/kernel/svm/svm_noise.h
@@ -32,270 +32,559 @@
 
 CCL_NAMESPACE_BEGIN
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssei quick_floor_sse(const ssef &x)
+/* **** Perlin Noise **** */
+
+/* The following functions compute 1D, 2D, 3D, and 4D perlin noise.
+ * The code is based on the OSL noise code for compatibility.
+ * See oslnoise.h
+ */
+
+/* An alternative to Hermite interpolation that have zero first and
+ * second derivatives at t = 0 and t = 1.
+ * Described in Ken Perlin's "Improving noise" [2002].
+ */
+ccl_device float fade(float t)
 {
-  ssei b = truncatei(x);
-  ssei isneg = cast((x < ssef(0.0f)).m128);
-  return b + isneg;  // unsaturated add 0xffffffff is the same as subtract -1
+  return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
 }
-#endif
 
-#ifdef __KERNEL_SSE2__
-ccl_device_inline ssei hash_sse(const ssei &kx, const ssei &ky, const ssei &kz)
-{
-#  define rot(x, k) (((x) << (k)) | (srl(x, 32 - (k))))
-#  define xor_rot(a, b, c) \
-    do { \
-      a = a ^ b; \
-      a = a - rot(b, c); \
-    } while (0)
-
-  uint len = 3;
-  ssei magic = ssei(0xdeadbeef + (len << 2) + 13);
-  ssei a = magic + kx;
-  ssei b = magic + ky;
-  ssei c = magic + kz;
-
-  xor_rot(c, b, 14);
-  xor_rot(a, c, 11);
-  xor_rot(b, a, 25);
-  xor_rot(c, b, 16);
-  xor_rot(a, c, 4);
-  xor_rot(b, a, 14);
-  xor_rot(c, b, 24);
-
-  return c;
-#  undef rot
-#  undef xor_rot
+ccl_device_inline float negate_if(float val, int condition)
+{
+  return (condition) ? -val : val;
 }
-#endif
 
-#if 0  // unused
-ccl_device int imod(int a, int b)
+ccl_device float grad1(int hash, float x)
 {
-  a %= b;
-  return a < 0 ? a + b : a;
+  int h = hash & 15;
+  float g = 1 + (h & 7);
+  return negate_if(g, h & 8) * x;
 }
 
-ccl_device uint phash(int kx, int ky, int kz, int3 p)
+ccl_device_noinline float perlin_1d(float x)
 {
-  return hash_uint3(imod(kx, p.x), imod(ky, p.y), imod(kz, p.z));
+  int X;
+  float fx = floorfrac(x, &X);
+  float u = fade(fx);
+
+  return mix(grad1(hash_uint(X), fx), grad1(hash_uint(X + 1), fx - 1.0f), u);
 }
-#endif
 
+/* 2D, 3D, and 4D noise can be accelerated using SSE, so we do a separate
+ * implementation for the SSE kernels.
+ */
 #ifndef __KERNEL_SSE2__
-ccl_device float floorfrac(float x, int *i)
+/* Bilinear Interpolation:
+ *
+ * v2          v3
+ *  @ + + + + @       y
+ *  +         +       ^
+ *  +         +       |
+ *  +         +       |
+ *  @ + + + + @       @------> x
+ * v0          v1
+ *
+ */
+ccl_device float bi_mix(float v0, float v1, float v2, float v3, float x, float y)
 {
-  *i = quick_floor_to_int(x);
-  return x - *i;
+  float x1 = 1.0f - x;
+  return (1.0f - y) * (v0 * x1 + v1 * x) + y * (v2 * x1 + v3 * x);
 }
-#else
-ccl_device_inline ssef floorfrac_sse(const ssef &x, ssei *i)
+
+/* Trilinear Interpolation:
+ *
+ *   v6               v7
+ *     @ + + + + + + @
+ *     +\            +\
+ *     + \           + \
+ *     +  \          +  \
+ *     +   \ v4      +   \ v5
+ *     +    @ + + + +++ + @          z
+ *     +    +        +    +      y   ^
+ *  v2 @ + +++ + + + @ v3 +       \  |
+ *      \   +         \   +        \ |
+ *       \  +          \  +         \|
+ *        \ +           \ +          +---------> x
+ *         \+            \+
+ *          @ + + + + + + @
+ *        v0               v1
+ */
+ccl_device float tri_mix(float v0,
+                         float v1,
+                         float v2,
+                         float v3,
+                         float v4,
+                         float v5,
+                         float v6,
+                         float v7,
+                         float x,
+                         float y,
+                         float z)
 {
-  *i = quick_floor_sse(x);
-  return x - ssef(*i);
+  float x1 = 1.0f - x;
+  float y1 = 1.0f - y;
+  float z1 = 1.0f - z;
+  return z1 * (y1 * (v0 * x1 + v1 * x) + y * (v2 * x1 + v3 * x)) +
+         z * (y1 * (v4 * x1 + v5 * x) + y * (v6 * x1 + v7 * x));
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float fade(float t)
+ccl_device float grad2(int hash, float x, float y)
 {
-  return t * t * t * (t * (t * 6.0f - 15.0f) + 10.0f);
+  int h = hash & 7;
+  float u = h < 4 ? x : y;
+  float v = 2.0f * (h < 4 ? y : x);
+  return negate_if(u, h & 1) + negate_if(v, h & 2);
 }
-#else
-ccl_device_inline ssef fade_sse(const ssef *t)
+
+ccl_device float grad3(int hash, float x, float y, float z)
 {
-  ssef a = madd(*t, ssef(6.0f), ssef(-15.0f));
-  ssef b = madd(*t, a, ssef(10.0f));
-  return ((*t) * (*t)) * ((*t) * b);
+  int h = hash & 15;
+  float u = h < 8 ? x : y;
+  float vt = ((h == 12) || (h == 14)) ? x : z;
+  float v = h < 4 ? y : vt;
+  return negate_if(u, h & 1) + negate_if(v, h & 2);
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float nerp(float t, float a, float b)
+ccl_device float grad4(int hash, float x, float y, float z, float w)
 {
-  return (1.0f - t) * a + t * b;
+  int h = hash & 31;
+  float u = h < 24 ? x : y;
+  float v = h < 16 ? y : z;
+  float s = h < 8 ? z : w;
+  return negate_if(u, h & 1) + negate_if(v, h & 2) + negate_if(s, h & 4);
 }
-#else
-ccl_device_inline ssef nerp_sse(const ssef &t, const ssef &a, const ssef &b)
+
+ccl_device_noinline float perlin_2d(float x, float y)
 {
-  ssef x1 = (ssef(1.0f) - t) * a;
-  return madd(t, b, x1);
+  int X;
+  int Y;
+
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
+
+  float u = fade(fx);
+  float v = fade(fy);
+
+  float r = bi_mix(grad2(hash_uint2(X, Y), fx, fy),
+                   grad2(hash_uint2(X + 1, Y), fx - 1.0f, fy),
+                   grad2(hash_uint2(X, Y + 1), fx, fy - 1.0f),
+                   grad2(hash_uint2(X + 1, Y + 1), fx - 1.0f, fy - 1.0f),
+                   u,
+                   v);
+
+  return r;
 }
-#endif
 
-#ifndef __KERNEL_SSE2__
-ccl_device float grad(int hash, float x, float y, float z)
+ccl_device_noinline float perlin_3d(float x, float y, float z)
 {
-  // use vectors pointing to the edges of the cube
-  int h = hash & 15;
-  float u = h < 8 ? x : y;
-  float vt = ((h == 12) | (h == 14)) ? x : z;
-  float v = h < 4 ? y : vt;
-  return ((h & 1) ? -u : u) + ((h & 2) ? -v : v);
+  int X;
+  int Y;
+  int Z;
+
+  float fx = floorfrac(x, &X);
+  float fy = floorfrac(y, &Y);
+  float fz = floorfrac(z, &Z);
+
+  float u = fade(fx);
+  float v = fade(fy);
+  float w = fade(fz);
+
+  float r = tri_mix(grad3(hash_uint3(X, Y, Z), fx, fy, fz),
+                    grad3(hash_uint3(X + 1, Y, Z), fx - 1.0f, fy, fz),
+                    grad3(hash_uint3(X, Y + 1, Z), fx, fy - 1.0f, fz),
+                    grad3(hash_uint3(X + 1, Y + 1, Z), fx - 1.0f, fy - 1.0f, fz),
+                    grad3(hash_uint3(X, Y, Z + 1), fx, fy, fz - 1.0f),
+                    grad3(hash_uint3(X + 1, Y, Z + 1), fx - 1.0

@@ Diff output truncated at 10240 characters. @@