[Bf-blender-cvs] [2214c391062] cycles-x: Cycles X: Switch sample count pass from float to uint

Wed May 12 16:10:46 CEST 2021

Commit: 2214c3910623b6c34652248589e64e75aff0043e
Author: Sergey Sharybin
Date:   Mon May 10 15:04:45 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB2214c3910623b6c34652248589e64e75aff0043e

Cycles X: Switch sample count pass from float to uint

This way we will never run into rounding issues when relying on a
per-pixel sample count value.

There is some code duplication between the film conversion and the
pass accessor. Ideally we will de-duplicate the per-pixel processing
logic, but is better if that happens as a separate refactor.

===================================================================

M	intern/cycles/integrator/denoiser_oidn.cpp
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/kernel_accumulate.h
M	intern/cycles/kernel/kernel_adaptive_sampling.h
M	intern/cycles/kernel/kernel_film.h
M	intern/cycles/render/pass_accessor.cpp

===================================================================

diff --git a/intern/cycles/integrator/denoiser_oidn.cpp b/intern/cycles/integrator/denoiser_oidn.cpp
index 5097e703dc2..b494edd86b4 100644
--- a/intern/cycles/integrator/denoiser_oidn.cpp
+++ b/intern/cycles/integrator/denoiser_oidn.cpp
@@ -143,7 +143,7 @@ static void oidn_add_pass_if_needed(oidn::FilterRef *oidn_filter,
 
       float pixel_scale = scale;
       if (pass_sample_count != PASS_UNUSED) {
-        pixel_scale = 1.0f / buffer_pixel[pass_sample_count];
+        pixel_scale = 1.0f / __float_as_uint(buffer_pixel[pass_sample_count]);
       }
 
       scaled_row[x * 3 + 0] = pass_pixel[0] * pixel_scale;
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index c38f17b714b..d753932e01b 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -475,7 +475,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
     pixel_scale = 1.0f / num_samples;
   }
   else {
-    pixel_scale = 1.0f / buffer[pass_sample_count];
+    pixel_scale = 1.0f / __float_as_uint(buffer[pass_sample_count]);
   }
 
   if (num_inputs > 0) {
@@ -535,7 +535,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
     pixel_scale = num_samples;
   }
   else {
-    pixel_scale = buffer[pass_sample_count];
+    pixel_scale = __float_as_uint(buffer[pass_sample_count]);
   }
 
   buffer[0] = in[0] * pixel_scale;
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 4d893ce57f8..49719f0f681 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -332,7 +332,7 @@ ccl_device_inline int kernel_accum_sample(INTEGRATOR_STATE_CONST_ARGS,
   ccl_global float *buffer = kernel_accum_pixel_render_buffer(INTEGRATOR_STATE_PASS,
                                                               render_buffer);
 
-  return (int)atomic_add_and_fetch_float(buffer + kernel_data.film.pass_sample_count, 1.0f) - 1;
+  return atomic_fetch_and_add_uint32((uint *)(buffer) + kernel_data.film.pass_sample_count, 1);
 }
 
 ccl_device void kernel_accum_adaptive_buffer(INTEGRATOR_STATE_CONST_ARGS,
diff --git a/intern/cycles/kernel/kernel_adaptive_sampling.h b/intern/cycles/kernel/kernel_adaptive_sampling.h
index ae87855c81d..61ae0c60cb0 100644
--- a/intern/cycles/kernel/kernel_adaptive_sampling.h
+++ b/intern/cycles/kernel/kernel_adaptive_sampling.h
@@ -68,7 +68,7 @@ ccl_device bool kernel_adaptive_sampling_convergence_check(const KernelGlobals *
 
   const float4 I = *((ccl_global float4 *)buffer);
 
-  const float sample = buffer[kernel_data.film.pass_sample_count];
+  const float sample = __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
 
   /* The per pixel error as seen in section 2.1 of
    * "A hierarchical automatic stopping condition for Monte Carlo global illumination"
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index 09876146031..5d5f55565c7 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -47,7 +47,14 @@ ccl_device float4 film_get_pass_result(const KernelGlobals *kg, ccl_global float
   }
   else if (display_pass_components == 1) {
     ccl_global const float *in = (ccl_global float *)(buffer + display_pass_offset);
-    pass_result = make_float4(*in, *in, *in, 0.0f);
+    if (kernel_data.film.pass_sample_count != PASS_UNUSED &&
+        kernel_data.film.pass_sample_count == display_pass_offset) {
+      const float value = __float_as_uint(*in);
+      pass_result = make_float4(value, value, value, 0.0f);
+    }
+    else {
+      pass_result = make_float4(*in, *in, *in, 0.0f);
+    }
   }
 
   return pass_result;
@@ -84,7 +91,7 @@ ccl_device void kernel_film_convert_to_half_float(const KernelGlobals *kg,
      * meaningful result (rather than becoming uniform buffer filled with 1). */
     if (kernel_data.film.pass_sample_count != PASS_UNUSED &&
         kernel_data.film.pass_sample_count != kernel_data.film.display_pass_offset) {
-      sample_scale = 1.0f / buffer[kernel_data.film.pass_sample_count];
+      sample_scale = 1.0f / __float_as_uint(buffer[kernel_data.film.pass_sample_count]);
     }
     rgba_in *= sample_scale;
   }
diff --git a/intern/cycles/render/pass_accessor.cpp b/intern/cycles/render/pass_accessor.cpp
index 1e93e6e61d6..56818165998 100644
--- a/intern/cycles/render/pass_accessor.cpp
+++ b/intern/cycles/render/pass_accessor.cpp
@@ -41,7 +41,7 @@ class Scaler {
   {
     /* Special trick to only scale the samples count pass with the sample scale. Otherwise the pass
      * becomes a uniform 1.0. */
-    if (sample_count_pass_ == pass_buffer) {
+    if (sample_count_pass_ == reinterpret_cast<const uint *>(pass_buffer)) {
       sample_count_pass_ = nullptr;
     }
 
@@ -86,15 +86,15 @@ class Scaler {
   }
 
  protected:
-  const float *get_sample_count_pass(const PassAccessor *pass_accessor,
-                                     const RenderBuffers *render_buffers)
+  const uint *get_sample_count_pass(const PassAccessor *pass_accessor,
+                                    const RenderBuffers *render_buffers)
   {
     const int pass_sample_count = pass_accessor->get_pass_offset(PASS_SAMPLE_COUNT);
     if (pass_sample_count == PASS_UNUSED) {
       return nullptr;
     }
 
-    return render_buffers->buffer.data() + pass_sample_count;
+    return reinterpret_cast<const uint *>(render_buffers->buffer.data()) + pass_sample_count;
   }
 
   const Pass *pass_;
@@ -103,7 +103,7 @@ class Scaler {
   const float num_samples_inv_ = 1.0f;
   const float exposure_ = 1.0f;
 
-  const float *sample_count_pass_ = nullptr;
+  const uint *sample_count_pass_ = nullptr;
 
   float scale_ = 0.0f;
   float scale_exposure_ = 0.0f;
@@ -248,6 +248,16 @@ bool PassAccessor::get_render_tile_pixels(RenderBuffers *render_buffers, float *
         pixels[0] = saturate(1.0f - f * scaler.scale_exposure(i));
       }
     }
+    else if (type == PASS_SAMPLE_COUNT) {
+      /* TODO(sergey): Consider normalizing into the [0..1] range, so that it is possible to see
+       * meaningful value when adaptive sampler stopped rendering image way before the maximum
+       * number of samples was reached (for examples when number of samples is set to 0 in
+       * viewport). */
+      for (int i = 0; i < size; i++, in += pass_stride, pixels++) {
+        const float f = *in;
+        pixels[0] = __float_as_uint(f) * scaler.scale(i);
+      }
+    }
 #ifdef WITH_CYCLES_DEBUG
     else if (type == PASS_BVH_TRAVERSED_NODES || type == PASS_BVH_TRAVERSED_INSTANCES ||
              type == PASS_BVH_INTERSECTIONS || type == PASS_RAY_BOUNCES) {