[Bf-blender-cvs] [829f9c59d1c] cycles-x: Cycles X: Implement float support in PassAccessorGPU

Wed Jun 16 11:00:38 CEST 2021

Commit: 829f9c59d1ce0174e5811d815450ea63ee6358bb
Author: Sergey Sharybin
Date:   Tue Jun 15 14:22:41 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB829f9c59d1ce0174e5811d815450ea63ee6358bb

Cycles X: Implement float support in PassAccessorGPU

Rather straightforward implementation, with some code-generation
macro which takes care of some boiler plate code.

Unfortunately, clang-format does some weird decision in the
`kernel_as_string ` function. Attempt to disable clang-format via
comment makes the comment to be weirdly indented. So this part
is left-as is.

Similar to the PassAccessorCPU there is no overlays applied to
the float result.

Tested with an upcoming refactor of the OptiX denoiser which
uses pass accessor.

No functional changes so far.

===================================================================

M	intern/cycles/device/device_kernel.cpp
M	intern/cycles/integrator/pass_accessor.h
M	intern/cycles/integrator/pass_accessor_gpu.cpp
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/kernel_types.h

===================================================================

diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index 45e73710530..e5e1ff6859d 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -71,31 +71,29 @@ const char *device_kernel_as_string(DeviceKernel kernel)
     case DEVICE_KERNEL_SHADER_EVAL_BACKGROUND:
       return "shader_eval_background";
 
-    /* Film. */
-    case DEVICE_KERNEL_FILM_CONVERT_DEPTH_HALF_RGBA:
-      return "film_convert_depth_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_MIST_HALF_RGBA:
-      return "film_convert_mist_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_SAMPLE_COUNT_HALF_RGBA:
-      return "film_convert_sample_count_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_FLOAT_HALF_RGBA:
-      return "film_convert_float_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_HALF_RGBA:
-      return "film_convert_shadow_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_DIVIDE_EVEN_COLOR_HALF_RGBA:
-      return "film_convert_divide_even_color_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_FLOAT3_HALF_RGBA:
-      return "film_convert_float3_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_MOTION_HALF_RGBA:
-      return "film_convert_motion_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_CRYPTOMATTE_HALF_RGBA:
-      return "film_convert_cryptomatte_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_HALF_RGBA:
-      return "film_convert_shadow_catcher_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_MATTE_WITH_SHADOW_HALF_RGBA:
-      return "film_convert_shadow_catcher_matte_with_shadow_half_rgba";
-    case DEVICE_KERNEL_FILM_CONVERT_FLOAT4_HALF_RGBA:
-      return "film_convert_float4_half_rgba";
+      /* Film. */
+
+#define FILM_CONVERT_KERNEL_AS_STRING(variant, variant_lowercase) \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant: \
+    return "film_convert_" #variant_lowercase; \
+  case DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA: \
+    return "film_convert_" #variant_lowercase "_half_rgba";
+
+      FILM_CONVERT_KERNEL_AS_STRING(DEPTH, depth)
+      FILM_CONVERT_KERNEL_AS_STRING(MIST, mist)
+      FILM_CONVERT_KERNEL_AS_STRING(SAMPLE_COUNT, sample_count)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT, float)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW, shadow)
+      FILM_CONVERT_KERNEL_AS_STRING(DIVIDE_EVEN_COLOR, divide_even_color)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT3, float3)
+      FILM_CONVERT_KERNEL_AS_STRING(MOTION, motion)
+      FILM_CONVERT_KERNEL_AS_STRING(CRYPTOMATTE, cryptomatte)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER, shadow_catcher)
+      FILM_CONVERT_KERNEL_AS_STRING(SHADOW_CATCHER_MATTE_WITH_SHADOW,
+                                    shadow_catcher_matte_with_shadow)
+      FILM_CONVERT_KERNEL_AS_STRING(FLOAT4, float4)
+
+#undef FILM_CONVERT_KERNEL_AS_STRING
 
     /* Adaptive sampling. */
     case DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK:
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
index 768226e56c9..859f3a65dd0 100644
--- a/intern/cycles/integrator/pass_accessor.h
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -66,7 +66,8 @@ class PassAccessor {
     half4 *pixels_half_rgba = nullptr;
 
     /* Device-side pointers. */
-    device_ptr d_pixels_half_rgba;
+    device_ptr d_pixels = 0;
+    device_ptr d_pixels_half_rgba = 0;
 
     int num_components = 0;
   };
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
index 0eca8c93883..a62ea250f5a 100644
--- a/intern/cycles/integrator/pass_accessor_gpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -45,7 +45,19 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
 
   const int work_size = buffer_params.width * buffer_params.height;
 
+  if (destination.d_pixels) {
+    void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
+                    const_cast<device_ptr *>(&destination.d_pixels),
+                    const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
+                    const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.offset),
+                    const_cast<int *>(&buffer_params.stride)};
+
+    queue_->enqueue(kernel, work_size, args);
+  }
   if (destination.d_pixels_half_rgba) {
+    const DeviceKernel kernel_half_float = static_cast<DeviceKernel>(kernel + 1);
+
     void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
                     const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
                     const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
@@ -53,7 +65,7 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
                     const_cast<int *>(&buffer_params.offset),
                     const_cast<int *>(&buffer_params.stride)};
 
-    queue_->enqueue(kernel, work_size, args);
+    queue_->enqueue(kernel_half_float, work_size, args);
   }
 
   queue_->synchronize();
@@ -68,10 +80,8 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
                                         const BufferParams &buffer_params, \
                                         const Destination &destination) const \
   { \
-    run_film_convert_kernels(DEVICE_KERNEL_FILM_CONVERT_##kernel_pass##_HALF_RGBA, \
-                             render_buffers, \
-                             buffer_params, \
-                             destination); \
+    run_film_convert_kernels( \
+        DEVICE_KERNEL_FILM_CONVERT_##kernel_pass, render_buffers, buffer_params, destination); \
   }
 
 /* Float (scalar) passes. */
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index 3cedaa3623a..5327c64893f 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -441,6 +441,28 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
  * Film.
  */
 
+/* Common implementation for float destination. */
+template<typename Processor>
+ccl_device_inline void kernel_cuda_film_convert_common(const KernelFilmConvert *kfilm_convert,
+                                                       float *pixels,
+                                                       float *render_buffer,
+                                                       int num_pixels,
+                                                       int offset,
+                                                       int stride,
+                                                       const Processor &processor)
+{
+  const int render_pixel_index = ccl_global_id(0);
+  if (render_pixel_index >= num_pixels) {
+    return;
+  }
+
+  const uint64_t render_buffer_offset = (uint64_t)render_pixel_index * kfilm_convert->pass_stride;
+  ccl_global const float *buffer = render_buffer + render_buffer_offset;
+  ccl_global float *pixel = pixels + render_pixel_index * kfilm_convert->num_components;
+
+  processor(kfilm_convert, buffer, pixel);
+}
+
 /* Common implementation for half4 destination and 4-channel input pass. */
 template<typename Processor>
 ccl_device_inline void kernel_cuda_film_convert_half_rgba_common_rgba(
@@ -531,6 +553,22 @@ ccl_device_inline void kernel_cuda_film_convert_half_rgba_common_value(
                                                   CUDA_KERNEL_MAX_REGISTERS) name
 
 #  define KERNEL_FILM_CONVERT_DEFINE_HALF_RGBA(variant, channels) \
+    KERNEL_FILM_CONVERT_PROC(kernel_cuda_film_convert_##variant) \
+    (const KernelFilmConvert kfilm_convert, \
+     float *pixels, \
+     float *render_buffer, \
+     int num_pixels, \
+     int offset, \
+     int stride) \
+    { \
+      kernel_cuda_film_convert_common(&kfilm_convert, \
+                                      pixels, \
+                                      render_buffer, \
+                                      num_pixels, \
+                                      offset, \
+                                      stride, \
+                                      film_get_pass_pixel_##variant); \
+    } \
     KERNEL_FILM_CONVERT_PROC(kernel_cuda_film_convert_##variant##_half_rgba) \
     (const KernelFilmConvert kfilm_convert, \
      uchar4 *rgba, \
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 5bf9c76d7e1..db313157a2b 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1431,18 +1431,23 @@ typedef enum DeviceKernel {
   DEVICE_KERNEL_SHADER_EVAL_DISPLACE,
   DEVICE_KERNEL_SHADER_EVAL_BACKGROUND,
 
-  DEVICE_KERNEL_FILM_CONVERT_DEPTH_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_MIST_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_SAMPLE_COUNT_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_FLOAT_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_SHADOW_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_DIVIDE_EVEN_COLOR_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_FLOAT3_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_MOTION_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_CRYPTOMATTE_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_SHADOW_CATCHER_MATTE_WITH_SHADOW_HALF_RGBA,
-  DEVICE_KERNEL_FILM_CONVERT_FLOAT4_HALF_RGBA,
+#define DECLARE_FILM_CONVERT_KERNEL(variant) \
+  DEVICE_KERNEL_FILM_CONVERT_##variant, DEVICE_KERNEL_FILM_CONVERT_##variant##_HALF_RGBA
+
+  DECLARE_FILM_CONVERT_KERNEL(DEPTH),
+  DECLARE_FILM_CONVERT_KERNEL(MIST),
+  DECLARE_FILM_CONVERT_KERNEL(SAMPLE_COUNT),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW),
+  DECLARE_FILM_CONVERT_KERNEL(DIVIDE_EVEN_COLOR),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT3),
+  DECLARE_FILM_CONVERT_KERNEL(MOTION),
+  DECLARE_FILM_CONVERT_KERNEL(CRYPTOMATTE),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER),
+  DECLARE_FILM_CONVERT_KERNEL(SHADOW_CATCHER_MATTE_WITH_SHADOW),
+  DECLARE_FILM_CONVERT_KERNEL(FLOAT4),
+
+#undef DECLARE_FILM_CONVERT_KERNEL
 
   DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK,
   DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X,