[Bf-blender-cvs] [be785ef4f78] cycles-x: Cycles X: refactoring of kernel globals

Fri May 7 15:12:32 CEST 2021

Commit: be785ef4f78e3dd7d51963f249177b1c2a8dc5d6
Author: Brecht Van Lommel
Date:   Thu May 6 20:06:57 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBbe785ef4f78e3dd7d51963f249177b1c2a8dc5d6

Cycles X: refactoring of kernel globals

* Declare kernel globals and associated macros in one place
* Add IntegratorStateGPU for CPU host to access
* Reduce code duplication between CUDA and OptiX
* Make shader sort key part of integrator state template
* Remove unused CPU kernel globals
* Prepare for copying/moving states
* Rename integrator_path_state.h to integrator_state_flow.h, to avoid
  confusion with kernel_path_state.h.

Part of these changes were implemented by Sergey for D11172.

Differential Revision: https://developer.blender.org/D11185

===================================================================

M	intern/cycles/device/cpu/kernel_thread_globals.cpp
M	intern/cycles/device/optix/device_impl.cpp
M	intern/cycles/device/optix/device_impl.h
M	intern/cycles/device/optix/queue.cpp
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/device/cpu/compat.h
M	intern/cycles/kernel/device/cpu/globals.h
M	intern/cycles/kernel/device/cpu/kernel_arch_impl.h
M	intern/cycles/kernel/device/cuda/compat.h
M	intern/cycles/kernel/device/cuda/globals.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/device/cuda/parallel_sorted_index.h
M	intern/cycles/kernel/device/optix/compat.h
M	intern/cycles/kernel/device/optix/globals.h
M	intern/cycles/kernel/device/optix/kernel.cu
D	intern/cycles/kernel/integrator/integrator_path_state.h
M	intern/cycles/kernel/integrator/integrator_state.h
A	intern/cycles/kernel/integrator/integrator_state_flow.h
M	intern/cycles/kernel/integrator/integrator_state_template.h

===================================================================

diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
index eff15af5ff1..f0089e34a7a 100644
--- a/intern/cycles/device/cpu/kernel_thread_globals.cpp
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -23,22 +23,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-/* TODO(sergey): Consider making more available function. Maybe `util_memory.h`? */
-static void safe_free(void *mem)
-{
-  if (mem == nullptr) {
-    return;
-  }
-  free(mem);
-}
-
-/* Get number of elements in a bound array. */
-/* TODO(sergey): Make this function more re-usable. */
-template<class T, int N> constexpr inline int ARRAY_SIZE(T (&/*array*/)[N])
-{
-  return N;
-}
-
 CPUKernelThreadGlobals::CPUKernelThreadGlobals()
 {
   reset_runtime_memory();
@@ -50,7 +34,6 @@ CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globa
 {
   reset_runtime_memory();
 
-  decoupled_volume_steps_index = 0;
   coverage_asset = nullptr;
   coverage_object = nullptr;
   coverage_material = nullptr;
@@ -70,12 +53,6 @@ CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) n
 
 CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
 {
-  safe_free(transparent_shadow_intersections);
-
-  const int decoupled_count = ARRAY_SIZE(decoupled_volume_steps);
-  for (int i = 0; i < decoupled_count; ++i) {
-    safe_free(decoupled_volume_steps[i]);
-  }
 #ifdef WITH_OSL
   OSLShader::thread_free(this);
 #endif
@@ -96,13 +73,9 @@ CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals
 
 void CPUKernelThreadGlobals::reset_runtime_memory()
 {
-  transparent_shadow_intersections = nullptr;
-
 #ifdef WITH_OSL
   osl = nullptr;
 #endif
-
-  memset(decoupled_volume_steps, 0, sizeof(decoupled_volume_steps));
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index c0ee4f143b8..5843d88ce88 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -97,7 +97,7 @@ OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
 #  endif
 
   /* Fix weird compiler bug that assigns wrong size. */
-  launch_params.data_elements = sizeof(KernelParams);
+  launch_params.data_elements = sizeof(KernelParamsOptiX);
 
   /* Allocate launch parameter buffer memory on device. */
   launch_params.alloc_to_device(1);
@@ -1387,20 +1387,17 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
     KernelData *const data = (KernelData *)host;
     *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
 
-    update_launch_params(offsetof(KernelParams, data), host, size);
+    update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
     return;
   }
 
   /* Update data storage pointers in launch parameters. */
 #  define KERNEL_TEX(data_type, tex_name) \
     if (strcmp(name, #tex_name) == 0) { \
-      update_launch_params(offsetof(KernelParams, tex_name), host, size); \
+      update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
       return; \
     }
   KERNEL_TEX(IntegratorState, __integrator_state)
-  KERNEL_TEX(IntegratorQueueCounter *, __integrator_queue_counter)
-  KERNEL_TEX(int *, __integrator_sort_key)
-  KERNEL_TEX(int *, __integrator_sort_key_counter)
 #  include "kernel/kernel_textures.h"
 #  undef KERNEL_TEX
 }
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index ba606075c79..a4b75a16354 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -26,7 +26,7 @@
 CCL_NAMESPACE_BEGIN
 
 class BVHOptiX;
-struct KernelParams;
+struct KernelParamsOptiX;
 
 /* List of OptiX program groups. */
 enum {
@@ -64,7 +64,7 @@ class OptiXDevice : public CUDADevice {
 
   bool motion_blur = false;
   device_vector<SbtRecord> sbt_data;
-  device_only_memory<KernelParams> launch_params;
+  device_only_memory<KernelParamsOptiX> launch_params;
   OptixTraversableHandle tlas_handle = 0;
 
   class Denoiser {
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 211df631bcb..59203dedb35 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -69,16 +69,17 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
 
   cuda_device_assert(
       cuda_device_,
-      cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, path_index_array),
+      cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
                         args[0],  // &d_path_index
                         sizeof(device_ptr),
                         cuda_stream_));
   if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-    cuda_device_assert(cuda_device_,
-                       cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, render_buffer),
-                                         args[1],  // &d_render_buffer
-                                         sizeof(device_ptr),
-                                         cuda_stream_));
+    cuda_device_assert(
+        cuda_device_,
+        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+                          args[1],  // &d_render_buffer
+                          sizeof(device_ptr),
+                          cuda_stream_));
   }
 
   cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 8991bfa0c63..0289c634ae6 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -38,7 +38,6 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       queue_(device->gpu_queue_create()),
       render_buffers_(buffers),
       integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
-      integrator_sort_key_(device, "integrator_sort_key", MEM_READ_WRITE),
       integrator_sort_key_counter_(device, "integrator_sort_key_counter", MEM_READ_WRITE),
       queued_paths_(device, "queued_paths", MEM_READ_WRITE),
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
@@ -47,36 +46,42 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorState))),
       max_active_path_index_(0)
 {
+  memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
   work_tile_scheduler_.set_max_num_path_states(max_num_paths_);
 }
 
-void PathTraceWorkGPU::alloc_integrator_state()
+void PathTraceWorkGPU::alloc_integrator_soa()
 {
   /* IntegrateState allocated as structure of arrays.
    *
    * Allocate a device only memory buffer before for each struct member, and then
    * write the pointers into a struct that resides in constant memory.
    *
-   * This assumes the device side struct memory contains consecutive pointers for
-   * each struct member, with the same 64-bit size as device_ptr.
-   *
-   * TODO: store float3 in separate XYZ arrays. */
+   * TODO: store float3 in separate XYZ arrays.
+   * TODO: skip zeroing most arrays and leave uninitialized. */
+
   if (!integrator_state_soa_.empty()) {
     return;
   }
 
-  vector<device_ptr> device_struct;
-
 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(type, name) \
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) \
+  { \
+    device_only_memory<type> *array = new device_only_memory<type>(device_, \
+                                                                   "integrator_state_" #name); \
+    array->alloc_to_device(max_num_paths_); \
+    array->zero_to_device(); \
+    integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+  }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name) \
   { \
     device_only_memory<type> *array = new device_only_memory<type>(device_, \
                                                                    "integrator_state_" #name); \
     array->alloc_to_device(max_num_paths_); \
-    /* TODO: skip for most arrays. */ \
     array->zero_to_device(); \
-    device_struct.push_back(array->device_pointer); \
     integrator_state_soa_.emplace_back(array); \
+    integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
   }
 #define KERNEL_STRUCT_END(name) \
   break; \
@@ -89,12 +94,9 @@ void PathTraceWorkGPU::alloc_integrator_state()
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
-
-  /* Copy to device side struct in constant memory. */
-  device_->const_copy_to(
-      "__integrator_state", device_struct.data(), device_struct.size() * sizeof(device_ptr));
 }
 
 void PathTraceWorkGPU::alloc_integrator_queue()
@@ -103,11 +105,8 @@ void PathTraceWorkGPU::alloc_integrator_queue()
     integrator_queue_counter_.alloc(1);
     integrator_queue_counter_.zero_to_device();
     integrator_queue_counter_.copy_from_device();
-
-    /* Copy to device side pointer in constant memory. */
-    device_->const_copy_to("__integrator_queue_counter",
-                           &integrator_queue_counter_.device_pointer,
-                           sizeof(device_ptr));
+    integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+                                              integrator_queue_counter_.device_pointer;
   }
 
   /* Allocate data for active path index arrays. */
@@ -126,21 +125,11 @@ void PathTraceWorkGPU::alloc_integrator_queue()
 void PathTraceWorkGPU::alloc_integrator_sorting()
 {
   /* Allocate arrays for shader sorting. */
-  if (integrator_sort_key_counter_.size() == 0) {
-    integrator_sort_key_.alloc(max_num_paths_);
-    /* TODO: this could be skip if we had a function to just allocate on device. */
-    integrator_sort_key_.zero_to_device();
-    device_->const_copy_to(
-        "__integrator_sort_key", &integrator_

@@ Diff output truncated at 10240 characters. @@