[Bf-blender-cvs] [87194d0c0a3] cycles-x: Cycles X: reduce GPU state memory usage when some features are not enabled

Brecht Van Lommel noreply at git.blender.org
Wed Jul 14 17:50:46 CEST 2021


Commit: 87194d0c0a30cefe326212cb1cf56f1706c9a915
Author: Brecht Van Lommel
Date:   Wed Jul 14 13:10:24 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB87194d0c0a30cefe326212cb1cf56f1706c9a915

Cycles X: reduce GPU state memory usage when some features are not enabled

In particular: volumes, subsurface, denoising and light passes.

In a scene without these features, we go from 538MB to 346MB for the state
memory usage. This also improves performance, presumably due to reduced
memory traffic.

Differential Revision: https://developer.blender.org/D11915

===================================================================

M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/integrator/integrator_shade_surface.h
M	intern/cycles/kernel/integrator/integrator_shade_volume.h
M	intern/cycles/kernel/integrator/integrator_state.h
M	intern/cycles/kernel/integrator/integrator_state_template.h
M	intern/cycles/kernel/integrator/integrator_state_util.h
M	intern/cycles/kernel/integrator/integrator_subsurface.h
M	intern/cycles/kernel/kernel_path_state.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/render/film.cpp
M	intern/cycles/render/film.h
M	intern/cycles/render/scene.cpp

===================================================================

diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 35c85166252..b0ba7054543 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -35,6 +35,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
                                    bool *cancel_requested_flag)
     : PathTraceWork(device, device_scene, cancel_requested_flag),
       queue_(device->gpu_queue_create()),
+      integrator_state_soa_kernel_features_(0),
       integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
       integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
       integrator_shader_raytrace_sort_counter_(
@@ -57,28 +58,31 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
 
 void PathTraceWorkGPU::alloc_integrator_soa()
 {
-  /* IntegrateState allocated as structure of arrays.
-   *
-   * Allocate a device only memory buffer before for each struct member, and then
-   * write the pointers into a struct that resides in constant memory.
-   *
-   * TODO: store float3 in separate XYZ arrays. */
+  /* IntegrateState allocated as structure of arrays. */
 
-  if (!integrator_state_soa_.empty()) {
+  /* Check if we already allocated memory for the required features. */
+  const uint kernel_features = device_scene_->data.kernel_features;
+  if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
     return;
   }
+  integrator_state_soa_kernel_features_ = kernel_features;
 
+  /* Allocate a device only memory buffer before for each struct member, and then
+   * write the pointers into a struct that resides in constant memory.
+   *
+   * TODO: store float3 in separate XYZ arrays. */
 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) \
-  { \
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
     device_only_memory<type> *array = new device_only_memory<type>(device_, \
                                                                    "integrator_state_" #name); \
     array->alloc_to_device(max_num_paths_); \
     integrator_state_soa_.emplace_back(array); \
     integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
   }
-#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name) \
-  { \
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+  if ((kernel_features & feature) && \
+      (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
     device_only_memory<type> *array = new device_only_memory<type>(device_, \
                                                                    "integrator_state_" #name); \
     array->alloc_to_device(max_num_paths_); \
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 69475ed0a1b..76018d8501f 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -117,6 +117,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   IntegratorStateGPU integrator_state_gpu_;
   /* SoA arrays for integrator state. */
   vector<unique_ptr<device_memory>> integrator_state_soa_;
+  uint integrator_state_soa_kernel_features_;
   /* Keep track of number of queued kernels. */
   device_vector<IntegratorQueueCounter> integrator_queue_counter_;
   /* Shader sorting. */
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
index b44b85b7b60..eefbcad87de 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -176,14 +176,18 @@ ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS
   uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
   shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
-  const float3 diffuse_glossy_ratio = (bounce == 0) ? bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
-                                                      INTEGRATOR_STATE(path, diffuse_glossy_ratio);
   const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval);
 
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
   INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
   INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
   INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
-  INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
   INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
 
   /* Branch off shadow kernel. */
@@ -242,9 +246,12 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE
   float3 throughput = INTEGRATOR_STATE(path, throughput);
   throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
   INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
-  if (INTEGRATOR_STATE(path, bounce) == 0) {
-    INTEGRATOR_STATE_WRITE(path,
-                           diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    if (INTEGRATOR_STATE(path, bounce) == 0) {
+      INTEGRATOR_STATE_WRITE(path,
+                             diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+    }
   }
 
   /* Update path state */
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
index 55db3fe4f02..c95ac43168c 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -584,14 +584,18 @@ ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS,
   uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
   shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
   shadow_flag |= PATH_RAY_VOLUME_PASS;
-  const float3 diffuse_glossy_ratio = (bounce == 0) ? one_float3() :
-                                                      INTEGRATOR_STATE(path, diffuse_glossy_ratio);
   const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&phase_eval);
 
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    const float3 diffuse_glossy_ratio = (bounce == 0) ?
+                                            one_float3() :
+                                            INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+    INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+  }
+
   INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
   INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
   INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
-  INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
   INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
 
   integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
@@ -636,7 +640,10 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS
   float3 throughput = INTEGRATOR_STATE(path, throughput);
   throughput *= bsdf_eval_sum(&phase_eval) / phase_pdf;
   INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
-  INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+
+  if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+    INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+  }
 
   /* Update path state */
   INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf;
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index 18cb362799f..6eb3153825b 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -69,7 +69,7 @@ CCL_NAMESPACE_BEGIN
  * CPU rendering path state with AoS layout. */
 typedef struct IntegratorState {
 #define KERNEL_STRUCT_BEGIN(name) struct {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) type name;
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
 #define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
 #define KERNEL_STRUCT_END(name) \
   } \
@@ -98,7 +98,7 @@ typedef struct IntegratorQueueCounter {
  * GPU rendering path state with SoA layout. */
 typedef struct IntegratorStateGPU {
 #define KERNEL_STRUCT_BEGIN(name) struct {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) type *name;
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name;
 #define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
 #define KERNEL_STRUCT_END(name) \
   } \
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
index a8fe35bc36b..36a42d30dea 100644
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -23,115 +23,115 @@ KERNEL_STRUCT_BEGIN(path)
  * `kernel_data.film.pass_stride`.
  *
  * The multiplication is delayed for later, so that state can use 32bit integer. */
-KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index)
+KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
 /* Current sample number. */
-KERNEL_STRUCT_MEMBER(path, uint16_t, sample)
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
 /* Current ray bounce depth. */
-KERNEL_STRUCT_MEMBER(path, uint16_t, bounce)
+KERNEL_STRUCT_MEMB

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list