[Bf-blender-cvs] [c6275da852e] master: Fix T91922: Cycles artifacts with high volume nested level

Sergey Sharybin noreply at git.blender.org
Wed Oct 6 15:59:34 CEST 2021


Commit: c6275da852eab77e2cea1ae601a43a2dbaad6c27
Author: Sergey Sharybin
Date:   Tue Oct 5 15:05:12 2021 +0200
Branches: master
https://developer.blender.org/rBc6275da852eab77e2cea1ae601a43a2dbaad6c27

Fix T91922: Cycles artifacts with high volume nested level

Make volume stack allocated conditionally, potentially based on the
actual nested level of objects in the scene.

Currently the nested level is estimated by number of volume objects.
This is a non-expensive check which is probably enough in practice
to get almost perfect memory usage and performance.

The conditional allocation is a bit tricky.

For the CPU we declare and define maximum possible volume stack,
because there are only that many integrator states on the CPU.

On the GPU we declare outer SoA to have all volume stack elements,
but only allocate actually needed ones. The actually used volume
stack size is passed as a pre-processor, which seems to be easiest
and fastest for the GPU state copy.

There seems to be no speed regression in the demo files on RTX6000.

Note that scenes with high nested level of volume will now be slower
but correct.

Differential Revision: https://developer.blender.org/D12759

===================================================================

M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
M	intern/cycles/kernel/integrator/integrator_state.h
M	intern/cycles/kernel/integrator/integrator_state_template.h
M	intern/cycles/kernel/integrator/integrator_state_util.h
M	intern/cycles/kernel/integrator/integrator_volume_stack.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/render/graph.cpp
M	intern/cycles/render/object.cpp
M	intern/cycles/render/object.h
M	intern/cycles/render/scene.cpp
M	intern/cycles/render/scene.h

===================================================================

diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index c29b0fb039e..8af8f9a02e2 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -23,6 +23,7 @@
 #include "render/buffers.h"
 #include "render/scene.h"
 #include "util/util_logging.h"
+#include "util/util_string.h"
 #include "util/util_tbb.h"
 #include "util/util_time.h"
 
@@ -30,7 +31,7 @@
 
 CCL_NAMESPACE_BEGIN
 
-static size_t estimate_single_state_size()
+static size_t estimate_single_state_size(DeviceScene *device_scene)
 {
   size_t state_size = 0;
 
@@ -45,12 +46,14 @@ static size_t estimate_single_state_size()
     break; \
   } \
   }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 
   return state_size;
 }
@@ -72,7 +75,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
       num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
       work_tiles_(device, "work_tiles", MEM_READ_WRITE),
       display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
-      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size())),
+      max_num_paths_(queue_->num_concurrent_states(estimate_single_state_size(device_scene))),
       min_num_active_paths_(queue_->num_concurrent_busy_states()),
       max_active_path_index_(0)
 {
@@ -125,12 +128,23 @@ void PathTraceWorkGPU::alloc_integrator_soa()
     break; \
   } \
   }
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE (device_scene_->data.volume_stack_size)
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
+
+  if (VLOG_IS_ON(3)) {
+    size_t total_soa_size = 0;
+    for (auto &&soa_memory : integrator_state_soa_) {
+      total_soa_size += soa_memory->memory_size();
+    }
+
+    VLOG(3) << "GPU SoA state size: " << string_human_readable_size(total_soa_size);
+  }
 }
 
 void PathTraceWorkGPU::alloc_integrator_queue()
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
index 60d8a8e3e54..99f6cf35e9e 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_volume_stack.h
@@ -38,10 +38,13 @@ ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_A
   volume_ray.P = from_P;
   volume_ray.D = normalize_len(to_P - from_P, &volume_ray.t);
 
+  /* Store to avoid global fetches on every intersection step. */
+  const uint volume_stack_size = kernel_data.volume_stack_size;
+
 #ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  Intersection hits[2 * volume_stack_size + 1];
   uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, PATH_RAY_ALL_VISIBILITY);
+      kg, &volume_ray, hits, 2 * volume_stack_size, PATH_RAY_ALL_VISIBILITY);
   if (num_hits > 0) {
     Intersection *isect = hits;
 
@@ -55,7 +58,7 @@ ccl_device void integrator_volume_stack_update_for_subsurface(INTEGRATOR_STATE_A
 #else
   Intersection isect;
   int step = 0;
-  while (step < 2 * VOLUME_STACK_SIZE &&
+  while (step < 2 * volume_stack_size &&
          scene_intersect_volume(kg, &volume_ray, &isect, PATH_RAY_ALL_VISIBILITY)) {
     shader_setup_from_ray(kg, stack_sd, &volume_ray, &isect);
     volume_stack_enter_exit(INTEGRATOR_STATE_PASS, stack_sd);
@@ -91,12 +94,15 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
     stack_index++;
   }
 
+  /* Store to avoid global fetches on every intersection step. */
+  const uint volume_stack_size = kernel_data.volume_stack_size;
+
 #ifdef __VOLUME_RECORD_ALL__
-  Intersection hits[2 * VOLUME_STACK_SIZE + 1];
+  Intersection hits[2 * volume_stack_size + 1];
   uint num_hits = scene_intersect_volume_all(
-      kg, &volume_ray, hits, 2 * VOLUME_STACK_SIZE, visibility);
+      kg, &volume_ray, hits, 2 * volume_stack_size, visibility);
   if (num_hits > 0) {
-    int enclosed_volumes[VOLUME_STACK_SIZE];
+    int enclosed_volumes[volume_stack_size];
     Intersection *isect = hits;
 
     qsort(hits, num_hits, sizeof(Intersection), intersections_compare);
@@ -121,7 +127,7 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
             break;
           }
         }
-        if (need_add && stack_index < VOLUME_STACK_SIZE - 1) {
+        if (need_add && stack_index < volume_stack_size - 1) {
           const VolumeStack new_entry = {stack_sd->object, stack_sd->shader};
           integrator_state_write_volume_stack(INTEGRATOR_STATE_PASS, stack_index, new_entry);
           ++stack_index;
@@ -136,11 +142,12 @@ ccl_device void integrator_intersect_volume_stack(INTEGRATOR_STATE_ARGS)
     }
   }
 #else
-  int enclosed_volumes[VOLUME_STACK_SIZE];
+  /* CUDA does not support defintion of a variable size arrays, so use the maximum possible. */
+  int enclosed_volumes[MAX_VOLUME_STACK_SIZE];
   int step = 0;
 
-  while (stack_index < VOLUME_STACK_SIZE - 1 && enclosed_index < VOLUME_STACK_SIZE - 1 &&
-         step < 2 * VOLUME_STACK_SIZE) {
+  while (stack_index < volume_stack_size - 1 && enclosed_index < volume_stack_size - 1 &&
+         step < 2 * volume_stack_size) {
     Intersection isect;
     if (!scene_intersect_volume(kg, &volume_ray, &isect, visibility)) {
       break;
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index f745ad3f4b9..efc7576d95b 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -59,8 +59,6 @@ CCL_NAMESPACE_BEGIN
  *
  * TODO: these could be made dynamic depending on the features used in the scene. */
 
-#define INTEGRATOR_VOLUME_STACK_SIZE VOLUME_STACK_SIZE
-
 #define INTEGRATOR_SHADOW_ISECT_SIZE_CPU 1024
 #define INTEGRATOR_SHADOW_ISECT_SIZE_GPU 4
 
@@ -85,12 +83,14 @@ typedef struct IntegratorStateCPU {
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
   } \
   name[cpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 } IntegratorStateCPU;
 
 /* Path Queue
@@ -114,12 +114,14 @@ typedef struct IntegratorStateGPU {
 #define KERNEL_STRUCT_END_ARRAY(name, cpu_size, gpu_size) \
   } \
   name[gpu_size];
+#define KERNEL_STRUCT_VOLUME_STACK_SIZE MAX_VOLUME_STACK_SIZE
 #include "kernel/integrator/integrator_state_template.h"
 #undef KERNEL_STRUCT_BEGIN
 #undef KERNEL_STRUCT_MEMBER
 #undef KERNEL_STRUCT_ARRAY_MEMBER
 #undef KERNEL_STRUCT_END
 #undef KERNEL_STRUCT_END_ARRAY
+#undef KERNEL_STRUCT_VOLUME_STACK_SIZE
 
   /* Count number of queued kernels. */
   IntegratorQueueCounter *queue_counter;
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
index 0d8126c64aa..15998ee6edf 100644
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -107,7 +107,9 @@ KERNEL_STRUCT_END(subsurface)
 KERNEL_STRUCT_BEGIN(volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
-KERNEL_STRUCT_END_ARRAY(volume_stack, INTEGRATOR_VOLUME_STACK_SIZE, INTEGRATOR_VOLUME_STACK_SIZE)
+KERNEL_STRUCT_END_ARRAY(volume_stack,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
 
 /********************************* Shadow Path State **************************/
 
@@ -163,5 +165,5 @@ KERNEL_STRUCT_BEGIN(shadow_volume_stack)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, object, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_ARRAY_MEMBER(shadow_volume_stack, int, shader, KERNEL_FEATURE_VOLUME)
 KERNEL_STRUCT_END_ARRAY(shadow_volume_stack,
-                        INTEGRATOR_VOLUME_STACK_SIZE,
-                        INTEGRATOR_VOLUME_STACK_SIZE)
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE,
+                        KERNEL_STRUCT_VOLUME_STACK_SIZE)
diff --git a/intern/cycles/kernel/integrator/integrator_state_util.h b/intern/cycles/kernel/integrator/integrator_state_util.h
index 08d6cb00114..453ec49c7b0 100644
--- a/intern/cycles/kernel/integrator/integrator_state_util.h
+++ b/intern/cycles/kernel/integrator/integrator_state_util.h
@@ -155,7 +155,7 @@ ccl_device_forceinline void integrator_state_read_shadow_isect(INTEGRATOR_STATE_
 ccl_device_forceinline void integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_ARGS)
 {
   if (kernel_data.kernel_features & KERNEL_FEATURE_VOLUME) {
-    for (int i = 0; i < INTEGRATOR_VOLUME_STACK_SIZE; i++) {
+    for (int i = 0; i < kernel_data.volume_stack_size; i++) {
       INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, object) = INTEGRATOR_STATE_ARRAY(
           volume_stack, i, object);
       INTEGRATOR_STATE_ARRAY_WRITE(shadow_volume_stack, i, shader) = INTEGRATOR_STATE_ARRAY(
@@ -223,6 +223,8 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
     while (index < gpu_array_size) \
       ;
 
+#  define KERNEL_STRUCT_VOLUME_STACK_SIZE kernel_data.volume_stack_size
+
 #  include "kernel/integrator/integrator_state_template.h"
 
 #  undef KERNEL_STRUCT_BEGIN
@@ -230,6 +232,7 @@ ccl_device_inline void integrator_state_copy_only(const IntegratorState to_state
 #  undef KERNEL_STRUCT_ARRAY_MEMBER
 #  undef KERNEL_STRUCT_END
 #  undef KERNEL_STRUCT_END_ARRAY
+#  undef KERNEL_

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list