[Bf-blender-cvs] [b41c72b710d] blender-v3.0-release: Fix performance decrease with Scrambling Distance on

Thu Nov 25 09:37:58 CET 2021

Commit: b41c72b710d4013fd6d67dc49a8ebb2a416b4462
Author: Alaska
Date:   Thu Nov 25 09:20:28 2021 +0100
Branches: blender-v3.0-release
https://developer.blender.org/rBb41c72b710d4013fd6d67dc49a8ebb2a416b4462

Fix performance decrease with Scrambling Distance on

With the current code in master, scrambling distance is enabled on non-hardware accelerated ray tracing devices see a measurable performance decrease when compared scrambling distance on vs off. From testing, this performance decrease comes from the large tile sizes scheduled in `tile.cpp`.

This patch attempts to address the performance decrease by using different algorithms to calculate the tile size for devices with hardware accelerated ray traversal and devices without. Large tile sizes for hardware accelerated devices and small tile sizes for others.

Most of this code is based on proposals from @brecht and @leesonw

Reviewed By: brecht, leesonw

Differential Revision: https://developer.blender.org/D13042

===================================================================

M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/tile.cpp
M	intern/cycles/integrator/tile.h
M	intern/cycles/integrator/work_tile_scheduler.cpp
M	intern/cycles/integrator/work_tile_scheduler.h
M	intern/cycles/kernel/device/gpu/work_stealing.h
M	intern/cycles/test/integrator_tile_test.cpp

===================================================================

diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index b9784f68f56..aff21ef59bb 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -257,7 +257,8 @@ void PathTraceWorkGPU::render_samples(RenderStatistics &statistics,
    * become busy after adding new tiles). This is especially important for the shadow catcher which
    * schedules work in halves of available number of paths. */
   work_tile_scheduler_.set_max_num_path_states(max_num_paths_ / 8);
-
+  work_tile_scheduler_.set_accelerated_rt((device_->get_bvh_layout_mask() & BVH_LAYOUT_OPTIX) !=
+                                          0);
   work_tile_scheduler_.reset(effective_buffer_params_,
                              start_sample,
                              samples_num,
diff --git a/intern/cycles/integrator/tile.cpp b/intern/cycles/integrator/tile.cpp
index 4a1558cce09..e9a3cbd38aa 100644
--- a/intern/cycles/integrator/tile.cpp
+++ b/intern/cycles/integrator/tile.cpp
@@ -46,7 +46,8 @@ ccl_device_inline uint round_up_to_power_of_two(uint x)
   return next_power_of_two(x);
 }
 
-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                   const int num_samples,
                                   const int max_num_path_states,
                                   const float scrambling_distance)
@@ -73,7 +74,7 @@ TileSize tile_calculate_best_size(const int2 &image_size,
 
   TileSize tile_size;
   const int num_path_states_per_sample = max_num_path_states / num_samples;
-  if (scrambling_distance < 0.9f) {
+  if (scrambling_distance < 0.9f && accel_rt) {
     /* Prefer large tiles for scrambling distance, bounded by max num path states. */
     tile_size.width = min(image_size.x, max_num_path_states);
     tile_size.height = min(image_size.y, max(max_num_path_states / tile_size.width, 1));
diff --git a/intern/cycles/integrator/tile.h b/intern/cycles/integrator/tile.h
index 61f7d736115..05b1e0af6b1 100644
--- a/intern/cycles/integrator/tile.h
+++ b/intern/cycles/integrator/tile.h
@@ -49,7 +49,8 @@ std::ostream &operator<<(std::ostream &os, const TileSize &tile_size);
  * of active path states.
  * Will attempt to provide best guess to keep path tracing threads of a device as localized as
  * possible, and have as many threads active for every tile as possible. */
-TileSize tile_calculate_best_size(const int2 &image_size,
+TileSize tile_calculate_best_size(const bool accel_rt,
+                                  const int2 &image_size,
                                   const int num_samples,
                                   const int max_num_path_states,
                                   const float scrambling_distance);
diff --git a/intern/cycles/integrator/work_tile_scheduler.cpp b/intern/cycles/integrator/work_tile_scheduler.cpp
index 2d1ac07db7f..cac573dfeda 100644
--- a/intern/cycles/integrator/work_tile_scheduler.cpp
+++ b/intern/cycles/integrator/work_tile_scheduler.cpp
@@ -28,6 +28,11 @@ WorkTileScheduler::WorkTileScheduler()
 {
 }
 
+void WorkTileScheduler::set_accelerated_rt(bool accelerated_rt)
+{
+  accelerated_rt_ = accelerated_rt;
+}
+
 void WorkTileScheduler::set_max_num_path_states(int max_num_path_states)
 {
   max_num_path_states_ = max_num_path_states;
@@ -59,7 +64,7 @@ void WorkTileScheduler::reset(const BufferParams &buffer_params,
 void WorkTileScheduler::reset_scheduler_state()
 {
   tile_size_ = tile_calculate_best_size(
-      image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
+      accelerated_rt_, image_size_px_, samples_num_, max_num_path_states_, scrambling_distance_);
 
   VLOG(3) << "Will schedule tiles of size " << tile_size_;
 
diff --git a/intern/cycles/integrator/work_tile_scheduler.h b/intern/cycles/integrator/work_tile_scheduler.h
index d9fa7e84431..8aa2f8e90bd 100644
--- a/intern/cycles/integrator/work_tile_scheduler.h
+++ b/intern/cycles/integrator/work_tile_scheduler.h
@@ -31,6 +31,9 @@ class WorkTileScheduler {
  public:
   WorkTileScheduler();
 
+  /* To indicate if there is accelerated RT support. */
+  void set_accelerated_rt(bool state);
+
   /* MAximum path states which are allowed to be used by a single scheduled work tile.
    *
    * Affects the scheduled work size: the work size will be as big as possible, but will not exceed
@@ -54,6 +57,9 @@ class WorkTileScheduler {
  protected:
   void reset_scheduler_state();
 
+  /* Used to indicate if there is accelerated ray tracing. */
+  bool accelerated_rt_ = false;
+
   /* Maximum allowed path states to be used.
    *
    * TODO(sergey): Naming can be improved. The fact that this is a limiting factor based on the
diff --git a/intern/cycles/kernel/device/gpu/work_stealing.h b/intern/cycles/kernel/device/gpu/work_stealing.h
index fab0915c38e..c3083948057 100644
--- a/intern/cycles/kernel/device/gpu/work_stealing.h
+++ b/intern/cycles/kernel/device/gpu/work_stealing.h
@@ -29,17 +29,20 @@ ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile,
                                       ccl_private uint *y,
                                       ccl_private uint *sample)
 {
-#if 0
-  /* Keep threads for the same sample together. */
-  uint tile_pixels = tile->w * tile->h;
-  uint sample_offset = global_work_index / tile_pixels;
-  uint pixel_offset = global_work_index - sample_offset * tile_pixels;
-#else
-  /* Keeping threads for the same pixel together.
-   * Appears to improve performance by a few % on CUDA and OptiX. */
-  uint sample_offset = global_work_index % tile->num_samples;
-  uint pixel_offset = global_work_index / tile->num_samples;
-#endif
+  uint sample_offset, pixel_offset;
+
+  if (kernel_data.integrator.scrambling_distance < 0.9f) {
+    /* Keep threads for the same sample together. */
+    uint tile_pixels = tile->w * tile->h;
+    sample_offset = global_work_index / tile_pixels;
+    pixel_offset = global_work_index - sample_offset * tile_pixels;
+  }
+  else {
+    /* Keeping threads for the same pixel together.
+     * Appears to improve performance by a few % on CUDA and OptiX. */
+    sample_offset = global_work_index % tile->num_samples;
+    pixel_offset = global_work_index / tile->num_samples;
+  }
 
   uint y_offset = pixel_offset / tile->w;
   uint x_offset = pixel_offset - y_offset * tile->w;
diff --git a/intern/cycles/test/integrator_tile_test.cpp b/intern/cycles/test/integrator_tile_test.cpp
index 8bb0856d6a9..822c34c36bf 100644
--- a/intern/cycles/test/integrator_tile_test.cpp
+++ b/intern/cycles/test/integrator_tile_test.cpp
@@ -24,26 +24,26 @@ CCL_NAMESPACE_BEGIN
 TEST(tile_calculate_best_size, Basic)
 {
   /* Make sure CPU-like case is handled properly. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1, 1.0f), TileSize(1, 1, 1));
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1, 1.0f), TileSize(1, 1, 1));
 
   /* Enough path states to fit an entire image with all samples. */
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 1, 1920 * 1080, 1.0f),
             TileSize(1920, 1080, 1));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(1920, 1080), 100, 1920 * 1080 * 100, 1.0f),
             TileSize(1920, 1080, 100));
 }
 
 TEST(tile_calculate_best_size, Extreme)
 {
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 262144, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 262144, 131072, 1.0f),
             TileSize(1, 1, 512));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 1048576, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 1048576, 131072, 1.0f),
             TileSize(1, 1, 1024));
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 10485760, 131072, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 10485760, 131072, 1.0f),
             TileSize(1, 1, 4096));
 
-  EXPECT_EQ(tile_calculate_best_size(make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
+  EXPECT_EQ(tile_calculate_best_size(false, make_int2(32, 32), 8192 * 8192 * 2, 1024, 1.0f),
             TileSize(1, 1, 1024));
 }