[Bf-blender-cvs] [a6fb710bb4c] cycles-x: Cycles X: Run init_from_camera kernel for all tiles

Wed May 19 12:52:27 CEST 2021

Commit: a6fb710bb4c7f9be4eddc5208ec45bea5cf5d58d
Author: Sergey Sharybin
Date:   Wed May 19 11:10:59 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBa6fb710bb4c7f9be4eddc5208ec45bea5cf5d58d

Cycles X: Run init_from_camera kernel for all tiles

Avoids pointer magic which is not necessarily supported by all compute
backends and allows to ensure there are no extra latency caused by
multiple kernel launches.

Currently this does not bring performance improvements, but this change
opens doors for more compute backends and makes it possible to test
different tile slicing and scheduling strategies.

```
                              init_all_tiles                cycles-x
bmw27.blend                   10.3444                       10.326
classroom.blend               16.476                        16.6067
pabellon.blend                9.13914                       9.13556
monster.blend                 11.9673                       11.963
barbershop_interior.blend     12.4566                       12.4414
junkshop.blend                16.4764                       16.491
pvt_flat.blend                17.288                        17.2757
```

Differential Revision: https://developer.blender.org/D11304

===================================================================

M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/kernel_types.h

===================================================================

diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index dafd4ec7314..889079ba98b 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -486,18 +486,26 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
     work_tiles_.alloc(num_work_tiles);
   }
 
+  int path_index_offset = 0;
+  int max_tile_work_size = 0;
   for (int i = 0; i < num_work_tiles; i++) {
     KernelWorkTile &work_tile = work_tiles_.data()[i];
     work_tile = work_tiles[i];
+
+    const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
+
+    work_tile.path_index_offset = path_index_offset;
+    work_tile.work_size = tile_work_size;
+
+    path_index_offset += tile_work_size;
+
+    max_tile_work_size = max(max_tile_work_size, tile_work_size);
   }
 
   queue_->copy_to_device(work_tiles_);
 
-  /* TODO: consider launching a single kernel with an array of work tiles.
-   * Mapping global index to the right tile with different sized tiles
-   * is not trivial so not done for now. */
-  void *d_work_tile = (void *)work_tiles_.device_pointer;
-  void *d_path_index = (void *)NULL;
+  void *d_work_tiles = (void *)work_tiles_.device_pointer;
+  void *d_path_index = (void *)nullptr;
   void *d_render_buffer = (void *)render_buffers_->buffer.device_pointer;
 
   if (max_active_path_index_ != 0) {
@@ -506,33 +514,14 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
     d_path_index = (void *)queued_paths_.device_pointer;
   }
 
-  int num_paths = 0;
+  /* Launch kernel. */
+  void *args[] = {&d_path_index,
+                  &d_work_tiles,
+                  const_cast<int *>(&num_work_tiles),
+                  &d_render_buffer,
+                  const_cast<int *>(&max_tile_work_size)};
 
-  for (int i = 0; i < num_work_tiles; i++) {
-    KernelWorkTile &work_tile = work_tiles_.data()[i];
-
-    /* Compute kernel launch parameters. */
-    const int tile_work_size = work_tile.w * work_tile.h * work_tile.num_samples;
-
-    /* Launch kernel. */
-    void *args[] = {&d_path_index,
-                    &d_work_tile,
-                    &d_render_buffer,
-                    const_cast<int *>(&tile_work_size),
-                    &num_paths};
-
-    queue_->enqueue(kernel, tile_work_size, args);
-
-    /* Offset work tile and path index pointers for next tile. */
-    num_paths += tile_work_size;
-    DCHECK_LE(num_paths, get_max_num_camera_paths());
-
-    /* TODO: this pointer manipulation won't work for OpenCL. */
-    d_work_tile = (void *)(((KernelWorkTile *)d_work_tile) + 1);
-    if (d_path_index) {
-      d_path_index = (void *)(((int *)d_path_index) + tile_work_size);
-    }
-  }
+  queue_->enqueue(kernel, max_tile_work_size * num_work_tiles, args);
 
   /* TODO: this could be computed more accurately using on the last entry
    * in the queued_paths array passed to the kernel? */
@@ -541,8 +530,9 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
    *
    * TODO: What is more accurate approach here? What if the shadow catcher is hit after some
    * transparent bounce? Do we need to calculate this somewhere else as well? */
-  max_active_path_index_ = min(
-      max_active_path_index_ + num_paths + get_shadow_catcher_state_offset(), max_num_paths_);
+  max_active_path_index_ = min(max_active_path_index_ + path_index_offset +
+                                   get_shadow_catcher_state_offset(),
+                               max_num_paths_);
 }
 
 int PathTraceWorkGPU::get_num_active_paths()
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index e3ed5ea35f7..ae2691e1267 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -94,22 +94,34 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
 extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
                                               CUDA_KERNEL_MAX_REGISTERS)
     kernel_cuda_integrator_init_from_camera(const int *path_index_array,
-                                            KernelWorkTile *tile,
+                                            KernelWorkTile *tiles,
+                                            const int num_tiles,
                                             float *render_buffer,
-                                            const int tile_work_size,
-                                            const int path_index_offset)
+                                            const int max_tile_work_size)
 {
-  const int global_index = ccl_global_id(0);
-  const int work_index = global_index;
-  bool thread_is_active = work_index < tile_work_size;
-  if (thread_is_active) {
-    const int path_index = (path_index_array) ? path_index_array[global_index] :
-                                                path_index_offset + global_index;
+  const int work_index = ccl_global_id(0);
 
-    uint x, y, sample;
-    get_work_pixel(tile, work_index, &x, &y, &sample);
-    integrator_init_from_camera(NULL, path_index, tile, render_buffer, x, y, sample);
+  if (work_index >= max_tile_work_size * num_tiles) {
+    return;
+  }
+
+  const int tile_index = work_index / max_tile_work_size;
+  const int tile_work_index = work_index - tile_index * max_tile_work_size;
+
+  const KernelWorkTile *tile = &tiles[tile_index];
+
+  if (tile_work_index >= tile->work_size) {
+    return;
   }
+
+  const int path_index = (path_index_array) ?
+                             path_index_array[tile->path_index_offset + tile_work_index] :
+                             tile->path_index_offset + tile_work_index;
+
+  uint x, y, sample;
+  get_work_pixel(tile, tile_work_index, &x, &y, &sample);
+
+  integrator_init_from_camera(nullptr, path_index, tile, render_buffer, x, y, sample);
 }
 
 extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
diff --git a/intern/cycles/kernel/kernel_types.h b/intern/cycles/kernel/kernel_types.h
index 403a2d0bf2e..f72db3704e3 100644
--- a/intern/cycles/kernel/kernel_types.h
+++ b/intern/cycles/kernel/kernel_types.h
@@ -1577,6 +1577,10 @@ typedef struct KernelWorkTile {
 
   int offset;
   uint stride;
+
+  /* Precalculated parameters used by init_from_camera kernel on GPU. */
+  int path_index_offset;
+  int work_size;
 } KernelWorkTile;
 
 /* Shader Evaluation.