[Bf-blender-cvs] [b72c1e93247] cycles-x: Cycles X: Support GPUDisplay with tiled rendering

Thu Sep 2 12:36:35 CEST 2021

Commit: b72c1e93247fe5ebe4be3653bb2d8028b5467c9e
Author: Sergey Sharybin
Date:   Wed Sep 1 14:39:00 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBb72c1e93247fe5ebe4be3653bb2d8028b5467c9e

Cycles X: Support GPUDisplay with tiled rendering

Allocate GPUDisplay texture for the full frame size, and fill pixels
tile-by-tile as they come.

Will allow to use GPUDisplay for final render display from D12039
together with tiled rendering support from D12309.

Tested by applying on top of the D12309 and forcing tiles in the
viewport.

Differential Revision: https://developer.blender.org/D12370

===================================================================

M	intern/cycles/blender/blender_gpu_display.cpp
M	intern/cycles/integrator/pass_accessor.h
M	intern/cycles/integrator/pass_accessor_cpu.cpp
M	intern/cycles/integrator/pass_accessor_gpu.cpp
M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/integrator/path_trace.h
M	intern/cycles/integrator/path_trace_work.cpp
M	intern/cycles/integrator/path_trace_work.h
M	intern/cycles/integrator/path_trace_work_cpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/kernel/device/gpu/kernel.h
M	intern/cycles/render/gpu_display.cpp
M	intern/cycles/render/gpu_display.h
M	intern/cycles/render/session.cpp

===================================================================

diff --git a/intern/cycles/blender/blender_gpu_display.cpp b/intern/cycles/blender/blender_gpu_display.cpp
index 552866601c6..903fb0f0db2 100644
--- a/intern/cycles/blender/blender_gpu_display.cpp
+++ b/intern/cycles/blender/blender_gpu_display.cpp
@@ -321,8 +321,8 @@ bool BlenderGPUDisplay::do_update_begin(int texture_width, int texture_height)
    * too much data to GPU when resolution divider is not 1. */
   /* TODO(sergey): Investigate whether keeping the PBO exact size of the texute makes non-interop
    * mode faster. */
-  const int buffer_width = params_.size.x;
-  const int buffer_height = params_.size.y;
+  const int buffer_width = params_.full_size.x;
+  const int buffer_height = params_.full_size.y;
   if (texture_.buffer_width != buffer_width || texture_.buffer_height != buffer_height) {
     const size_t size_in_bytes = sizeof(half4) * buffer_width * buffer_height;
     glBindBuffer(GL_PIXEL_UNPACK_BUFFER, texture_.gl_pbo_id_);
@@ -374,7 +374,7 @@ void BlenderGPUDisplay::do_copy_pixels_to_texture(
   }
   else {
     const half4 *rgba_row = rgba_pixels;
-    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width;
+    half4 *mapped_rgba_row = mapped_rgba_pixels + texture_y * texture_.width + texture_x;
     for (int y = 0; y < pixels_height;
          ++y, rgba_row += pixels_width, mapped_rgba_row += texture_.width) {
       memcpy(mapped_rgba_row, rgba_row, sizeof(half4) * pixels_width);
diff --git a/intern/cycles/integrator/pass_accessor.h b/intern/cycles/integrator/pass_accessor.h
index bc4768f682e..ce5b2983c09 100644
--- a/intern/cycles/integrator/pass_accessor.h
+++ b/intern/cycles/integrator/pass_accessor.h
@@ -86,8 +86,15 @@ class PassAccessor {
     int offset = 0;
 
     /* Number of floats per pixel. When zero is the same as `num_components`.
-     * NOTE: Is ignored for half4 destination. */
+     *
+     * NOTE: Is ignored for half4 destination, as the half4 pixels are always 4-component
+     * half-floats. */
     int pixel_stride = 0;
+
+    /* Row stride in pixel elements:
+     *  - For the float destination stride is a number of floats per row.
+     *  - For the half4 destination stride is a number of half4 per row. */
+    int stride = 0;
   };
 
   class Source {
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 83e6413b302..3c6691f6d43 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -97,13 +97,15 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_float(
     const Destination &destination,
     const Processor &processor) const
 {
+  DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
   const float *buffer_data = render_buffers->buffer.data();
   const int pixel_stride = destination.pixel_stride ? destination.pixel_stride :
                                                       destination.num_components;
 
-  tbb::parallel_for(0, buffer_params.height, [&](int y) {
-    int64_t pixel_index = int64_t(y) * buffer_params.width;
-    for (int x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
+    int64_t pixel_index = y * buffer_params.width;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
       const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
       const float *buffer = buffer_data + input_pixel_offset;
       float *pixel = destination.pixels + (pixel_index + destination.offset) * pixel_stride;
@@ -123,9 +125,14 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
 {
   const float *buffer_data = render_buffers->buffer.data();
 
-  tbb::parallel_for(0, buffer_params.height, [&](int y) {
+  half4 *dst_start = destination.pixels_half_rgba + destination.offset;
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
+  tbb::parallel_for(0, buffer_params.height, [&](int64_t y) {
     int64_t pixel_index = y * buffer_params.width;
-    for (int x = 0; x < buffer_params.width; ++x, ++pixel_index) {
+    half4 *dst_row_start = dst_start + y * destination_stride;
+    for (int64_t x = 0; x < buffer_params.width; ++x, ++pixel_index) {
       const int64_t input_pixel_offset = pixel_index * buffer_params.pass_stride;
       const float *buffer = buffer_data + input_pixel_offset;
 
@@ -134,7 +141,7 @@ inline void PassAccessorCPU::run_get_pass_kernel_processor_half_rgba(
 
       film_apply_pass_pixel_overlays_rgba(kfilm_convert, buffer, pixel);
 
-      half4 *pixel_half_rgba = destination.pixels_half_rgba + pixel_index + destination.offset;
+      half4 *pixel_half_rgba = dst_row_start + x;
       float4_store_half(&pixel_half_rgba->x, make_float4(pixel[0], pixel[1], pixel[2], pixel[3]));
     }
   });
diff --git a/intern/cycles/integrator/pass_accessor_gpu.cpp b/intern/cycles/integrator/pass_accessor_gpu.cpp
index 0100f8743e8..eb80ba99655 100644
--- a/intern/cycles/integrator/pass_accessor_gpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_gpu.cpp
@@ -45,14 +45,21 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
 
   const int work_size = buffer_params.width * buffer_params.height;
 
+  const int destination_stride = destination.stride != 0 ? destination.stride :
+                                                           buffer_params.width;
+
   if (destination.d_pixels) {
+    DCHECK_EQ(destination.stride, 0) << "Custom stride for float destination is not implemented.";
+
     void *args[] = {const_cast<KernelFilmConvert *>(&kfilm_convert),
                     const_cast<device_ptr *>(&destination.d_pixels),
                     const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
                     const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
                     const_cast<int *>(&buffer_params.offset),
                     const_cast<int *>(&buffer_params.stride),
-                    const_cast<int *>(&destination.offset)};
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
 
     queue_->enqueue(kernel, work_size, args);
   }
@@ -63,9 +70,11 @@ void PassAccessorGPU::run_film_convert_kernels(DeviceKernel kernel,
                     const_cast<device_ptr *>(&destination.d_pixels_half_rgba),
                     const_cast<device_ptr *>(&render_buffers->buffer.device_pointer),
                     const_cast<int *>(&work_size),
+                    const_cast<int *>(&buffer_params.width),
                     const_cast<int *>(&buffer_params.offset),
                     const_cast<int *>(&buffer_params.stride),
-                    const_cast<int *>(&destination.offset)};
+                    const_cast<int *>(&destination.offset),
+                    const_cast<int *>(&destination_stride)};
 
     queue_->enqueue(kernel_half_float, work_size, args);
   }
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 689ecc68dab..9db7f0d917a 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -112,15 +112,20 @@ bool PathTrace::ready_to_reset()
   return false;
 }
 
-void PathTrace::reset(const BufferParams &big_tile_params)
+void PathTrace::reset(const BufferParams &full_params, const BufferParams &big_tile_params)
 {
   if (big_tile_params_.modified(big_tile_params)) {
     big_tile_params_ = big_tile_params;
     render_state_.need_reset_params = true;
   }
 
+  full_params_ = full_params;
+
+  /* NOTE: GPU display checks for buffer modification and avoids unnecessary re-allocation.
+   * It is requires to inform about reset whenever it happens, so that the redraw state tracking is
+   * properly updated. */
   if (gpu_display_) {
-    gpu_display_->reset(big_tile_params);
+    gpu_display_->reset(full_params);
   }
 
   render_state_.has_denoised_result = false;
@@ -276,6 +281,7 @@ void PathTrace::update_effective_work_buffer_params(const RenderWork &render_wor
 {
   const int resolution_divider = render_work.resolution_divider;
 
+  const BufferParams scaled_full_params = scale_buffer_params(full_params_, resolution_divider);
   const BufferParams scaled_big_tile_params = scale_buffer_params(big_tile_params_,
                                                                   resolution_divider);
 
@@ -284,7 +290,7 @@ void PathTrace::update_effective_work_buffer_params(const RenderWork &render_wor
                                scaled_big_tile_params,
                                [&](PathTraceWork *path_trace_work, const BufferParams params) {
                                  path_trace_work->set_effective_buffer_params(
-                                     scaled_big_tile_params, params);
+                                     scaled_full_params, scaled_big_tile_params, params);
                                });
 
   render_state_.effective_big_tile_params = scaled_big_tile_params;
@@ -533,19 +539,19 @@ void PathTrace::update_display(const RenderWork &render_work)
     return;
   }
 
-  VLOG(3) << "Perform copy to GPUDisplay work.";
-
-  const double start_time = time_dt();
-
-  const int width = render_state_.effective_big_tile_params.width;
-  const int height = render_state_.effective_big_tile_params.height;
-  if (width == 0 || height == 0) {
+  if (full_params_.width == 0 || full_params_.height == 0) {
+    VLOG(3) << "Skipping GPUDisplay update due to 0 size of the render buffer.";
     return;
   }
 
-  const int num_samples = get_num_samples_in_buffer();
+  VLOG(3) << "Perform copy to GPUDisplay work.";
+
+  const double start_time = time_dt();
 
-  if (!gpu_display_->update_begin(width, height)) {
+  const int resolution_divider = render_work.resolution_divider;
+  const int texture_width = max(1, full_params_.width / resolution_divider);
+  const int texture_height = max(1, full_params_.height / resolution_divider);
+  if (!gpu_display_->update_begin(texture_width, texture_height)) {
     LOG(ERROR) <

@@ Diff output truncated at 10240 characters. @@