[Bf-blender-cvs] [6eeb32706aa] master: Cycles: support OpenImageDenoise in final renders

Fri Jul 10 17:15:05 CEST 2020

Commit: 6eeb32706aa28bd4d0f3c26f6a5965facd6c0d62
Author: Brecht Van Lommel
Date:   Thu Jul 9 12:20:07 2020 +0200
Branches: master
https://developer.blender.org/rB6eeb32706aa28bd4d0f3c26f6a5965facd6c0d62

Cycles: support OpenImageDenoise in final renders

Performance is not great currently due to the API not seeming to support
efficient denoising of multiple tiles at the same time. So in many cases
only one or a few threads will actually be denoising at the same time.

In renders with many samples this is not a big problem, but for faster
renders it's a signficant overhead.

We should try to optimize this still, possibly by batching denoising of
a bigger neighborhood of multiple tiles at once.

===================================================================

M	intern/cycles/blender/addon/properties.py
M	intern/cycles/device/device_cpu.cpp

===================================================================

diff --git a/intern/cycles/blender/addon/properties.py b/intern/cycles/blender/addon/properties.py
index e24bf9b7ee5..6d28c14e12a 100644
--- a/intern/cycles/blender/addon/properties.py
+++ b/intern/cycles/blender/addon/properties.py
@@ -209,6 +209,7 @@ def enum_preview_denoiser(self, context):
 def enum_denoiser(self, context):
     items = [('NLM', "NLM", "Cycles native non-local means denoiser, running on any compute device", 1)]
     items += enum_optix_denoiser(self, context)
+    items += enum_openimagedenoise_denoiser(self, context)
     return items
 
 enum_denoising_optix_input_passes = (
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 2e4761562a5..878301e8242 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -181,6 +181,7 @@ class CPUDevice : public Device {
 #ifdef WITH_OPENIMAGEDENOISE
   oidn::DeviceRef oidn_device;
   oidn::FilterRef oidn_filter;
+  thread_spin_lock oidn_task_lock;
 #endif
 
   bool use_split_kernel;
@@ -948,12 +949,24 @@ class CPUDevice : public Device {
     }
   }
 
-  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  void denoise_openimagedenoise_buffer(DeviceTask &task,
+                                       float *buffer,
+                                       size_t offset,
+                                       size_t stride,
+                                       size_t x,
+                                       size_t y,
+                                       size_t w,
+                                       size_t h)
   {
 #ifdef WITH_OPENIMAGEDENOISE
     assert(openimagedenoise_supported());
 
-    /* Only one at a time, since OpenImageDenoise itself is multithreaded. */
+    /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
+     * buffers, and for tiled rendering because creating multiple devices and filters
+     * is slow and memory hungry as well.
+     *
+     * TODO: optimize tiled rendering case, by batching together denoising of many
+     * tiles somehow? */
     static thread_mutex mutex;
     thread_scoped_lock lock(mutex);
 
@@ -964,11 +977,10 @@ class CPUDevice : public Device {
     }
     if (!oidn_filter) {
       oidn_filter = oidn_device.newFilter("RT");
+      oidn_filter.set("hdr", true);
+      oidn_filter.set("srgb", false);
     }
 
-    /* Copy pixels from compute device to CPU (no-op for CPU device). */
-    rtile.buffers->buffer.copy_from_device();
-
     /* Set images with appropriate stride for our interleaved pass storage. */
     const struct {
       const char *name;
@@ -981,37 +993,131 @@ class CPUDevice : public Device {
                     0 }};
 
     for (int i = 0; passes[i].name; i++) {
-      const int64_t offset = rtile.offset + rtile.x + rtile.y * rtile.stride;
-      const int64_t buffer_offset = (offset * task.pass_stride + passes[i].offset) * sizeof(float);
+      const int64_t pixel_offset = offset + x + y * stride;
+      const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset) *
+                                    sizeof(float);
       const int64_t pixel_stride = task.pass_stride * sizeof(float);
-      const int64_t row_stride = rtile.stride * pixel_stride;
+      const int64_t row_stride = stride * pixel_stride;
 
       oidn_filter.setImage(passes[i].name,
-                           (char *)rtile.buffer + buffer_offset,
+                           (char *)buffer + buffer_offset,
                            oidn::Format::Float3,
-                           rtile.w,
-                           rtile.h,
+                           w,
+                           h,
                            0,
                            pixel_stride,
                            row_stride);
     }
 
     /* Execute filter. */
-    oidn_filter.set("hdr", true);
-    oidn_filter.set("srgb", false);
     oidn_filter.commit();
     oidn_filter.execute();
-
-    /* todo: it may be possible to avoid this copy, but we have to ensure that
-     * when other code copies data from the device it doesn't overwrite the
-     * denoiser buffers. */
-    rtile.buffers->buffer.copy_to_device();
 #else
     (void)task;
-    (void)rtile;
+    (void)buffer;
+    (void)offset;
+    (void)stride;
+    (void)x;
+    (void)ry;
+    (void)w;
+    (void)h;
 #endif
   }
 
+  void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
+  {
+    if (task.type == DeviceTask::DENOISE_BUFFER) {
+      /* Copy pixels from compute device to CPU (no-op for CPU device). */
+      rtile.buffers->buffer.copy_from_device();
+
+      denoise_openimagedenoise_buffer(task,
+                                      (float *)rtile.buffer,
+                                      rtile.offset,
+                                      rtile.stride,
+                                      rtile.x,
+                                      rtile.y,
+                                      rtile.w,
+                                      rtile.h);
+
+      /* todo: it may be possible to avoid this copy, but we have to ensure that
+       * when other code copies data from the device it doesn't overwrite the
+       * denoiser buffers. */
+      rtile.buffers->buffer.copy_to_device();
+    }
+    else {
+      /* Per-tile denoising. */
+      rtile.sample = rtile.start_sample + rtile.num_samples;
+
+      /* Map neighboring tiles into one buffer for denoising. */
+      RenderTileNeighbors neighbors(rtile);
+      task.map_neighbor_tiles(neighbors, this);
+      RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
+      rtile = center_tile;
+
+      /* Calculate size of the tile to denoise (including overlap). The overlap
+       * size was chosen empirically. OpenImageDenoise specifies an overlap size
+       * of 128 but this is significantly bigger than typical tile size. */
+      const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
+      const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
+
+      /* Adjacent tiles are in separate memory regions, copy into single buffer. */
+      array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
+
+      for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
+        RenderTile &ntile = neighbors.tiles[i];
+        if (!ntile.buffer) {
+          continue;
+        }
+
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        const float *tile_buffer = (float *)ntile.buffer + tile_offset * task.pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        float *merged_buffer = merged.data() + merged_offset * task.pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          memcpy(merged_buffer, tile_buffer, sizeof(float) * task.pass_stride * (xmax - xmin));
+          tile_buffer += ntile.stride * task.pass_stride;
+          merged_buffer += merged_stride * task.pass_stride;
+        }
+      }
+
+      /* Denoise */
+      denoise_openimagedenoise_buffer(
+          task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y);
+
+      /* Copy back result from merged buffer. */
+      RenderTile &ntile = neighbors.target;
+      if (ntile.buffer) {
+        const int xmin = max(ntile.x, rect.x);
+        const int ymin = max(ntile.y, rect.y);
+        const int xmax = min(ntile.x + ntile.w, rect.z);
+        const int ymax = min(ntile.y + ntile.h, rect.w);
+
+        const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
+        float *tile_buffer = (float *)ntile.buffer + tile_offset * task.pass_stride;
+
+        const size_t merged_stride = rect_size.x;
+        const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
+        const float *merged_buffer = merged.data() + merged_offset * task.pass_stride;
+
+        for (int y = ymin; y < ymax; y++) {
+          memcpy(tile_buffer, merged_buffer, sizeof(float) * task.pass_stride * (xmax - xmin));
+          tile_buffer += ntile.stride * task.pass_stride;
+          merged_buffer += merged_stride * task.pass_stride;
+        }
+      }
+
+      task.unmap_neighbor_tiles(neighbors, this);
+    }
+  }
+
   void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
   {
     ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
@@ -1070,10 +1176,23 @@ class CPUDevice : public Device {
       }
     }
 
+    /* NLM denoiser. */
     DenoisingTask *denoising = NULL;
 
+    /* OpenImageDenoise: we can only denoise with one thread at a time, so to
+     * avoid waiting with mutex locks in the denoiser, we let only a single
+     * thread acquire denoising tiles. */
+    uint tile_types = task.tile_types;
+    bool hold_denoise_lock = false;
+    if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
+      if (!oidn_task_lock.try_lock()) {
+        tile_types &= ~RenderTile::DENOISE;
+        hold_denoise_lock = true;
+      }
+    }
+
     RenderTile tile;
-    while (task.acquire_tile(this, tile, task.tile_types)) {
+    while (task.acquire_tile(this, tile, tile_types)) {
       if (tile.task == RenderTile::PATH_TRACE) {
         if (use_split_kernel) {
           device_only_memory<uchar> void_buffer(this, "void_buffer");
@@ -1108,6 +1227,10 @@ class CPUDevice : public Device {
       }
     }
 
+    if (hold_denoise_lock) {
+      oidn_task_lock.unlock();
+    }
+
     profiler.remove_state(&kg->profiler);
 
     thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);