[Bf-blender-cvs] [c16d21edb5e] cycles-x: Cycles X: Bring back cryptomatte post-processing

Sergey Sharybin noreply at git.blender.org
Thu Jul 15 17:18:31 CEST 2021


Commit: c16d21edb5e3463a790ff866b67c33e12cffc592
Author: Sergey Sharybin
Date:   Thu Jul 15 16:27:25 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBc16d21edb5e3463a790ff866b67c33e12cffc592

Cycles X: Bring back cryptomatte post-processing

Is the non-accurate mode which is used for both CPU and GPU which is
done as a post-processing pass after all samples have finished. This
is happening via render scheduler, as it knows when path tracing did
finish.

Compared to regular Cycles this makes it so the cprypromatte pass is
properly sorted with adaptive sampling enabled.

The accurate CPU implementation which used to be done via the Coverage
class is not yet hooked back. This needs to somehow happen either via
the kernel or via the PathTraceWork. Current state of the patch should
make it trivial to bring accurate implementation back.

This change also fixes missing denoising when rendering when using
constant time rendering.

Differential Revision: https://developer.blender.org/D11934

===================================================================

M	intern/cycles/device/cpu/kernel.cpp
M	intern/cycles/device/cpu/kernel.h
M	intern/cycles/device/device_kernel.cpp
M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/integrator/path_trace.h
M	intern/cycles/integrator/path_trace_work.h
M	intern/cycles/integrator/path_trace_work_cpu.cpp
M	intern/cycles/integrator/path_trace_work_cpu.h
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/integrator/path_trace_work_gpu.h
M	intern/cycles/integrator/render_scheduler.cpp
M	intern/cycles/integrator/render_scheduler.h
M	intern/cycles/kernel/device/cpu/kernel_arch.h
M	intern/cycles/kernel/device/cpu/kernel_arch_impl.h
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/kernel_id_passes.h
M	intern/cycles/kernel/kernel_types.h

===================================================================

diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
index ed91644646e..0ab58ff8600 100644
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -48,6 +48,8 @@ CPUKernels::CPUKernels()
       REGISTER_KERNEL(adaptive_sampling_convergence_check),
       REGISTER_KERNEL(adaptive_sampling_filter_x),
       REGISTER_KERNEL(adaptive_sampling_filter_y),
+      /* Cryptomatte. */
+      REGISTER_KERNEL(cryptomatte_postprocess),
       /* Bake. */
       REGISTER_KERNEL(bake)
 {
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
index f05c1e71d66..ee671251eb6 100644
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -94,6 +94,13 @@ class CPUKernels {
   AdaptiveSamplingFilterXFunction adaptive_sampling_filter_x;
   AdaptiveSamplingFilterYFunction adaptive_sampling_filter_y;
 
+  /* Cryptomatte. */
+
+  using CryptomattePostprocessFunction = CPUKernelFunction<void (*)(
+      const KernelGlobals *kg, ccl_global float *render_buffer, int pixel_index)>;
+
+  CryptomattePostprocessFunction cryptomatte_postprocess;
+
   /* Bake. */
 
   CPUKernelFunction<void (*)(const KernelGlobals *, float *, int, int, int, int, int)> bake;
diff --git a/intern/cycles/device/device_kernel.cpp b/intern/cycles/device/device_kernel.cpp
index ec34d976b75..9689e7a30ca 100644
--- a/intern/cycles/device/device_kernel.cpp
+++ b/intern/cycles/device/device_kernel.cpp
@@ -116,6 +116,10 @@ const char *device_kernel_as_string(DeviceKernel kernel)
     case DEVICE_KERNEL_FILTER_COLOR_POSTPROCESS:
       return "filter_color_postprocess";
 
+    /* Cryptomatte. */
+    case DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS:
+      return "cryptomatte_postprocess";
+
     /* Generic */
     case DEVICE_KERNEL_PREFIX_SUM:
       return "prefix_sum";
diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 439d3eff48f..2b5d67f0b85 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -22,6 +22,7 @@
 #include "integrator/render_scheduler.h"
 #include "render/gpu_display.h"
 #include "render/pass.h"
+#include "render/scene.h"
 #include "util/util_algorithm.h"
 #include "util/util_logging.h"
 #include "util/util_progress.h"
@@ -53,7 +54,7 @@ class TempCPURenderBuffers {
 }  // namespace
 
 PathTrace::PathTrace(Device *device, DeviceScene *device_scene, RenderScheduler &render_scheduler)
-    : device_(device), render_scheduler_(render_scheduler)
+    : device_(device), device_scene_(device_scene), render_scheduler_(render_scheduler)
 {
   DCHECK_NE(device_, nullptr);
 
@@ -153,6 +154,8 @@ void PathTrace::render(const RenderWork &render_work)
 
 void PathTrace::render_pipeline(RenderWork render_work)
 {
+  render_scheduler_.set_need_schedule_cryptomatte(device_scene_->data.film.cryptomatte_depth);
+
   render_init_kernel_execution();
 
   init_render_buffers(render_work);
@@ -169,6 +172,11 @@ void PathTrace::render_pipeline(RenderWork render_work)
     return;
   }
 
+  cryptomatte_postprocess(render_work);
+  if (is_cancel_requested()) {
+    return;
+  }
+
   denoise(render_work);
   if (is_cancel_requested()) {
     return;
@@ -411,6 +419,18 @@ void PathTrace::set_adaptive_sampling(const AdaptiveSampling &adaptive_sampling)
   render_scheduler_.set_adaptive_sampling(adaptive_sampling);
 }
 
+void PathTrace::cryptomatte_postprocess(const RenderWork &render_work)
+{
+  if (!render_work.cryptomatte.postprocess) {
+    return;
+  }
+  VLOG(3) << "Perform cryptomatte work.";
+
+  tbb::parallel_for_each(path_trace_works_, [&](unique_ptr<PathTraceWork> &path_trace_work) {
+    path_trace_work->cryptomatte_postproces();
+  });
+}
+
 void PathTrace::denoise(const RenderWork &render_work)
 {
   if (!render_work.denoise) {
diff --git a/intern/cycles/integrator/path_trace.h b/intern/cycles/integrator/path_trace.h
index b1911b82cb7..8911ca0b698 100644
--- a/intern/cycles/integrator/path_trace.h
+++ b/intern/cycles/integrator/path_trace.h
@@ -184,6 +184,7 @@ class PathTrace {
   void path_trace(RenderWork &render_work);
   void adaptive_sample(RenderWork &render_work);
   void denoise(const RenderWork &render_work);
+  void cryptomatte_postprocess(const RenderWork &render_work);
   void update_display(const RenderWork &render_work);
   void rebalance(const RenderWork &render_work);
 
@@ -207,6 +208,8 @@ class PathTrace {
    * are configured this is a `MultiDevice`. */
   Device *device_ = nullptr;
 
+  DeviceScene *device_scene_;
+
   RenderScheduler &render_scheduler_;
 
   unique_ptr<GPUDisplay> gpu_display_;
diff --git a/intern/cycles/integrator/path_trace_work.h b/intern/cycles/integrator/path_trace_work.h
index c3760d29734..84f0dc182bb 100644
--- a/intern/cycles/integrator/path_trace_work.h
+++ b/intern/cycles/integrator/path_trace_work.h
@@ -124,6 +124,9 @@ class PathTraceWork {
    * Returns number of active pixels (the ones which did not converge yet). */
   virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) = 0;
 
+  /* Run cryptomatte pass post-processing kernels. */
+  virtual void cryptomatte_postproces() = 0;
+
   /* Cheap-ish request to see whether rendering is requested and is to be stopped as soon as
    * possible, without waiting for any samples to be finished. */
   inline bool is_cancel_requested() const
diff --git a/intern/cycles/integrator/path_trace_work_cpu.cpp b/intern/cycles/integrator/path_trace_work_cpu.cpp
index dcddcd3a264..e794a214a1b 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_cpu.cpp
@@ -242,4 +242,26 @@ int PathTraceWorkCPU::adaptive_sampling_converge_filter_count_active(float thres
   return num_active_pixels;
 }
 
+void PathTraceWorkCPU::cryptomatte_postproces()
+{
+  const int width = effective_buffer_params_.width;
+  const int height = effective_buffer_params_.height;
+
+  float *render_buffer = buffers_->buffer.data();
+
+  tbb::task_arena local_arena = local_tbb_arena_create(device_);
+
+  /* Check convergency and do x-filter in a single `parallel_for`, to reduce threading overhead. */
+  local_arena.execute([&]() {
+    tbb::parallel_for(0, height, [&](int y) {
+      CPUKernelThreadGlobals *kernel_globals = &kernel_thread_globals_[0];
+      int pixel_index = y * width;
+
+      for (int x = 0; x < width; ++x, ++pixel_index) {
+        kernels_.cryptomatte_postprocess(kernel_globals, render_buffer, pixel_index);
+      }
+    });
+  });
+}
+
 CCL_NAMESPACE_END
diff --git a/intern/cycles/integrator/path_trace_work_cpu.h b/intern/cycles/integrator/path_trace_work_cpu.h
index 80d43c1c082..34027150661 100644
--- a/intern/cycles/integrator/path_trace_work_cpu.h
+++ b/intern/cycles/integrator/path_trace_work_cpu.h
@@ -54,6 +54,7 @@ class PathTraceWorkCPU : public PathTraceWork {
   virtual bool zero_render_buffers() override;
 
   virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
 
  protected:
   /* Core path tracing routine. Renders given work time on the given queue. */
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 6e63a7a3aa9..a37b678cd91 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -839,6 +839,18 @@ void PathTraceWorkGPU::enqueue_adaptive_sampling_filter_y()
   queue_->enqueue(DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y, work_size, args);
 }
 
+void PathTraceWorkGPU::cryptomatte_postproces()
+{
+  const int work_size = effective_buffer_params_.width * effective_buffer_params_.height;
+
+  void *args[] = {&buffers_->buffer.device_pointer,
+                  const_cast<int *>(&work_size),
+                  &effective_buffer_params_.offset,
+                  &effective_buffer_params_.stride};
+
+  queue_->enqueue(DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS, work_size, args);
+}
+
 bool PathTraceWorkGPU::copy_render_buffers_from_device()
 {
   queue_->copy_from_device(buffers_->buffer);
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index b94b4426a9f..f339be59799 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -52,6 +52,7 @@ class PathTraceWorkGPU : public PathTraceWork {
   virtual bool zero_render_buffers() override;
 
   virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override;
+  virtual void cryptomatte_postproces() override;
 
  protected:
   void alloc_integrator_soa();
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
index 1dee0a1d936..aae1c81ff72 100644
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -35,6 +35,11 @@ RenderScheduler::RenderScheduler(bool headless, bool background, int pixel_size)
   use_progressive_noise_floor_ = !background_;
 }
 
+void RenderScheduler::set_need_schedule_cryptomatte(bool need_schedule_cryptomatte)
+{
+  need_schedule_cryptomatte_ = need_schedule_cryptomatte;
+}
+
 void RenderScheduler::set_need_schedule_rebalance(bool need_schedule_rebalance)
 {
   need_schedule_rebalance_works_ = need_schedule_rebalance;
@@ -130,6 +135,9 @@ void RenderScheduler::reset(const BufferParams &buffer_params, int num_samples)
   /* NOTE: The adaptive sampling settings might not be available here yet. */
   state_.adaptive_sampling_threshold = 0.4f;
 
+  state_.last_work_was_denoised = false;
+  state_.postprocess_work_scheduled = false;
+
   state_.path_trace_finished = false;
 
   state_.start_render_time = 0.0;
@@ -216,10 +224,13 @@ RenderWork RenderScheduler::get_render_work()
   const double time_now = time_dt();
 
   if (done()) {
-    if (state_.end_render_time == 0.0) {
-      state_.end_render_time = time_now;
+    RenderWork render_work;
+    if (!set_postprocess_render_work(&render_work)) {
+      if (state_.end_render_time == 0.0) {
+        state_.end_render_time = time_now;
+      }
     }
-    ret

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list