[Bf-blender-cvs] [5025757b12e] cycles-x: Cycles X: remove unused megakernel for GPU rendering

Wed May 19 19:44:18 CEST 2021

Commit: 5025757b12e398034d280eb7e049b1aaa22e6ca4
Author: Brecht Van Lommel
Date:   Tue May 4 20:06:38 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB5025757b12e398034d280eb7e049b1aaa22e6ca4

Cycles X: remove unused megakernel for GPU rendering

This reduces OptiX runtime compilation time to less than a second here.

Differential Revision: https://developer.blender.org/D11313

===================================================================

M	intern/cycles/device/cuda/kernel.cpp
M	intern/cycles/device/optix/device_impl.cpp
M	intern/cycles/device/optix/device_impl.h
M	intern/cycles/device/optix/queue.cpp
M	intern/cycles/integrator/path_trace_work_gpu.cpp
M	intern/cycles/kernel/device/cuda/kernel.cu
M	intern/cycles/kernel/device/optix/kernel.cu

===================================================================

diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
index 793c9efe72a..d31b903f969 100644
--- a/intern/cycles/device/cuda/kernel.cpp
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -28,6 +28,11 @@ void CUDADeviceKernels::load(CUDADevice *device)
   for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
     CUDADeviceKernel &kernel = kernels_[i];
 
+    /* No megakernel used for GPU. */
+    if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+      continue;
+    }
+
     const std::string function_name = std::string("kernel_cuda_") +
                                       device_kernel_as_string((DeviceKernel)i);
     cuda_device_assert(device,
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 2bb31d8d1d3..5e2c923c573 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -300,10 +300,6 @@ bool OptiXDevice::load_kernels(const DeviceRequestedFeatures &requested_features
   OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
   OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
-  group_descs[PG_RGEN_MEGAKERNEL].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
-  group_descs[PG_RGEN_MEGAKERNEL].raygen.module = optix_module;
-  group_descs[PG_RGEN_MEGAKERNEL].raygen.entryFunctionName =
-      "__raygen__kernel_optix_integrator_megakernel";
   group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
   group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
   group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
@@ -433,54 +429,6 @@ bool OptiXDevice::load_kernels(const DeviceRequestedFeatures &requested_features
   link_options.overrideUsesMotionBlur = motion_blur;
 #  endif
 
-  { /* Create megakernel pipeline. */
-    vector<OptixProgramGroup> pipeline_groups;
-    pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
-    pipeline_groups.push_back(groups[PG_RGEN_MEGAKERNEL]);
-    pipeline_groups.push_back(groups[PG_MISS]);
-    pipeline_groups.push_back(groups[PG_HITD]);
-    pipeline_groups.push_back(groups[PG_HITS]);
-    pipeline_groups.push_back(groups[PG_HITL]);
-#  if OPTIX_ABI_VERSION >= 36
-    if (motion_blur) {
-      pipeline_groups.push_back(groups[PG_HITD_MOTION]);
-      pipeline_groups.push_back(groups[PG_HITS_MOTION]);
-    }
-#  endif
-    if (requested_features.use_shader_raytrace) {
-      pipeline_groups.push_back(groups[PG_CALL + 0]);
-      pipeline_groups.push_back(groups[PG_CALL + 1]);
-      pipeline_groups.push_back(groups[PG_CALL + 2]);
-    }
-
-    optix_assert(optixPipelineCreate(context,
-                                     &pipeline_options,
-                                     &link_options,
-                                     pipeline_groups.data(),
-                                     pipeline_groups.size(),
-                                     nullptr,
-                                     0,
-                                     &pipelines[PIP_MEGAKERNEL]));
-
-    /* Combine ray generation and trace continuation stack size. */
-    const unsigned int css = stack_size[PG_RGEN_MEGAKERNEL].cssRG +
-                             link_options.maxTraceDepth * trace_css;
-    /* Max direct callable depth is one of the following, so combine accordingly
-     * - __raygen__ -> svm_eval_nodes
-     * - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
-     * - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes */
-    const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
-                             std::max(stack_size[PG_CALL + 1].dssDC,
-                                      stack_size[PG_CALL + 2].dssDC);
-
-    /* Set stack size depending on pipeline options. */
-    optix_assert(optixPipelineSetStackSize(pipelines[PIP_MEGAKERNEL],
-                                           0,
-                                           requested_features.use_shader_raytrace ? dss : 0,
-                                           css,
-                                           motion_blur ? 3 : 2));
-  }
-
   { /* Create intersection-only pipeline. */
     vector<OptixProgramGroup> pipeline_groups;
     pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index a4b75a16354..d3a044582f4 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -30,7 +30,6 @@ struct KernelParamsOptiX;
 
 /* List of OptiX program groups. */
 enum {
-  PG_RGEN_MEGAKERNEL,
   PG_RGEN_INTERSECT_CLOSEST,
   PG_RGEN_INTERSECT_SHADOW,
   PG_RGEN_INTERSECT_SUBSURFACE,
@@ -47,7 +46,7 @@ enum {
 };
 
 /* List of OptiX pipelines. */
-enum { PIP_MEGAKERNEL, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_INTERSECT, NUM_PIPELINES };
 
 /* A single shader binding table entry. */
 struct SbtRecord {
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 1741b958ecc..444b97baf17 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -40,8 +40,7 @@ void OptiXDeviceQueue::init_execution()
 
 static bool is_optix_specific_kernel(DeviceKernel kernel)
 {
-  return (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL ||
-          kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+  return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
           kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
           kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
 }
@@ -73,14 +72,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
                         args[0],  // &d_path_index
                         sizeof(device_ptr),
                         cuda_stream_));
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-    cuda_device_assert(
-        cuda_device_,
-        cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
-                          args[1],  // &d_render_buffer
-                          sizeof(device_ptr),
-                          cuda_stream_));
-  }
 
   cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
 
@@ -88,10 +79,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
   OptixShaderBindingTable sbt_params = {};
 
   switch (kernel) {
-    case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
-      pipeline = optix_device->pipelines[PIP_MEGAKERNEL];
-      sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_MEGAKERNEL * sizeof(SbtRecord);
-      break;
     case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
       pipeline = optix_device->pipelines[PIP_INTERSECT];
       sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -111,6 +98,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+    case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
     case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
     case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index f7db88fe126..615832dd443 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -221,23 +221,6 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
     return false;
   }
 
-#if 0
-  /* Megakernel does not support state split, so disable for the shadow catcher.
-   * It is possible to make it work, but currently we are planning to make the megakernel
-   * obsolete for the GPU rendering, so we don't spend time on making shadow catcher to work
-   * there */
-  if (!has_shadow_catcher()) {
-    const float megakernel_threshold = 0.02f;
-    const bool use_megakernel = queue_->kernel_available(DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) &&
-                                (num_paths < megakernel_threshold * max_num_paths_);
-
-    if (use_megakernel) {
-      enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL);
-      return true;
-    }
-  }
-#endif
-
   /* Find kernel to execute, with max number of queued paths. */
   int max_num_queued = 0;
   DeviceKernel kernel = DEVICE_KERNEL_NUM;
@@ -282,13 +265,6 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
   IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
   int num_queued = queue_counter->num_queued[kernel];
 
-  if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-    num_queued = 0;
-    for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
-      num_queued += queue_counter->num_queued[i];
-    }
-  }
-
   if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
     /* Compute array of active paths, sorted by shader. */
     work_size = num_queued;
@@ -300,15 +276,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     work_size = num_queued;
     d_path_index = (void *)queued_paths_.device_pointer;
 
-    if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
-      /* Compute array of all active paths for megakernel. */
-      compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY, kernel);
-      queue_->copy_from_device(num_queued_paths_);
-      queue_->synchronize();
-      work_size = num_queued_paths_.data()[0];
-    }
-    else if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
-             kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+    if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+        kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
       /* Compute array of active shadow paths for specific kernel. */
       compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
     }
@@ -336,8 +305,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
     case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
     case DEVICE_KERNEL_INTEGRATOR_SHADE_S

@@ Diff output truncated at 10240 characters. @@