[Bf-blender-cvs] [5025757b12e] cycles-x: Cycles X: remove unused megakernel for GPU rendering
Brecht Van Lommel
noreply at git.blender.org
Wed May 19 19:44:18 CEST 2021
Commit: 5025757b12e398034d280eb7e049b1aaa22e6ca4
Author: Brecht Van Lommel
Date: Tue May 4 20:06:38 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB5025757b12e398034d280eb7e049b1aaa22e6ca4
Cycles X: remove unused megakernel for GPU rendering
This reduces OptiX runtime compilation time to less than a second here.
Differential Revision: https://developer.blender.org/D11313
===================================================================
M intern/cycles/device/cuda/kernel.cpp
M intern/cycles/device/optix/device_impl.cpp
M intern/cycles/device/optix/device_impl.h
M intern/cycles/device/optix/queue.cpp
M intern/cycles/integrator/path_trace_work_gpu.cpp
M intern/cycles/kernel/device/cuda/kernel.cu
M intern/cycles/kernel/device/optix/kernel.cu
===================================================================
diff --git a/intern/cycles/device/cuda/kernel.cpp b/intern/cycles/device/cuda/kernel.cpp
index 793c9efe72a..d31b903f969 100644
--- a/intern/cycles/device/cuda/kernel.cpp
+++ b/intern/cycles/device/cuda/kernel.cpp
@@ -28,6 +28,11 @@ void CUDADeviceKernels::load(CUDADevice *device)
for (int i = 0; i < (int)DEVICE_KERNEL_NUM; i++) {
CUDADeviceKernel &kernel = kernels_[i];
+ /* No megakernel used for GPU. */
+ if (i == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
+ continue;
+ }
+
const std::string function_name = std::string("kernel_cuda_") +
device_kernel_as_string((DeviceKernel)i);
cuda_device_assert(device,
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index 2bb31d8d1d3..5e2c923c573 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -300,10 +300,6 @@ bool OptiXDevice::load_kernels(const DeviceRequestedFeatures &requested_features
OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
- group_descs[PG_RGEN_MEGAKERNEL].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
- group_descs[PG_RGEN_MEGAKERNEL].raygen.module = optix_module;
- group_descs[PG_RGEN_MEGAKERNEL].raygen.entryFunctionName =
- "__raygen__kernel_optix_integrator_megakernel";
group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
@@ -433,54 +429,6 @@ bool OptiXDevice::load_kernels(const DeviceRequestedFeatures &requested_features
link_options.overrideUsesMotionBlur = motion_blur;
# endif
- { /* Create megakernel pipeline. */
- vector<OptixProgramGroup> pipeline_groups;
- pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
- pipeline_groups.push_back(groups[PG_RGEN_MEGAKERNEL]);
- pipeline_groups.push_back(groups[PG_MISS]);
- pipeline_groups.push_back(groups[PG_HITD]);
- pipeline_groups.push_back(groups[PG_HITS]);
- pipeline_groups.push_back(groups[PG_HITL]);
-# if OPTIX_ABI_VERSION >= 36
- if (motion_blur) {
- pipeline_groups.push_back(groups[PG_HITD_MOTION]);
- pipeline_groups.push_back(groups[PG_HITS_MOTION]);
- }
-# endif
- if (requested_features.use_shader_raytrace) {
- pipeline_groups.push_back(groups[PG_CALL + 0]);
- pipeline_groups.push_back(groups[PG_CALL + 1]);
- pipeline_groups.push_back(groups[PG_CALL + 2]);
- }
-
- optix_assert(optixPipelineCreate(context,
- &pipeline_options,
- &link_options,
- pipeline_groups.data(),
- pipeline_groups.size(),
- nullptr,
- 0,
- &pipelines[PIP_MEGAKERNEL]));
-
- /* Combine ray generation and trace continuation stack size. */
- const unsigned int css = stack_size[PG_RGEN_MEGAKERNEL].cssRG +
- link_options.maxTraceDepth * trace_css;
- /* Max direct callable depth is one of the following, so combine accordingly
- * - __raygen__ -> svm_eval_nodes
- * - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
- * - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes */
- const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
- std::max(stack_size[PG_CALL + 1].dssDC,
- stack_size[PG_CALL + 2].dssDC);
-
- /* Set stack size depending on pipeline options. */
- optix_assert(optixPipelineSetStackSize(pipelines[PIP_MEGAKERNEL],
- 0,
- requested_features.use_shader_raytrace ? dss : 0,
- css,
- motion_blur ? 3 : 2));
- }
-
{ /* Create intersection-only pipeline. */
vector<OptixProgramGroup> pipeline_groups;
pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index a4b75a16354..d3a044582f4 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -30,7 +30,6 @@ struct KernelParamsOptiX;
/* List of OptiX program groups. */
enum {
- PG_RGEN_MEGAKERNEL,
PG_RGEN_INTERSECT_CLOSEST,
PG_RGEN_INTERSECT_SHADOW,
PG_RGEN_INTERSECT_SUBSURFACE,
@@ -47,7 +46,7 @@ enum {
};
/* List of OptiX pipelines. */
-enum { PIP_MEGAKERNEL, PIP_INTERSECT, NUM_PIPELINES };
+enum { PIP_INTERSECT, NUM_PIPELINES };
/* A single shader binding table entry. */
struct SbtRecord {
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 1741b958ecc..444b97baf17 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -40,8 +40,7 @@ void OptiXDeviceQueue::init_execution()
static bool is_optix_specific_kernel(DeviceKernel kernel)
{
- return (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL ||
- kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
+ return (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST ||
kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE);
}
@@ -73,14 +72,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
args[0], // &d_path_index
sizeof(device_ptr),
cuda_stream_));
- if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- cuda_device_assert(
- cuda_device_,
- cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
- args[1], // &d_render_buffer
- sizeof(device_ptr),
- cuda_stream_));
- }
cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
@@ -88,10 +79,6 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
OptixShaderBindingTable sbt_params = {};
switch (kernel) {
- case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
- pipeline = optix_device->pipelines[PIP_MEGAKERNEL];
- sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_MEGAKERNEL * sizeof(SbtRecord);
- break;
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
pipeline = optix_device->pipelines[PIP_INTERSECT];
sbt_params.raygenRecord = sbt_data_ptr + PG_RGEN_INTERSECT_CLOSEST * sizeof(SbtRecord);
@@ -111,6 +98,7 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
case DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE:
case DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME:
case DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW:
+ case DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL:
case DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY:
case DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY:
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index f7db88fe126..615832dd443 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -221,23 +221,6 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
return false;
}
-#if 0
- /* Megakernel does not support state split, so disable for the shadow catcher.
- * It is possible to make it work, but currently we are planning to make the megakernel
- * obsolete for the GPU rendering, so we don't spend time on making shadow catcher to work
- * there */
- if (!has_shadow_catcher()) {
- const float megakernel_threshold = 0.02f;
- const bool use_megakernel = queue_->kernel_available(DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) &&
- (num_paths < megakernel_threshold * max_num_paths_);
-
- if (use_megakernel) {
- enqueue_path_iteration(DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL);
- return true;
- }
- }
-#endif
-
/* Find kernel to execute, with max number of queued paths. */
int max_num_queued = 0;
DeviceKernel kernel = DEVICE_KERNEL_NUM;
@@ -282,13 +265,6 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
IntegratorQueueCounter *queue_counter = integrator_queue_counter_.data();
int num_queued = queue_counter->num_queued[kernel];
- if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- num_queued = 0;
- for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
- num_queued += queue_counter->num_queued[i];
- }
- }
-
if (kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
/* Compute array of active paths, sorted by shader. */
work_size = num_queued;
@@ -300,15 +276,8 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
work_size = num_queued;
d_path_index = (void *)queued_paths_.device_pointer;
- if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- /* Compute array of all active paths for megakernel. */
- compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY, kernel);
- queue_->copy_from_device(num_queued_paths_);
- queue_->synchronize();
- work_size = num_queued_paths_.data()[0];
- }
- else if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
- kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
+ if (kernel == DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW ||
+ kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW) {
/* Compute array of active shadow paths for specific kernel. */
compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY, kernel);
}
@@ -336,8 +305,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
case DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT:
case DEVICE_KERNEL_INTEGRATOR_SHADE_S
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list