[Bf-blender-cvs] [cb8f94022ab] cycles-x: Cleanup: refactor shader sorting to support it for more kernels later
Brecht Van Lommel
noreply at git.blender.org
Thu May 27 19:15:40 CEST 2021
Commit: cb8f94022ab0142c51b70a34f09d6715fb430505
Author: Brecht Van Lommel
Date: Thu May 27 11:28:35 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBcb8f94022ab0142c51b70a34f09d6715fb430505
Cleanup: refactor shader sorting to support it for more kernels later
===================================================================
M intern/cycles/integrator/path_trace_work_gpu.cpp
M intern/cycles/integrator/path_trace_work_gpu.h
M intern/cycles/kernel/device/cuda/kernel.cu
M intern/cycles/kernel/integrator/integrator_intersect_closest.h
M intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
M intern/cycles/kernel/integrator/integrator_state.h
M intern/cycles/kernel/integrator/integrator_state_flow.h
M intern/cycles/kernel/integrator/integrator_subsurface.h
===================================================================
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index e8958ef1147..eee3ce0fd40 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -126,7 +126,9 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
if (integrator_sort_key_counter_.size() < num_shaders) {
integrator_sort_key_counter_.alloc(num_shaders);
integrator_sort_key_counter_.zero_to_device();
- integrator_state_gpu_.sort_key_counter = (int *)integrator_sort_key_counter_.device_pointer;
+
+ integrator_state_gpu_.sort_key_counter[DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE] =
+ (int *)integrator_sort_key_counter_.device_pointer;
}
}
@@ -213,10 +215,10 @@ DeviceKernel PathTraceWorkGPU::get_most_queued_kernel() const
void PathTraceWorkGPU::enqueue_reset()
{
- const int num_keys = integrator_sort_key_counter_.size();
- void *args[] = {&max_num_paths_, const_cast<int *>(&num_keys)};
- queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max(max_num_paths_, num_keys), args);
+ void *args[] = {&max_num_paths_};
+ queue_->enqueue(DEVICE_KERNEL_INTEGRATOR_RESET, max_num_paths_, args);
queue_->zero_to_device(integrator_queue_counter_);
+ queue_->zero_to_device(integrator_sort_key_counter_);
/* Tiles enqueue need to know number of active paths, which is based on this counter. Zero the
* counter on the host side because `zero_to_device()` is not doing it. */
@@ -346,15 +348,17 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
}
}
-void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, int queued_kernel)
+void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
{
- void *d_key_counter = (void *)integrator_sort_key_counter_.device_pointer;
+ int d_queued_kernel = queued_kernel;
+ void *d_counter = integrator_state_gpu_.sort_key_counter[d_queued_kernel];
+ assert(d_counter != nullptr);
/* Compute prefix sum of number of active paths with each shader. */
{
const int work_size = 1;
int num_shaders = integrator_sort_key_counter_.size();
- void *args[] = {&d_key_counter, &num_shaders};
+ void *args[] = {&d_counter, &num_shaders};
queue_->enqueue(DEVICE_KERNEL_PREFIX_SUM, work_size, args);
}
@@ -369,18 +373,23 @@ void PathTraceWorkGPU::compute_sorted_queued_paths(DeviceKernel kernel, int queu
void *args[] = {const_cast<int *>(&work_size),
&d_queued_paths,
&d_num_queued_paths,
- &d_key_counter,
- &queued_kernel};
+ &d_counter,
+ &d_queued_kernel};
queue_->enqueue(kernel, work_size, args);
}
- /* TODO: ensure this happens as part of queue stream. */
queue_->zero_to_device(num_queued_paths_);
- queue_->zero_to_device(integrator_sort_key_counter_);
+ if (queued_kernel == DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE) {
+ queue_->zero_to_device(integrator_sort_key_counter_);
+ }
+ else {
+ /* TODO */
+ assert(0);
+ }
}
-void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, int queued_kernel)
+void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
{
/* Launch kernel to fill the active paths arrays. */
/* TODO: this could be smaller for terminated paths based on amount of work we want
@@ -388,11 +397,12 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, int queued_kern
const int work_size = (kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY) ?
min(max_num_paths_, get_max_num_camera_paths()) :
max_active_path_index_;
+ int d_queued_kernel = queued_kernel;
void *d_queued_paths = (void *)queued_paths_.device_pointer;
void *d_num_queued_paths = (void *)num_queued_paths_.device_pointer;
void *args[] = {
- const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &queued_kernel};
+ const_cast<int *>(&work_size), &d_queued_paths, &d_num_queued_paths, &d_queued_kernel};
queue_->enqueue(kernel, work_size, args);
}
@@ -492,7 +502,7 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
void *d_render_buffer = (void *)render_buffers_->buffer.device_pointer;
if (max_active_path_index_ != 0) {
- compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, 0);
+ compute_queued_paths(DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY, (DeviceKernel)0);
queue_->zero_to_device(num_queued_paths_);
d_path_index = (void *)queued_paths_.device_pointer;
}
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index dd83286094c..4b8f84bee5e 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -67,8 +67,8 @@ class PathTraceWorkGPU : public PathTraceWork {
bool enqueue_path_iteration();
void enqueue_path_iteration(DeviceKernel kernel);
- void compute_queued_paths(DeviceKernel kernel, int queued_kernel);
- void compute_sorted_queued_paths(DeviceKernel kernel, int queued_kernel);
+ void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
+ void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel);
int get_num_active_paths();
@@ -114,7 +114,7 @@ class PathTraceWorkGPU : public PathTraceWork {
vector<unique_ptr<device_memory>> integrator_state_soa_;
/* Keep track of number of queued kernels. */
device_vector<IntegratorQueueCounter> integrator_queue_counter_;
- /* Key for shader sorting. */
+ /* Shader sorting. */
device_vector<int> integrator_sort_key_counter_;
/* Temporary buffer to get an array of queued path for a particular kernel. */
diff --git a/intern/cycles/kernel/device/cuda/kernel.cu b/intern/cycles/kernel/device/cuda/kernel.cu
index 950c724aa25..0ce3c1abd71 100644
--- a/intern/cycles/kernel/device/cuda/kernel.cu
+++ b/intern/cycles/kernel/device/cuda/kernel.cu
@@ -76,7 +76,7 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS, CUD
extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
CUDA_KERNEL_MAX_REGISTERS)
- kernel_cuda_integrator_reset(int num_states, int num_keys)
+ kernel_cuda_integrator_reset(int num_states)
{
const int path_index = ccl_global_id(0);
@@ -84,10 +84,6 @@ extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
INTEGRATOR_STATE_WRITE(path, queued_kernel) = 0;
INTEGRATOR_STATE_WRITE(shadow_path, queued_kernel) = 0;
}
-
- if (path_index < num_keys) {
- kernel_integrator_state.sort_key_counter[path_index] = 0;
- }
}
extern "C" __global__ void CUDA_LAUNCH_BOUNDS(CUDA_KERNEL_BLOCK_NUM_THREADS,
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_closest.h b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
index b30012a8f45..ed671a6d595 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_closest.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_closest.h
@@ -28,21 +28,20 @@
CCL_NAMESPACE_BEGIN
-ccl_device_forceinline int integrator_intersect_next_shader(INTEGRATOR_STATE_ARGS,
- const Intersection *ccl_restrict isect)
+ccl_device_forceinline bool integrator_intersect_shader_next_kernel(
+ INTEGRATOR_STATE_ARGS, const Intersection *ccl_restrict isect)
{
/* Find shader from intersection. */
const int shader = intersection_get_shader(kg, isect);
+ const int flags = kernel_tex_fetch(__shaders, shader).flags;
/* Optional AO bounce termination. */
if (path_state_ao_bounce(INTEGRATOR_STATE_PASS)) {
- const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
if (flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_AFTER_TRANSPARENT;
}
else {
- return SHADER_NONE;
+ return false;
}
}
@@ -60,19 +59,21 @@ ccl_device_forceinline int integrator_intersect_next_shader(INTEGRATOR_STATE_ARG
const float terminate = path_state_rng_1D(kg, &rng_state, PRNG_TERMINATE);
if (probability == 0.0f || terminate >= probability) {
- const int flags = kernel_tex_fetch(__shaders, shader).flags;
-
if (flags & (SD_HAS_TRANSPARENT_SHADOW | SD_HAS_EMISSION)) {
/* Mark path to be terminated right after shader evaluation. */
INTEGRATOR_STATE_WRITE(path, flag) |= PATH_RAY_TERMINATE_IMMEDIATE;
}
else {
- return SHADER_NONE;
+ return false;
}
}
}
- return shader;
+ /* Setup next kernel to execute. */
+ const int next_kernel = DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE;
+ INTEGRATOR_PATH_NEXT_SORTED(INTERSECT_CLOSEST, next_kernel, shader);
+
+ return true;
}
ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
@@ -129,11 +130,7 @@ ccl_device void integrator_intersect_closest(INTEGRATOR_STATE_ARGS)
}
else {
/* Hit a surface, continue with surface kernel unless terminated. */
- const int shader = integrator_intersect_next_shader(INTEGRATOR_STATE_PASS, &isect);
- if (shader != SHADER_NONE) {
- INTEGRATOR_PATH_SET_SORT_KEY(shader);
- INTEGRATOR_PATH_NEXT(INTERSECT_CLOSEST, SHADE_SURFACE);
-
+ if (integrator_intersect_shader_next_kernel(INTEGRATOR_STATE_PASS, &isect)) {
const int object_flags = intersection_get_object_flags(kg, &isect);
kernel_shadow_catcher_split(INTEGRATOR_STATE_PASS, object_flags);
return;
diff --git a/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
index a96f5cc5ec9..59e151ad287 100644
--- a/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
+++ b/intern/cycles/kernel/integrator/integrator_intersect_subsurface.h
@@ -24,7 +24,6 @@ ccl_device void integrator_intersect_subsurface(INTEGRATOR_STATE_ARGS)
{
#ifdef __SUBSURF
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list