[Bf-blender-cvs] [be785ef4f78] cycles-x: Cycles X: refactoring of kernel globals
Brecht Van Lommel
noreply at git.blender.org
Fri May 7 15:12:32 CEST 2021
Commit: be785ef4f78e3dd7d51963f249177b1c2a8dc5d6
Author: Brecht Van Lommel
Date: Thu May 6 20:06:57 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBbe785ef4f78e3dd7d51963f249177b1c2a8dc5d6
Cycles X: refactoring of kernel globals
* Declare kernel globals and associated macros in one place
* Add IntegratorStateGPU for CPU host to access
* Reduce code duplication between CUDA and OptiX
* Make shader sort key part of integrator state template
* Remove unused CPU kernel globals
* Prepare for copying/moving states
* Rename integrator_path_state.h to integrator_state_flow.h, to avoid
confusion with kernel_path_state.h.
Part of these changes were implemented by Sergey for D11172.
Differential Revision: https://developer.blender.org/D11185
===================================================================
M intern/cycles/device/cpu/kernel_thread_globals.cpp
M intern/cycles/device/optix/device_impl.cpp
M intern/cycles/device/optix/device_impl.h
M intern/cycles/device/optix/queue.cpp
M intern/cycles/integrator/path_trace_work_gpu.cpp
M intern/cycles/integrator/path_trace_work_gpu.h
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/device/cpu/compat.h
M intern/cycles/kernel/device/cpu/globals.h
M intern/cycles/kernel/device/cpu/kernel_arch_impl.h
M intern/cycles/kernel/device/cuda/compat.h
M intern/cycles/kernel/device/cuda/globals.h
M intern/cycles/kernel/device/cuda/kernel.cu
M intern/cycles/kernel/device/cuda/parallel_sorted_index.h
M intern/cycles/kernel/device/optix/compat.h
M intern/cycles/kernel/device/optix/globals.h
M intern/cycles/kernel/device/optix/kernel.cu
D intern/cycles/kernel/integrator/integrator_path_state.h
M intern/cycles/kernel/integrator/integrator_state.h
A intern/cycles/kernel/integrator/integrator_state_flow.h
M intern/cycles/kernel/integrator/integrator_state_template.h
===================================================================
diff --git a/intern/cycles/device/cpu/kernel_thread_globals.cpp b/intern/cycles/device/cpu/kernel_thread_globals.cpp
index eff15af5ff1..f0089e34a7a 100644
--- a/intern/cycles/device/cpu/kernel_thread_globals.cpp
+++ b/intern/cycles/device/cpu/kernel_thread_globals.cpp
@@ -23,22 +23,6 @@
CCL_NAMESPACE_BEGIN
-/* TODO(sergey): Consider making more available function. Maybe `util_memory.h`? */
-static void safe_free(void *mem)
-{
- if (mem == nullptr) {
- return;
- }
- free(mem);
-}
-
-/* Get number of elements in a bound array. */
-/* TODO(sergey): Make this function more re-usable. */
-template<class T, int N> constexpr inline int ARRAY_SIZE(T (&/*array*/)[N])
-{
- return N;
-}
-
CPUKernelThreadGlobals::CPUKernelThreadGlobals()
{
reset_runtime_memory();
@@ -50,7 +34,6 @@ CPUKernelThreadGlobals::CPUKernelThreadGlobals(const KernelGlobals &kernel_globa
{
reset_runtime_memory();
- decoupled_volume_steps_index = 0;
coverage_asset = nullptr;
coverage_object = nullptr;
coverage_material = nullptr;
@@ -70,12 +53,6 @@ CPUKernelThreadGlobals::CPUKernelThreadGlobals(CPUKernelThreadGlobals &&other) n
CPUKernelThreadGlobals::~CPUKernelThreadGlobals()
{
- safe_free(transparent_shadow_intersections);
-
- const int decoupled_count = ARRAY_SIZE(decoupled_volume_steps);
- for (int i = 0; i < decoupled_count; ++i) {
- safe_free(decoupled_volume_steps[i]);
- }
#ifdef WITH_OSL
OSLShader::thread_free(this);
#endif
@@ -96,13 +73,9 @@ CPUKernelThreadGlobals &CPUKernelThreadGlobals::operator=(CPUKernelThreadGlobals
void CPUKernelThreadGlobals::reset_runtime_memory()
{
- transparent_shadow_intersections = nullptr;
-
#ifdef WITH_OSL
osl = nullptr;
#endif
-
- memset(decoupled_volume_steps, 0, sizeof(decoupled_volume_steps));
}
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/optix/device_impl.cpp b/intern/cycles/device/optix/device_impl.cpp
index c0ee4f143b8..5843d88ce88 100644
--- a/intern/cycles/device/optix/device_impl.cpp
+++ b/intern/cycles/device/optix/device_impl.cpp
@@ -97,7 +97,7 @@ OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
# endif
/* Fix weird compiler bug that assigns wrong size. */
- launch_params.data_elements = sizeof(KernelParams);
+ launch_params.data_elements = sizeof(KernelParamsOptiX);
/* Allocate launch parameter buffer memory on device. */
launch_params.alloc_to_device(1);
@@ -1387,20 +1387,17 @@ void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
KernelData *const data = (KernelData *)host;
*(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
- update_launch_params(offsetof(KernelParams, data), host, size);
+ update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
return;
}
/* Update data storage pointers in launch parameters. */
# define KERNEL_TEX(data_type, tex_name) \
if (strcmp(name, #tex_name) == 0) { \
- update_launch_params(offsetof(KernelParams, tex_name), host, size); \
+ update_launch_params(offsetof(KernelParamsOptiX, tex_name), host, size); \
return; \
}
KERNEL_TEX(IntegratorState, __integrator_state)
- KERNEL_TEX(IntegratorQueueCounter *, __integrator_queue_counter)
- KERNEL_TEX(int *, __integrator_sort_key)
- KERNEL_TEX(int *, __integrator_sort_key_counter)
# include "kernel/kernel_textures.h"
# undef KERNEL_TEX
}
diff --git a/intern/cycles/device/optix/device_impl.h b/intern/cycles/device/optix/device_impl.h
index ba606075c79..a4b75a16354 100644
--- a/intern/cycles/device/optix/device_impl.h
+++ b/intern/cycles/device/optix/device_impl.h
@@ -26,7 +26,7 @@
CCL_NAMESPACE_BEGIN
class BVHOptiX;
-struct KernelParams;
+struct KernelParamsOptiX;
/* List of OptiX program groups. */
enum {
@@ -64,7 +64,7 @@ class OptiXDevice : public CUDADevice {
bool motion_blur = false;
device_vector<SbtRecord> sbt_data;
- device_only_memory<KernelParams> launch_params;
+ device_only_memory<KernelParamsOptiX> launch_params;
OptixTraversableHandle tlas_handle = 0;
class Denoiser {
diff --git a/intern/cycles/device/optix/queue.cpp b/intern/cycles/device/optix/queue.cpp
index 211df631bcb..59203dedb35 100644
--- a/intern/cycles/device/optix/queue.cpp
+++ b/intern/cycles/device/optix/queue.cpp
@@ -69,16 +69,17 @@ bool OptiXDeviceQueue::enqueue(DeviceKernel kernel, const int work_size, void *a
cuda_device_assert(
cuda_device_,
- cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, path_index_array),
+ cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, path_index_array),
args[0], // &d_path_index
sizeof(device_ptr),
cuda_stream_));
if (kernel == DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) {
- cuda_device_assert(cuda_device_,
- cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, render_buffer),
- args[1], // &d_render_buffer
- sizeof(device_ptr),
- cuda_stream_));
+ cuda_device_assert(
+ cuda_device_,
+ cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParamsOptiX, render_buffer),
+ args[1], // &d_render_buffer
+ sizeof(device_ptr),
+ cuda_stream_));
}
cuda_device_assert(cuda_device_, cuStreamSynchronize(cuda_stream_));
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 8991bfa0c63..0289c634ae6 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -38,7 +38,6 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
queue_(device->gpu_queue_create()),
render_buffers_(buffers),
integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
- integrator_sort_key_(device, "integrator_sort_key", MEM_READ_WRITE),
integrator_sort_key_counter_(device, "integrator_sort_key_counter", MEM_READ_WRITE),
queued_paths_(device, "queued_paths", MEM_READ_WRITE),
num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
@@ -47,36 +46,42 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorState))),
max_active_path_index_(0)
{
+ memset(&integrator_state_gpu_, 0, sizeof(integrator_state_gpu_));
work_tile_scheduler_.set_max_num_path_states(max_num_paths_);
}
-void PathTraceWorkGPU::alloc_integrator_state()
+void PathTraceWorkGPU::alloc_integrator_soa()
{
/* IntegrateState allocated as structure of arrays.
*
* Allocate a device only memory buffer before for each struct member, and then
* write the pointers into a struct that resides in constant memory.
*
- * This assumes the device side struct memory contains consecutive pointers for
- * each struct member, with the same 64-bit size as device_ptr.
- *
- * TODO: store float3 in separate XYZ arrays. */
+ * TODO: store float3 in separate XYZ arrays.
+ * TODO: skip zeroing most arrays and leave uninitialized. */
+
if (!integrator_state_soa_.empty()) {
return;
}
- vector<device_ptr> device_struct;
-
#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(type, name) \
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) \
+ { \
+ device_only_memory<type> *array = new device_only_memory<type>(device_, \
+ "integrator_state_" #name); \
+ array->alloc_to_device(max_num_paths_); \
+ array->zero_to_device(); \
+ integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
+ }
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name) \
{ \
device_only_memory<type> *array = new device_only_memory<type>(device_, \
"integrator_state_" #name); \
array->alloc_to_device(max_num_paths_); \
- /* TODO: skip for most arrays. */ \
array->zero_to_device(); \
- device_struct.push_back(array->device_pointer); \
integrator_state_soa_.emplace_back(array); \
+ integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
}
#define KERNEL_STRUCT_END(name) \
break; \
@@ -89,12 +94,9 @@ void PathTraceWorkGPU::alloc_integrator_state()
#include "kernel/integrator/integrator_state_template.h"
#undef KERNEL_STRUCT_BEGIN
#undef KERNEL_STRUCT_MEMBER
+#undef KERNEL_STRUCT_ARRAY_MEMBER
#undef KERNEL_STRUCT_END
#undef KERNEL_STRUCT_END_ARRAY
-
- /* Copy to device side struct in constant memory. */
- device_->const_copy_to(
- "__integrator_state", device_struct.data(), device_struct.size() * sizeof(device_ptr));
}
void PathTraceWorkGPU::alloc_integrator_queue()
@@ -103,11 +105,8 @@ void PathTraceWorkGPU::alloc_integrator_queue()
integrator_queue_counter_.alloc(1);
integrator_queue_counter_.zero_to_device();
integrator_queue_counter_.copy_from_device();
-
- /* Copy to device side pointer in constant memory. */
- device_->const_copy_to("__integrator_queue_counter",
- &integrator_queue_counter_.device_pointer,
- sizeof(device_ptr));
+ integrator_state_gpu_.queue_counter = (IntegratorQueueCounter *)
+ integrator_queue_counter_.device_pointer;
}
/* Allocate data for active path index arrays. */
@@ -126,21 +125,11 @@ void PathTraceWorkGPU::alloc_integrator_queue()
void PathTraceWorkGPU::alloc_integrator_sorting()
{
/* Allocate arrays for shader sorting. */
- if (integrator_sort_key_counter_.size() == 0) {
- integrator_sort_key_.alloc(max_num_paths_);
- /* TODO: this could be skip if we had a function to just allocate on device. */
- integrator_sort_key_.zero_to_device();
- device_->const_copy_to(
- "__integrator_sort_key", &integrator_
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list