[Bf-blender-cvs] [9181588057d] cycles-x: Cleanup: move number of concurrent GPU paths to device queue
Brecht Van Lommel
noreply at git.blender.org
Wed Apr 28 20:07:50 CEST 2021
Commit: 9181588057db04fd594c0fad9c98b9325dda42e1
Author: Brecht Van Lommel
Date: Wed Apr 28 19:26:46 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB9181588057db04fd594c0fad9c98b9325dda42e1
Cleanup: move number of concurrent GPU paths to device queue
So different devices can set it to different values, though it's still
hardcoded as of now.
===================================================================
M intern/cycles/device/cuda/queue.cpp
M intern/cycles/device/cuda/queue.h
M intern/cycles/device/device_queue.h
M intern/cycles/integrator/path_trace_work_gpu.cpp
M intern/cycles/integrator/path_trace_work_gpu.h
===================================================================
diff --git a/intern/cycles/device/cuda/queue.cpp b/intern/cycles/device/cuda/queue.cpp
index 32af06d85df..f8db047a663 100644
--- a/intern/cycles/device/cuda/queue.cpp
+++ b/intern/cycles/device/cuda/queue.cpp
@@ -37,6 +37,13 @@ CUDADeviceQueue::~CUDADeviceQueue()
cuStreamDestroy(cuda_stream_);
}
+int CUDADeviceQueue::num_concurrent_states(const size_t) const
+{
+ /* TODO: compute automatically. */
+ /* TODO: must have at least num_threads_per_block. */
+ return 1048576;
+}
+
void CUDADeviceQueue::init_execution()
{
/* Synchronize all textures and memory copies before executing task. */
diff --git a/intern/cycles/device/cuda/queue.h b/intern/cycles/device/cuda/queue.h
index acdcfd8a69e..54088f0b6a5 100644
--- a/intern/cycles/device/cuda/queue.h
+++ b/intern/cycles/device/cuda/queue.h
@@ -35,6 +35,8 @@ class CUDADeviceQueue : public DeviceQueue {
CUDADeviceQueue(CUDADevice *device);
~CUDADeviceQueue();
+ virtual int num_concurrent_states(const size_t state_size) const override;
+
virtual void init_execution() override;
virtual bool kernel_available(DeviceKernel kernel) const override;
diff --git a/intern/cycles/device/device_queue.h b/intern/cycles/device/device_queue.h
index dffa3ce26b7..d1aee95f963 100644
--- a/intern/cycles/device/device_queue.h
+++ b/intern/cycles/device/device_queue.h
@@ -36,6 +36,10 @@ class DeviceQueue {
public:
virtual ~DeviceQueue();
+ /* Number of concurrent states to process for integrator,
+ * based on number of cores and/or available memory. */
+ virtual int num_concurrent_states(const size_t state_size) const = 0;
+
/* Initialize execution of kernels on this queue.
*
* Will, for example, load all data required by the kernels from Device to global or path state.
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index d477c92da13..8991bfa0c63 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -44,9 +44,10 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
num_queued_paths_(device, "num_queued_paths", MEM_READ_WRITE),
work_tiles_(device, "work_tiles", MEM_READ_WRITE),
gpu_display_rgba_half_(device, "display buffer half", MEM_READ_WRITE),
+ max_num_paths_(queue_->num_concurrent_states(sizeof(IntegratorState))),
max_active_path_index_(0)
{
- work_tile_scheduler_.set_max_num_path_states(get_max_num_paths());
+ work_tile_scheduler_.set_max_num_path_states(max_num_paths_);
}
void PathTraceWorkGPU::alloc_integrator_state()
@@ -65,14 +66,13 @@ void PathTraceWorkGPU::alloc_integrator_state()
}
vector<device_ptr> device_struct;
- const int max_num_paths = get_max_num_paths();
#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
#define KERNEL_STRUCT_MEMBER(type, name) \
{ \
device_only_memory<type> *array = new device_only_memory<type>(device_, \
"integrator_state_" #name); \
- array->alloc_to_device(max_num_paths); \
+ array->alloc_to_device(max_num_paths_); \
/* TODO: skip for most arrays. */ \
array->zero_to_device(); \
device_struct.push_back(array->device_pointer); \
@@ -117,7 +117,7 @@ void PathTraceWorkGPU::alloc_integrator_queue()
}
if (queued_paths_.size() == 0) {
- queued_paths_.alloc(get_max_num_paths());
+ queued_paths_.alloc(max_num_paths_);
/* TODO: this could be skip if we had a function to just allocate on device. */
queued_paths_.zero_to_device();
}
@@ -127,7 +127,7 @@ void PathTraceWorkGPU::alloc_integrator_sorting()
{
/* Allocate arrays for shader sorting. */
if (integrator_sort_key_counter_.size() == 0) {
- integrator_sort_key_.alloc(get_max_num_paths());
+ integrator_sort_key_.alloc(max_num_paths_);
/* TODO: this could be skip if we had a function to just allocate on device. */
integrator_sort_key_.zero_to_device();
device_->const_copy_to(
@@ -202,7 +202,6 @@ bool PathTraceWorkGPU::enqueue_path_iteration()
return false;
}
- const int max_num_paths = get_max_num_paths();
const float megakernel_threshold = 0.02f;
const bool use_megakernel = queue_->kernel_available(DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL) &&
(num_paths < megakernel_threshold * max_num_paths_);
@@ -294,7 +293,7 @@ void PathTraceWorkGPU::enqueue_path_iteration(DeviceKernel kernel)
queue_->zero_to_device(num_queued_paths_);
}
- DCHECK_LE(work_size, get_max_num_paths());
+ DCHECK_LE(work_size, max_num_paths_);
switch (kernel) {
case DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST:
@@ -386,7 +385,7 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, int queued_kern
/* TODO: this could be smaller for terminated paths based on amount of work we want
* to schedule. */
const int work_size = (kernel == DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY) ?
- get_max_num_paths() :
+ max_num_paths_ :
max_active_path_index_;
void *d_queued_paths = (void *)queued_paths_.device_pointer;
@@ -400,7 +399,6 @@ void PathTraceWorkGPU::compute_queued_paths(DeviceKernel kernel, int queued_kern
bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
{
const float regenerate_threshold = 0.5f;
- const int max_num_paths = get_max_num_paths();
int num_paths = get_num_active_paths();
if (num_paths == 0) {
@@ -421,11 +419,11 @@ bool PathTraceWorkGPU::enqueue_work_tiles(bool &finished)
/* Schedule when we're out of paths or there are too few paths to keep the
* device occupied. */
- if (num_paths == 0 || num_paths < regenerate_threshold * max_num_paths) {
+ if (num_paths == 0 || num_paths < regenerate_threshold * max_num_paths_) {
/* Get work tiles until the maximum number of path is reached. */
- while (num_paths < max_num_paths) {
+ while (num_paths < max_num_paths_) {
KernelWorkTile work_tile;
- if (work_tile_scheduler_.get_work(&work_tile, max_num_paths - num_paths)) {
+ if (work_tile_scheduler_.get_work(&work_tile, max_num_paths_ - num_paths)) {
work_tiles.push_back(work_tile);
num_paths += work_tile.w * work_tile.h * work_tile.num_samples;
}
@@ -499,7 +497,7 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
/* Offset work tile and path index pointers for next tile. */
num_paths += tile_work_size;
- DCHECK_LE(num_paths, get_max_num_paths());
+ DCHECK_LE(num_paths, max_num_paths_);
/* TODO: this pointer manipulation won't work for OpenCL. */
d_work_tile = (void *)(((KernelWorkTile *)d_work_tile) + 1);
@@ -510,7 +508,7 @@ void PathTraceWorkGPU::enqueue_work_tiles(DeviceKernel kernel,
/* TODO: this could be computed more accurately using on the last entry
* in the queued_paths array passed to the kernel? */
- max_active_path_index_ = min(max_active_path_index_ + num_paths, get_max_num_paths());
+ max_active_path_index_ = min(max_active_path_index_ + num_paths, max_num_paths_);
}
int PathTraceWorkGPU::get_num_active_paths()
@@ -526,13 +524,6 @@ int PathTraceWorkGPU::get_num_active_paths()
return num_paths;
}
-int PathTraceWorkGPU::get_max_num_paths()
-{
- /* TODO: compute automatically. */
- /* TODO: must have at least num_threads_per_block. */
- return 1048576;
-}
-
void PathTraceWorkGPU::copy_to_gpu_display(GPUDisplay *gpu_display, float sample_scale)
{
if (!interop_use_checked_) {
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 3e7fa1d5726..4439f6ae6a7 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -86,8 +86,7 @@ class PathTraceWorkGPU : public PathTraceWork {
void enqueue_adaptive_sampling_filter_x();
void enqueue_adaptive_sampling_filter_y();
- /* Integrator queues.
- * There are as many of queues as the concurrent queues the device supports. */
+ /* Integrator queue. */
unique_ptr<DeviceQueue> queue_;
/* Scheduler which gives work to path tracing threads. */
@@ -122,6 +121,9 @@ class PathTraceWorkGPU : public PathTraceWork {
bool interop_use_checked_ = false;
bool interop_use_ = false;
+ /* Maximum number of concurrent integrator states. */
+ int max_num_paths_;
+
/* Maximum path index, effective number of paths used may be smaller than
* the size of the integrator_state_ buffer so can avoid iterating over the
* full buffer. */
More information about the Bf-blender-cvs
mailing list