[Bf-blender-cvs] [87194d0c0a3] cycles-x: Cycles X: reduce GPU state memory usage when some features are not enabled
Brecht Van Lommel
noreply at git.blender.org
Wed Jul 14 17:50:46 CEST 2021
Commit: 87194d0c0a30cefe326212cb1cf56f1706c9a915
Author: Brecht Van Lommel
Date: Wed Jul 14 13:10:24 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB87194d0c0a30cefe326212cb1cf56f1706c9a915
Cycles X: reduce GPU state memory usage when some features are not enabled
In particular: volumes, subsurface, denoising and light passes.
In a scene without these features, we go from 538MB to 346MB for the state
memory usage. This also improves performance, presumably due to reduced
memory traffic.
Differential Revision: https://developer.blender.org/D11915
===================================================================
M intern/cycles/integrator/path_trace_work_gpu.cpp
M intern/cycles/integrator/path_trace_work_gpu.h
M intern/cycles/kernel/integrator/integrator_shade_surface.h
M intern/cycles/kernel/integrator/integrator_shade_volume.h
M intern/cycles/kernel/integrator/integrator_state.h
M intern/cycles/kernel/integrator/integrator_state_template.h
M intern/cycles/kernel/integrator/integrator_state_util.h
M intern/cycles/kernel/integrator/integrator_subsurface.h
M intern/cycles/kernel/kernel_path_state.h
M intern/cycles/kernel/kernel_types.h
M intern/cycles/render/film.cpp
M intern/cycles/render/film.h
M intern/cycles/render/scene.cpp
===================================================================
diff --git a/intern/cycles/integrator/path_trace_work_gpu.cpp b/intern/cycles/integrator/path_trace_work_gpu.cpp
index 35c85166252..b0ba7054543 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.cpp
+++ b/intern/cycles/integrator/path_trace_work_gpu.cpp
@@ -35,6 +35,7 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
bool *cancel_requested_flag)
: PathTraceWork(device, device_scene, cancel_requested_flag),
queue_(device->gpu_queue_create()),
+ integrator_state_soa_kernel_features_(0),
integrator_queue_counter_(device, "integrator_queue_counter", MEM_READ_WRITE),
integrator_shader_sort_counter_(device, "integrator_shader_sort_counter", MEM_READ_WRITE),
integrator_shader_raytrace_sort_counter_(
@@ -57,28 +58,31 @@ PathTraceWorkGPU::PathTraceWorkGPU(Device *device,
void PathTraceWorkGPU::alloc_integrator_soa()
{
- /* IntegrateState allocated as structure of arrays.
- *
- * Allocate a device only memory buffer before for each struct member, and then
- * write the pointers into a struct that resides in constant memory.
- *
- * TODO: store float3 in separate XYZ arrays. */
+ /* IntegrateState allocated as structure of arrays. */
- if (!integrator_state_soa_.empty()) {
+ /* Check if we already allocated memory for the required features. */
+ const uint kernel_features = device_scene_->data.kernel_features;
+ if ((integrator_state_soa_kernel_features_ & kernel_features) == kernel_features) {
return;
}
+ integrator_state_soa_kernel_features_ = kernel_features;
+ /* Allocate a device only memory buffer before for each struct member, and then
+ * write the pointers into a struct that resides in constant memory.
+ *
+ * TODO: store float3 in separate XYZ arrays. */
#define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) \
- { \
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
device_only_memory<type> *array = new device_only_memory<type>(device_, \
"integrator_state_" #name); \
array->alloc_to_device(max_num_paths_); \
integrator_state_soa_.emplace_back(array); \
integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
}
-#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name) \
- { \
+#define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
+ if ((kernel_features & feature) && \
+ (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
device_only_memory<type> *array = new device_only_memory<type>(device_, \
"integrator_state_" #name); \
array->alloc_to_device(max_num_paths_); \
diff --git a/intern/cycles/integrator/path_trace_work_gpu.h b/intern/cycles/integrator/path_trace_work_gpu.h
index 69475ed0a1b..76018d8501f 100644
--- a/intern/cycles/integrator/path_trace_work_gpu.h
+++ b/intern/cycles/integrator/path_trace_work_gpu.h
@@ -117,6 +117,7 @@ class PathTraceWorkGPU : public PathTraceWork {
IntegratorStateGPU integrator_state_gpu_;
/* SoA arrays for integrator state. */
vector<unique_ptr<device_memory>> integrator_state_soa_;
+ uint integrator_state_soa_kernel_features_;
/* Keep track of number of queued kernels. */
device_vector<IntegratorQueueCounter> integrator_queue_counter_;
/* Shader sorting. */
diff --git a/intern/cycles/kernel/integrator/integrator_shade_surface.h b/intern/cycles/kernel/integrator/integrator_shade_surface.h
index b44b85b7b60..eefbcad87de 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_surface.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_surface.h
@@ -176,14 +176,18 @@ ccl_device_forceinline void integrate_surface_direct_light(INTEGRATOR_STATE_ARGS
uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
shadow_flag |= (is_transmission) ? PATH_RAY_TRANSMISSION_PASS : PATH_RAY_REFLECT_PASS;
- const float3 diffuse_glossy_ratio = (bounce == 0) ? bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
- INTEGRATOR_STATE(path, diffuse_glossy_ratio);
const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&bsdf_eval);
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ const float3 diffuse_glossy_ratio = (bounce == 0) ?
+ bsdf_eval_diffuse_glossy_ratio(&bsdf_eval) :
+ INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+ INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+ }
+
INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
- INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
/* Branch off shadow kernel. */
@@ -242,9 +246,12 @@ ccl_device_forceinline int integrate_surface_bsdf_bssrdf_bounce(INTEGRATOR_STATE
float3 throughput = INTEGRATOR_STATE(path, throughput);
throughput *= bsdf_eval_sum(&bsdf_eval) / bsdf_pdf;
INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
- if (INTEGRATOR_STATE(path, bounce) == 0) {
- INTEGRATOR_STATE_WRITE(path,
- diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ if (INTEGRATOR_STATE(path, bounce) == 0) {
+ INTEGRATOR_STATE_WRITE(path,
+ diffuse_glossy_ratio) = bsdf_eval_diffuse_glossy_ratio(&bsdf_eval);
+ }
}
/* Update path state */
diff --git a/intern/cycles/kernel/integrator/integrator_shade_volume.h b/intern/cycles/kernel/integrator/integrator_shade_volume.h
index 55db3fe4f02..c95ac43168c 100644
--- a/intern/cycles/kernel/integrator/integrator_shade_volume.h
+++ b/intern/cycles/kernel/integrator/integrator_shade_volume.h
@@ -584,14 +584,18 @@ ccl_device_forceinline void integrate_volume_direct_light(INTEGRATOR_STATE_ARGS,
uint32_t shadow_flag = INTEGRATOR_STATE(path, flag);
shadow_flag |= (is_light) ? PATH_RAY_SHADOW_FOR_LIGHT : 0;
shadow_flag |= PATH_RAY_VOLUME_PASS;
- const float3 diffuse_glossy_ratio = (bounce == 0) ? one_float3() :
- INTEGRATOR_STATE(path, diffuse_glossy_ratio);
const float3 throughput = INTEGRATOR_STATE(path, throughput) * bsdf_eval_sum(&phase_eval);
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ const float3 diffuse_glossy_ratio = (bounce == 0) ?
+ one_float3() :
+ INTEGRATOR_STATE(path, diffuse_glossy_ratio);
+ INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
+ }
+
INTEGRATOR_STATE_WRITE(shadow_path, flag) = shadow_flag;
INTEGRATOR_STATE_WRITE(shadow_path, bounce) = bounce;
INTEGRATOR_STATE_WRITE(shadow_path, transparent_bounce) = transparent_bounce;
- INTEGRATOR_STATE_WRITE(shadow_path, diffuse_glossy_ratio) = diffuse_glossy_ratio;
INTEGRATOR_STATE_WRITE(shadow_path, throughput) = throughput;
integrator_state_copy_volume_stack_to_shadow(INTEGRATOR_STATE_PASS);
@@ -636,7 +640,10 @@ ccl_device_forceinline bool integrate_volume_phase_scatter(INTEGRATOR_STATE_ARGS
float3 throughput = INTEGRATOR_STATE(path, throughput);
throughput *= bsdf_eval_sum(&phase_eval) / phase_pdf;
INTEGRATOR_STATE_WRITE(path, throughput) = throughput;
- INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+
+ if (kernel_data.kernel_features & KERNEL_FEATURE_LIGHT_PASSES) {
+ INTEGRATOR_STATE_WRITE(path, diffuse_glossy_ratio) = one_float3();
+ }
/* Update path state */
INTEGRATOR_STATE_WRITE(path, mis_ray_pdf) = phase_pdf;
diff --git a/intern/cycles/kernel/integrator/integrator_state.h b/intern/cycles/kernel/integrator/integrator_state.h
index 18cb362799f..6eb3153825b 100644
--- a/intern/cycles/kernel/integrator/integrator_state.h
+++ b/intern/cycles/kernel/integrator/integrator_state.h
@@ -69,7 +69,7 @@ CCL_NAMESPACE_BEGIN
* CPU rendering path state with AoS layout. */
typedef struct IntegratorState {
#define KERNEL_STRUCT_BEGIN(name) struct {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) type name;
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type name;
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
#define KERNEL_STRUCT_END(name) \
} \
@@ -98,7 +98,7 @@ typedef struct IntegratorQueueCounter {
* GPU rendering path state with SoA layout. */
typedef struct IntegratorStateGPU {
#define KERNEL_STRUCT_BEGIN(name) struct {
-#define KERNEL_STRUCT_MEMBER(parent_struct, type, name) type *name;
+#define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) type *name;
#define KERNEL_STRUCT_ARRAY_MEMBER KERNEL_STRUCT_MEMBER
#define KERNEL_STRUCT_END(name) \
} \
diff --git a/intern/cycles/kernel/integrator/integrator_state_template.h b/intern/cycles/kernel/integrator/integrator_state_template.h
index a8fe35bc36b..36a42d30dea 100644
--- a/intern/cycles/kernel/integrator/integrator_state_template.h
+++ b/intern/cycles/kernel/integrator/integrator_state_template.h
@@ -23,115 +23,115 @@ KERNEL_STRUCT_BEGIN(path)
* `kernel_data.film.pass_stride`.
*
* The multiplication is delayed for later, so that state can use 32bit integer. */
-KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index)
+KERNEL_STRUCT_MEMBER(path, uint32_t, render_pixel_index, KERNEL_FEATURE_PATH_TRACING)
/* Current sample number. */
-KERNEL_STRUCT_MEMBER(path, uint16_t, sample)
+KERNEL_STRUCT_MEMBER(path, uint16_t, sample, KERNEL_FEATURE_PATH_TRACING)
/* Current ray bounce depth. */
-KERNEL_STRUCT_MEMBER(path, uint16_t, bounce)
+KERNEL_STRUCT_MEMB
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list