[Bf-blender-cvs] [2d994de77c3] master: Cycles: MetalRT optimisation for subsurface intersection queries
Michael Jones
noreply at git.blender.org
Mon Feb 6 20:12:31 CET 2023
Commit: 2d994de77c35a6e8a8a9c78935a3f8ed7d147f7d
Author: Michael Jones
Date: Mon Feb 6 19:09:51 2023 +0000
Branches: master
https://developer.blender.org/rB2d994de77c35a6e8a8a9c78935a3f8ed7d147f7d
Cycles: MetalRT optimisation for subsurface intersection queries
This patch optimises subsurface intersection queries on MetalRT. Currently intersect_local traverses from the scene root, retrospectively discarding all non-local hits. Using a lookup of bottom level acceleration structures, we can explicitly query only the relevant instance. On M1 Max, with MetalRT selected, this can give a render speedup of 15-20% for scenes like Monster which make heavy use of subsurface scattering.
Patch authored by Marco Giordano.
Reviewed By: brecht
Differential Revision: https://developer.blender.org/D17153
===================================================================
M intern/cycles/device/metal/bvh.h
M intern/cycles/device/metal/bvh.mm
M intern/cycles/device/metal/device_impl.h
M intern/cycles/device/metal/device_impl.mm
M intern/cycles/device/metal/kernel.h
M intern/cycles/device/metal/kernel.mm
M intern/cycles/device/metal/queue.mm
M intern/cycles/kernel/device/metal/bvh.h
M intern/cycles/kernel/device/metal/compat.h
M intern/cycles/kernel/device/metal/kernel.metal
===================================================================
diff --git a/intern/cycles/device/metal/bvh.h b/intern/cycles/device/metal/bvh.h
index 519cbf00294..5448a3ae41d 100644
--- a/intern/cycles/device/metal/bvh.h
+++ b/intern/cycles/device/metal/bvh.h
@@ -21,6 +21,7 @@ class BVHMetal : public BVH {
API_AVAILABLE(macos(11.0))
vector<id<MTLAccelerationStructure>> blas_array;
+ vector<uint32_t> blas_lookup;
bool motion_blur = false;
diff --git a/intern/cycles/device/metal/bvh.mm b/intern/cycles/device/metal/bvh.mm
index a7fd64d3c98..c692b762d86 100644
--- a/intern/cycles/device/metal/bvh.mm
+++ b/intern/cycles/device/metal/bvh.mm
@@ -816,6 +816,11 @@ bool BVHMetal::build_TLAS(Progress &progress,
uint32_t instance_index = 0;
uint32_t motion_transform_index = 0;
+
+ // allocate look up buffer for wost case scenario
+ uint64_t count = objects.size();
+ blas_lookup.resize(count);
+
for (Object *ob : objects) {
/* Skip non-traceable objects */
if (!ob->is_traceable())
@@ -843,12 +848,15 @@ bool BVHMetal::build_TLAS(Progress &progress,
/* Set user instance ID to object index */
int object_index = ob->get_device_index();
uint32_t user_id = uint32_t(object_index);
+ int currIndex = instance_index++;
+ assert(user_id < blas_lookup.size());
+ blas_lookup[user_id] = accel_struct_index;
/* Bake into the appropriate descriptor */
if (motion_blur) {
MTLAccelerationStructureMotionInstanceDescriptor *instances =
(MTLAccelerationStructureMotionInstanceDescriptor *)[instanceBuf contents];
- MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[instance_index++];
+ MTLAccelerationStructureMotionInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
@@ -894,7 +902,7 @@ bool BVHMetal::build_TLAS(Progress &progress,
else {
MTLAccelerationStructureUserIDInstanceDescriptor *instances =
(MTLAccelerationStructureUserIDInstanceDescriptor *)[instanceBuf contents];
- MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[instance_index++];
+ MTLAccelerationStructureUserIDInstanceDescriptor &desc = instances[currIndex];
desc.accelerationStructureIndex = accel_struct_index;
desc.userID = user_id;
diff --git a/intern/cycles/device/metal/device_impl.h b/intern/cycles/device/metal/device_impl.h
index a10962b4e45..2b89ebf19c9 100644
--- a/intern/cycles/device/metal/device_impl.h
+++ b/intern/cycles/device/metal/device_impl.h
@@ -74,6 +74,11 @@ class MetalDevice : public Device {
id<MTLBuffer> texture_bindings_3d = nil;
std::vector<id<MTLTexture>> texture_slot_map;
+ /* BLAS encoding & lookup */
+ id<MTLArgumentEncoder> mtlBlasArgEncoder = nil;
+ id<MTLBuffer> blas_buffer = nil;
+ id<MTLBuffer> blas_lookup_buffer = nil;
+
bool use_metalrt = false;
MetalPipelineType kernel_specialization_level = PSO_GENERIC;
diff --git a/intern/cycles/device/metal/device_impl.mm b/intern/cycles/device/metal/device_impl.mm
index 35298822e41..aadf5e02934 100644
--- a/intern/cycles/device/metal/device_impl.mm
+++ b/intern/cycles/device/metal/device_impl.mm
@@ -192,6 +192,10 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
arg_desc_as.access = MTLArgumentAccessReadOnly;
+ MTLArgumentDescriptor *arg_desc_ptrs = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_ptrs.dataType = MTLDataTypePointer;
+ arg_desc_ptrs.access = MTLArgumentAccessReadOnly;
+
MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
arg_desc_ift.access = MTLArgumentAccessReadOnly;
@@ -204,14 +208,28 @@ MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profile
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
arg_desc_ift.index = index++;
[ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
+ arg_desc_ift.index = index++;
+ [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local_prim */
+ arg_desc_ptrs.index = index++;
+ [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* blas array */
+ arg_desc_ptrs.index = index++;
+ [ancillary_desc addObject:[arg_desc_ptrs copy]]; /* look up table for blas */
[arg_desc_ift release];
[arg_desc_as release];
+ [arg_desc_ptrs release];
}
}
mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
+ // preparing the blas arg encoder
+ MTLArgumentDescriptor *arg_desc_blas = [[MTLArgumentDescriptor alloc] init];
+ arg_desc_blas.dataType = MTLDataTypeInstanceAccelerationStructure;
+ arg_desc_blas.access = MTLArgumentAccessReadOnly;
+ mtlBlasArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_blas ]];
+ [arg_desc_blas release];
+
for (int i = 0; i < ancillary_desc.count; i++) {
[ancillary_desc[i] release];
}
@@ -1240,6 +1258,33 @@ void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
if (@available(macos 11.0, *)) {
if (bvh->params.top_level) {
bvhMetalRT = bvh_metal;
+
+ // allocate required buffers for BLAS array
+ uint64_t count = bvhMetalRT->blas_array.size();
+ uint64_t bufferSize = mtlBlasArgEncoder.encodedLength * count;
+ blas_buffer = [mtlDevice newBufferWithLength:bufferSize options:default_storage_mode];
+ stats.mem_alloc(blas_buffer.allocatedSize);
+
+ for (uint64_t i = 0; i < count; ++i) {
+ [mtlBlasArgEncoder setArgumentBuffer:blas_buffer
+ offset:i * mtlBlasArgEncoder.encodedLength];
+ [mtlBlasArgEncoder setAccelerationStructure:bvhMetalRT->blas_array[i] atIndex:0];
+ }
+
+ count = bvhMetalRT->blas_lookup.size();
+ bufferSize = sizeof(uint32_t) * count;
+ blas_lookup_buffer = [mtlDevice newBufferWithLength:bufferSize
+ options:default_storage_mode];
+ stats.mem_alloc(blas_lookup_buffer.allocatedSize);
+
+ memcpy([blas_lookup_buffer contents],
+ bvhMetalRT -> blas_lookup.data(),
+ blas_lookup_buffer.allocatedSize);
+
+ if (default_storage_mode == MTLResourceStorageModeManaged) {
+ [blas_buffer didModifyRange:NSMakeRange(0, blas_buffer.length)];
+ [blas_lookup_buffer didModifyRange:NSMakeRange(0, blas_lookup_buffer.length)];
+ }
}
}
}
diff --git a/intern/cycles/device/metal/kernel.h b/intern/cycles/device/metal/kernel.h
index 212671f52a0..0225c5c4947 100644
--- a/intern/cycles/device/metal/kernel.h
+++ b/intern/cycles/device/metal/kernel.h
@@ -19,6 +19,8 @@ enum {
METALRT_FUNC_SHADOW_BOX,
METALRT_FUNC_LOCAL_TRI,
METALRT_FUNC_LOCAL_BOX,
+ METALRT_FUNC_LOCAL_TRI_PRIM,
+ METALRT_FUNC_LOCAL_BOX_PRIM,
METALRT_FUNC_CURVE_RIBBON,
METALRT_FUNC_CURVE_RIBBON_SHADOW,
METALRT_FUNC_CURVE_ALL,
@@ -28,7 +30,13 @@ enum {
METALRT_FUNC_NUM
};
-enum { METALRT_TABLE_DEFAULT, METALRT_TABLE_SHADOW, METALRT_TABLE_LOCAL, METALRT_TABLE_NUM };
+enum {
+ METALRT_TABLE_DEFAULT,
+ METALRT_TABLE_SHADOW,
+ METALRT_TABLE_LOCAL,
+ METALRT_TABLE_LOCAL_PRIM,
+ METALRT_TABLE_NUM
+};
/* Pipeline State Object types */
enum MetalPipelineType {
diff --git a/intern/cycles/device/metal/kernel.mm b/intern/cycles/device/metal/kernel.mm
index 2ed230ee657..d9e977f1ed6 100644
--- a/intern/cycles/device/metal/kernel.mm
+++ b/intern/cycles/device/metal/kernel.mm
@@ -524,6 +524,8 @@ void MetalKernelPipeline::compile()
"__anyhit__cycles_metalrt_shadow_all_hit_box",
"__anyhit__cycles_metalrt_local_hit_tri",
"__anyhit__cycles_metalrt_local_hit_box",
+ "__anyhit__cycles_metalrt_local_hit_tri_prim",
+ "__anyhit__cycles_metalrt_local_hit_box_prim",
"__intersection__curve_ribbon",
"__intersection__curve_ribbon_shadow",
"__intersection__curve_all",
@@ -614,11 +616,17 @@ void MetalKernelPipeline::compile()
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
nil];
+ table_functions[METALRT_TABLE_LOCAL_PRIM] = [NSArray
+ arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI_PRIM],
+ rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+ rt_intersection_function[METALRT_FUNC_LOCAL_BOX_PRIM],
+ nil];
NSMutableSet *unique_functions = [NSMutableSet
setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
[unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
+ [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL_PRIM]];
if (kernel_has_intersection(device_kernel)) {
linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
diff --git a/intern/cycles/device/metal/queue.mm b/intern/cycles/device/metal/queue.mm
index 9137e9b1fb0..b824b75ccf4 100644
--- a/intern/cycles/device/metal/queue.mm
+++ b/intern/cycles/device/metal/queue.mm
@@ -482,6 +482,12 @@ bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
if (metal_device_->bvhMetalRT) {
id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
[metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
+ [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
+ offset:0
+ atIndex:7];
+ [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_lookup_buffer
+ offset:0
+
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list