[Bf-blender-cvs] [222b64fcdc8] master: Fix Cycles CUDA crash when building kernels without optimizations (for debug)

Wed Nov 30 22:00:14 CET 2022

Commit: 222b64fcdc8166266eca10bb16c1cebb2bda8923
Author: Brecht Van Lommel
Date:   Wed Nov 30 21:38:57 2022 +0100
Branches: master
https://developer.blender.org/rB222b64fcdc8166266eca10bb16c1cebb2bda8923

Fix Cycles CUDA crash when building kernels without optimizations (for debug)

In this case the blocksize may not the one we requested, which was assumed to be
the case. Instead get the effective block size from the compiler as was already
done for Metal and OneAPI.

===================================================================

M	intern/cycles/kernel/device/gpu/kernel.h
M	intern/cycles/kernel/device/gpu/parallel_active_index.h

===================================================================

diff --git a/intern/cycles/kernel/device/gpu/kernel.h b/intern/cycles/kernel/device/gpu/kernel.h
index d7d2000775f..a44bd1dece7 100644
--- a/intern/cycles/kernel/device/gpu/kernel.h
+++ b/intern/cycles/kernel/device/gpu/kernel.h
@@ -314,11 +314,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int kernel_index);
   ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix
 
@@ -333,11 +329,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int kernel_index);
   ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
 
-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix
 
@@ -349,11 +341,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
 
-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix
 
@@ -366,11 +354,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
 
-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices + indices_offset,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix
 
@@ -383,11 +368,8 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
 {
   ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
 
-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices + indices_offset,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(
+      num_states, indices + indices_offset, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix
 
@@ -431,11 +413,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int num_active_paths);
   ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
 
-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix
 
@@ -469,11 +447,7 @@ ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE)
                         int num_active_paths);
   ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
 
-  gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE,
-                                  num_states,
-                                  indices,
-                                  num_indices,
-                                  ccl_gpu_kernel_lambda_pass);
+  gpu_parallel_active_index_array(num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass);
 }
 ccl_gpu_kernel_postfix
 
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 38cdcb572eb..1d47211604b 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -56,7 +56,7 @@ void gpu_parallel_active_index_array_impl(const uint num_states,
   const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
 #else /* !__KERNEL__ONEAPI__ */
 #  ifndef __KERNEL_METAL__
-template<uint blocksize, typename IsActiveOp>
+template<typename IsActiveOp>
 __device__
 #  endif
     void
@@ -79,6 +79,10 @@ __device__
 {
   extern ccl_gpu_shared int warp_offset[];
 
+#    ifndef __KERNEL_METAL__
+  const uint blocksize = ccl_gpu_block_dim_x;
+#    endif
+
   const uint thread_index = ccl_gpu_thread_idx_x;
   const uint thread_warp = thread_index % ccl_gpu_warp_size;
 
@@ -149,7 +153,7 @@ __device__
 
 #ifdef __KERNEL_METAL__
 
-#  define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
+#  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
     const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
                                is_active_op(ccl_gpu_global_id_x()) : \
                                0; \
@@ -167,15 +171,13 @@ __device__
                                          simdgroup_offset)
 #elif defined(__KERNEL_ONEAPI__)
 
-#  define gpu_parallel_active_index_array( \
-      blocksize, num_states, indices, num_indices, is_active_op) \
+#  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
     gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
 
 #else
 
-#  define gpu_parallel_active_index_array( \
-      blocksize, num_states, indices, num_indices, is_active_op) \
-    gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
+#  define gpu_parallel_active_index_array(num_states, indices, num_indices, is_active_op) \
+    gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
 
 #endif