[Bf-blender-cvs] [858fffc2df8] master: Cycles: oneAPI: add support for SYCL host task
Nikita Sirgienko
noreply at git.blender.org
Tue Jan 3 20:49:03 CET 2023
Commit: 858fffc2df8fe124664c75796aa39fab4bc93af1
Author: Nikita Sirgienko
Date: Tue Jan 3 20:45:57 2023 +0100
Branches: master
https://developer.blender.org/rB858fffc2df8fe124664c75796aa39fab4bc93af1
Cycles: oneAPI: add support for SYCL host task
This functionality is related only to debugging of SYCL implementation
via single-threaded CPU execution and is disabled by default.
Host device has been deprecated in SYCL 2020 spec and we removed it
in 305b92e05f748a0fd9cb62b9829791d717ba2d57.
Since this is still very useful for debugging, we're restoring a
similar functionality here through SYCL 2020 Host Task.
===================================================================
M CMakeLists.txt
M intern/cycles/device/CMakeLists.txt
M intern/cycles/device/oneapi/device_impl.cpp
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/device/gpu/parallel_active_index.h
M intern/cycles/kernel/device/oneapi/compat.h
M intern/cycles/kernel/device/oneapi/globals.h
M intern/cycles/kernel/device/oneapi/kernel.cpp
===================================================================
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8aa8bffe08f..acfab6ffc60 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -501,12 +501,14 @@ endif()
if(NOT APPLE)
option(WITH_CYCLES_DEVICE_ONEAPI "Enable Cycles oneAPI compute support" OFF)
option(WITH_CYCLES_ONEAPI_BINARIES "Enable Ahead-Of-Time compilation for Cycles oneAPI device" OFF)
+ option(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION "Switch target of oneAPI implementation from SYCL devices to Host Task (single thread on CPU). This option is only for debugging purposes." OFF)
# https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html
# acm-g10 is the target for the first Intel Arc Alchemist GPUs.
set(CYCLES_ONEAPI_SPIR64_GEN_DEVICES "acm-g10" CACHE STRING "oneAPI Intel GPU architectures to build binaries for")
set(CYCLES_ONEAPI_SYCL_TARGETS spir64 spir64_gen CACHE STRING "oneAPI targets to build AOT binaries for")
+ mark_as_advanced(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
mark_as_advanced(CYCLES_ONEAPI_SPIR64_GEN_DEVICES)
mark_as_advanced(CYCLES_ONEAPI_SYCL_TARGETS)
endif()
diff --git a/intern/cycles/device/CMakeLists.txt b/intern/cycles/device/CMakeLists.txt
index c7e95d44d9b..8ec15c6f304 100644
--- a/intern/cycles/device/CMakeLists.txt
+++ b/intern/cycles/device/CMakeLists.txt
@@ -163,6 +163,9 @@ if(WITH_CYCLES_DEVICE_METAL)
endif()
if(WITH_CYCLES_DEVICE_ONEAPI)
+ if(WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
+ add_definitions(-DWITH_ONEAPI_SYCL_HOST_TASK)
+ endif()
if(WITH_CYCLES_ONEAPI_BINARIES)
set(cycles_kernel_oneapi_lib_suffix "_aot")
else()
diff --git a/intern/cycles/device/oneapi/device_impl.cpp b/intern/cycles/device/oneapi/device_impl.cpp
index bf8de8b5a12..edffd9525b1 100644
--- a/intern/cycles/device/oneapi/device_impl.cpp
+++ b/intern/cycles/device/oneapi/device_impl.cpp
@@ -429,7 +429,12 @@ void OneapiDevice::check_usm(SyclQueue *queue_, const void *usm_ptr, bool allow_
queue->get_device().get_info<sycl::info::device::device_type>();
sycl::usm::alloc usm_type = get_pointer_type(usm_ptr, queue->get_context());
(void)usm_type;
- assert(usm_type == sycl::usm::alloc::device ||
+# ifndef WITH_ONEAPI_SYCL_HOST_TASK
+ const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
+# else
+ const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
+# endif
+ assert(usm_type == main_memory_type ||
(usm_type == sycl::usm::alloc::host &&
(allow_host || device_type == sycl::info::device_type::cpu)) ||
usm_type == sycl::usm::alloc::unknown);
@@ -478,7 +483,11 @@ void *OneapiDevice::usm_alloc_device(SyclQueue *queue_, size_t memory_size)
{
assert(queue_);
sycl::queue *queue = reinterpret_cast<sycl::queue *>(queue_);
+# ifndef WITH_ONEAPI_SYCL_HOST_TASK
return sycl::malloc_device(memory_size, *queue);
+# else
+ return sycl::malloc_host(memory_size, *queue);
+# endif
}
void OneapiDevice::usm_free(SyclQueue *queue_, void *usm_ptr)
@@ -736,7 +745,11 @@ char *OneapiDevice::device_capabilities()
const std::vector<sycl::device> &oneapi_devices = available_devices();
for (const sycl::device &device : oneapi_devices) {
+# ifndef WITH_ONEAPI_SYCL_HOST_TASK
const std::string &name = device.get_info<sycl::info::device::name>();
+# else
+ const std::string &name = "SYCL Host Task (Debug)";
+# endif
capabilities << std::string("\t") << name << "\n";
# define WRITE_ATTR(attribute_name, attribute_variable) \
@@ -813,7 +826,11 @@ void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb, void *user_p
for (sycl::device &device : devices) {
const std::string &platform_name =
device.get_platform().get_info<sycl::info::platform::name>();
+# ifndef WITH_ONEAPI_SYCL_HOST_TASK
std::string name = device.get_info<sycl::info::device::name>();
+# else
+ std::string name = "SYCL Host Task (Debug)";
+# endif
std::string id = "ONEAPI_" + platform_name + "_" + name;
if (device.has(sycl::aspect::ext_intel_pci_address)) {
id.append("_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index a7a6c0a6007..5ba1b683d6b 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -752,6 +752,10 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
${SYCL_CPP_FLAGS}
)
+ if (WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
+ list(APPEND sycl_compiler_flags -DWITH_ONEAPI_SYCL_HOST_TASK)
+ endif()
+
# Set defaults for spir64 and spir64_gen options
if(NOT DEFINED CYCLES_ONEAPI_SYCL_OPTIONS_spir64)
set(CYCLES_ONEAPI_SYCL_OPTIONS_spir64 "-options '-ze-opt-large-register-file -ze-opt-regular-grf-kernel integrator_intersect'")
@@ -763,7 +767,8 @@ if(WITH_CYCLES_DEVICE_ONEAPI)
string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "--format zebin ")
string(PREPEND CYCLES_ONEAPI_SYCL_OPTIONS_spir64_gen "-device ${CYCLES_ONEAPI_SPIR64_GEN_DEVICES} ")
- if(WITH_CYCLES_ONEAPI_BINARIES)
+ # Host execution won't use GPU binaries, no need to compile them.
+ if(WITH_CYCLES_ONEAPI_BINARIES AND NOT WITH_CYCLES_ONEAPI_HOST_TASK_EXECUTION)
# AoT binaries aren't currently reused when calling sycl::build.
list(APPEND sycl_compiler_flags -DSYCL_SKIP_KERNELS_PRELOAD)
# Iterate over all targest and their options
diff --git a/intern/cycles/kernel/device/gpu/parallel_active_index.h b/intern/cycles/kernel/device/gpu/parallel_active_index.h
index 1d47211604b..c876c35465c 100644
--- a/intern/cycles/kernel/device/gpu/parallel_active_index.h
+++ b/intern/cycles/kernel/device/gpu/parallel_active_index.h
@@ -30,6 +30,16 @@ void gpu_parallel_active_index_array_impl(const uint num_states,
ccl_global int *ccl_restrict num_indices,
IsActiveOp is_active_op)
{
+# ifdef WITH_ONEAPI_SYCL_HOST_TASK
+ int write_index = 0;
+ for (int state_index = 0; state_index < num_states; state_index++) {
+ if (is_active_op(state_index))
+ indices[write_index++] = state_index;
+ }
+ *num_indices = write_index;
+ return;
+# endif /* WITH_ONEAPI_SYCL_HOST_TASK */
+
const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
const uint blocksize = item_id.get_local_range(0);
diff --git a/intern/cycles/kernel/device/oneapi/compat.h b/intern/cycles/kernel/device/oneapi/compat.h
index b83512180d7..0691c01b3b5 100644
--- a/intern/cycles/kernel/device/oneapi/compat.h
+++ b/intern/cycles/kernel/device/oneapi/compat.h
@@ -56,7 +56,8 @@
#define ccl_gpu_kernel(block_num_threads, thread_num_registers)
#define ccl_gpu_kernel_threads(block_num_threads)
-#define ccl_gpu_kernel_signature(name, ...) \
+#ifndef WITH_ONEAPI_SYCL_HOST_TASK
+# define ccl_gpu_kernel_signature(name, ...) \
void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
size_t kernel_global_size, \
size_t kernel_local_size, \
@@ -67,9 +68,37 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
sycl::nd_range<1>(kernel_global_size, kernel_local_size), \
[=](sycl::nd_item<1> item) {
-#define ccl_gpu_kernel_postfix \
+# define ccl_gpu_kernel_postfix \
}); \
}
+#else
+/* Additional anonymous lambda is required to handle all "return" statements in the kernel code */
+# define ccl_gpu_kernel_signature(name, ...) \
+void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
+ size_t kernel_global_size, \
+ size_t kernel_local_size, \
+ sycl::handler &cgh, \
+ __VA_ARGS__) { \
+ (kg); \
+ (kernel_local_size); \
+ cgh.host_task( \
+ [=]() {\
+ for (size_t gid = (size_t)0; gid < kernel_global_size; gid++) { \
+ kg->nd_item_local_id_0 = 0; \
+ kg->nd_item_local_range_0 = 1; \
+ kg->nd_item_group_id_0 = gid; \
+ kg->nd_item_group_range_0 = kernel_global_size; \
+ kg->nd_item_global_id_0 = gid; \
+ kg->nd_item_global_range_0 = kernel_global_size; \
+ auto kernel = [=]() {
+
+# define ccl_gpu_kernel_postfix \
+ }; \
+ kernel(); \
+ } \
+ }); \
+}
+#endif
#define ccl_gpu_kernel_call(x) ((ONEAPIKernelContext*)kg)->x
@@ -83,23 +112,40 @@ void oneapi_kernel_##name(KernelGlobalsGPU *ccl_restrict kg, \
} ccl_gpu_kernel_lambda_pass((ONEAPIKernelContext *)kg)
/* GPU thread, block, grid size and index */
-#define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
-#define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
-#define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
-#define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
-#define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
-#define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
-#define ccl_gpu_global_id_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_id(0))
-#define ccl_gpu_global_size_x() (sycl::ext::oneapi::experimental::this_nd_item<1>().get_global_range(0))
+#ifndef WITH_ONEAPI_SYCL_HOST_TASK
+# define ccl_gpu_thread_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_id(0))
+# define ccl_gpu_block_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_local_range(0))
+# define ccl_gpu_block_idx_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group(0))
+# define ccl_gpu_grid_dim_x (sycl::ext::oneapi::experimental::this_nd_item<1>().get_group_range(0))
+# define ccl_gpu_warp_size (sycl::ext::oneapi::experimental::this_sub_group().get_local_range()[0])
+# define ccl_gpu_thread_mask(thread_warp) uint(0xFFFFFFFF >> (ccl_gpu_warp_size - thread_warp))
+
+# define cc
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list