[Bf-blender-cvs] [97ff37bf544] blender-v3.0-release: Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion
Brecht Van Lommel
noreply at git.blender.org
Fri Nov 5 22:05:36 CET 2021
Commit: 97ff37bf54474efbce39653a1387ad55091d4964
Author: Brecht Van Lommel
Date: Fri Nov 5 21:01:23 2021 +0100
Branches: blender-v3.0-release
https://developer.blender.org/rB97ff37bf54474efbce39653a1387ad55091d4964
Cycles: perform CPU film reading in the kernel, to use AVX2 half conversion
Adds a bunch of CPU kernel function to process on row of pixels, and use those
instead of calling unoptimized implementations.
Fixes T92598
===================================================================
M intern/cycles/device/cpu/device_impl.cpp
M intern/cycles/device/cpu/device_impl.h
M intern/cycles/device/cpu/kernel.cpp
M intern/cycles/device/cpu/kernel.h
M intern/cycles/device/device.cpp
M intern/cycles/device/device.h
M intern/cycles/integrator/pass_accessor_cpu.cpp
M intern/cycles/integrator/pass_accessor_cpu.h
M intern/cycles/integrator/path_trace_work_cpu.cpp
M intern/cycles/integrator/shader_eval.cpp
M intern/cycles/kernel/device/cpu/kernel.h
M intern/cycles/kernel/device/cpu/kernel_arch.h
M intern/cycles/kernel/device/cpu/kernel_arch_impl.h
===================================================================
diff --git a/intern/cycles/device/cpu/device_impl.cpp b/intern/cycles/device/cpu/device_impl.cpp
index d494b40f71d..68dec7f0af2 100644
--- a/intern/cycles/device/cpu/device_impl.cpp
+++ b/intern/cycles/device/cpu/device_impl.cpp
@@ -68,7 +68,8 @@ CPUDevice::CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_
{
/* Pick any kernel, all of them are supposed to have same level of microarchitecture
* optimization. */
- VLOG(1) << "Using " << kernels.integrator_init_from_camera.get_uarch_name() << " CPU kernels.";
+ VLOG(1) << "Using " << get_cpu_kernels().integrator_init_from_camera.get_uarch_name()
+ << " CPU kernels.";
if (info.cpu_threads == 0) {
info.cpu_threads = TaskScheduler::num_threads();
@@ -296,11 +297,6 @@ void CPUDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
Device::build_bvh(bvh, progress, refit);
}
-const CPUKernels *CPUDevice::get_cpu_kernels() const
-{
- return &kernels;
-}
-
void CPUDevice::get_cpu_kernel_thread_globals(
vector<CPUKernelThreadGlobals> &kernel_thread_globals)
{
diff --git a/intern/cycles/device/cpu/device_impl.h b/intern/cycles/device/cpu/device_impl.h
index 553728ccc3b..90d217bb624 100644
--- a/intern/cycles/device/cpu/device_impl.h
+++ b/intern/cycles/device/cpu/device_impl.h
@@ -57,8 +57,6 @@ class CPUDevice : public Device {
RTCDevice embree_device;
#endif
- CPUKernels kernels;
-
CPUDevice(const DeviceInfo &info_, Stats &stats_, Profiler &profiler_);
~CPUDevice();
@@ -90,7 +88,6 @@ class CPUDevice : public Device {
void build_bvh(BVH *bvh, Progress &progress, bool refit) override;
- virtual const CPUKernels *get_cpu_kernels() const override;
virtual void get_cpu_kernel_thread_globals(
vector<CPUKernelThreadGlobals> &kernel_thread_globals) override;
virtual void *get_cpu_osl_memory() override;
diff --git a/intern/cycles/device/cpu/kernel.cpp b/intern/cycles/device/cpu/kernel.cpp
index 3b253c094fd..91c472d41e8 100644
--- a/intern/cycles/device/cpu/kernel.cpp
+++ b/intern/cycles/device/cpu/kernel.cpp
@@ -26,6 +26,9 @@ CCL_NAMESPACE_BEGIN
KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
#define REGISTER_KERNEL(name) name(KERNEL_FUNCTIONS(name))
+#define REGISTER_KERNEL_FILM_CONVERT(name) \
+ film_convert_##name(KERNEL_FUNCTIONS(film_convert_##name)), \
+ film_convert_half_rgba_##name(KERNEL_FUNCTIONS(film_convert_half_rgba_##name))
CPUKernels::CPUKernels()
: /* Integrator. */
@@ -50,11 +53,25 @@ CPUKernels::CPUKernels()
REGISTER_KERNEL(adaptive_sampling_filter_x),
REGISTER_KERNEL(adaptive_sampling_filter_y),
/* Cryptomatte. */
- REGISTER_KERNEL(cryptomatte_postprocess)
+ REGISTER_KERNEL(cryptomatte_postprocess),
+ /* Film Convert. */
+ REGISTER_KERNEL_FILM_CONVERT(depth),
+ REGISTER_KERNEL_FILM_CONVERT(mist),
+ REGISTER_KERNEL_FILM_CONVERT(sample_count),
+ REGISTER_KERNEL_FILM_CONVERT(float),
+ REGISTER_KERNEL_FILM_CONVERT(light_path),
+ REGISTER_KERNEL_FILM_CONVERT(float3),
+ REGISTER_KERNEL_FILM_CONVERT(motion),
+ REGISTER_KERNEL_FILM_CONVERT(cryptomatte),
+ REGISTER_KERNEL_FILM_CONVERT(shadow_catcher),
+ REGISTER_KERNEL_FILM_CONVERT(shadow_catcher_matte_with_shadow),
+ REGISTER_KERNEL_FILM_CONVERT(combined),
+ REGISTER_KERNEL_FILM_CONVERT(float4)
{
}
#undef REGISTER_KERNEL
+#undef REGISTER_KERNEL_FILM_CONVERT
#undef KERNEL_FUNCTIONS
CCL_NAMESPACE_END
diff --git a/intern/cycles/device/cpu/kernel.h b/intern/cycles/device/cpu/kernel.h
index 5beeaf148a1..406bd07ab3d 100644
--- a/intern/cycles/device/cpu/kernel.h
+++ b/intern/cycles/device/cpu/kernel.h
@@ -17,11 +17,13 @@
#pragma once
#include "device/cpu/kernel_function.h"
+#include "util/half.h"
#include "util/types.h"
CCL_NAMESPACE_BEGIN
struct KernelGlobalsCPU;
+struct KernelFilmConvert;
struct IntegratorStateCPU;
struct TileInfo;
@@ -102,6 +104,41 @@ class CPUKernels {
CryptomattePostprocessFunction cryptomatte_postprocess;
+ /* Film Convert. */
+ using FilmConvertFunction = CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+ const float *buffer,
+ float *pixel,
+ const int width,
+ const int buffer_stride,
+ const int pixel_stride)>;
+ using FilmConvertHalfRGBAFunction =
+ CPUKernelFunction<void (*)(const KernelFilmConvert *kfilm_convert,
+ const float *buffer,
+ half4 *pixel,
+ const int width,
+ const int buffer_stride)>;
+
+#define KERNEL_FILM_CONVERT_FUNCTION(name) \
+ FilmConvertFunction film_convert_##name; \
+ FilmConvertHalfRGBAFunction film_convert_half_rgba_##name;
+
+ KERNEL_FILM_CONVERT_FUNCTION(depth)
+ KERNEL_FILM_CONVERT_FUNCTION(mist)
+ KERNEL_FILM_CONVERT_FUNCTION(sample_count)
+ KERNEL_FILM_CONVERT_FUNCTION(float)
+
+ KERNEL_FILM_CONVERT_FUNCTION(light_path)
+ KERNEL_FILM_CONVERT_FUNCTION(float3)
+
+ KERNEL_FILM_CONVERT_FUNCTION(motion)
+ KERNEL_FILM_CONVERT_FUNCTION(cryptomatte)
+ KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher)
+ KERNEL_FILM_CONVERT_FUNCTION(shadow_catcher_matte_with_shadow)
+ KERNEL_FILM_CONVERT_FUNCTION(combined)
+ KERNEL_FILM_CONVERT_FUNCTION(float4)
+
+#undef KERNEL_FILM_CONVERT_FUNCTION
+
CPUKernels();
};
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 69e959b6f7b..63d0a49d3eb 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -23,6 +23,7 @@
#include "device/queue.h"
#include "device/cpu/device.h"
+#include "device/cpu/kernel.h"
#include "device/cuda/device.h"
#include "device/dummy/device.h"
#include "device/hip/device.h"
@@ -363,10 +364,11 @@ unique_ptr<DeviceQueue> Device::gpu_queue_create()
return nullptr;
}
-const CPUKernels *Device::get_cpu_kernels() const
+const CPUKernels &Device::get_cpu_kernels()
{
- LOG(FATAL) << "Device does not support CPU kernels.";
- return nullptr;
+ /* Initialize CPU kernels once and reuse. */
+ static CPUKernels kernels;
+ return kernels;
}
void Device::get_cpu_kernel_thread_globals(
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index 3cb177adde7..65188459c2c 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -180,7 +180,7 @@ class Device {
* These may not be used on GPU or multi-devices. */
/* Get CPU kernel functions for native instruction set. */
- virtual const CPUKernels *get_cpu_kernels() const;
+ static const CPUKernels &get_cpu_kernels();
/* Get kernel globals to pass to kernels. */
virtual void get_cpu_kernel_thread_globals(
vector<CPUKernelThreadGlobals> & /*kernel_thread_globals*/);
diff --git a/intern/cycles/integrator/pass_accessor_cpu.cpp b/intern/cycles/integrator/pass_accessor_cpu.cpp
index 820da757be0..77ca332d142 100644
--- a/intern/cycles/integrator/pass_accessor_cpu.cpp
+++ b/intern/cycles/integrator/pass_accessor_cpu.cpp
@@ -14,9 +14,12 @@
* limitations under the License.
*/
+#include "device/device.h"
+
#include "integrator/pass_accessor_cpu.h"
#include "session/buffers.h"
+
#include "util/log.h"
#include "util/tbb.h"
@@ -33,70 +36,16 @@ CCL_NAMESPACE_BEGIN
* Kernel processing.
*/
-template<typename Processor>
-inline void PassAccessorCPU::run_get_pass_kernel_processor(const RenderBuffers *render_buffers,
- const BufferParams &buffer_params,
- const Destination &destination,
- const Processor &processor) const
-{
- KernelFilmConvert kfilm_convert;
- init_kernel_film_convert(&kfilm_convert, buffer_params, destination);
-
- if (destination.pixels) {
- /* NOTE: No overlays are applied since they are not used for final renders.
- * Can be supported via some sort of specialization to avoid code duplication. */
-
- run_get_pass_kernel_processor_float(
- &kfilm_convert, render_buffers, buffer_params, destination, processor);
- }
-
- if (destination.pixels_half_rgba) {
- /* TODO(sergey): Consider adding specialization to avoid per-pixel overlay check. */
-
- if (destination.num_components == 1) {
- run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
- render_buffers,
- buffer_params,
- destination,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
- float pixel;
- processor(kfilm_convert, buffer, &pixel);
-
- pixel_rgba[0] = pixel;
- pixel_rgba[1] = pixel;
- pixel_rgba[2] = pixel;
- pixel_rgba[3] = 1.0f;
- });
- }
- else if (destination.num_components == 3) {
- run_get_pass_kernel_processor_half_rgba(&kfilm_convert,
- render_buffers,
- buffer_params,
- destination,
- [&processor](const KernelFilmConvert *kfilm_convert,
- ccl_global const float *buffer,
- float *pixel_rgba) {
-
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list