[Bf-blender-cvs] [ff77a1d] soc-2016-cycles_denoising: Cycles: Store CPU kernel function pointers in objects to clean up the code

Lukas Stockner noreply at git.blender.org
Tue Nov 22 04:25:12 CET 2016


Commit: ff77a1de08b5f5e179af30a6952c2dc2755338d9
Author: Lukas Stockner
Date:   Mon Nov 14 11:11:05 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rBff77a1de08b5f5e179af30a6952c2dc2755338d9

Cycles: Store CPU kernel function pointers in objects to clean up the code

With the number of kernel functions that's currently needed, it was just getting to messy.
This commit is based on D1825.

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/kernel.h
M	intern/cycles/kernel/kernels/cpu/kernel_avx.cpp
M	intern/cycles/kernel/kernels/cpu/kernel_avx2.cpp
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/cpu/kernel_sse2.cpp
M	intern/cycles/kernel/kernels/cpu/kernel_sse3.cpp
M	intern/cycles/kernel/kernels/cpu/kernel_sse41.cpp

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index ec78cd0..3057d66 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -48,65 +48,134 @@
 
 CCL_NAMESPACE_BEGIN
 
-class CPUDevice : public Device
-{
-public:
-	TaskPool task_pool;
-	KernelGlobals kernel_globals;
+/* Has to be outside of the class to be shared across template instantiations. */
+static bool logged_architecture = false;
 
-#ifdef WITH_OSL
-	OSLGlobals osl_globals;
-#endif
-	
-	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
-	: Device(info, stats, background)
+template<typename F>
+class KernelFunctions {
+public:
+	KernelFunctions(F kernel_default,
+	                F kernel_sse2,
+	                F kernel_sse3,
+	                F kernel_sse41,
+	                F kernel_avx,
+	                F kernel_avx2)
 	{
-#ifdef WITH_OSL
-		kernel_globals.osl = &osl_globals;
-#endif
-
-		/* do now to avoid thread issues */
-		system_cpu_support_sse2();
-		system_cpu_support_sse3();
-		system_cpu_support_sse41();
-		system_cpu_support_avx();
-		system_cpu_support_avx2();
-
+		string architecture_name = "default";
+		kernel = kernel_default;
+
+		/* Silence potential warnings about unused variables
+		 * when compiling without some architectures. */
+		(void)kernel_sse2;
+		(void)kernel_sse3;
+		(void)kernel_sse41;
+		(void)kernel_avx;
+		(void)kernel_avx2;
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2()) {
-			VLOG(1) << "Will be using AVX2 kernels.";
+			architecture_name = "AVX2";
+			kernel = kernel_avx2;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 		if(system_cpu_support_avx()) {
-			VLOG(1) << "Will be using AVX kernels.";
+			architecture_name = "AVX";
+			kernel = kernel_avx;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 		if(system_cpu_support_sse41()) {
-			VLOG(1) << "Will be using SSE4.1 kernels.";
+			architecture_name = "SSE4.1";
+			kernel = kernel_sse41;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
-			VLOG(1) << "Will be using SSE3kernels.";
+			architecture_name = "SSE3";
+			kernel = kernel_sse3;
 		}
 		else
 #endif
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 		if(system_cpu_support_sse2()) {
-			VLOG(1) << "Will be using SSE2 kernels.";
+			architecture_name = "SSE2";
+			kernel = kernel_sse2;
 		}
-		else
 #endif
-		{
-			VLOG(1) << "Will be using regular kernels.";
+
+		if(!logged_architecture) {
+			VLOG(1) << "Will be using " << architecture_name << " kernels.";
+			logged_architecture = true;
 		}
 	}
 
+	inline F operator()() const {
+		return kernel;
+	}
+protected:
+	F kernel;
+};
+
+class CPUDevice : public Device
+{
+public:
+	TaskPool task_pool;
+	KernelGlobals kernel_globals;
+
+#ifdef WITH_OSL
+	OSLGlobals osl_globals;
+#endif
+
+	KernelFunctions<void(*)(KernelGlobals *, float *, unsigned int *, int, int, int, int, int)>   path_trace_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_half_float_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_byte_kernel;
+	KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int*)> filter_divide_shadow_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*)>       filter_get_feature_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int, float, float)>                               filter_non_local_means_kernel;
+	KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                                  filter_combine_halves_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_construct_transform_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, void*, int*)>                                      filter_estimate_wlr_params_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, int*, int*)>              filter_final_pass_wlr_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, int, float*, int, int, int, int, float*, void*, int*, int*)>              filter_final_pass_nlm_kernel;
+	KernelFunctions<void(*)(int, int, float**, float**, float**, float**, int*, int, int, float, float)>              filter_non_local_means_3_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, float*, int, int, int, int, float, float*, int*)>                         filter_old_1_kernel;
+	KernelFunctions<void(*)(KernelGlobals*, float*, float*, int, int, int, int, int, int, float, float*, int*, int*)> filter_old_2_kernel;
+
+#define KERNEL_FUNCTIONS(name) \
+	      KERNEL_NAME_EVAL(cpu, name), \
+	      KERNEL_NAME_EVAL(cpu_sse2, name), \
+	      KERNEL_NAME_EVAL(cpu_sse3, name), \
+	      KERNEL_NAME_EVAL(cpu_sse41, name), \
+	      KERNEL_NAME_EVAL(cpu_avx, name), \
+	      KERNEL_NAME_EVAL(cpu_avx2, name)
+
+	CPUDevice(DeviceInfo& info, Stats &stats, bool background)
+	: Device(info, stats, background),
+	  path_trace_kernel(KERNEL_FUNCTIONS(path_trace)),
+	  convert_to_half_float_kernel(KERNEL_FUNCTIONS(convert_to_half_float)),
+	  convert_to_byte_kernel(KERNEL_FUNCTIONS(convert_to_byte)),
+	  shader_kernel(KERNEL_FUNCTIONS(shader)),
+	  filter_divide_shadow_kernel(KERNEL_FUNCTIONS(filter_divide_shadow)),
+	  filter_get_feature_kernel(KERNEL_FUNCTIONS(filter_get_feature)),
+	  filter_non_local_means_kernel(KERNEL_FUNCTIONS(filter_non_local_means)),
+	  filter_combine_halves_kernel(KERNEL_FUNCTIONS(filter_combine_halves)),
+	  filter_construct_transform_kernel(KERNEL_FUNCTIONS(filter_construct_transform)),
+	  filter_estimate_wlr_params_kernel(KERNEL_FUNCTIONS(filter_estimate_wlr_params)),
+	  filter_final_pass_wlr_kernel(KERNEL_FUNCTIONS(filter_final_pass_wlr)),
+	  filter_final_pass_nlm_kernel(KERNEL_FUNCTIONS(filter_final_pass_nlm)),
+	  filter_non_local_means_3_kernel(KERNEL_FUNCTIONS(filter_non_local_means_3)),
+	  filter_old_1_kernel(KERNEL_FUNCTIONS(filter_old_1)),
+	  filter_old_2_kernel(KERNEL_FUNCTIONS(filter_old_2))
+	{
+#ifdef WITH_OSL
+		kernel_globals.osl = &osl_globals;
+#endif
+	}
+
 	~CPUDevice()
 	{
 		task_pool.stop();
@@ -210,63 +279,6 @@ public:
 
 	float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides, int frames, int *frame_strides)
 	{
-		void(*filter_divide_shadow)(KernelGlobals*, int, float**, int, int, int*, int*, int*, int*, float*, float*, float*, float*, int*);
-		void(*filter_get_feature)(KernelGlobals*, int, float**, int, int, int, int, int*, int*, int*, int*, float*, float*, int*);
-		void(*filter_non_local_means)(int, int, float*, float*, float*, float*, int*, int, int, float, float);
-		void(*filter_combine_halves)(int, int, float*, float*, float*, float*, int*, int);
-
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
-		if(system_cpu_support_avx2()) {
-			filter_divide_shadow = kernel_cpu_avx2_filter_divide_shadow;
-			filter_get_feature = kernel_cpu_avx2_filter_get_feature;
-			filter_non_local_means = kernel_cpu_avx2_filter_non_local_means;
-			filter_combine_halves = kernel_cpu_avx2_filter_combine_halves;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
-		if(system_cpu_support_avx()) {
-			filter_divide_shadow = kernel_cpu_avx_filter_divide_shadow;
-			filter_get_feature = kernel_cpu_avx_filter_get_feature;
-			filter_non_local_means = kernel_cpu_avx_filter_non_local_means;
-			filter_combine_halves = kernel_cpu_avx_filter_combine_halves;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
-		if(system_cpu_support_sse41()) {
-			filter_divide_shadow = kernel_cpu_sse41_filter_divide_shadow;
-			filter_get_feature = kernel_cpu_sse41_filter_get_feature;
-			filter_non_local_means = kernel_cpu_sse41_filter_non_local_means;
-			filter_combine_halves = kernel_cpu_sse41_filter_combine_halves;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
-		if(system_cpu_support_sse3()) {
-			filter_divide_shadow = kernel_cpu_sse3_filter_divide_shadow;
-			filter_get_feature = kernel_cpu_sse3_filter_get_feature;
-			filter_non_local_means = kernel_cpu_sse3_filter_non_local_means;
-			filter_combine_halves = kernel_cpu_sse3_filter_combine_halves;
-		}
-		else
-#endif
-#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
-		if(system_cpu_support_sse2()) {
-			filter_divide_shadow = kernel_cpu_sse2_filter_divide_shadow;
-			filter_get_feature = kernel_cpu_sse2_filter_get_feature;
-			filter_non_local_means = kernel_cpu_sse2_filter_non_local_means;
-			filter_combine_halves = kernel_cpu_sse2_filter_combine_halves;
-		}
-		else
-#endif
-		{
-			filter_divide_shadow = kernel_cpu_filter_divide_shadow;
-			filter_get_feature = kernel_cpu_filter_get_feature;
-			filter_non_local_means = kernel_cpu_filter_non_local_means;
-			filter_combine_halves = kernel_cpu_filter_combine_halves;
-		}
-
 		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
 		int pass_stride = w*h*frames;
 		float *filter_buffers = new float[22*pass_stride];
@@ -298,12 +310,12 @@ public:
 				for(int i = 0; i < 7; i++) {
 					for(int y = rect.y; y < rect.w; y++) {
 						for(int x = rect.x; x < rect.z; x++) {
-							filter_get_feature(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, &rect.x);
+							filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, &rect.x);
 						}
 					}
 					for(int y = rect.y; y < rect.w; y++) {
 						for(int x = rect.x; x < rect.z; x++) {
-							filter_non_local_means(x, y, unfiltered, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, filter_buffer + offset_to[i]*pass_stride, &rect.x, 2, 2, 1, 0.25f);
+		

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list