[Bf-blender-cvs] [9351ac0] master: Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.

Thomas Dinges noreply at git.blender.org
Tue Jan 14 20:40:09 CET 2014


Commit: 9351ac0d8577a2c76c238bbf2c365d811e986209
Author: Thomas Dinges
Date:   Tue Jan 14 20:39:21 2014 +0100
https://developer.blender.org/rB9351ac0d8577a2c76c238bbf2c365d811e986209

Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.

Reviewed by: brecht
Differential Revision: https://developer.blender.org/D199

===================================================================

M	intern/cycles/CMakeLists.txt
M	intern/cycles/SConscript
M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/kernel.cpp
M	intern/cycles/kernel/kernel.h
M	intern/cycles/kernel/kernel_sse2.cpp
M	intern/cycles/kernel/kernel_sse3.cpp
M	intern/cycles/kernel/kernel_sse41.cpp
M	intern/cycles/util/CMakeLists.txt
A	intern/cycles/util/util_optimization.h

===================================================================

diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index fc193d9..6fa6260 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -8,10 +8,6 @@ include(cmake/external_libs.cmake)
 
 # Build Flags
 
-if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
-	set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
-endif()
-
 if(WIN32 AND MSVC)
 	# there is no /arch:SSE3, but intrinsics are available anyway
 	if(CMAKE_CL_64)
@@ -54,10 +50,6 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
 add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
 add_definitions(-DCCL_NAMESPACE_END=})
 
-if(WITH_CYCLES_OPTIMIZED_KERNEL)
-	add_definitions(-DWITH_OPTIMIZED_KERNEL)
-endif()
-
 if(WITH_CYCLES_NETWORK)
 	add_definitions(-DWITH_NETWORK)
 endif()
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 448375a..e31fb5b 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -75,41 +75,40 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
     incs.append(env['BF_PTHREADS_INC'])
 
 # optimized kernel
-if env['WITH_BF_RAYOPTIMIZATION']:
-    sse2_cxxflags = Split(env['CXXFLAGS'])
-    sse3_cxxflags = Split(env['CXXFLAGS'])
-    sse41_cxxflags = Split(env['CXXFLAGS'])
-
-    if env['OURPLATFORM'] == 'win32-vc':
-        # there is no /arch:SSE3, but intrinsics are available anyway
-        sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-    elif env['OURPLATFORM'] == 'win64-vc':
-        sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-        sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
-    else:
-        sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
-        sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
-        sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
-    
-    defs.append('WITH_OPTIMIZED_KERNEL')
-    optim_defs = defs[:]
-
-    if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
-        cycles_sse41 = cycles.Clone()
-        sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
-        cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
-        defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
-
-    cycles_sse3 = cycles.Clone()
-    sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
-    cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
-
-    cycles_sse2 = cycles.Clone()
-    sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
-    cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
+sse2_cxxflags = Split(env['CXXFLAGS'])
+sse3_cxxflags = Split(env['CXXFLAGS'])
+sse41_cxxflags = Split(env['CXXFLAGS'])
+
+if env['OURPLATFORM'] == 'win32-vc':
+    # there is no /arch:SSE3, but intrinsics are available anyway
+    sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+elif env['OURPLATFORM'] == 'win64-vc':
+    sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+    sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+else:
+    sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
+    sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
+    sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
+
+defs.append('WITH_OPTIMIZED_KERNEL')
+optim_defs = defs[:]
+
+if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
+    cycles_sse41 = cycles.Clone()
+    sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
+    cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
+    defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
+
+cycles_sse3 = cycles.Clone()
+sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
+cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
+
+cycles_sse2 = cycles.Clone()
+sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
+cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
 
 cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)
 
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index ea632b7..b29d64e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -166,7 +166,6 @@ public:
 			int start_sample = tile.start_sample;
 			int end_sample = tile.start_sample + tile.num_samples;
 
-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
@@ -189,6 +188,7 @@ public:
 			}
 			else
 #endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 			if(system_cpu_support_sse3()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task.get_cancel() || task_pool.canceled()) {
@@ -208,7 +208,10 @@ public:
 					task.update_progress(tile);
 				}
 			}
-			else if(system_cpu_support_sse2()) {
+			else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+			if(system_cpu_support_sse2()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task.get_cancel() || task_pool.canceled()) {
 						if(task.need_finish_queue == false)
@@ -267,7 +270,6 @@ public:
 		float sample_scale = 1.0f/(task.sample + 1);
 
 		if(task.rgba_half) {
-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@@ -276,14 +278,18 @@ public:
 							sample_scale, x, y, task.offset, task.stride);
 			}
 			else
-#endif				
+#endif		
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3		
 			if(system_cpu_support_sse3()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
 							sample_scale, x, y, task.offset, task.stride);
 			}
-			else if(system_cpu_support_sse2()) {
+			else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+			if(system_cpu_support_sse2()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
@@ -299,7 +305,6 @@ public:
 			}
 		}
 		else {
-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 			if(system_cpu_support_sse41()) {
 				for(int y = task.y; y < task.y + task.h; y++)
@@ -309,13 +314,17 @@ public:
 			}
 			else
 #endif			
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 			if(system_cpu_support_sse3()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
 							sample_scale, x, y, task.offset, task.stride);
 			}
-			else if(system_cpu_support_sse2()) {
+			else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+			if(system_cpu_support_sse2()) {
 				for(int y = task.y; y < task.y + task.h; y++)
 					for(int x = task.x; x < task.x + task.w; x++)
 						kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
@@ -340,7 +349,6 @@ public:
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
 
-#ifdef WITH_OPTIMIZED_KERNEL
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41			
 		if(system_cpu_support_sse41()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
@@ -352,6 +360,7 @@ public:
 		}
 		else
 #endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 		if(system_cpu_support_sse3()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
@@ -360,7 +369,10 @@ public:
 					break;
 			}
 		}
-		else if(system_cpu_support_sse2()) {
+		else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+		if(system_cpu_support_sse2()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
 				kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 998d1a3..81499bb 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -192,10 +192,8 @@ endif()
 include_directories(${INC})
 include_directories(SYSTEM ${INC_SYS})
 
-if(WITH_CYCLES_OPTIMIZED_KERNEL)
-	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
-	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-endif()
+set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS 

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list