[Bf-blender-cvs] [9351ac0] master: Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.
Thomas Dinges
noreply at git.blender.org
Tue Jan 14 20:40:09 CET 2014
Commit: 9351ac0d8577a2c76c238bbf2c365d811e986209
Author: Thomas Dinges
Date: Tue Jan 14 20:39:21 2014 +0100
https://developer.blender.org/rB9351ac0d8577a2c76c238bbf2c365d811e986209
Cycles: Skip the compilation of the dedicated SSE2 kernel on x86-64, we can assume SSE2 here, so just re-use the regular one. Saves 500kb in the blender binary.
Reviewed by: brecht
Differential Revision: https://developer.blender.org/D199
===================================================================
M intern/cycles/CMakeLists.txt
M intern/cycles/SConscript
M intern/cycles/device/device_cpu.cpp
M intern/cycles/kernel/CMakeLists.txt
M intern/cycles/kernel/kernel.cpp
M intern/cycles/kernel/kernel.h
M intern/cycles/kernel/kernel_sse2.cpp
M intern/cycles/kernel/kernel_sse3.cpp
M intern/cycles/kernel/kernel_sse41.cpp
M intern/cycles/util/CMakeLists.txt
A intern/cycles/util/util_optimization.h
===================================================================
diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index fc193d9..6fa6260 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -8,10 +8,6 @@ include(cmake/external_libs.cmake)
# Build Flags
-if(WITH_RAYOPTIMIZATION AND SUPPORT_SSE_BUILD)
- set(WITH_CYCLES_OPTIMIZED_KERNEL ON)
-endif()
-
if(WIN32 AND MSVC)
# there is no /arch:SSE3, but intrinsics are available anyway
if(CMAKE_CL_64)
@@ -54,10 +50,6 @@ add_definitions(${BOOST_DEFINITIONS} ${OPENIMAGEIO_DEFINITIONS})
add_definitions(-DCCL_NAMESPACE_BEGIN=namespace\ ccl\ {)
add_definitions(-DCCL_NAMESPACE_END=})
-if(WITH_CYCLES_OPTIMIZED_KERNEL)
- add_definitions(-DWITH_OPTIMIZED_KERNEL)
-endif()
-
if(WITH_CYCLES_NETWORK)
add_definitions(-DWITH_NETWORK)
endif()
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 448375a..e31fb5b 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -75,41 +75,40 @@ if env['OURPLATFORM'] in ('win32-vc', 'win32-mingw', 'linuxcross', 'win64-vc', '
incs.append(env['BF_PTHREADS_INC'])
# optimized kernel
-if env['WITH_BF_RAYOPTIMIZATION']:
- sse2_cxxflags = Split(env['CXXFLAGS'])
- sse3_cxxflags = Split(env['CXXFLAGS'])
- sse41_cxxflags = Split(env['CXXFLAGS'])
-
- if env['OURPLATFORM'] == 'win32-vc':
- # there is no /arch:SSE3, but intrinsics are available anyway
- sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- elif env['OURPLATFORM'] == 'win64-vc':
- sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
- else:
- sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
- sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
- sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
-
- defs.append('WITH_OPTIMIZED_KERNEL')
- optim_defs = defs[:]
-
- if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
- cycles_sse41 = cycles.Clone()
- sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
- cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
- defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
-
- cycles_sse3 = cycles.Clone()
- sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
- cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
-
- cycles_sse2 = cycles.Clone()
- sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
- cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
+sse2_cxxflags = Split(env['CXXFLAGS'])
+sse3_cxxflags = Split(env['CXXFLAGS'])
+sse41_cxxflags = Split(env['CXXFLAGS'])
+
+if env['OURPLATFORM'] == 'win32-vc':
+ # there is no /arch:SSE3, but intrinsics are available anyway
+ sse2_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse3_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse41_cxxflags.append('/arch:SSE /arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+elif env['OURPLATFORM'] == 'win64-vc':
+ sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+ sse41_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /Ox /Gs-'.split())
+else:
+ sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
+ sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -mfpmath=sse'.split())
+ sse41_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mssse3 -msse4.1 -mfpmath=sse'.split())
+
+defs.append('WITH_OPTIMIZED_KERNEL')
+optim_defs = defs[:]
+
+if env['WITH_CYCLES_OPTIMIZED_KERNEL_SSE41']:
+ cycles_sse41 = cycles.Clone()
+ sse41_sources = [path.join('kernel', 'kernel_sse41.cpp')]
+ cycles_sse41.BlenderLib('bf_intern_cycles_sse41', sse41_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse41_cxxflags)
+ defs.append('WITH_CYCLES_OPTIMIZED_KERNEL_SSE41')
+
+cycles_sse3 = cycles.Clone()
+sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
+cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
+
+cycles_sse2 = cycles.Clone()
+sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
+cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index ea632b7..b29d64e 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -166,7 +166,6 @@ public:
int start_sample = tile.start_sample;
int end_sample = tile.start_sample + tile.num_samples;
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int sample = start_sample; sample < end_sample; sample++) {
@@ -189,6 +188,7 @@ public:
}
else
#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
@@ -208,7 +208,10 @@ public:
task.update_progress(tile);
}
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int sample = start_sample; sample < end_sample; sample++) {
if (task.get_cancel() || task_pool.canceled()) {
if(task.need_finish_queue == false)
@@ -267,7 +270,6 @@ public:
float sample_scale = 1.0f/(task.sample + 1);
if(task.rgba_half) {
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -276,14 +278,18 @@ public:
sample_scale, x, y, task.offset, task.stride);
}
else
-#endif
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse3_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse2_convert_to_half_float(&kernel_globals, (uchar4*)task.rgba_half, (float*)task.buffer,
@@ -299,7 +305,6 @@ public:
}
}
else {
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int y = task.y; y < task.y + task.h; y++)
@@ -309,13 +314,17 @@ public:
}
else
#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse3_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
sample_scale, x, y, task.offset, task.stride);
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int y = task.y; y < task.y + task.h; y++)
for(int x = task.x; x < task.x + task.w; x++)
kernel_cpu_sse2_convert_to_byte(&kernel_globals, (uchar4*)task.rgba_byte, (float*)task.buffer,
@@ -340,7 +349,6 @@ public:
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
#endif
-#ifdef WITH_OPTIMIZED_KERNEL
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
@@ -352,6 +360,7 @@ public:
}
else
#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
@@ -360,7 +369,10 @@ public:
break;
}
}
- else if(system_cpu_support_sse2()) {
+ else
+#endif
+#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
+ if(system_cpu_support_sse2()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 998d1a3..81499bb 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -192,10 +192,8 @@ endif()
include_directories(${INC})
include_directories(SYSTEM ${INC_SYS})
-if(WITH_CYCLES_OPTIMIZED_KERNEL)
- set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
- set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
-endif()
+set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list