[Bf-blender-cvs] SVN commit: /data/svn/bf-blender [54299] trunk/blender/intern/cycles: Fix cycles intersection issue with overlapping faces on windows 32 bit and CPU

Brecht Van Lommel brechtvanlommel at pandora.be
Mon Feb 4 17:12:38 CET 2013


Revision: 54299
          http://projects.blender.org/scm/viewvc.php?view=rev&root=bf-blender&revision=54299
Author:   blendix
Date:     2013-02-04 16:12:37 +0000 (Mon, 04 Feb 2013)
Log Message:
-----------
Fix cycles intersection issue with overlapping faces on windows 32 bit and CPU
without SSE3 support, due to 80 bit precision float register being used for one
bounding box but not the one next to it.

Modified Paths:
--------------
    trunk/blender/intern/cycles/CMakeLists.txt
    trunk/blender/intern/cycles/SConscript
    trunk/blender/intern/cycles/device/device_cpu.cpp
    trunk/blender/intern/cycles/kernel/CMakeLists.txt
    trunk/blender/intern/cycles/kernel/kernel.h
    trunk/blender/intern/cycles/kernel/kernel_bvh.h
    trunk/blender/intern/cycles/util/util_system.cpp
    trunk/blender/intern/cycles/util/util_system.h

Added Paths:
-----------
    trunk/blender/intern/cycles/kernel/kernel_sse2.cpp
    trunk/blender/intern/cycles/kernel/kernel_sse3.cpp

Removed Paths:
-------------
    trunk/blender/intern/cycles/kernel/kernel_optimized.cpp

Modified: trunk/blender/intern/cycles/CMakeLists.txt
===================================================================
--- trunk/blender/intern/cycles/CMakeLists.txt	2013-02-04 16:12:27 UTC (rev 54298)
+++ trunk/blender/intern/cycles/CMakeLists.txt	2013-02-04 16:12:37 UTC (rev 54299)
@@ -13,10 +13,12 @@
 endif()
 
 if(WIN32 AND MSVC)
-	set(CYCLES_OPTIMIZED_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
+	set(CYCLES_SSE2_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
+	set(CYCLES_SSE3_KERNEL_FLAGS "/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc")
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast -D_CRT_SECURE_NO_WARNINGS /EHsc")
 elseif(CMAKE_COMPILER_IS_GNUCC)
-	set(CYCLES_OPTIMIZED_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse")
+	set(CYCLES_SSE2_KERNEL_FLAGS "-ffast-math -msse -msse2 -mfpmath=sse")
+	set(CYCLES_SSE3_KERNEL_FLAGS "-ffast-math -msse -msse2 -msse3 -mfpmath=sse")
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
 endif()
 

Modified: trunk/blender/intern/cycles/SConscript
===================================================================
--- trunk/blender/intern/cycles/SConscript	2013-02-04 16:12:27 UTC (rev 54298)
+++ trunk/blender/intern/cycles/SConscript	2013-02-04 16:12:37 UTC (rev 54299)
@@ -36,7 +36,8 @@
 
 sources.remove(path.join('util', 'util_view.cpp'))
 sources.remove(path.join('render', 'film_response.cpp'))
-sources.remove(path.join('kernel', 'kernel_optimized.cpp'))
+sources.remove(path.join('kernel', 'kernel_sse2.cpp'))
+sources.remove(path.join('kernel', 'kernel_sse3.cpp'))
 
 incs = [] 
 defs = []
@@ -73,22 +74,30 @@
 
 # optimized kernel
 if env['WITH_BF_RAYOPTIMIZATION']:
-    optim_cxxflags = Split(env['CXXFLAGS'])
+    sse2_cxxflags = Split(env['CXXFLAGS'])
+    sse3_cxxflags = Split(env['CXXFLAGS'])
 
     if env['OURPLATFORM'] == 'win32-vc':
-        optim_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse2_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse3_cxxflags.append('/arch:SSE2 -D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
     elif env['OURPLATFORM'] == 'win64-vc':
-        optim_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse2_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
+        sse3_cxxflags.append('-D_CRT_SECURE_NO_WARNINGS /fp:fast /EHsc'.split())
     else:
-        optim_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split())
+        sse2_cxxflags.append('-ffast-math -msse -msse2 -mfpmath=sse'.split())
+        sse3_cxxflags.append('-ffast-math -msse -msse2 -msse3 -mfpmath=sse'.split())
     
     defs.append('WITH_OPTIMIZED_KERNEL')
     optim_defs = defs[:]
-    optim_sources = [path.join('kernel', 'kernel_optimized.cpp')]
 
-    cycles_optim = cycles.Clone()
-    cycles_optim.BlenderLib('bf_intern_cycles_optimized', optim_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=optim_cxxflags)
+    cycles_sse3 = cycles.Clone()
+    sse3_sources = [path.join('kernel', 'kernel_sse3.cpp')]
+    cycles_sse3.BlenderLib('bf_intern_cycles_sse3', sse3_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse3_cxxflags)
 
+    cycles_sse2 = cycles.Clone()
+    sse2_sources = [path.join('kernel', 'kernel_sse2.cpp')]
+    cycles_sse2.BlenderLib('bf_intern_cycles_sse2', sse2_sources, incs, optim_defs, libtype=['intern'], priority=[10], cxx_compileflags=sse2_cxxflags)
+
 cycles.BlenderLib('bf_intern_cycles', sources, incs, defs, libtype=['intern'], priority=[0], cxx_compileflags=cxxflags)
 
 if env['WITH_BF_CYCLES_OSL']:

Modified: trunk/blender/intern/cycles/device/device_cpu.cpp
===================================================================
--- trunk/blender/intern/cycles/device/device_cpu.cpp	2013-02-04 16:12:27 UTC (rev 54298)
+++ trunk/blender/intern/cycles/device/device_cpu.cpp	2013-02-04 16:12:37 UTC (rev 54299)
@@ -58,7 +58,8 @@
 #endif
 
 		/* do now to avoid thread issues */
-		system_cpu_support_optimized();
+		system_cpu_support_sse2();
+		system_cpu_support_sse3();
 	}
 
 	~CPUDevice()
@@ -170,7 +171,7 @@
 			int end_sample = tile.start_sample + tile.num_samples;
 
 #ifdef WITH_OPTIMIZED_KERNEL
-			if(system_cpu_support_optimized()) {
+			if(system_cpu_support_sse2()) {
 				for(int sample = start_sample; sample < end_sample; sample++) {
 					if (task.get_cancel() || task_pool.cancelled()) {
 						if(task.need_finish_queue == false)
@@ -179,7 +180,7 @@
 
 					for(int y = tile.y; y < tile.y + tile.h; y++) {
 						for(int x = tile.x; x < tile.x + tile.w; x++) {
-							kernel_cpu_optimized_path_trace(&kg, render_buffer, rng_state,
+							kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
 								sample, x, y, tile.offset, tile.stride);
 						}
 					}
@@ -189,6 +190,25 @@
 					task.update_progress(tile);
 				}
 			}
+			else if(system_cpu_support_sse3()) {
+				for(int sample = start_sample; sample < end_sample; sample++) {
+					if (task.get_cancel() || task_pool.cancelled()) {
+						if(task.need_finish_queue == false)
+							break;
+					}
+
+					for(int y = tile.y; y < tile.y + tile.h; y++) {
+						for(int x = tile.x; x < tile.x + tile.w; x++) {
+							kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
+								sample, x, y, tile.offset, tile.stride);
+						}
+					}
+
+					tile.sample = sample + 1;
+
+					task.update_progress(tile);
+				}
+			}
 			else
 #endif
 			{
@@ -227,12 +247,18 @@
 	void thread_tonemap(DeviceTask& task)
 	{
 #ifdef WITH_OPTIMIZED_KERNEL
-		if(system_cpu_support_optimized()) {
+		if(system_cpu_support_sse2()) {
 			for(int y = task.y; y < task.y + task.h; y++)
 				for(int x = task.x; x < task.x + task.w; x++)
-					kernel_cpu_optimized_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
+					kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
 						task.sample, task.resolution, x, y, task.offset, task.stride);
 		}
+		else if(system_cpu_support_sse3()) {
+			for(int y = task.y; y < task.y + task.h; y++)
+				for(int x = task.x; x < task.x + task.w; x++)
+					kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
+						task.sample, task.resolution, x, y, task.offset, task.stride);
+		}
 		else
 #endif
 		{
@@ -252,14 +278,22 @@
 #endif
 
 #ifdef WITH_OPTIMIZED_KERNEL
-		if(system_cpu_support_optimized()) {
+		if(system_cpu_support_sse2()) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
-				kernel_cpu_optimized_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+				kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
 
 				if(task_pool.cancelled())
 					break;
 			}
 		}
+		else if(system_cpu_support_sse3()) {
+			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
+				kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+
+				if(task_pool.cancelled())
+					break;
+			}
+		}
 		else
 #endif
 		{

Modified: trunk/blender/intern/cycles/kernel/CMakeLists.txt
===================================================================
--- trunk/blender/intern/cycles/kernel/CMakeLists.txt	2013-02-04 16:12:27 UTC (rev 54298)
+++ trunk/blender/intern/cycles/kernel/CMakeLists.txt	2013-02-04 16:12:37 UTC (rev 54299)
@@ -12,7 +12,8 @@
 
 set(SRC
 	kernel.cpp
-	kernel_optimized.cpp
+	kernel_sse2.cpp
+	kernel_sse3.cpp
 	kernel.cl
 	kernel.cu
 )
@@ -149,7 +150,8 @@
 add_library(cycles_kernel ${SRC} ${SRC_HEADERS} ${SRC_CLOSURE_HEADERS} ${SRC_SVM_HEADERS})
 
 if(WITH_CYCLES_OPTIMIZED_KERNEL)
-	set_source_files_properties(kernel_optimized.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_OPTIMIZED_KERNEL_FLAGS}")
+	set_source_files_properties(kernel_sse2.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE2_KERNEL_FLAGS}")
+	set_source_files_properties(kernel_sse3.cpp PROPERTIES COMPILE_FLAGS "${CYCLES_SSE3_KERNEL_FLAGS}")
 endif()
 
 if(WITH_CYCLES_CUDA)

Modified: trunk/blender/intern/cycles/kernel/kernel.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel.h	2013-02-04 16:12:27 UTC (rev 54298)
+++ trunk/blender/intern/cycles/kernel/kernel.h	2013-02-04 16:12:37 UTC (rev 54299)
@@ -44,12 +44,19 @@
 	int type, int i);
 
 #ifdef WITH_OPTIMIZED_KERNEL
-void kernel_cpu_optimized_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
 	int sample, int x, int y, int offset, int stride);
-void kernel_cpu_optimized_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+void kernel_cpu_sse2_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	int sample, int resolution, int x, int y, int offset, int stride);
-void kernel_cpu_optimized_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
 	int type, int i);
+
+void kernel_cpu_sse3_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
+	int sample, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_tonemap(KernelGlobals *kg, uchar4 *rgba, float *buffer,
+	int sample, int resolution, int x, int y, int offset, int stride);
+void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
+	int type, int i);
 #endif
 
 CCL_NAMESPACE_END

Modified: trunk/blender/intern/cycles/kernel/kernel_bvh.h
===================================================================
--- trunk/blender/intern/cycles/kernel/kernel_bvh.h	2013-02-04 16:12:27 UTC (rev 54298)
+++ trunk/blender/intern/cycles/kernel/kernel_bvh.h	2013-02-04 16:12:37 UTC (rev 54299)
@@ -126,21 +126,21 @@
 
 	/* intersect ray against child nodes */
 	float3 ood = P * idir;
-	float c0lox = n0xy.x * idir.x - ood.x;
-	float c0hix = n0xy.y * idir.x - ood.x;
-	float c0loy = n0xy.z * idir.y - ood.y;
-	float c0hiy = n0xy.w * idir.y - ood.y;
-	float c0loz = nz.x * idir.z - ood.z;
-	float c0hiz = nz.y * idir.z - ood.z;
+	NO_EXTENDED_PRECISION float c0lox = n0xy.x * idir.x - ood.x;
+	NO_EXTENDED_PRECISION float c0hix = n0xy.y * idir.x - ood.x;
+	NO_EXTENDED_PRECISION float c0loy = n0xy.z * idir.y - ood.y;
+	NO_EXTENDED_PRECISION float c0hiy = n0xy.w * idir.y - ood.y;

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list