[Bf-blender-cvs] [3844b8f] soc-2014-cycles: Cycles: Use some dedicated FMA intrinsics in the AVX2 kernel.

Thomas Dinges noreply at git.blender.org
Wed May 14 21:06:45 CEST 2014


Commit: 3844b8f85c7dd849a10b80c5b6b92fe968a19ecf
Author: Thomas Dinges
Date:   Wed May 14 21:03:41 2014 +0200
https://developer.blender.org/rB3844b8f85c7dd849a10b80c5b6b92fe968a19ecf

Cycles: Use some dedicated FMA intrinsics in the AVX2 kernel.

This gives me a small speedup of 2% in bmw.blend, and 3% in hair.blend.
Could only test on my Macbook with clang though, no idea how gcc or msvc performs here.

Thanks to Lockal for some input on this! :)

===================================================================

M	intern/cycles/CMakeLists.txt
M	intern/cycles/SConscript
M	intern/cycles/kernel/geom/geom_bvh_shadow.h
M	intern/cycles/kernel/geom/geom_bvh_subsurface.h
M	intern/cycles/kernel/geom/geom_bvh_traversal.h
M	intern/cycles/kernel/kernel_avx2.cpp
M	intern/cycles/util/util_optimization.h
M	intern/cycles/util/util_simd.h

===================================================================

diff --git a/intern/cycles/CMakeLists.txt b/intern/cycles/CMakeLists.txt
index 7a9739c..7a1fea8 100644
--- a/intern/cycles/CMakeLists.txt
+++ b/intern/cycles/CMakeLists.txt
@@ -20,7 +20,7 @@ if(WIN32 AND MSVC)
 	# /arch:AVX for VC2012 and above
 	if(NOT MSVC_VERSION LESS 1700)
 		set(CYCLES_AVX_ARCH_FLAGS "/arch:AVX")
-		set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX")
+		set(CYCLES_AVX2_ARCH_FLAGS "/arch:AVX /arch:AVX2“)
 	elseif(NOT CMAKE_CL_64)
 		set(CYCLES_AVX_ARCH_FLAGS "/arch:SSE2")
 		set(CYCLES_AVX2_ARCH_FLAGS "/arch:SSE2")
diff --git a/intern/cycles/SConscript b/intern/cycles/SConscript
index 2439e0a..f3481df 100644
--- a/intern/cycles/SConscript
+++ b/intern/cycles/SConscript
@@ -99,7 +99,7 @@ elif env['OURPLATFORM'] == 'win64-vc':
     if env['MSVC_VERSION'] in ('11.0', '12.0'):
         kernel_flags['sse41'] = kernel_flags['sse3']
         kernel_flags['avx'] = kernel_flags['sse41'] + ' /arch:AVX'
-        kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX'
+        kernel_flags['avx2'] = kernel_flags['sse41'] + ' /arch:AVX /arch:AVX2'
 else:
     # -mavx only available with relatively new gcc/clang
     kernel_flags['sse2'] = '-ffast-math -msse -msse2 -mfpmath=sse'
diff --git a/intern/cycles/kernel/geom/geom_bvh_shadow.h b/intern/cycles/kernel/geom/geom_bvh_shadow.h
index 324f51f..8685a30 100644
--- a/intern/cycles/kernel/geom/geom_bvh_shadow.h
+++ b/intern/cycles/kernel/geom/geom_bvh_shadow.h
@@ -136,9 +136,15 @@ ccl_device bool BVH_FUNCTION_NAME
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
+#ifdef __KERNEL_AVX2__
+				const __m128 tminmaxx = fms(shuffle_swap(bvh_nodes[0], shufflexyz[0]), idirsplat[0], _mm_mul_ps(Psplat[0], idirsplat[0]));
+				const __m128 tminmaxy = fms(shuffle_swap(bvh_nodes[1], shufflexyz[1]), idirsplat[1], _mm_mul_ps(Psplat[1], idirsplat[1]));
+				const __m128 tminmaxz = fms(shuffle_swap(bvh_nodes[2], shufflexyz[2]), idirsplat[2], _mm_mul_ps(Psplat[2], idirsplat[2]));
+#else
 				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
 				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
 				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+#endif
 
 				/* calculate { c0min, c1min, -c0max, -c1max} */
 				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
diff --git a/intern/cycles/kernel/geom/geom_bvh_subsurface.h b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
index a19f05d..de4ba37 100644
--- a/intern/cycles/kernel/geom/geom_bvh_subsurface.h
+++ b/intern/cycles/kernel/geom/geom_bvh_subsurface.h
@@ -135,9 +135,15 @@ ccl_device uint BVH_FUNCTION_NAME(KernelGlobals *kg, const Ray *ray, Intersectio
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
+#ifdef __KERNEL_AVX2__
+				const __m128 tminmaxx = fms(shuffle_swap(bvh_nodes[0], shufflexyz[0]), idirsplat[0], _mm_mul_ps(Psplat[0], idirsplat[0]));
+				const __m128 tminmaxy = fms(shuffle_swap(bvh_nodes[1], shufflexyz[1]), idirsplat[1], _mm_mul_ps(Psplat[1], idirsplat[1]));
+				const __m128 tminmaxz = fms(shuffle_swap(bvh_nodes[2], shufflexyz[2]), idirsplat[2], _mm_mul_ps(Psplat[2], idirsplat[2]));
+#else
 				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
 				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
 				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+#endif
 
 				const __m128 tminmax = _mm_xor_ps(_mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat)), pn);
 				const __m128 lrhit = _mm_cmple_ps(tminmax, shuffle<2, 3, 0, 1>(tminmax));
diff --git a/intern/cycles/kernel/geom/geom_bvh_traversal.h b/intern/cycles/kernel/geom/geom_bvh_traversal.h
index 9fd40f9..35f827f 100644
--- a/intern/cycles/kernel/geom/geom_bvh_traversal.h
+++ b/intern/cycles/kernel/geom/geom_bvh_traversal.h
@@ -155,9 +155,15 @@ ccl_device bool BVH_FUNCTION_NAME
 				const float4 cnodes = ((float4*)bvh_nodes)[3];
 
 				/* intersect ray against child nodes */
+#ifdef __KERNEL_AVX2__
+				const __m128 tminmaxx = fms(shuffle_swap(bvh_nodes[0], shufflexyz[0]), idirsplat[0], _mm_mul_ps(Psplat[0], idirsplat[0]));
+				const __m128 tminmaxy = fms(shuffle_swap(bvh_nodes[1], shufflexyz[1]), idirsplat[1], _mm_mul_ps(Psplat[1], idirsplat[1]));
+				const __m128 tminmaxz = fms(shuffle_swap(bvh_nodes[2], shufflexyz[2]), idirsplat[2], _mm_mul_ps(Psplat[2], idirsplat[2]));
+#else
 				const __m128 tminmaxx = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[0], shufflexyz[0]), Psplat[0]), idirsplat[0]);
 				const __m128 tminmaxy = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[1], shufflexyz[1]), Psplat[1]), idirsplat[1]);
 				const __m128 tminmaxz = _mm_mul_ps(_mm_sub_ps(shuffle_swap(bvh_nodes[2], shufflexyz[2]), Psplat[2]), idirsplat[2]);
+#endif
 
 				/* calculate { c0min, c1min, -c0max, -c1max} */
 				__m128 minmax = _mm_max_ps(_mm_max_ps(tminmaxx, tminmaxy), _mm_max_ps(tminmaxz, tsplat));
diff --git a/intern/cycles/kernel/kernel_avx2.cpp b/intern/cycles/kernel/kernel_avx2.cpp
index 2cbad99..c6c4ba5 100644
--- a/intern/cycles/kernel/kernel_avx2.cpp
+++ b/intern/cycles/kernel/kernel_avx2.cpp
@@ -24,6 +24,8 @@
 #define __KERNEL_SSE3__
 #define __KERNEL_SSSE3__
 #define __KERNEL_SSE41__
+#define __KERNEL_AVX__
+#define __KERNEL_AVX2__
 #endif
  
 #include "util_optimization.h"
diff --git a/intern/cycles/util/util_optimization.h b/intern/cycles/util/util_optimization.h
index c147405..34a43fb 100644
--- a/intern/cycles/util/util_optimization.h
+++ b/intern/cycles/util/util_optimization.h
@@ -123,6 +123,10 @@
 #include <smmintrin.h> /* SSE 4.1 */
 #endif
 
+#ifdef __KERNEL_AVX__
+#include <immintrin.h> /* AVX(2) */
+#endif
+
 #else
 
 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
diff --git a/intern/cycles/util/util_simd.h b/intern/cycles/util/util_simd.h
index f0f37fa..6918156 100644
--- a/intern/cycles/util/util_simd.h
+++ b/intern/cycles/util/util_simd.h
@@ -145,13 +145,21 @@ ccl_device_inline const __m128 blend(const __m128& mask, const __m128& a, const
 /* calculate a*b+c (replacement for fused multiply-add on SSE CPUs) */
 ccl_device_inline const __m128 fma(const __m128& a, const __m128& b, const __m128& c)
 {
+#ifdef __KERNEL_AVX2__
+	return _mm_fmadd_ps(a, b, c);
+#else
 	return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
 }
 
 /* calculate a*b-c (replacement for fused multiply-subtract on SSE CPUs) */
 ccl_device_inline const __m128 fms(const __m128& a, const __m128& b, const __m128& c)
 {
+#ifdef __KERNEL_AVX2__
+	return _mm_fmsub_ps(a, b, c);
+#else
 	return _mm_sub_ps(_mm_mul_ps(a, b), c);
+#endif
 }
 
 /* calculate -a*b+c (replacement for fused negated-multiply-subtract on SSE CPUs) */




More information about the Bf-blender-cvs mailing list