[Bf-blender-cvs] [6353ecb] master: Cycles: Tweaks to support CUDA 8 toolkit

Sergey Sharybin noreply at git.blender.org
Mon Aug 1 15:56:18 CEST 2016


Commit: 6353ecb996898b4ce2fe8065130ed1f5ea3b6989
Author: Sergey Sharybin
Date:   Mon Aug 1 15:40:46 2016 +0200
Branches: master
https://developer.blender.org/rB6353ecb996898b4ce2fe8065130ed1f5ea3b6989

Cycles: Tweaks to support CUDA 8 toolkit

All the changes are mainly giving explicit tips on inlining functions,
so they match how inlining worked with previous toolkit.

This make kernel compiled by CUDA 8 render in average with same speed
as previous kernels. Some scenes are somewhat faster, some of them are
somewhat slower. But slowdown is within 1% so far.

On a positive side it allows us to enable newer generation cards on
buildbots (so GTX 10x0 will be officially supported soon).

===================================================================

M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/bvh/bvh_shadow_all.h
M	intern/cycles/kernel/bvh/bvh_subsurface.h
M	intern/cycles/kernel/bvh/bvh_traversal.h
M	intern/cycles/kernel/bvh/bvh_volume.h
M	intern/cycles/kernel/bvh/bvh_volume_all.h
M	intern/cycles/kernel/closure/bsdf.h
M	intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
M	intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
M	intern/cycles/kernel/closure/bssrdf.h
M	intern/cycles/kernel/geom/geom_primitive.h
M	intern/cycles/kernel/geom/geom_volume.h
M	intern/cycles/kernel/kernel_bake.h
M	intern/cycles/kernel/kernel_camera.h
M	intern/cycles/kernel/kernel_compat_cuda.h
M	intern/cycles/kernel/kernel_light.h
M	intern/cycles/kernel/kernel_path.h
M	intern/cycles/kernel/kernel_path_surface.h
M	intern/cycles/kernel/kernel_path_volume.h
M	intern/cycles/kernel/kernel_projection.h
M	intern/cycles/kernel/kernel_shader.h
M	intern/cycles/kernel/kernel_subsurface.h
M	intern/cycles/kernel/kernel_volume.h
M	intern/cycles/kernel/svm/svm_attribute.h
M	intern/cycles/kernel/svm/svm_geometry.h
M	intern/cycles/kernel/svm/svm_ramp.h
M	intern/cycles/kernel/svm/svm_ramp_util.h
M	intern/cycles/kernel/svm/svm_tex_coord.h
M	intern/cycles/util/util_math.h

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 2d40491..80f2644 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -321,11 +321,11 @@ public:
 			return "";
 		}
 		if(cuda_version < 60) {
-			printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5.\n", cuda_version/10, cuda_version%10);
+			printf("Unsupported CUDA version %d.%d detected, you need CUDA 7.5 or newer.\n", cuda_version/10, cuda_version%10);
 			return "";
 		}
-		else if(cuda_version != 75)
-			printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 is officially supported.\n", cuda_version/10, cuda_version%10);
+		else if(cuda_version != 75 && cuda_version != 80)
+			printf("CUDA version %d.%d detected, build may succeed but only CUDA 7.5 and 8.0 are officially supported.\n", cuda_version/10, cuda_version%10);
 
 		/* Compile. */
 		string kernel = path_join(kernel_path, path_join("kernels", path_join("cuda", "kernel.cu")));
diff --git a/intern/cycles/kernel/bvh/bvh_shadow_all.h b/intern/cycles/kernel/bvh/bvh_shadow_all.h
index 1d6fa30..e9eeff3 100644
--- a/intern/cycles/kernel/bvh/bvh_shadow_all.h
+++ b/intern/cycles/kernel/bvh/bvh_shadow_all.h
@@ -37,11 +37,16 @@
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            uint *num_hits)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect_array,
+                                 const uint max_hits,
+                                 uint *num_hits)
 {
 	/* todo:
 	 * - likely and unlikely for if() statements
diff --git a/intern/cycles/kernel/bvh/bvh_subsurface.h b/intern/cycles/kernel/bvh/bvh_subsurface.h
index 18978ef..d9623c9 100644
--- a/intern/cycles/kernel/bvh/bvh_subsurface.h
+++ b/intern/cycles/kernel/bvh/bvh_subsurface.h
@@ -35,12 +35,17 @@
  *
  */
 
-ccl_device void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            SubsurfaceIntersection *ss_isect,
-                                            int subsurface_object,
-                                            uint *lcg_state,
-                                            int max_hits)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+void BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 SubsurfaceIntersection *ss_isect,
+                                 int subsurface_object,
+                                 uint *lcg_state,
+                                 int max_hits)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/bvh/bvh_traversal.h b/intern/cycles/kernel/bvh/bvh_traversal.h
index 68a11b6..b1a5296 100644
--- a/intern/cycles/kernel/bvh/bvh_traversal.h
+++ b/intern/cycles/kernel/bvh/bvh_traversal.h
@@ -40,16 +40,21 @@
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect,
+                                 const uint visibility
 #if BVH_FEATURE(BVH_HAIR_MINIMUM_WIDTH)
-                                            , uint *lcg_state,
-                                            float difl,
-                                            float extmax
+                                 , uint *lcg_state,
+                                 float difl,
+                                 float extmax
 #endif
-                                            )
+                                 )
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/bvh/bvh_volume.h b/intern/cycles/kernel/bvh/bvh_volume.h
index 03499e9..107373c 100644
--- a/intern/cycles/kernel/bvh/bvh_volume.h
+++ b/intern/cycles/kernel/bvh/bvh_volume.h
@@ -36,10 +36,15 @@
  *
  */
 
-ccl_device bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect,
-                                            const uint visibility)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+bool BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect,
+                                 const uint visibility)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/bvh/bvh_volume_all.h b/intern/cycles/kernel/bvh/bvh_volume_all.h
index 7eddc28..1f6515c 100644
--- a/intern/cycles/kernel/bvh/bvh_volume_all.h
+++ b/intern/cycles/kernel/bvh/bvh_volume_all.h
@@ -36,11 +36,16 @@
  *
  */
 
-ccl_device uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
-                                            const Ray *ray,
-                                            Intersection *isect_array,
-                                            const uint max_hits,
-                                            const uint visibility)
+#ifndef __KERNEL_GPU__
+ccl_device
+#else
+ccl_device_inline
+#endif
+uint BVH_FUNCTION_FULL_NAME(BVH)(KernelGlobals *kg,
+                                 const Ray *ray,
+                                 Intersection *isect_array,
+                                 const uint max_hits,
+                                 const uint visibility)
 {
 	/* todo:
 	 * - test if pushing distance on the stack helps (for non shadow rays)
diff --git a/intern/cycles/kernel/closure/bsdf.h b/intern/cycles/kernel/closure/bsdf.h
index a251e3b..55bdf3e 100644
--- a/intern/cycles/kernel/closure/bsdf.h
+++ b/intern/cycles/kernel/closure/bsdf.h
@@ -36,7 +36,15 @@
 
 CCL_NAMESPACE_BEGIN
 
-ccl_device int bsdf_sample(KernelGlobals *kg, ShaderData *sd, const ShaderClosure *sc, float randu, float randv, float3 *eval, float3 *omega_in, differential3 *domega_in, float *pdf)
+ccl_device_inline int bsdf_sample(KernelGlobals *kg,
+                                  ShaderData *sd,
+                                  const ShaderClosure *sc,
+                                  float randu,
+                                  float randv,
+                                  float3 *eval,
+                                  float3 *omega_in,
+                                  differential3 *domega_in,
+                                  float *pdf)
 {
 	int label;
 
diff --git a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
index 8ed76be..9929246 100644
--- a/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
+++ b/intern/cycles/kernel/closure/bsdf_ashikhmin_shirley.h
@@ -62,7 +62,11 @@ ccl_device_inline float bsdf_ashikhmin_shirley_roughness_to_exponent(float rough
 	return 2.0f / (roughness*roughness) - 2.0f;
 }
 
-ccl_device float3 bsdf_ashikhmin_shirley_eval_reflect(const ShaderClosure *sc, const float3 I, const float3 omega_in, float *pdf)
+ccl_device_inline float3 bsdf_ashikhmin_shirley_eval_reflect(
+        const ShaderClosure *sc,
+        const float3 I,
+        const float3 omega_in,
+        float *pdf)
 {
 	const MicrofacetBsdf *bsdf = (const MicrofacetBsdf*)sc;
 	float3 N = bsdf->N;
diff --git a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
index afd4a8d..6ebe2f6 100644
--- a/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
+++ b/intern/cycles/kernel/closure/bsdf_microfacet_multi_impl.h
@@ -25,11 +25,18 @@
  * energy is used. In combination with MIS, that is enough to produce an unbiased result, although
  * the balance heuristic isn't necessarily optimal anymore.
  */
-ccl_device float3 MF_FUNCTION_FULL_NAME(mf_eval)(float3 wi, float3 wo, const bool wo_outside, const float3 color, const float alpha_x, const float alpha_y, ccl_addr_space uint* lcg_state
+ccl_device_inline float3 MF_FUNCTION_FULL_NAME(mf_eval)(
+        float3 wi,
+        float3 wo,
+        const bool wo_outside,
+        const float3 color,
+        const float alpha_x,
+        const float alpha_y,
+         ccl_addr_space uint *lcg_state
 #ifdef MF_MULTI_GLASS
-	, const float eta
+        , const float eta
 #elif defined(MF_MULTI_GLOSSY)
-	, float3 *n, float3 *k
+        , float3 *n, float3 *k
 #endif
 )
 {
diff --git a/intern/cycles/kernel/closure/bssrdf.h b/intern/cycles/kernel/closure/bssrdf.h
index a260ae9..35c9576 100644
--- a/intern/cycles/kernel/closure/bssrdf.h
+++ b/intern/cycles/kernel/closure/bssrdf.h
@@ -141,7 +141,7 @@ ccl_device float bssrdf_cubic_pdf(const ShaderClosure *sc, float r)
 }
 
 /* solve 10x^2 - 20x^3 + 15x^4 - 4x^5 - xi == 0 */
-ccl_device float bssrdf_cubic_quintic_root_find(float xi)
+ccl_device_inline float bssrdf_cubic_quintic_root_find(float xi)
 {
 	/* newton-raphson iteration, usually succeeds in 2-4 iterations, except
 	 * outside 0.02 ... 0.98 where it can go up to 10, so overall performance
@@ -255,7 +255,7 @@ ccl_device float bssrdf_burley_pdf(const ShaderClosure *sc, float r)
  * Returns scaled radius, meaning the result is to be scaled up by d.
  * Since there's no closed form solution we do Newton-Raphson method to find it.
  */
-ccl_device float bssrdf_burley_root_find(float xi)
+ccl_device_inline float bssrdf_burley_root_find(float xi)
 {
 	const float tolerance = 1e-6f;
 	const int max_iter

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list