[Bf-blender-cvs] [c588af7] experimental-build: Revert "Cycles: Use array data storage for Fermi cards as well."

Mon May 16 14:58:27 CEST 2016

Commit: c588af70f428a246e957bb79531cb57f2e487d0b
Author: Thomas Dinges
Date:   Mon May 16 14:58:16 2016 +0200
Branches: experimental-build
https://developer.blender.org/rBc588af70f428a246e957bb79531cb57f2e487d0b

Revert "Cycles: Use array data storage for Fermi cards as well."

This reverts commit 11f3c9470d3b11180aa7ce14ac89eb0829353469.

===================================================================

M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/kernel/kernel_compat_cuda.h
M	intern/cycles/kernel/kernel_globals.h

===================================================================

diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 2ce2c2e..69015e7 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -86,7 +86,9 @@ public:
 	CUmodule cuModule;
 	map<device_ptr, bool> tex_interp_map;
 	int cuDevId;
+	int cuDevArchitecture;
 	bool first_error;
+	bool use_texture_storage;
 
 	struct PixelMem {
 		GLuint cuPBO;
@@ -174,6 +176,7 @@ public:
 	{
 		first_error = true;
 		background = background_;
+		use_texture_storage = true;
 
 		cuDevId = info.num;
 		cuDevice = 0;
@@ -204,6 +207,15 @@ public:
 		if(cuda_error_(result, "cuCtxCreate"))
 			return;
 
+		int major, minor;
+		cuDeviceComputeCapability(&major, &minor, cuDevId);
+		cuDevArchitecture = major*100 + minor*10;
+
+		/* In order to use full 6GB of memory on Titan cards, use arrays instead
+		 * of textures. On earlier cards this seems slower, but on Titan it is
+		 * actually slightly faster in tests. */
+		use_texture_storage = (cuDevArchitecture < 300);
+
 		cuda_pop_context();
 	}
 
@@ -483,7 +495,7 @@ public:
 		CUarray_format_enum format;
 		size_t dsize = datatype_size(mem.data_type);
 		size_t size = mem.memory_size();
-		bool use_texture = (interpolation != INTERPOLATION_NONE);
+		bool use_texture = (interpolation != INTERPOLATION_NONE) || use_texture_storage;
 
 		if(use_texture) {
 
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 167c994..d10d325 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -67,7 +67,18 @@ typedef texture<uchar4, 2, cudaReadModeNormalizedFloat> texture_image_uchar4;
 
 /* Macros to handle different memory storage on different devices */
 
-#define kernel_tex_fetch(t, index) t[(index)]
+/* In order to use full 6GB of memory on Titan cards, use arrays instead
+ * of textures. On earlier cards this seems slower, but on Titan it is
+ * actually slightly faster in tests. */
+#if __CUDA_ARCH__ < 300
+#  define __KERNEL_CUDA_TEX_STORAGE__
+#endif
+
+#ifdef __KERNEL_CUDA_TEX_STORAGE__
+#  define kernel_tex_fetch(t, index) tex1Dfetch(t, index)
+#else
+#  define kernel_tex_fetch(t, index) t[(index)]
+#endif
 #define kernel_tex_image_interp(t, x, y) tex2D(t, x, y)
 #define kernel_tex_image_interp_3d(t, x, y, z) tex3D(t, x, y, z)
 
diff --git a/intern/cycles/kernel/kernel_globals.h b/intern/cycles/kernel/kernel_globals.h
index cb8ce4c..c44ea1b 100644
--- a/intern/cycles/kernel/kernel_globals.h
+++ b/intern/cycles/kernel/kernel_globals.h
@@ -65,9 +65,12 @@ typedef struct KernelGlobals {
 __constant__ KernelData __data;
 typedef struct KernelGlobals {} KernelGlobals;
 
-#  define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
+#  ifdef __KERNEL_CUDA_TEX_STORAGE__
+#    define KERNEL_TEX(type, ttype, name) ttype name;
+#  else
+#    define KERNEL_TEX(type, ttype, name) const __constant__ __device__ type *name;
+#  endif
 #  define KERNEL_IMAGE_TEX(type, ttype, name) ttype name;
-
 #  include "kernel_textures.h"
 
 #endif  /* __KERNEL_CUDA__ */