[Bf-blender-cvs] [cb3b197] master: Cycles: Use utility define for restrict pointers
Sergey Sharybin
noreply at git.blender.org
Mon Jul 11 14:04:23 CEST 2016
Commit: cb3b19730c4fa402c065e288330f4f1f197026ab
Author: Sergey Sharybin
Date: Mon Jul 11 13:53:37 2016 +0200
Branches: master
https://developer.blender.org/rBcb3b19730c4fa402c065e288330f4f1f197026ab
Cycles: Use utility define for restrict pointers
This way restrict can be used for CUDA and OpenCL as well.
>From quick tests in areas i've been testing this it might give some
barely measurable %% of speedup, but it increases registers pressure.
So use of this qualifier is still really limited.
===================================================================
M intern/cycles/kernel/bvh/qbvh_nodes.h
M intern/cycles/kernel/kernel_compat_cuda.h
M intern/cycles/kernel/kernel_compat_opencl.h
M intern/cycles/util/util_types.h
===================================================================
diff --git a/intern/cycles/kernel/bvh/qbvh_nodes.h b/intern/cycles/kernel/bvh/qbvh_nodes.h
index 6dfb1c0..a833f4b 100644
--- a/intern/cycles/kernel/bvh/qbvh_nodes.h
+++ b/intern/cycles/kernel/bvh/qbvh_nodes.h
@@ -22,27 +22,27 @@ struct QBVHStackItem {
/* TOOD(sergey): Investigate if using intrinsics helps for both
* stack item swap and float comparison.
*/
-ccl_device_inline void qbvh_item_swap(QBVHStackItem *__restrict a,
- QBVHStackItem *__restrict b)
+ccl_device_inline void qbvh_item_swap(QBVHStackItem *ccl_restrict a,
+ QBVHStackItem *ccl_restrict b)
{
QBVHStackItem tmp = *a;
*a = *b;
*b = tmp;
}
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
- QBVHStackItem *__restrict s2,
- QBVHStackItem *__restrict s3)
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+ QBVHStackItem *ccl_restrict s2,
+ QBVHStackItem *ccl_restrict s3)
{
if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
if(s3->dist < s2->dist) { qbvh_item_swap(s3, s2); }
if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
}
-ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
- QBVHStackItem *__restrict s2,
- QBVHStackItem *__restrict s3,
- QBVHStackItem *__restrict s4)
+ccl_device_inline void qbvh_stack_sort(QBVHStackItem *ccl_restrict s1,
+ QBVHStackItem *ccl_restrict s2,
+ QBVHStackItem *ccl_restrict s3,
+ QBVHStackItem *ccl_restrict s4)
{
if(s2->dist < s1->dist) { qbvh_item_swap(s2, s1); }
if(s4->dist < s3->dist) { qbvh_item_swap(s4, s3); }
@@ -53,7 +53,7 @@ ccl_device_inline void qbvh_stack_sort(QBVHStackItem *__restrict s1,
/* Axis-aligned nodes intersection */
-ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
+ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *ccl_restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
@@ -69,7 +69,7 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
const int far_y,
const int far_z,
const int node_addr,
- ssef *__restrict dist)
+ ssef *ccl_restrict dist)
{
const int offset = node_addr + 1;
#ifdef __KERNEL_AVX2__
@@ -104,7 +104,7 @@ ccl_device_inline int qbvh_aligned_node_intersect(KernelGlobals *__restrict kg,
}
ccl_device_inline int qbvh_aligned_node_intersect_robust(
- KernelGlobals *__restrict kg,
+ KernelGlobals *ccl_restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
@@ -121,7 +121,7 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
const int far_z,
const int node_addr,
const float difl,
- ssef *__restrict dist)
+ ssef *ccl_restrict dist)
{
const int offset = node_addr + 1;
#ifdef __KERNEL_AVX2__
@@ -152,7 +152,7 @@ ccl_device_inline int qbvh_aligned_node_intersect_robust(
/* Unaligned nodes intersection */
ccl_device_inline int qbvh_unaligned_node_intersect(
- KernelGlobals *__restrict kg,
+ KernelGlobals *ccl_restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
@@ -168,7 +168,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect(
const int far_y,
const int far_z,
const int node_addr,
- ssef *__restrict dist)
+ ssef *ccl_restrict dist)
{
const int offset = node_addr;
const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
@@ -236,7 +236,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect(
}
ccl_device_inline int qbvh_unaligned_node_intersect_robust(
- KernelGlobals *__restrict kg,
+ KernelGlobals *ccl_restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
@@ -253,7 +253,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect_robust(
const int far_z,
const int node_addr,
const float difl,
- ssef *__restrict dist)
+ ssef *ccl_restrict dist)
{
const int offset = node_addr;
const ssef tfm_x_x = kernel_tex_fetch_ssef(__bvh_nodes, offset+1);
@@ -324,7 +324,7 @@ ccl_device_inline int qbvh_unaligned_node_intersect_robust(
*/
ccl_device_inline int qbvh_node_intersect(
- KernelGlobals *__restrict kg,
+ KernelGlobals *ccl_restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
@@ -340,7 +340,7 @@ ccl_device_inline int qbvh_node_intersect(
const int far_y,
const int far_z,
const int node_addr,
- ssef *__restrict dist)
+ ssef *ccl_restrict dist)
{
const int offset = node_addr;
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
@@ -377,7 +377,7 @@ ccl_device_inline int qbvh_node_intersect(
}
ccl_device_inline int qbvh_node_intersect_robust(
- KernelGlobals *__restrict kg,
+ KernelGlobals *ccl_restrict kg,
const ssef& tnear,
const ssef& tfar,
#ifdef __KERNEL_AVX2__
@@ -394,7 +394,7 @@ ccl_device_inline int qbvh_node_intersect_robust(
const int far_z,
const int node_addr,
const float difl,
- ssef *__restrict dist)
+ ssef *ccl_restrict dist)
{
const int offset = node_addr;
const float4 node = kernel_tex_fetch(__bvh_nodes, offset);
diff --git a/intern/cycles/kernel/kernel_compat_cuda.h b/intern/cycles/kernel/kernel_compat_cuda.h
index 4231475..08f6f45 100644
--- a/intern/cycles/kernel/kernel_compat_cuda.h
+++ b/intern/cycles/kernel/kernel_compat_cuda.h
@@ -42,6 +42,7 @@
#define ccl_constant
#define ccl_may_alias
#define ccl_addr_space
+#define ccl_restrict __restrict__
/* No assert supported for CUDA */
diff --git a/intern/cycles/kernel/kernel_compat_opencl.h b/intern/cycles/kernel/kernel_compat_opencl.h
index a570844..8505cb8 100644
--- a/intern/cycles/kernel/kernel_compat_opencl.h
+++ b/intern/cycles/kernel/kernel_compat_opencl.h
@@ -39,6 +39,7 @@
#define ccl_global __global
#define ccl_local __local
#define ccl_private __private
+#define ccl_restrict restrict
#ifdef __SPLIT_KERNEL__
# define ccl_addr_space __global
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 972befa..257c6ad 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -37,6 +37,7 @@
#define ccl_device_noinline static
#define ccl_global
#define ccl_constant
+#define ccl_restrict __restrict
#define __KERNEL_WITH_SSE_ALIGN__
#if defined(_WIN32) && !defined(FREE_WINDOWS)
More information about the Bf-blender-cvs
mailing list