[Bf-blender-cvs] [471a896] cycles_split_kernel: Cycles: Replace utility macros with functions from util_types.h

Wed Dec 7 04:27:02 CET 2016

Commit: 471a896731bfdf2db28214681f45d6d33738b5b4
Author: Mai Lavelle
Date:   Mon Dec 5 18:19:58 2016 -0500
Branches: cycles_split_kernel
https://developer.blender.org/rB471a896731bfdf2db28214681f45d6d33738b5b4

Cycles: Replace utility macros with functions from util_types.h

===================================================================

M	intern/cycles/device/device_split_kernel.cpp
M	intern/cycles/kernel/split/kernel_split_data.h
M	intern/cycles/util/util_types.h

===================================================================

diff --git a/intern/cycles/device/device_split_kernel.cpp b/intern/cycles/device/device_split_kernel.cpp
index 4bb3c5d..db1e79d 100644
--- a/intern/cycles/device/device_split_kernel.cpp
+++ b/intern/cycles/device/device_split_kernel.cpp
@@ -21,8 +21,6 @@
 
 CCL_NAMESPACE_BEGIN
 
-#define ROUND_UP(x, multiple) (((((x) - 1 ) / (multiple)) + 1) * (multiple))
-
 DeviceSplitKernel::DeviceSplitKernel(Device *device) : device(device)
 {
 	path_iteration_times = PATH_ITER_INC_FACTOR;
@@ -98,8 +96,8 @@ bool DeviceSplitKernel::path_trace(DeviceTask *task,
 	 */
 	int2 max_render_feasible_tile_size;
 	const int2 tile_size = task->requested_tile_size;
-	max_render_feasible_tile_size.x = ROUND_UP(tile_size.x, local_size[0]);
-	max_render_feasible_tile_size.y = ROUND_UP(tile_size.y, local_size[1]);
+	max_render_feasible_tile_size.x = round_up(tile_size.x, local_size[0]);
+	max_render_feasible_tile_size.y = round_up(tile_size.y, local_size[1]);
 
 	/* Calculate per_thread_output_buffer_size. */
 	size_t per_thread_output_buffer_size;
diff --git a/intern/cycles/kernel/split/kernel_split_data.h b/intern/cycles/kernel/split/kernel_split_data.h
index bab5718..387e395 100644
--- a/intern/cycles/kernel/split/kernel_split_data.h
+++ b/intern/cycles/kernel/split/kernel_split_data.h
@@ -108,23 +108,22 @@ typedef struct SplitData {
 } SplitData;
 
 #define SIZEOF_SD(max_closure) (sizeof(ShaderData) - (sizeof(ShaderClosure) * (MAX_CLOSURE - (max_closure))))
-#define ALIGN_16(num) (((num) + 15) & ~15)
 
 ccl_device_inline size_t split_data_buffer_size(size_t num_elements,
                                                 size_t max_closure,
                                                 size_t per_thread_output_buffer_size)
 {
 	size_t size = 0;
-#define SPLIT_DATA_ENTRY(type, name, num) + ALIGN_16(num_elements * num * sizeof(type))
+#define SPLIT_DATA_ENTRY(type, name, num) + align_up(num_elements * num * sizeof(type), 16)
 	size = size SPLIT_DATA_ENTRIES;
 #undef SPLIT_DATA_ENTRY
 
 	/* TODO(sergey): This will actually over-allocate if
 	 * particular kernel does not support multiclosure.
 	 */
-	size += ALIGN_16(num_elements * SIZEOF_SD(max_closure)); /* sd */
-	size += ALIGN_16(2 * num_elements * SIZEOF_SD(max_closure)); /* sd_DL_shadow */
-	size += ALIGN_16(num_elements * per_thread_output_buffer_size); /* per_sample_output_buffers */
+	size += align_up(num_elements * SIZEOF_SD(max_closure), 16); /* sd */
+	size += align_up(2 * num_elements * SIZEOF_SD(max_closure), 16); /* sd_DL_shadow */
+	size += align_up(num_elements * per_thread_output_buffer_size, 16); /* per_sample_output_buffers */
 
 	return size;
 }
@@ -137,18 +136,18 @@ ccl_device_inline void split_data_init(ccl_global SplitData *split_data,
 	ccl_global char *p = (ccl_global char*)data;
 
 #define SPLIT_DATA_ENTRY(type, name, num) \
-	split_data->name = (type*)p; p += ALIGN_16(num_elements * num * sizeof(type));
+	split_data->name = (type*)p; p += align_up(num_elements * num * sizeof(type), 16);
 	SPLIT_DATA_ENTRIES
 #undef SPLIT_DATA_ENTRY
 
 	split_data->sd = (ShaderData*)p;
-	p += ALIGN_16(num_elements * SIZEOF_SD(MAX_CLOSURE));
+	p += align_up(num_elements * SIZEOF_SD(MAX_CLOSURE), 16);
 
 	split_data->sd_DL_shadow = (ShaderData*)p;
-	p += ALIGN_16(2 * num_elements * SIZEOF_SD(MAX_CLOSURE));
+	p += align_up(2 * num_elements * SIZEOF_SD(MAX_CLOSURE), 16);
 
 	split_data->per_sample_output_buffers = (ccl_global float*)p;
-	//p += ALIGN_16(num_elements * per_thread_output_buffer_size);
+	//p += align_up(num_elements * per_thread_output_buffer_size, 16);
 
 	split_data->ray_state = ray_state;
 }
diff --git a/intern/cycles/util/util_types.h b/intern/cycles/util/util_types.h
index 165f831..96e108b 100644
--- a/intern/cycles/util/util_types.h
+++ b/intern/cycles/util/util_types.h
@@ -399,11 +399,6 @@ ccl_device_inline float4 make_float4(float x, float y, float z, float w)
 	return a;
 }
 
-ccl_device_inline int align_up(int offset, int alignment)
-{
-	return (offset + alignment - 1) & ~(alignment - 1);
-}
-
 ccl_device_inline int3 make_int3(int i)
 {
 #ifdef __KERNEL_SSE__
@@ -478,6 +473,16 @@ ccl_device_inline int4 make_int4(const float3& f)
 
 #endif
 
+ccl_device_inline int align_up(int offset, int alignment)
+{
+	return (offset + alignment - 1) & ~(alignment - 1);
+}
+
+ccl_device_inline int round_up(int x, int multiple)
+{
+	return ((x + multiple - 1) / multiple) * multiple;
+}
+
 /* Interpolation types for textures
  * cuda also use texture space to store other objects */
 enum InterpolationType {