[Bf-blender-cvs] [4923af4c77] cycles-tiles-rework: Cycles: Deduplicate sample range setting and getting

Sat Jan 14 08:26:44 CET 2017

Commit: 4923af4c773f64c54f1b46695ecb55525c03a437
Author: Mai Lavelle
Date:   Sat Jan 14 02:01:00 2017 -0500
Branches: cycles-tiles-rework
https://developer.blender.org/rB4923af4c773f64c54f1b46695ecb55525c03a437

Cycles: Deduplicate sample range setting and getting

===================================================================

M	intern/cycles/kernel/CMakeLists.txt
A	intern/cycles/kernel/kernel_sample_range.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu
M	intern/cycles/kernel/kernels/opencl/kernel.cl

===================================================================

diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 56bcafbce3..e0fbc57edd 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -72,6 +72,7 @@ set(SRC_HEADERS
 	kernel_projection.h
 	kernel_queues.h
 	kernel_random.h
+	kernel_sample_range.h
 	kernel_shader.h
 	kernel_shadow.h
 	kernel_subsurface.h
diff --git a/intern/cycles/kernel/kernel_sample_range.h b/intern/cycles/kernel/kernel_sample_range.h
new file mode 100644
index 0000000000..6694a7e959
--- /dev/null
+++ b/intern/cycles/kernel/kernel_sample_range.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2011-2016 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+CCL_NAMESPACE_BEGIN
+
+ccl_device_inline void kernel_set_sample_range(
+		ccl_global SampleRange *sample_ranges,
+		int range,
+		ccl_global float *buffer,
+		ccl_global uint *rng_state,
+		int sample,
+		int sx,
+		int sy,
+		int sw,
+		int sh,
+		int offset,
+		int stride)
+{
+	ccl_global SampleRange* sample_range = &sample_ranges[range];
+
+	sample_range->buffer = buffer;
+	sample_range->rng_state = rng_state;
+	sample_range->sample = sample;
+	sample_range->x = sx;
+	sample_range->y = sy;
+	sample_range->w = sw;
+	sample_range->h = sh;
+	sample_range->offset = offset;
+	sample_range->stride = stride;
+
+	if(range == 0) {
+		sample_range->work_offset = 0;
+	}
+	else {
+		ccl_global SampleRange* prev_range = &sample_ranges[range-1];
+		sample_range->work_offset = prev_range->work_offset + prev_range->w * prev_range->h;
+	}
+}
+
+ccl_device_inline bool kernel_pixel_sample_for_thread(
+		KernelGlobals *kg,
+		ccl_global SampleRange *sample_ranges,
+		int num_sample_ranges,
+		int *thread_x,
+		int *thread_y,
+		int *thread_sample,
+		ccl_global SampleRange **thread_sample_range)
+{
+	/* order threads to maintain inner block coherency */
+	const int group_id = ccl_group_id(0) + ccl_num_groups(0) * ccl_group_id(1);
+	const int local_thread_id = ccl_local_id(0) + ccl_local_id(1) * ccl_local_size(0);
+
+	const int thread_id = group_id * (ccl_local_size(0) * ccl_local_size(1)) + local_thread_id;
+
+	/* find which sample range belongs to this thread */
+	ccl_global SampleRange* sample_range = NULL;
+
+	for(int i = 0; i < num_sample_ranges; i++) {
+		if(thread_id >= sample_ranges[i].work_offset &&
+		   thread_id < sample_ranges[i].work_offset + sample_ranges[i].w * sample_ranges[i].h)
+		{
+			sample_range = &sample_ranges[i];
+		}
+	}
+
+	/* check if theres work for this thread */
+	if(!sample_range) {
+		return false;
+	}
+
+	int work_offset = thread_id - sample_range->work_offset;
+
+	if(work_offset < 0 || work_offset >= sample_range->w * sample_range->h) {
+		return false;
+	}
+
+	if(thread_sample_range) *thread_sample_range = sample_range;
+	if(thread_x) *thread_x = (work_offset % sample_range->w) + sample_range->x;
+	if(thread_y) *thread_y = (work_offset / sample_range->w) + sample_range->y;
+	if(thread_sample) *thread_sample = sample_range->sample;
+
+	return true;
+}
+
+CCL_NAMESPACE_END
+
diff --git a/intern/cycles/kernel/kernels/cuda/kernel.cu b/intern/cycles/kernel/kernels/cuda/kernel.cu
index 2c73ba4888..e90c2c902c 100644
--- a/intern/cycles/kernel/kernels/cuda/kernel.cu
+++ b/intern/cycles/kernel/kernels/cuda/kernel.cu
@@ -24,6 +24,7 @@
 #include "../../kernel_path.h"
 #include "../../kernel_path_branched.h"
 #include "../../kernel_bake.h"
+#include "../../kernel_sample_range.h"
 
 /* device data taken from CUDA occupancy calculator */
 
@@ -123,68 +124,37 @@
 
 /* kernels */
 extern "C" __global__ void
-kernel_cuda_set_sample_range(SampleRange *sample_ranges, int range, float *buffer, uint *rng_state, int sample, int sx, int sy, int sw, int sh, int offset, int stride)
+kernel_cuda_set_sample_range(
+	SampleRange *sample_ranges,
+	int range,
+	float *buffer,
+	uint *rng_state,
+	int sample,
+	int sx,
+	int sy,
+	int sw,
+	int sh,
+	int offset,
+	int stride)
 {
-	SampleRange* sample_range = &sample_ranges[range];
-
-	sample_range->buffer = buffer;
-	sample_range->rng_state = rng_state;
-	sample_range->sample = sample;
-	sample_range->x = sx;
-	sample_range->y = sy;
-	sample_range->w = sw;
-	sample_range->h = sh;
-	sample_range->offset = offset;
-	sample_range->stride = stride;
-
-	if(range == 0) {
-		sample_range->work_offset = 0;
-	}
-	else {
-		SampleRange* prev_range = &sample_ranges[range-1];
-		sample_range->work_offset = prev_range->work_offset + prev_range->w * prev_range->h;
-	}
+	kernel_set_sample_range(sample_ranges, range, buffer, rng_state, sample, sx, sy, sw, sh, offset, stride);
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
 kernel_cuda_path_trace(SampleRange *sample_ranges, int num_sample_ranges)
 {
-	/* order threads to maintain inner block coherency */
-	const int group_id = blockIdx.x + gridDim.x * blockIdx.y;
-	const int local_thread_id = threadIdx.x + threadIdx.y * blockDim.x;
-
-	const int thread_id = group_id * (blockDim.x * blockDim.x) + local_thread_id;
-
-	/* find which sample range belongs to this thread */
-	SampleRange* sample_range = NULL;
-
-	for(int i = 0; i < num_sample_ranges; i++) {
-		if(thread_id >= sample_ranges[i].work_offset &&
-		   thread_id < sample_ranges[i].work_offset + sample_ranges[i].w * sample_ranges[i].h)
-		{
-			sample_range = &sample_ranges[i];
-		}
-	}
-
-	/* check if theres work for this thread */
-	if(!sample_range) {
-		return;
-	}
-
-	int work_offset = thread_id - sample_range->work_offset;
+	ccl_global SampleRange* sample_range;
+	int x, y, sample;
 
-	if(work_offset < 0 || work_offset >= sample_range->w * sample_range->h) {
+	if(!kernel_pixel_sample_for_thread(kg, sample_ranges, num_sample_ranges, &x, &y, &sample, &sample_range)) {
 		return;
 	}
 
-	int x = (work_offset % sample_range->w) + sample_range->x;
-	int y = (work_offset / sample_range->w) + sample_range->y;
-
 	kernel_path_trace(NULL,
 	                  sample_range->buffer,
 	                  sample_range->rng_state,
-	                  sample_range->sample,
+	                  sample,
 	                  x, y,
 	                  sample_range->offset,
 	                  sample_range->stride);
@@ -195,41 +165,17 @@ extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_BRANCHED_MAX_REGISTERS)
 kernel_cuda_branched_path_trace(SampleRange *sample_ranges, int num_sample_ranges)
 {
-	/* order threads to maintain inner block coherency */
-	const int group_id = blockIdx.x + gridDim.x * blockIdx.y;
-	const int local_thread_id = threadIdx.x + threadIdx.y * blockDim.x;
-
-	const int thread_id = group_id * (blockDim.x * blockDim.x) + local_thread_id;
-
-	/* find which sample range belongs to this thread */
-	SampleRange* sample_range = NULL;
-
-	for(int i = 0; i < num_sample_ranges; i++) {
-		if(thread_id >= sample_ranges[i].work_offset &&
-		   thread_id < sample_ranges[i].work_offset + sample_ranges[i].w * sample_ranges[i].h)
-		{
-			sample_range = &sample_ranges[i];
-		}
-	}
+	ccl_global SampleRange* sample_range;
+	int x, y, sample;
 
-	/* check if theres work for this thread */
-	if(!sample_range) {
+	if(!kernel_pixel_sample_for_thread(kg, sample_ranges, num_sample_ranges, &x, &y, &sample, &sample_range)) {
 		return;
 	}
 
-	int work_offset = thread_id - sample_range->work_offset;
-
-	if(work_offset < 0 || work_offset >= sample_range->w * sample_range->h) {
-		return;
-	}
-
-	int x = (work_offset % sample_range->w) + sample_range->x;
-	int y = (work_offset / sample_range->w) + sample_range->y;
-
 	kernel_branched_path_trace(NULL,
 	                  sample_range->buffer,
 	                  sample_range->rng_state,
-	                  sample_range->sample,
+	                  sample,
 	                  x, y,
 	                  sample_range->offset,
 	                  sample_range->stride);
diff --git a/intern/cycles/kernel/kernels/opencl/kernel.cl b/intern/cycles/kernel/kernels/opencl/kernel.cl
index 1816d01d3b..77bf645ef4 100644
--- a/intern/cycles/kernel/kernels/opencl/kernel.cl
+++ b/intern/cycles/kernel/kernels/opencl/kernel.cl
@@ -45,6 +45,8 @@
 
 #include "../../kernel_bake.h"
 
+#include "../../kernel_sample_range.h"
+
 #ifdef __COMPILE_ONLY_MEGAKERNEL__
 
 __kernel void kernel_ocl_path_trace(
@@ -65,41 +67,17 @@ __kernel void kernel_ocl_path_trace(
 	kg->name = name;
 #include "../../kernel_textures.h"
 
-	/* order threads to maintain inner block coherency */
-	const int group_id = get_group_id(0) + get_num_groups(0) * get_group_id(1);
-	const int local_thread_id = get_local_id(0) + get_local_id(1) * get_local_size(0);
-
-	const int thread_id = group_id * (get_local_size(0) * get_local_size(1)) + local_thread_id;
-
-	/* find which sample range belongs to this thread */
-	ccl_global SampleRange* sample_range = NULL;
-
-	for(int i = 0; i < num_sample_ranges; i++) {
-		if(thread_id >= sample_ranges[i].work_offset &&
-		   thread_id < sample_ranges[i].work_offset + sample_ranges[i].w * sample_ranges[i].h)
-		{
-			sample_range = &sample_ranges[i];
-		}
-	}
-
-	/* check if theres work for this thread */
-	if(!sample_range) {
-		return;
-	}
-
-	int work_offset = thread_id - sample_range->work_offset;
+	ccl_global SampleRange* sample_range;
+	int x, y, sample;
 
-	if(work_offset < 0 || work_offset >= sample_range->w * sample_range->h) {
+	if(!kernel_pixel_sample_for_thread(kg, sample_ranges, num_sample_ranges, &x, &y, &sample, &sample_range)) {
 		return;
 	}
 
-	int x = (work_offset % sample_range->w) + sample_range->x;
-	int y = (work_offset / sample_range->w) + sample_range->y;
-
 	kernel_path_trace(kg,
 	                  sample_range->buffer,
 	                  sample_range->rng_state,
-	                  sampl

@@ Diff output truncated at 10240 characters. @@