[Bf-blender-cvs] [8d3cc43] master: Fix T41471 Cycles Bake: Setting small tile size results in wrong bake with stripes rather than the expected noise pattern

Tue Aug 19 11:43:13 CEST 2014

Commit: 8d3cc431d7fdcc9f3243cc24dfdcb94124be0993
Author: Dalai Felinto
Date:   Tue Aug 19 11:39:40 2014 +0200
Branches: master
https://developer.blender.org/rB8d3cc431d7fdcc9f3243cc24dfdcb94124be0993

Fix T41471 Cycles Bake: Setting small tile size results in wrong bake with stripes rather than the expected noise pattern

This problem was introduced in 983cbafd1877f8dbaae60b064a14e27b5b640f18
Basically the issue is that we were not getting a unique index in the
baking routine for the RNG (random number generator).

Reviewers: sergey

Differential Revision: https://developer.blender.org/D749

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_opencl.cpp
M	intern/cycles/kernel/kernel.cl
M	intern/cycles/kernel/kernel.cpp
M	intern/cycles/kernel/kernel.cu
M	intern/cycles/kernel/kernel.h
M	intern/cycles/kernel/kernel_avx.cpp
M	intern/cycles/kernel/kernel_avx2.cpp
M	intern/cycles/kernel/kernel_bake.h
M	intern/cycles/kernel/kernel_sse2.cpp
M	intern/cycles/kernel/kernel_sse3.cpp
M	intern/cycles/kernel/kernel_sse41.cpp
M	intern/cycles/render/bake.cpp

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 4fdeef6..fd5ae1d 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -435,7 +435,8 @@ public:
 		if(system_cpu_support_avx2()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_avx2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -449,7 +450,8 @@ public:
 		if(system_cpu_support_avx()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -463,7 +465,8 @@ public:
 		if(system_cpu_support_sse41()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -477,7 +480,8 @@ public:
 		if(system_cpu_support_sse3()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -491,7 +495,8 @@ public:
 		if(system_cpu_support_sse2()) {
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
@@ -504,7 +509,8 @@ public:
 		{
 			for(int sample = 0; sample < task.num_samples; sample++) {
 				for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
+					kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
+					    task.shader_eval_type, x, task.offset, sample);
 
 				if(task.get_cancel() || task_pool.canceled())
 					break;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 6629069..9e3d703 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -676,6 +676,7 @@ public:
 		const int shader_chunk_size = 65536;
 		const int start = task.shader_x;
 		const int end = task.shader_x + task.shader_w;
+		int offset = task.offset;
 
 		bool canceled = false;
 		for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
@@ -688,6 +689,7 @@ public:
 								 &task.shader_eval_type,
 								 &shader_x,
 								 &shader_w,
+								 &offset,
 								 &sample};
 
 				/* launch kernel */
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index 077ff9d..82419cd 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1004,6 +1004,7 @@ public:
 		cl_int d_shader_eval_type = task.shader_eval_type;
 		cl_int d_shader_x = task.shader_x;
 		cl_int d_shader_w = task.shader_w;
+		cl_int d_offset = task.offset;
 
 		/* sample arguments */
 		cl_uint narg = 0;
@@ -1033,6 +1034,7 @@ public:
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
 			opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
 
 			enqueue_kernel(kernel, task.shader_w, 1);
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index 2e0a494..4f20ef9 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
 	ccl_global type *name,
 #include "kernel_textures.h"
 
-	int type, int sx, int sw, int sample)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_bake(
 	ccl_global type *name,
 #include "kernel_textures.h"
 
-	int type, int sx, int sw, int sample)
+	int type, int sx, int sw, int offset, int sample)
 {
 	KernelGlobals kglobals, *kg = &kglobals;
 
@@ -153,6 +153,6 @@ __kernel void kernel_ocl_bake(
 	int x = sx + get_global_id(0);
 
 	if(x < sx + sw)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
 }
 
diff --git a/intern/cycles/kernel/kernel.cpp b/intern/cycles/kernel/kernel.cpp
index 42eb9a6..fa2113f 100644
--- a/intern/cycles/kernel/kernel.cpp
+++ b/intern/cycles/kernel/kernel.cpp
@@ -120,10 +120,10 @@ void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *bu
 
 /* Shader Evaluation */
 
-void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int sample)
+void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int i, int offset, int sample)
 {
 	if(type >= SHADER_EVAL_BAKE)
-		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
+		kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, i, offset, sample);
 	else
 		kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, i, sample);
 }
diff --git a/intern/cycles/kernel/kernel.cu b/intern/cycles/kernel/kernel.cu
index 9208acc..23f56ee 100644
--- a/intern/cycles/kernel/kernel.cu
+++ b/intern/cycles/kernel/kernel.cu
@@ -156,12 +156,12 @@ kernel_cuda_shader(uint4 *input, float4 *output, int type, int sx, int sw, int s
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int sample)
+kernel_cuda_bake(uint4 *input, float4 *output, int type, int sx, int sw, int offset, int sample)
 {
 	int x = sx + blockDim.x*blockIdx.x + threadIdx.x;
 
 	if(x < sx + sw)
-		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, sample);
+		kernel_bake_evaluate(NULL, input, output, (ShaderEvalType)type, x, offset, sample);
 }
 
 #endif
diff --git a/intern/cycles/kernel/kernel.h b/intern/cycles/kernel/kernel.h
index 264e5e3..19e06b8 100644
--- a/intern/cycles/kernel/kernel.h
+++ b/intern/cycles/kernel/kernel.h
@@ -41,7 +41,7 @@ void kernel_cpu_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 void kernel_cpu_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 void kernel_cpu_sse2_path_trace(KernelGlobals *kg, float *buffer, unsigned int *rng_state,
@@ -51,7 +51,7 @@ void kernel_cpu_sse2_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse2_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse2_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
@@ -62,7 +62,7 @@ void kernel_cpu_sse3_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buf
 void kernel_cpu_sse3_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse3_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
@@ -73,7 +73,7 @@ void kernel_cpu_sse41_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *bu
 void kernel_cpu_sse41_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_sse41_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
@@ -84,7 +84,7 @@ void kernel_cpu_avx_convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buff
 void kernel_cpu_avx_convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer,
 	float sample_scale, int x, int y, int offset, int stride);
 void kernel_cpu_avx_shader(KernelGlobals *kg, uint4 *input, float4 *output,
-	int type, int i, int sample);
+	int type, int i, int offset, int sample);
 #endif
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
@@ -95,7 +95,7 @@ void kernel_cpu_avx2_convert_to_byte(KernelGloba

@@ Diff output truncated at 10240 characters. @@