[Bf-blender-cvs] [3918c8b] master: Cycles: Optionally output luminance from the shader evaluation kernel

Sergey Sharybin noreply at git.blender.org
Wed Dec 30 15:04:21 CET 2015


Commit: 3918c8b9a52ae9dcdb0488df92d7d3ca615be8c7
Author: Sergey Sharybin
Date:   Wed Dec 30 19:04:01 2015 +0500
Branches: master
https://developer.blender.org/rB3918c8b9a52ae9dcdb0488df92d7d3ca615be8c7

Cycles: Optionally output luminance from the shader evaluation kernel

This makes it possible to move some parts of evaluation from host to the device
and hopefully reduce memory usage by avoid having full RGBA buffer on the host.

Reviewers: juicyfruit, lukasstockner97, brecht

Reviewed By: lukasstockner97, brecht

Differential Revision: https://developer.blender.org/D1702

===================================================================

M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_cuda.cpp
M	intern/cycles/device/device_multi.cpp
M	intern/cycles/device/device_network.cpp
M	intern/cycles/device/device_network.h
M	intern/cycles/device/device_opencl.cpp
M	intern/cycles/device/device_task.cpp
M	intern/cycles/device/device_task.h
M	intern/cycles/kernel/kernel_bake.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu.h
M	intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
M	intern/cycles/kernel/kernels/cuda/kernel.cu
M	intern/cycles/kernel/kernels/opencl/kernel.cl

===================================================================

diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index f06963c..832f4d1 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -343,7 +343,7 @@ public:
 #ifdef WITH_OSL
 		OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
 #endif
-		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, int, int, int, int);
+		void(*shader_kernel)(KernelGlobals*, uint4*, float4*, float*, int, int, int, int);
 
 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 		if(system_cpu_support_avx2())
@@ -374,8 +374,14 @@ public:
 
 		for(int sample = 0; sample < task.num_samples; sample++) {
 			for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
-				shader_kernel(&kg, (uint4*)task.shader_input, (float4*)task.shader_output,
-					task.shader_eval_type, x, task.offset, sample);
+				shader_kernel(&kg,
+				              (uint4*)task.shader_input,
+				              (float4*)task.shader_output,
+				              (float*)task.shader_output_luma,
+				              task.shader_eval_type,
+				              x,
+				              task.offset,
+				              sample);
 
 			if(task.get_cancel() || task_pool.canceled())
 				break;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index d9d6fd7..5c9ca34 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -726,6 +726,7 @@ public:
 		CUfunction cuShader;
 		CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 		CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+		CUdeviceptr d_output_luma = cuda_device_ptr(task.shader_output_luma);
 
 		/* get kernel function */
 		if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
@@ -747,13 +748,18 @@ public:
 				int shader_w = min(shader_chunk_size, end - shader_x);
 
 				/* pass in parameters */
-				void *args[] = {&d_input,
-								 &d_output,
-								 &task.shader_eval_type,
-								 &shader_x,
-								 &shader_w,
-								 &offset,
-								 &sample};
+				void *args[8];
+				int arg = 0;
+				args[arg++] = &d_input;
+				args[arg++] = &d_output;
+				if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+					args[arg++] = &d_output_luma;
+				}
+				args[arg++] = &task.shader_eval_type;
+				args[arg++] = &shader_x;
+				args[arg++] = &shader_w;
+				args[arg++] = &offset;
+				args[arg++] = &sample;
 
 				/* launch kernel */
 				int threads_per_block;
diff --git a/intern/cycles/device/device_multi.cpp b/intern/cycles/device/device_multi.cpp
index 8fb841b..069305e 100644
--- a/intern/cycles/device/device_multi.cpp
+++ b/intern/cycles/device/device_multi.cpp
@@ -316,6 +316,7 @@ public:
 				if(task.rgba_half) subtask.rgba_half = sub.ptr_map[task.rgba_half];
 				if(task.shader_input) subtask.shader_input = sub.ptr_map[task.shader_input];
 				if(task.shader_output) subtask.shader_output = sub.ptr_map[task.shader_output];
+				if(task.shader_output_luma) subtask.shader_output_luma = sub.ptr_map[task.shader_output_luma];
 
 				sub.device->task_add(subtask);
 			}
diff --git a/intern/cycles/device/device_network.cpp b/intern/cycles/device/device_network.cpp
index afa3522..23faa61 100644
--- a/intern/cycles/device/device_network.cpp
+++ b/intern/cycles/device/device_network.cpp
@@ -648,6 +648,9 @@ protected:
 			if(task.shader_output)
 				task.shader_output = device_ptr_from_client_pointer(task.shader_output);
 
+			if(task.shader_output)luma)
+				task.shader_output_luma = device_ptr_from_client_pointer(task.shader_output_luma);
+
 
 			task.acquire_tile = function_bind(&DeviceServer::task_acquire_tile, this, _1, _2);
 			task.release_tile = function_bind(&DeviceServer::task_release_tile, this, _1);
diff --git a/intern/cycles/device/device_network.h b/intern/cycles/device/device_network.h
index 2e751f6..60ecc1d 100644
--- a/intern/cycles/device/device_network.h
+++ b/intern/cycles/device/device_network.h
@@ -132,7 +132,7 @@ public:
 		archive & type & task.x & task.y & task.w & task.h;
 		archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		archive & task.offset & task.stride;
-		archive & task.shader_input & task.shader_output & task.shader_eval_type;
+		archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
 		archive & task.shader_x & task.shader_w;
 		archive & task.need_finish_queue;
 	}
@@ -291,7 +291,7 @@ public:
 		*archive & type & task.x & task.y & task.w & task.h;
 		*archive & task.rgba_byte & task.rgba_half & task.buffer & task.sample & task.num_samples;
 		*archive & task.offset & task.stride;
-		*archive & task.shader_input & task.shader_output & task.shader_eval_type;
+		*archive & task.shader_input & task.shader_output & task.shader_output_luma & task.shader_eval_type;
 		*archive & task.shader_x & task.shader_w;
 		*archive & task.need_finish_queue;
 
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index e0c6024..a1743f5 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1304,6 +1304,7 @@ public:
 		cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
 		cl_mem d_input = CL_MEM_PTR(task.shader_input);
 		cl_mem d_output = CL_MEM_PTR(task.shader_output);
+		cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
 		cl_int d_shader_eval_type = task.shader_eval_type;
 		cl_int d_shader_x = task.shader_x;
 		cl_int d_shader_w = task.shader_w;
@@ -1330,6 +1331,12 @@ public:
 				                d_input,
 				                d_output);
 
+		if(task.shader_eval_type < SHADER_EVAL_BAKE) {
+			start_arg_index += kernel_set_args(kernel,
+			                                   start_arg_index,
+			                                   d_output_luma);
+		}
+
 #define KERNEL_TEX(type, ttype, name) \
 		set_kernel_arg_mem(kernel, &start_arg_index, #name);
 #include "kernel_textures.h"
diff --git a/intern/cycles/device/device_task.cpp b/intern/cycles/device/device_task.cpp
index d527540..0cae118 100644
--- a/intern/cycles/device/device_task.cpp
+++ b/intern/cycles/device/device_task.cpp
@@ -29,7 +29,7 @@ CCL_NAMESPACE_BEGIN
 DeviceTask::DeviceTask(Type type_)
 : type(type_), x(0), y(0), w(0), h(0), rgba_byte(0), rgba_half(0), buffer(0),
   sample(0), num_samples(1),
-  shader_input(0), shader_output(0),
+  shader_input(0), shader_output(0), shader_output_luma(0),
   shader_eval_type(0), shader_x(0), shader_w(0)
 {
 	last_update_time = time_dt();
diff --git a/intern/cycles/device/device_task.h b/intern/cycles/device/device_task.h
index 834ea60..7654508 100644
--- a/intern/cycles/device/device_task.h
+++ b/intern/cycles/device/device_task.h
@@ -46,7 +46,7 @@ public:
 	int offset, stride;
 
 	device_ptr shader_input;
-	device_ptr shader_output;
+	device_ptr shader_output, shader_output_luma;
 	int shader_eval_type;
 	int shader_x, shader_w;
 
diff --git a/intern/cycles/kernel/kernel_bake.h b/intern/cycles/kernel/kernel_bake.h
index 715c11c..b54afbd 100644
--- a/intern/cycles/kernel/kernel_bake.h
+++ b/intern/cycles/kernel/kernel_bake.h
@@ -453,7 +453,13 @@ ccl_device void kernel_bake_evaluate(KernelGlobals *kg, ccl_global uint4 *input,
 		output[i] += make_float4(out.x, out.y, out.z, 1.0f) * output_fac;
 }
 
-ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *input, ccl_global float4 *output, ShaderEvalType type, int i, int sample)
+ccl_device void kernel_shader_evaluate(KernelGlobals *kg,
+                                       ccl_global uint4 *input,
+                                       ccl_global float4 *output,
+                                       ccl_global float *output_luma,
+                                       ShaderEvalType type,
+                                       int i,
+                                       int sample)
 {
 	ShaderData sd;
 	uint4 in = input[i];
@@ -500,10 +506,22 @@ ccl_device void kernel_shader_evaluate(KernelGlobals *kg, ccl_global uint4 *inpu
 	}
 	
 	/* write output */
-	if(sample == 0)
-		output[i] = make_float4(out.x, out.y, out.z, 0.0f);
-	else
-		output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+	if(sample == 0) {
+		if(output != NULL) {
+			output[i] = make_float4(out.x, out.y, out.z, 0.0f);
+		}
+		if(output_luma != NULL) {
+			output_luma[i] = average(out);
+		}
+	}
+	else {
+		if(output != NULL) {
+			output[i] += make_float4(out.x, out.y, out.z, 0.0f);
+		}
+		if(output_luma != NULL) {
+			output_luma[i] += average(out);
+		}
+	}
 }
 
 CCL_NAMESPACE_END
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
index 2560c6d..1ce1e41 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu.h
@@ -42,6 +42,7 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
diff --git a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
index 693285e..0249610 100644
--- a/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
+++ b/intern/cycles/kernel/kernels/cpu/kernel_cpu_impl.h
@@ -99,12 +99,14 @@ void KERNEL_FUNCTION_FULL_NAME(convert_to_half_float)(KernelGlobals *kg,
 void KERNEL_FUNCTION_FULL_NAME(shader)(KernelGlobals *kg,
                                        uint4 *input,
                                        float4 *output,
+                                       float *output_luma,
                                        int type,
                                        int i,
                                        int offset,
                                        int sample)
 {
 	if(type >= SHADER_EVAL_BAKE) {
+		kernel_assert(output_luma == NULL);
 		kernel_bake_evaluate(kg,
 		                     input,
 		                     output,
@@ -117,6 +119,7 @@ void KERNEL_FUNCTION

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list