[Bf-blender-cvs] [e4e58d4] master: Fix T40370: cycles CUDA baking timeout with high number of AA samples.
Brecht Van Lommel
noreply at git.blender.org
Fri Jun 6 15:39:21 CEST 2014
Commit: e4e58d46128dc7fe4fb9b881d73b38173f00f5c3
Author: Brecht Van Lommel
Date: Fri Jun 6 14:40:09 2014 +0200
https://developer.blender.org/rBe4e58d46128dc7fe4fb9b881d73b38173f00f5c3
Fix T40370: cycles CUDA baking timeout with high number of AA samples.
Now baking does one AA sample at a time, just like final render. There is
also some code for shader antialiasing that solves T40369 but it is disabled
for now because there may be unpredictable side effects.
===================================================================
M intern/cycles/blender/blender_session.cpp
M intern/cycles/device/device_cpu.cpp
M intern/cycles/device/device_cuda.cpp
M intern/cycles/device/device_opencl.cpp
M intern/cycles/kernel/kernel.cl
M intern/cycles/kernel/kernel.cpp
M intern/cycles/kernel/kernel.cu
M intern/cycles/kernel/kernel.h
M intern/cycles/kernel/kernel_avx.cpp
M intern/cycles/kernel/kernel_bake.h
M intern/cycles/kernel/kernel_sse2.cpp
M intern/cycles/kernel/kernel_sse3.cpp
M intern/cycles/kernel/kernel_sse41.cpp
M intern/cycles/render/bake.cpp
M intern/cycles/render/bake.h
M intern/cycles/render/light.cpp
M intern/cycles/render/mesh_displace.cpp
===================================================================
diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 13d4041..0f31e55 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -492,26 +492,6 @@ static void populate_bake_data(BakeData *data, BL::BakePixel pixel_array, const
}
}
-static bool is_light_pass(ShaderEvalType type)
-{
- switch (type) {
- case SHADER_EVAL_AO:
- case SHADER_EVAL_COMBINED:
- case SHADER_EVAL_SHADOW:
- case SHADER_EVAL_DIFFUSE_DIRECT:
- case SHADER_EVAL_GLOSSY_DIRECT:
- case SHADER_EVAL_TRANSMISSION_DIRECT:
- case SHADER_EVAL_SUBSURFACE_DIRECT:
- case SHADER_EVAL_DIFFUSE_INDIRECT:
- case SHADER_EVAL_GLOSSY_INDIRECT:
- case SHADER_EVAL_TRANSMISSION_INDIRECT:
- case SHADER_EVAL_SUBSURFACE_INDIRECT:
- return true;
- default:
- return false;
- }
-}
-
void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::BakePixel pixel_array, int num_pixels, int depth, float result[])
{
ShaderEvalType shader_type = get_shader_type(pass_type);
@@ -529,7 +509,7 @@ void BlenderSession::bake(BL::Object b_object, const string& pass_type, BL::Bake
Pass::add(PASS_UV, scene->film->passes);
}
- if(is_light_pass(shader_type)) {
+ if(BakeManager::is_light_pass(shader_type)) {
/* force use_light_pass to be true */
Pass::add(PASS_LIGHT, scene->film->passes);
}
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index b0739dd..71bf2d2 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -393,7 +393,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
if(system_cpu_support_avx()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++)
+ kernel_cpu_avx_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -404,7 +405,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
if(system_cpu_support_sse41()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++)
+ kernel_cpu_sse41_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -415,7 +417,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
if(system_cpu_support_sse3()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++)
+ kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -426,7 +429,8 @@ public:
#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
if(system_cpu_support_sse2()) {
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++)
+ kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
@@ -436,7 +440,8 @@ public:
#endif
{
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
- kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
+ for(int sample = 0; sample < task.num_samples; sample++)
+ kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x, sample);
if(task.get_cancel() || task_pool.canceled())
break;
diff --git a/intern/cycles/device/device_cuda.cpp b/intern/cycles/device/device_cuda.cpp
index 0429bfc..0aa09ac 100644
--- a/intern/cycles/device/device_cuda.cpp
+++ b/intern/cycles/device/device_cuda.cpp
@@ -764,40 +764,45 @@ public:
int shader_w = min(shader_chunk_size, end - shader_x);
- /* pass in parameters */
- int offset = 0;
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ /* pass in parameters */
+ int offset = 0;
- cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
- offset += sizeof(d_input);
+ cuda_assert(cuParamSetv(cuShader, offset, &d_input, sizeof(d_input)));
+ offset += sizeof(d_input);
- cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
- offset += sizeof(d_output);
+ cuda_assert(cuParamSetv(cuShader, offset, &d_output, sizeof(d_output)));
+ offset += sizeof(d_output);
- int shader_eval_type = task.shader_eval_type;
- offset = align_up(offset, __alignof(shader_eval_type));
+ int shader_eval_type = task.shader_eval_type;
+ offset = align_up(offset, __alignof(shader_eval_type));
- cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
- offset += sizeof(task.shader_eval_type);
+ cuda_assert(cuParamSeti(cuShader, offset, task.shader_eval_type));
+ offset += sizeof(task.shader_eval_type);
- cuda_assert(cuParamSeti(cuShader, offset, shader_x));
- offset += sizeof(shader_x);
+ cuda_assert(cuParamSeti(cuShader, offset, shader_x));
+ offset += sizeof(shader_x);
- cuda_assert(cuParamSeti(cuShader, offset, shader_w));
- offset += sizeof(shader_w);
+ cuda_assert(cuParamSeti(cuShader, offset, shader_w));
+ offset += sizeof(shader_w);
- cuda_assert(cuParamSetSize(cuShader, offset));
+ cuda_assert(cuParamSeti(cuShader, offset, sample));
+ offset += sizeof(sample);
- /* launch kernel */
- int threads_per_block;
- cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+ cuda_assert(cuParamSetSize(cuShader, offset));
- int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
+ /* launch kernel */
+ int threads_per_block;
+ cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
- cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
- cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
- cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+ int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
- cuda_assert(cuCtxSynchronize());
+ cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+ cuda_assert(cuFuncSetBlockShape(cuShader, threads_per_block, 1, 1));
+ cuda_assert(cuLaunchGrid(cuShader, xblocks, 1));
+
+ cuda_assert(cuCtxSynchronize());
+ }
}
cuda_pop_context();
diff --git a/intern/cycles/device/device_opencl.cpp b/intern/cycles/device/device_opencl.cpp
index f841dab..abfe445 100644
--- a/intern/cycles/device/device_opencl.cpp
+++ b/intern/cycles/device/device_opencl.cpp
@@ -1067,19 +1067,24 @@ public:
else
kernel = ckShaderKernel;
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
+ for(int sample = 0; sample < task.num_samples; sample++) {
+ cl_int d_sample = task.sample;
+
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
#define KERNEL_TEX(type, ttype, name) \
- set_kernel_arg_mem(kernel, &narg, #name);
+ set_kernel_arg_mem(kernel, &narg, #name);
#include "kernel_textures.h"
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
- opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
+ opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
- enqueue_kernel(kernel, task.shader_w, 1);
+ enqueue_kernel(kernel, task.shader_w, 1);
+ }
}
void thread_run(DeviceTask *task)
diff --git a/intern/cycles/kernel/kernel.cl b/intern/cycles/kernel/kernel.cl
index d7d3438..2e0a494 100644
--- a/intern/cycles/kernel/kernel.cl
+++ b/intern/cycles/kernel/kernel.cl
@@ -115,7 +115,7 @@ __kernel void kernel_ocl_shader(
ccl_global type *name,
#include "kernel_textures.h"
- int type, int sx, int sw)
+ int type, int sx, int sw, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -128,7 +128,7 @@ __kernel void kernel_ocl_shader(
int x = sx + get_global_id(0);
if(x < sx + sw)
- kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x);
+ kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
}
__kernel void kernel_ocl_bake(
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_bake(
ccl_global type *name,
#include "kernel_textures.h"
- int type, int sx, int sw)
+ int type, int sx, int sw, int sample)
{
KernelGlobals kglobals, *kg = &kglobals;
@@ -153,6 +153,6 @@ __kernel void kernel_ocl_bake(
int x = sx + get_global_id(0);
if(x < sx + sw)
- kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x);
+ kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
}
diff --git a/intern/cycles/kernel/kernel.cpp b/
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list