[Bf-blender-cvs] [c4c450045d0] master: Code cleanup: tweak inlining for 2% better CUDA performance with hair.
Brecht Van Lommel
noreply at git.blender.org
Wed Sep 13 15:30:44 CEST 2017
Commit: c4c450045d072c79d02a1857f56ecf94689375c8
Author: Brecht Van Lommel
Date: Wed Sep 13 00:12:51 2017 +0200
Branches: master
https://developer.blender.org/rBc4c450045d072c79d02a1857f56ecf94689375c8
Code cleanup: tweak inlining for 2% better CUDA performance with hair.
===================================================================
M intern/cycles/kernel/kernel_path.h
===================================================================
diff --git a/intern/cycles/kernel/kernel_path.h b/intern/cycles/kernel/kernel_path.h
index bc8f8ec09f8..bfde96ec270 100644
--- a/intern/cycles/kernel/kernel_path.h
+++ b/intern/cycles/kernel/kernel_path.h
@@ -424,26 +424,18 @@ ccl_device void kernel_path_indirect(KernelGlobals *kg,
#endif /* defined(__BRANCHED_PATH__) || defined(__BAKING__) */
-ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
- uint rng_hash,
- int sample,
- Ray ray,
- ccl_global float *buffer,
- PathRadiance *L,
- bool *is_shadow_catcher)
+ccl_device_forceinline void kernel_path_integrate(
+ KernelGlobals *kg,
+ PathState *state,
+ float3 throughput,
+ Ray *ray,
+ PathRadiance *L,
+ float *buffer,
+ ShaderData *emission_sd,
+ bool *is_shadow_catcher)
{
- /* initialize */
- float3 throughput = make_float3(1.0f, 1.0f, 1.0f);
-
- path_radiance_init(L, kernel_data.film.use_light_pass);
-
- /* shader data memory used for both volumes and surfaces, saves stack space */
+ /* Shader data memory used for both volumes and surfaces, saves stack space. */
ShaderData sd;
- /* shader data used by emission, shadows, volume stacks */
- ShaderData emission_sd;
-
- PathState state;
- path_state_init(kg, &emission_sd, &state, rng_hash, sample, &ray);
#ifdef __SUBSURFACE__
SubsurfaceIndirectRays ss_indirect;
@@ -456,40 +448,40 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
for(;;) {
/* intersect scene */
Intersection isect;
- uint visibility = path_state_ray_visibility(kg, &state);
+ uint visibility = path_state_ray_visibility(kg, state);
#ifdef __HAIR__
float difl = 0.0f, extmax = 0.0f;
uint lcg_state = 0;
if(kernel_data.bvh.have_curves) {
- if((kernel_data.cam.resolution == 1) && (state.flag & PATH_RAY_CAMERA)) {
- float3 pixdiff = ray.dD.dx + ray.dD.dy;
- /*pixdiff = pixdiff - dot(pixdiff, ray.D)*ray.D;*/
+ if((kernel_data.cam.resolution == 1) && (state->flag & PATH_RAY_CAMERA)) {
+ float3 pixdiff = ray->dD.dx + ray->dD.dy;
+ /*pixdiff = pixdiff - dot(pixdiff, ray->D)*ray->D;*/
difl = kernel_data.curve.minimum_width * len(pixdiff) * 0.5f;
}
extmax = kernel_data.curve.maximum_width;
- lcg_state = lcg_state_init(&state, 0x51633e2d);
+ lcg_state = lcg_state_init(state, 0x51633e2d);
}
- if(path_state_ao_bounce(kg, &state)) {
+ if(path_state_ao_bounce(kg, state)) {
visibility = PATH_RAY_SHADOW;
- ray.t = kernel_data.background.ao_distance;
+ ray->t = kernel_data.background.ao_distance;
}
- bool hit = scene_intersect(kg, ray, visibility, &isect, &lcg_state, difl, extmax);
+ bool hit = scene_intersect(kg, *ray, visibility, &isect, &lcg_state, difl, extmax);
#else
if(path_state_ao_bounce(kg, state)) {
visibility = PATH_RAY_SHADOW;
- ray.t = kernel_data.background.ao_distance;
+ ray->t = kernel_data.background.ao_distance;
}
bool hit = scene_intersect(kg, ray, visibility, &isect, NULL, 0.0f, 0.0f);
#endif /* __HAIR__ */
#ifdef __KERNEL_DEBUG__
- if(state.flag & PATH_RAY_CAMERA) {
+ if(state->flag & PATH_RAY_CAMERA) {
L->debug_data.num_bvh_traversed_nodes += isect.num_traversed_nodes;
L->debug_data.num_bvh_traversed_instances += isect.num_traversed_instances;
L->debug_data.num_bvh_intersections += isect.num_intersections;
@@ -498,40 +490,40 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
#endif /* __KERNEL_DEBUG__ */
#ifdef __LAMP_MIS__
- if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+ if(kernel_data.integrator.use_lamp_mis && !(state->flag & PATH_RAY_CAMERA)) {
/* ray starting from previous non-transparent bounce */
Ray light_ray;
- light_ray.P = ray.P - state.ray_t*ray.D;
- state.ray_t += isect.t;
- light_ray.D = ray.D;
- light_ray.t = state.ray_t;
- light_ray.time = ray.time;
- light_ray.dD = ray.dD;
- light_ray.dP = ray.dP;
+ light_ray.P = ray->P - state->ray_t*ray->D;
+ state->ray_t += isect.t;
+ light_ray.D = ray->D;
+ light_ray.t = state->ray_t;
+ light_ray.time = ray->time;
+ light_ray.dD = ray->dD;
+ light_ray.dP = ray->dP;
/* intersect with lamp */
float3 emission;
- if(indirect_lamp_emission(kg, &emission_sd, &state, &light_ray, &emission))
- path_radiance_accum_emission(L, throughput, emission, state.bounce);
+ if(indirect_lamp_emission(kg, emission_sd, state, &light_ray, &emission))
+ path_radiance_accum_emission(L, throughput, emission, state->bounce);
}
#endif /* __LAMP_MIS__ */
#ifdef __VOLUME__
/* Sanitize volume stack. */
if(!hit) {
- kernel_volume_clean_stack(kg, state.volume_stack);
+ kernel_volume_clean_stack(kg, state->volume_stack);
}
/* volume attenuation, emission, scatter */
- if(state.volume_stack[0].shader != SHADER_NONE) {
- Ray volume_ray = ray;
+ if(state->volume_stack[0].shader != SHADER_NONE) {
+ Ray volume_ray = *ray;
volume_ray.t = (hit)? isect.t: FLT_MAX;
- bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+ bool heterogeneous = volume_stack_is_heterogeneous(kg, state->volume_stack);
# ifdef __VOLUME_DECOUPLED__
- int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+ int sampling_method = volume_stack_sampling_method(kg, state->volume_stack);
bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
if(decoupled) {
@@ -539,14 +531,14 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
VolumeSegment volume_segment;
shader_setup_from_volume(kg, &sd, &volume_ray);
- kernel_volume_decoupled_record(kg, &state,
+ kernel_volume_decoupled_record(kg, state,
&volume_ray, &sd, &volume_segment, heterogeneous);
volume_segment.sampling_method = sampling_method;
/* emission */
if(volume_segment.closure_flag & SD_EMISSION)
- path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state.bounce);
+ path_radiance_accum_emission(L, throughput, volume_segment.accum_emission, state->bounce);
/* scattering */
VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
@@ -556,17 +548,17 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
/* direct light sampling */
kernel_branched_path_volume_connect_light(kg, &sd,
- &emission_sd, throughput, &state, L, all,
+ emission_sd, throughput, state, L, all,
&volume_ray, &volume_segment);
/* indirect sample. if we use distance sampling and take just
* one sample for direct and indirect light, we could share
* this computation, but makes code a bit complex */
- float rphase = path_state_rng_1D_for_decision(kg, &state, PRNG_PHASE);
- float rscatter = path_state_rng_1D_for_decision(kg, &state, PRNG_SCATTER_DISTANCE);
+ float rphase = path_state_rng_1D_for_decision(kg, state, PRNG_PHASE);
+ float rscatter = path_state_rng_1D_for_decision(kg, state, PRNG_SCATTER_DISTANCE);
result = kernel_volume_decoupled_scatter(kg,
- &state, &volume_ray, &sd, &throughput,
+ state, &volume_ray, &sd, &throughput,
rphase, rscatter, &volume_segment, NULL, true);
}
@@ -574,7 +566,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
kernel_volume_decoupled_free(kg, &volume_segment);
if(result == VOLUME_PATH_SCATTERED) {
- if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray))
+ if(kernel_path_volume_bounce(kg, &sd, &throughput, state, L, ray))
continue;
else
break;
@@ -588,15 +580,15 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
{
/* integrate along volume segment with distance sampling */
VolumeIntegrateResult result = kernel_volume_integrate(
- kg, &state, &sd, &volume_ray, L, &throughput, heterogeneous);
+ kg, state, &sd, &volume_ray, L, &throughput, heterogeneous);
# ifdef __VOLUME_SCATTER__
if(result == VOLUME_PATH_SCATTERED) {
/* direct lighting */
- kernel_path_volume_connect_light(kg, &sd, &emission_sd, throughput, &state, L);
+ kernel_path_volume_connect_light(kg, &sd, emission_sd, throughput, state, L);
/* indirect light bounce */
- if(kernel_path_volume_bounce(kg, &sd, &throughput, &state, L, &ray))
+ if(kernel_path_volume_bounce(kg, &sd, &throughput, state, L, ray))
continue;
else
break;
@@ -608,7 +600,7 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
if(!hit) {
/* eval background shader if nothing hit */
- if(kernel_data.background.transparent && (state.flag & PATH_RAY_CAMERA)) {
+ if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
L->transparent += average(throughput);
#ifdef __PASSES__
@@ -619,35 +611,35 @@ ccl_device_inline void kernel_path_integrate(KernelGlobals *kg,
#ifdef __BACKGROUND__
/* sample background shader */
- float3 L_background = indirect_background(kg, &emission_sd, &state, &ray);
- path_radiance_accum_background(L, &state, throughput, L_background);
+ float3 L_background = indirect_background(kg, emission_sd, state, ray);
+ path_radiance_accum_background(L, state, throughput, L_background);
#endif /* __BACKGROUND__ */
break;
}
- else if(path_state_ao_bounce(kg, &state)) {
+ else if(path_state_ao_bounce(kg, state)) {
break;
}
/* setup shading */
- shader_setup_from_ray(kg, &sd, &isect, &ray);
- float rbsdf = path_state_rng_1D_for_decision(kg, &state, PRNG_BSDF);
- shader_eval_surface(kg, &sd, &state, rbsdf, state.flag);
+ shader_setup_from_ray(kg, &sd, &isect, ray);
+ float rbsdf = path_state_rng_1D_for_decision(kg, state, PRNG_BSDF);
+ shader_eval_surface(kg, &sd, state, rbsdf, state->flag);
#ifdef __SHADOW_TRICKS__
if((sd.object_flag & SD_OBJECT_SHADOW_CATCHER)) {
- if(state.flag & PATH_RAY_CAMERA) {
- state.flag |= (PATH_RAY_SHADOW_CATCHER |
+ if(state->flag & PATH_RAY_CA
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list