[Bf-committers] [Bf-blender-cvs] [ae7d84d] master: Cycles: Use native saturate function for CUDA

Sergey Sharybin sergey.vfx at gmail.com
Mon Apr 27 21:50:35 CEST 2015


Eeeh, original author was lost when i've amened the commit in order to put
more info about benchmark. Sorry about that and credits are going to lockal
(Sv. Lockal).

On Tue, Apr 28, 2015 at 12:48 AM, Sergey Sharybin <noreply at git.blender.org>
wrote:

> Commit: ae7d84dbc1936ef7ddd00f9c22d074389f97f04f
> Author: Sergey Sharybin
> Date:   Tue Apr 28 00:13:03 2015 +0500
> Branches: master
> https://developer.blender.org/rBae7d84dbc1936ef7ddd00f9c22d074389f97f04f
>
> Cycles: Use native saturate function for CUDA
>
> This more a workaround for CUDA optimizer which can't optimize clamp(x, 0,
> 1)
> into a single instruction and uses 4 instructions instead.
>
> Original patch by @lockal with own modification:
>
>   Don't make changes outside of the kernel. They don't make any difference
>   anyway and term saturate() has a bit different meaning outside of kernel.
>
> This gives around 2% of speedup in Barcelona file, but in more complex
> shader
> setups with lots of math nodes with clamping speedup could be much nicer.
>
> Subscribers: dingto
>
> Projects: #cycles
>
> Differential Revision: https://developer.blender.org/D1224
>
> ===================================================================
>
> M       intern/cycles/kernel/closure/bsdf_microfacet.h
> M       intern/cycles/kernel/closure/bsdf_oren_nayar.h
> M       intern/cycles/kernel/closure/bsdf_toon.h
> M       intern/cycles/kernel/closure/bssrdf.h
> M       intern/cycles/kernel/geom/geom_curve.h
> M       intern/cycles/kernel/kernel_film.h
> M       intern/cycles/kernel/kernel_globals.h
> M       intern/cycles/kernel/kernel_passes.h
> M       intern/cycles/kernel/svm/svm_brick.h
> M       intern/cycles/kernel/svm/svm_closure.h
> M       intern/cycles/kernel/svm/svm_gradient.h
> M       intern/cycles/kernel/svm/svm_image.h
> M       intern/cycles/kernel/svm/svm_math_util.h
> M       intern/cycles/kernel/svm/svm_mix.h
> M       intern/cycles/kernel/svm/svm_musgrave.h
> M       intern/cycles/kernel/svm/svm_ramp.h
> M       intern/cycles/render/buffers.cpp
> M       intern/cycles/render/nodes.cpp
> M       intern/cycles/util/util_math.h
>
> ===================================================================
>
> diff --git a/intern/cycles/kernel/closure/bsdf_microfacet.h
> b/intern/cycles/kernel/closure/bsdf_microfacet.h
> index ca68b1e..6a50bbe 100644
> --- a/intern/cycles/kernel/closure/bsdf_microfacet.h
> +++ b/intern/cycles/kernel/closure/bsdf_microfacet.h
> @@ -235,7 +235,7 @@ ccl_device_inline float3 microfacet_sample_stretched(
>
>  ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure *sc)
>  {
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
> +       sc->data0 = saturate(sc->data0); /* alpha_x */
>         sc->data1 = sc->data0; /* alpha_y */
>
>         sc->type = CLOSURE_BSDF_MICROFACET_GGX_ID;
> @@ -245,8 +245,8 @@ ccl_device int bsdf_microfacet_ggx_setup(ShaderClosure
> *sc)
>
>  ccl_device int bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
>  {
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
> -       sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
> +       sc->data0 = saturate(sc->data0); /* alpha_x */
> +       sc->data1 = saturate(sc->data1); /* alpha_y */
>
>         sc->type = CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID;
>
> @@ -255,7 +255,7 @@ ccl_device int
> bsdf_microfacet_ggx_aniso_setup(ShaderClosure *sc)
>
>  ccl_device int bsdf_microfacet_ggx_refraction_setup(ShaderClosure *sc)
>  {
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
> +       sc->data0 = saturate(sc->data0); /* alpha_x */
>         sc->data1 = sc->data0; /* alpha_y */
>
>         sc->type = CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID;
> @@ -588,7 +588,7 @@ ccl_device int
> bsdf_microfacet_ggx_sample(KernelGlobals *kg, const ShaderClosure
>
>  ccl_device int bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
>  {
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
> +       sc->data0 = saturate(sc->data0); /* alpha_x */
>         sc->data1 = sc->data0; /* alpha_y */
>
>         sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ID;
> @@ -597,8 +597,8 @@ ccl_device int
> bsdf_microfacet_beckmann_setup(ShaderClosure *sc)
>
>  ccl_device int bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
>  {
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
> -       sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* alpha_y */
> +       sc->data0 = saturate(sc->data0); /* alpha_x */
> +       sc->data1 = saturate(sc->data1); /* alpha_y */
>
>         sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID;
>         return SD_BSDF|SD_BSDF_HAS_EVAL;
> @@ -606,7 +606,7 @@ ccl_device int
> bsdf_microfacet_beckmann_aniso_setup(ShaderClosure *sc)
>
>  ccl_device int bsdf_microfacet_beckmann_refraction_setup(ShaderClosure
> *sc)
>  {
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f); /* alpha_x */
> +       sc->data0 = saturate(sc->data0); /* alpha_x */
>         sc->data1 = sc->data0; /* alpha_y */
>
>         sc->type = CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID;
> diff --git a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
> b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
> index c476d4c..61b7cb1 100644
> --- a/intern/cycles/kernel/closure/bsdf_oren_nayar.h
> +++ b/intern/cycles/kernel/closure/bsdf_oren_nayar.h
> @@ -37,7 +37,7 @@ ccl_device int bsdf_oren_nayar_setup(ShaderClosure *sc)
>
>         sc->type = CLOSURE_BSDF_OREN_NAYAR_ID;
>
> -       sigma = clamp(sigma, 0.0f, 1.0f);
> +       sigma = saturate(sigma);
>
>         float div = 1.0f / (M_PI_F + ((3.0f * M_PI_F - 4.0f) / 6.0f) *
> sigma);
>
> diff --git a/intern/cycles/kernel/closure/bsdf_toon.h
> b/intern/cycles/kernel/closure/bsdf_toon.h
> index df03942..e5b6ab9 100644
> --- a/intern/cycles/kernel/closure/bsdf_toon.h
> +++ b/intern/cycles/kernel/closure/bsdf_toon.h
> @@ -40,8 +40,8 @@ CCL_NAMESPACE_BEGIN
>  ccl_device int bsdf_diffuse_toon_setup(ShaderClosure *sc)
>  {
>         sc->type = CLOSURE_BSDF_DIFFUSE_TOON_ID;
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
> -       sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
> +       sc->data0 = saturate(sc->data0);
> +       sc->data1 = saturate(sc->data1);
>
>         return SD_BSDF|SD_BSDF_HAS_EVAL;
>  }
> @@ -120,8 +120,8 @@ ccl_device int bsdf_diffuse_toon_sample(const
> ShaderClosure *sc, float3 Ng, floa
>  ccl_device int bsdf_glossy_toon_setup(ShaderClosure *sc)
>  {
>         sc->type = CLOSURE_BSDF_GLOSSY_TOON_ID;
> -       sc->data0 = clamp(sc->data0, 0.0f, 1.0f);
> -       sc->data1 = clamp(sc->data1, 0.0f, 1.0f);
> +       sc->data0 = saturate(sc->data0);
> +       sc->data1 = saturate(sc->data1);
>
>         return SD_BSDF|SD_BSDF_HAS_EVAL;
>  }
> diff --git a/intern/cycles/kernel/closure/bssrdf.h
> b/intern/cycles/kernel/closure/bssrdf.h
> index b284826..f817dcd 100644
> --- a/intern/cycles/kernel/closure/bssrdf.h
> +++ b/intern/cycles/kernel/closure/bssrdf.h
> @@ -30,8 +30,8 @@ ccl_device int bssrdf_setup(ShaderClosure *sc,
> ClosureType type)
>                 return flag;
>         }
>         else {
> -               sc->data1 = clamp(sc->data1, 0.0f, 1.0f); /* texture blur
> */
> -               sc->T.x = clamp(sc->T.x, 0.0f, 1.0f); /* sharpness */
> +               sc->data1 = saturate(sc->data1); /* texture blur */
> +               sc->T.x = saturate(sc->T.x); /* sharpness */
>                 sc->type = type;
>
>                 return SD_BSDF|SD_BSDF_HAS_EVAL|SD_BSSRDF;
> @@ -168,7 +168,7 @@ ccl_device float bssrdf_cubic_quintic_root_find(float
> xi)
>                 if(fabsf(f) < tolerance || f_ == 0.0f)
>                         break;
>
> -               x = clamp(x - f/f_, 0.0f, 1.0f);
> +               x = saturate(x - f/f_);
>         }
>
>         return x;
> diff --git a/intern/cycles/kernel/geom/geom_curve.h
> b/intern/cycles/kernel/geom/geom_curve.h
> index c13858f..ec6c790 100644
> --- a/intern/cycles/kernel/geom/geom_curve.h
> +++ b/intern/cycles/kernel/geom/geom_curve.h
> @@ -465,7 +465,7 @@ ccl_device_inline bool
> bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
>                                         continue;
>                                 }
>                                 w = -(p_st.x * tg.x + p_st.y * tg.y) / w;
> -                               w = clamp((float)w, 0.0f, 1.0f);
> +                               w = saturate(w);
>
>                                 /* compute u on the curve segment */
>                                 u = i_st * (1 - w) + i_en * w;
> @@ -577,7 +577,7 @@ ccl_device_inline bool
> bvh_cardinal_curve_intersect(KernelGlobals *kg, Intersect
>                                 }
>
>                                 float w = (zcentre + (tg.z * correction))
> * invl;
> -                               w = clamp((float)w, 0.0f, 1.0f);
> +                               w = saturate(w);
>                                 /* compute u on the curve segment */
>                                 u = i_st * (1 - w) + i_en * w;
>
> diff --git a/intern/cycles/kernel/kernel_film.h
> b/intern/cycles/kernel/kernel_film.h
> index 4668b40..f9e9b41 100644
> --- a/intern/cycles/kernel/kernel_film.h
> +++ b/intern/cycles/kernel/kernel_film.h
> @@ -27,7 +27,7 @@ ccl_device float4 film_map(KernelGlobals *kg, float4
> irradiance, float scale)
>         result.z = color_scene_linear_to_srgb(result.z*exposure);
>
>         /* clamp since alpha might be > 1.0 due to russian roulette */
> -       result.w = clamp(result.w, 0.0f, 1.0f);
> +       result.w = saturate(result.w);
>
>         return result;
>  }
> @@ -37,10 +37,10 @@ ccl_device uchar4 film_float_to_byte(float4 color)
>         uchar4 result;
>
>         /* simple float to byte conversion */
> -       result.x = (uchar)clamp(color.x*255.0f, 0.0f, 255.0f);
> -       result.y = (uchar)clamp(color.y*255.0f, 0.0f, 255.0f);
> -       result.z = (uchar)clamp(color.z*255.0f, 0.0f, 255.0f);
> -       result.w = (uchar)clamp(color.w*255.0f, 0.0f, 255.0f);
> +       result.x = (uchar)(saturate(color.x)*255.0f);
> +       result.y = (uchar)(saturate(color.y)*255.0f);
> +       result.z = (uchar)(saturate(color.z)*255.0f);
> +       result.w = (uchar)(saturate(color.w)*255.0f);
>
>         return result;
>  }
> diff --git a/intern/cycles/kernel/kernel_globals.h
> b/intern/cycles/kernel/kernel_globals.h
> index 0a9753b..be2c879 100644
> --- a/intern/cycles/kernel/kernel_globals.h
> +++ b/intern/cycles/kernel/kernel_globals.h
> @@ -94,7 +94,7 @@ typedef struct KernelGlobals {
>
>  ccl_device float lookup_table_read(KernelGlobals *kg, float x, int
> offset, int size)
>  {
> -       x = clamp(x, 0.0f, 1.0f)*(size-1);
> +       x = saturate(x)*(size-1);
>
>         int index = min(float_to_int(x), size-1);
>         int nindex = min(index+1, size-1);
> @@ -110,7 +110,7 @@ ccl_device float lookup_table_read(KernelGlobals *kg,
> float x, int offset, int s
>
>  ccl_device float lookup_table_read_2D(KernelGlobals *kg, float x, float
> y, int offset, int xsize, int ysize)
>  {
> -       y = clamp(y, 0.0f, 1.0f)*(ysize-1);
> +       y = saturate(y)*(ysize-1);
>
>         int index = min(float_to_int(y), ysize-1);
>         int nindex = min(index+1, ysize-1);
> diff --git a/intern/cycles/kernel/kernel_passes.h
> b/intern/cycles/kernel/kernel_passes.h
> index 6bb39ee..8910e26 100644
> --- a/intern/cycles/kernel/kernel_passes.h
> +++ b/intern/cycles/kernel/kernel_passes.h
> @@ -102,7 +102,7 @@ ccl_device_inline void
> kernel_write_data_passes(KernelGlobals *kg, ccl_global fl
>                 float mist_inv_depth = kernel_data.film.mist_inv_depth;
>
>                 float depth = camera_distance(kg, sd->P);
> -               float mist = clamp((depth - mist_start)*mist_inv_depth,
> 0.0f, 1.0f);
> +               float mist = saturate((depth - mist_start)*mist_inv_depth);
>
>                 /* falloff */
>                 float mist_falloff = kernel_data.film.mist_falloff;
> diff --git a/intern/cycles/kernel/svm/svm_brick.h
> b/intern/cycles/kernel/svm/svm_brick.h
> index 33a2a5c..fcf8f47 100644
> --- a/intern/cycles/kernel/svm/svm_brick.h
> +++ b/intern/cycles/kernel/svm/svm_brick.h
> @@ -47,7 +47,7 @@ ccl_device_noinline float2 svm_brick(float3 p, float
> mortar_size, float bias,
>         y = p.y - row_height*rownum;
>
>         return make_float2(
> -               clamp((brick_noise((rownum << 16) + (bricknum & 0xFFFF)) +
> bias), 0.0f, 1.0f),
> +               saturate((brick_noise((rownum << 16) + (bricknum &
> 0xFFFF)) + bias)),
>
>                 (x < mortar_size || y < mortar_size ||
>                 x > (brick_width - mortar_size) ||
> diff --git a/intern/cycles/kernel/svm/svm_closure.h
> b/intern/cycles/kernel/svm/svm_closure.h
> index 8afdb73..0d2d155 100644
> --- a/intern/cycles/kernel/svm/svm_closure.h
> +++ b/intern/cycles/kernel/svm/svm_closure.h
> @@ -347,7 +347,7 @@ ccl_device void svm_node_closure_bsdf(KernelGlobals
> *kg, ShaderData *sd, float *
>                                 sc->N = N;
>
>                                 /* sigma */
> -                               sc->data0 = clamp(param1, 0.0f, 1.0f);
> +                               sc->data0 = saturate(param1);
>                                 sc->data1 = 0.0f;
>                                 sc->data2 = 0.0f;
>                                 sd->flag |=
> bsdf_ashikhmin_velvet_setup(sc);
> @@ -655,7 +655,7 @@ ccl_device void svm_node_mix_closure(ShaderData *sd,
> float *stack, uint4 node)
>         decode_node_uchar4(node.y, &weight_offset, &in_weight_offset,
> &weight1_offset, &weight2_offset);
>
>         float weight = stack_load_float(stack, weight_offset);
> -       weight = clamp(weight, 0.0f, 1.0f);
> +       weight = saturate(weight);
>
>         float in_weight = (stack_valid(in_weight_offset))?
> stack_load_float(stack, in_weight_offset): 1.0f;
>
> diff --git a/intern/cycles/kernel/svm/svm_gradient.h
> b/intern/cycles/kernel/svm/svm_gradient.h
> index a5e385f..53d7b4f 100644
> --- a/intern/cycles/kernel/svm/svm_gradient.h
> +++ b/intern/cycles/kernel/svm/svm_gradient.h
> @@ -66,7 +66,7 @@ ccl_device void svm_node_tex_gradient(ShaderData *sd,
> float *stack, uint4 node)
>         float3 co = stack_load_float3(stack, co_offset);
>
>         float f = svm_gradient(co, (NodeGra
>
> @@ Diff output truncated at 10240 characters. @@
>
> _______________________________________________
> Bf-blender-cvs mailing list
> Bf-blender-cvs at blender.org
> http://lists.blender.org/mailman/listinfo/bf-blender-cvs
>



-- 
With best regards, Sergey Sharybin


More information about the Bf-committers mailing list