[Bf-blender-cvs] [f3bd5458a39] master: Metal: Optimise shader texture cache usage and branch reduction via point sampling.

Jason Fielder noreply at git.blender.org
Tue Jan 31 11:04:43 CET 2023


Commit: f3bd5458a39b7ebd397dac10c8555a4c9fbc7409
Author: Jason Fielder
Date:   Tue Jan 31 10:56:13 2023 +0100
Branches: master
https://developer.blender.org/rBf3bd5458a39b7ebd397dac10c8555a4c9fbc7409

Metal: Optimise shader texture cache usage and branch reduction via point sampling.

Replace texelFetch calls with a texture point-sample rather than a textureRead call. This increases texture cache utilisation when mixing between sampled calls and reads. Bounds checking can also be removed from these functions, reducing instruction count and branch divergence, as the sampler routine handles range clamping.

Authored by Apple: Michael Parkin-White
Ref T96261

Depends on D16923

Reviewed By: fclem

Maniphest Tasks: T96261

Differential Revision: https://developer.blender.org/D17021

===================================================================

M	source/blender/gpu/metal/mtl_shader_generator.mm
M	source/blender/gpu/shaders/metal/mtl_shader_defines.msl

===================================================================

diff --git a/source/blender/gpu/metal/mtl_shader_generator.mm b/source/blender/gpu/metal/mtl_shader_generator.mm
index 15d451d0ddb..ad0280efa03 100644
--- a/source/blender/gpu/metal/mtl_shader_generator.mm
+++ b/source/blender/gpu/metal/mtl_shader_generator.mm
@@ -1752,8 +1752,9 @@ void MSLGeneratorInterface::prepare_from_createinfo(const shader::ShaderCreateIn
 bool MSLGeneratorInterface::use_argument_buffer_for_samplers() const
 {
   /* We can only use argument buffers IF sampler count exceeds static limit of 16,
-   * AND we can support more samplers with an argument buffer. */
-  return texture_samplers.size() >= 16 && GPU_max_samplers() > 16;
+   * AND we can support more samplers with an argument buffer.
+   * NOTE: We reserve one constant sampler within the shader for fast read via point-sampling. */
+  return texture_samplers.size() >= 15 && GPU_max_samplers() > 16;
 }
 
 uint32_t MSLGeneratorInterface::num_samplers_for_stage(ShaderStage stage) const
diff --git a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
index 8dff8982971..5cb9c47f36f 100644
--- a/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
+++ b/source/blender/gpu/shaders/metal/mtl_shader_defines.msl
@@ -291,7 +291,93 @@ union _msl_return_float {
 /* Add custom texture sampling/reading routines for each type to account for special return cases,
  * e.g. returning a float with an r parameter Note: Cannot use template specialization for input
  * type, as return types are specific to the signature of 'tex'. */
-/* Texture Read. */
+
+/* Use point sampler instead of texture read to benefit from texture caching and reduce branching
+ * through removal of bounds tests, as these are handled by the sample operation. */
+constexpr sampler _point_sample_(address::clamp_to_zero, filter::nearest, coord::pixel);
+
+/* Texture Read via point sampling.
+ * NOTE: These templates will evaluate first for texture resources bound with sample. */
+template<typename S, typename T>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, access::sample> tex,
+                                      T texel,
+                                      uint lod = 0)
+{
+  return tex.texture->sample(_point_sample_, float(texel));
+}
+
+template<typename S, typename T>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, access::sample> tex,
+                                      T texel,
+                                      uint lod,
+                                      T offset)
+{
+  return tex.texture->sample(_point_sample_, float(texel + offset));
+}
+
+template<typename S, typename T>
+inline vec<S, 4> _texelFetch_internal(
+    thread _mtl_combined_image_sampler_1d_array<S, access::sample> tex,
+    vec<T, 2> texel,
+    uint lod,
+    vec<T, 2> offset = vec<T, 2>(0, 0))
+{
+  return tex.texture->sample(_point_sample_, float(texel.x + offset.x), uint(texel.y + offset.y));
+}
+
+template<typename S, typename T>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_2d<S, access::sample> tex,
+                                      vec<T, 2> texel,
+                                      uint lod,
+                                      vec<T, 2> offset = vec<T, 2>(0))
+{
+  return tex.texture->sample(_point_sample_, float2(texel.xy + offset.xy), level(lod));
+}
+
+template<typename S, typename T>
+inline vec<S, 4> _texelFetch_internal(
+    thread _mtl_combined_image_sampler_2d_array<S, access::sample> tex,
+    vec<T, 3> texel,
+    uint lod,
+    vec<T, 3> offset = vec<T, 3>(0))
+{
+  return tex.texture->sample(
+      _point_sample_, float2(texel.xy + offset.xy), uint(texel.z + offset.z), level(lod));
+}
+
+template<typename S, typename T>
+inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_3d<S, access::sample> tex,
+                                      vec<T, 3> texel,
+                                      uint lod,
+                                      vec<T, 3> offset = vec<T, 3>(0))
+{
+  return tex.texture->sample(_point_sample_, float3(texel.xyz + offset.xyz), level(lod));
+}
+
+template<typename T>
+inline _msl_return_float _texelFetch_internal(
+    thread _mtl_combined_image_sampler_depth_2d<float, access::sample> tex,
+    vec<T, 2> texel,
+    uint lod,
+    vec<T, 2> offset = vec<T, 2>(0))
+{
+  _msl_return_float fl = {
+      tex.texture->sample(_point_sample_, float2(texel.xy + offset.xy), level(lod))};
+  return fl;
+}
+
+template<typename S, typename T>
+inline vec<S, 4> _texture_internal_samp(
+    thread _mtl_combined_image_sampler_2d_array<S, access::sample> tex,
+    vec<T, 3> texel,
+    uint lod,
+    vec<T, 3> offset = vec<T, 3>(0))
+{
+  return tex.texture->sample(
+      _point_sample_, float2(texel.xy + offset.xy), uint(texel.z + offset.z), level(lod));
+}
+
+/* Texture Read via read operation. Required by compute/image-bindings. */
 template<typename S, typename T, access A>
 inline vec<S, 4> _texelFetch_internal(thread _mtl_combined_image_sampler_1d<S, A> tex,
                                       T texel,



More information about the Bf-blender-cvs mailing list