[Bf-blender-cvs] [6eca3ca715f] cycles-x: Cycles X: remove all alignment requirements in render passes storage

Fri Jul 23 19:29:01 CEST 2021

Commit: 6eca3ca715f74b6399da04b32b96dad27501f2d1
Author: Brecht Van Lommel
Date:   Thu Jul 15 16:55:09 2021 +0200
Branches: cycles-x
https://developer.blender.org/rB6eca3ca715f74b6399da04b32b96dad27501f2d1

Cycles X: remove all alignment requirements in render passes storage

This saves memory, simplifies the code and has no apparent performance
impact. Whatever historical reasons there were for this, I don't think
they apply anymore.

Differential Revision: https://developer.blender.org/D12015

===================================================================

M	intern/cycles/blender/addon/engine.py
M	intern/cycles/integrator/pass_accessor.cpp
M	intern/cycles/kernel/kernel_accumulate.h
M	intern/cycles/kernel/kernel_film.h
M	intern/cycles/kernel/kernel_passes.h
M	intern/cycles/kernel/kernel_types.h
M	intern/cycles/kernel/kernel_write_passes.h
M	intern/cycles/kernel/svm/svm_aov.h
M	intern/cycles/render/buffers.cpp
M	intern/cycles/render/film.cpp
M	intern/cycles/render/nodes.cpp
M	intern/cycles/render/nodes.h
M	intern/cycles/render/pass.cpp
M	intern/cycles/render/pass.h
M	intern/cycles/render/svm.cpp

===================================================================

diff --git a/intern/cycles/blender/addon/engine.py b/intern/cycles/blender/addon/engine.py
index 19e85bd8438..df6bb2dc982 100644
--- a/intern/cycles/blender/addon/engine.py
+++ b/intern/cycles/blender/addon/engine.py
@@ -242,7 +242,7 @@ def list_render_passes(scene, srl):
         if aov.type == 'VALUE':
             yield (aov.name, "X", 'VALUE')
         else:
-            yield (aov.name, "RGBA", 'COLOR')
+            yield (aov.name, "RGB", 'COLOR')
 
 
 def register_passes(engine, scene, view_layer):
diff --git a/intern/cycles/integrator/pass_accessor.cpp b/intern/cycles/integrator/pass_accessor.cpp
index ab715ad5ad9..9d38f0137f5 100644
--- a/intern/cycles/integrator/pass_accessor.cpp
+++ b/intern/cycles/integrator/pass_accessor.cpp
@@ -63,18 +63,7 @@ PassAccessor::Destination::Destination(const PassType pass_type, half4 *pixels)
 PassAccessor::Destination::Destination(const PassType pass_type)
 {
   const PassInfo pass_info = Pass::get_info(pass_type);
-
-  if (pass_info.divide_type != PASS_NONE) {
-    /* Divide is used for colors, which has 3 destination components.
-     * The passes which use division are stored as aligned float4 internally, and there is no
-     * implementation of divide_even_color for float4. So we force it here.
-     * The rest of the aligned float3 passes should be fine, because they have float4
-     * implementation. */
-    num_components = 3;
-  }
-  else {
-    num_components = pass_info.num_components;
-  }
+  num_components = pass_info.num_components;
 }
 
 /* --------------------------------------------------------------------
@@ -160,10 +149,10 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
   const PassMode mode = pass_access_info_.mode;
   const PassInfo pass_info = Pass::get_info(type);
 
-  if (destination.num_components == 1) {
-    DCHECK_LE(pass_info.num_components, destination.num_components)
-        << "Number of components mismatch for " << pass_type_as_string(type);
+  DCHECK_LE(pass_info.num_components, destination.num_components)
+      << "Number of components mismatch for " << pass_type_as_string(type);
 
+  if (pass_info.num_components == 1) {
     if (mode == PassMode::DENOISED) {
       /* Denoised passes store their final pixels, no need in special calculation. */
       get_pass_float(render_buffers, buffer_params, destination);
@@ -184,16 +173,7 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
       get_pass_float(render_buffers, buffer_params, destination);
     }
   }
-  else if (destination.num_components == 3) {
-    if (pass_info.is_aligned) {
-      DCHECK_LE(pass_info.num_components, 4)
-          << "Number of components mismatch for pass " << pass_type_as_string(type);
-    }
-    else {
-      DCHECK_LE(pass_info.num_components, 3)
-          << "Number of components mismatch for pass " << pass_type_as_string(type);
-    }
-
+  else if (pass_info.num_components == 3) {
     if (mode == PassMode::DENOISED) {
       /* Denoised passes store their final pixels, no need in special calculation. */
       get_pass_float3(render_buffers, buffer_params, destination);
@@ -210,10 +190,7 @@ bool PassAccessor::get_render_tile_pixels(const RenderBuffers *render_buffers,
       get_pass_float3(render_buffers, buffer_params, destination);
     }
   }
-  else if (destination.num_components == 4) {
-    DCHECK_EQ(pass_info.num_components, 4)
-        << "Number of components mismatch for pass " << pass_type_as_string(type);
-
+  else if (pass_info.num_components == 4) {
     if (type == PASS_SHADOW_CATCHER_MATTE && pass_access_info_.use_approximate_shadow_catcher) {
       /* Denoised matte with shadow needs to do calculation (will use denoised shadow catcher pass
        * to approximate shadow with). */
@@ -314,18 +291,23 @@ bool PassAccessor::set_render_tile_pixels(RenderBuffers *render_buffers, const S
     return false;
   }
 
+  const PassType type = pass_access_info_.type;
+  const PassInfo pass_info = Pass::get_info(type);
+
   const BufferParams &buffer_params = render_buffers->params;
 
   float *buffer_data = render_buffers->buffer.data();
-  const int pass_stride = buffer_params.pass_stride;
   const int size = buffer_params.width * buffer_params.height;
-  const int num_components = source.num_components;
+
+  const int out_stride = buffer_params.pass_stride;
+  const int in_stride = source.num_components;
+  const int num_components_to_copy = min(source.num_components, pass_info.num_components);
 
   float *out = buffer_data + pass_access_info_.offset;
-  const float *in = source.pixels + source.offset * num_components;
+  const float *in = source.pixels + source.offset * in_stride;
 
-  for (int i = 0; i < size; i++, out += pass_stride, in += num_components) {
-    memcpy(out, in, sizeof(float) * num_components);
+  for (int i = 0; i < size; i++, out += out_stride, in += in_stride) {
+    memcpy(out, in, sizeof(float) * num_components_to_copy);
   }
 
   return true;
diff --git a/intern/cycles/kernel/kernel_accumulate.h b/intern/cycles/kernel/kernel_accumulate.h
index 98875d4fe43..e04e1378346 100644
--- a/intern/cycles/kernel/kernel_accumulate.h
+++ b/intern/cycles/kernel/kernel_accumulate.h
@@ -331,8 +331,7 @@ ccl_device_inline void kernel_accum_emission_or_background_pass(INTEGRATOR_STATE
       const float3 denoising_feature_throughput = INTEGRATOR_STATE(path,
                                                                    denoising_feature_throughput);
       const float3 denoising_albedo = denoising_feature_throughput * contribution;
-      kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_albedo,
-                                         denoising_albedo);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_albedo, denoising_albedo);
     }
   }
 #  endif /* __DENOISING_FEATURES__ */
diff --git a/intern/cycles/kernel/kernel_film.h b/intern/cycles/kernel/kernel_film.h
index ec18a141116..65966e0fef6 100644
--- a/intern/cycles/kernel/kernel_film.h
+++ b/intern/cycles/kernel/kernel_film.h
@@ -282,7 +282,7 @@ ccl_device_inline void film_get_pass_pixel_combined(const KernelFilmConvert *ccl
  * Shadow catcher.
  */
 
-ccl_device_inline float4
+ccl_device_inline float3
 film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfilm_convert,
                                        ccl_global const float *ccl_restrict buffer)
 {
@@ -295,10 +295,10 @@ film_calculate_shadow_catcher_denoised(const KernelFilmConvert *ccl_restrict kfi
 
   const float3 pixel = make_float3(in_catcher[0], in_catcher[1], in_catcher[2]) * scale_exposure;
 
-  return make_float4(pixel.x, pixel.y, pixel.z, 1.0f);
+  return pixel;
 }
 
-ccl_device_inline float4
+ccl_device_inline float3
 film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_convert,
                               ccl_global const float *ccl_restrict buffer)
 {
@@ -318,7 +318,7 @@ film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_conver
    * needed, so return one. */
   const float num_samples = in_catcher[3];
   if (num_samples == 0.0f) {
-    return one_float4();
+    return one_float3();
   }
 
   /* NOTE: It is possible that the Shadow Catcher pass is requested as an output without actual
@@ -356,7 +356,7 @@ film_calculate_shadow_catcher(const KernelFilmConvert *ccl_restrict kfilm_conver
    * during the division. */
   const float3 pixel = (1.0f - alpha) * one_float3() + alpha * shadow_catcher;
 
-  return make_float4(pixel.x, pixel.y, pixel.z, 1.0f);
+  return pixel;
 }
 
 ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
@@ -378,14 +378,13 @@ ccl_device_inline float4 film_calculate_shadow_catcher_matte_with_shadow(
 
   ccl_global const float *in_matte = buffer + kfilm_convert->pass_shadow_catcher_matte;
 
-  const float4 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
+  const float3 shadow_catcher = film_calculate_shadow_catcher(kfilm_convert, buffer);
   const float3 color_matte = make_float3(in_matte[0], in_matte[1], in_matte[2]) * scale_exposure;
 
   const float transparency = in_matte[3] * scale;
   const float alpha = saturate(1.0f - transparency);
 
-  const float alpha_matte = (1.0f - alpha) * (1.0f - average(float4_to_float3(shadow_catcher))) +
-                            alpha;
+  const float alpha_matte = (1.0f - alpha) * (1.0f - average(shadow_catcher)) + alpha;
 
   if (kfilm_convert->use_approximate_shadow_catcher_background) {
     kernel_assert(kfilm_convert->pass_background != PASS_UNUSED);
@@ -406,13 +405,13 @@ ccl_device_inline void film_get_pass_pixel_shadow_catcher(
     ccl_global const float *ccl_restrict buffer,
     float *ccl_restrict pixel)
 {
-  const float4 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
+  const float3 pixel_value = film_calculate_shadow_catcher(kfilm_convert, buffer);
 
   pixel[0] = pixel_value.x;
   pixel[1] = pixel_value.y;
   pixel[2] = pixel_value.z;
   if (kfilm_convert->num_components == 4) {
-    pixel[3] = pixel_value.w;
+    pixel[3] = 1.0f;
   }
 }
 
diff --git a/intern/cycles/kernel/kernel_passes.h b/intern/cycles/kernel/kernel_passes.h
index 568886a2104..325ea07218b 100644
--- a/intern/cycles/kernel/kernel_passes.h
+++ b/intern/cycles/kernel/kernel_passes.h
@@ -103,8 +103,7 @@ ccl_device_forceinline void kernel_write_denoising_features(
       normal = transform_direction(&worldtocamera, normal);
 
       const float3 denoising_normal = ensure_finite3(normal);
-      kernel_write_pass_float3_unaligned(buffer + kernel_data.film.pass_denoising_normal,
-                                         denoising_normal);
+      kernel_write_pass_float3(buffer + kernel_data.film.pass_denoising_normal, denoising_normal);
     }
 
     if (kernel_data.film.pass_denoising_albedo != PASS_UNUSED) {
@@ -112,8 +111,7 @@ ccl_device_forceinline void kernel_write_denoising_features(
                                                                    denoising_feature_throughput);
       const float3 denoising_albedo = ensure_finite3(denoising_feature_throughput *
                                                      diffuse_albedo);
-      kernel

@@ Diff output truncated at 10240 characters. @@