[Bf-blender-cvs] [3f1886d0b78] master: Functions: align chunk sizes in multi-function evaluation
Jacques Lucke
noreply at git.blender.org
Sun Jan 22 00:03:36 CET 2023
Commit: 3f1886d0b788d043917ec86702b4b480f8d5dd2e
Author: Jacques Lucke
Date: Sun Jan 22 00:03:25 2023 +0100
Branches: master
https://developer.blender.org/rB3f1886d0b788d043917ec86702b4b480f8d5dd2e
Functions: align chunk sizes in multi-function evaluation
This can improve performance in some circumstances when there are
vectorized and/or unrolled loops. I especially noticed that this helps
a lot while working on D16970 (got a 10-20% speedup there by avoiding
running into the non-vectorized fallback loop too often).
===================================================================
M source/blender/blenlib/BLI_task.hh
M source/blender/functions/intern/multi_function.cc
===================================================================
diff --git a/source/blender/blenlib/BLI_task.hh b/source/blender/blenlib/BLI_task.hh
index c726691ad46..60585e35099 100644
--- a/source/blender/blenlib/BLI_task.hh
+++ b/source/blender/blenlib/BLI_task.hh
@@ -71,6 +71,36 @@ void parallel_for(IndexRange range, int64_t grain_size, const Function &function
function(range);
}
+/**
+ * Same as #parallel_for but tries to make the sub-range sizes multiples of the given alignment.
+ * This can improve performance when the range is processed using vectorized and/or unrolled loops,
+ * because the fallback loop that processes remaining values is used less often. A disadvantage of
+ * using this instead of #parallel_for is that the size differences between sub-ranges can be
+ * larger, which means that work is distributed less evenly.
+ */
+template<typename Function>
+void parallel_for_aligned(const IndexRange range,
+ const int64_t grain_size,
+ const int64_t alignment,
+ const Function &function)
+{
+ const int64_t global_begin = range.start();
+ const int64_t global_end = range.one_after_last();
+ const int64_t alignment_mask = ~(alignment - 1);
+ parallel_for(range, grain_size, [&](const IndexRange unaligned_range) {
+ /* Move the sub-range boundaries down to the next aligned index. The "global" begin and end
+ * remain fixed though. */
+ const int64_t unaligned_begin = unaligned_range.start();
+ const int64_t unaligned_end = unaligned_range.one_after_last();
+ const int64_t aligned_begin = std::max(global_begin, unaligned_begin & alignment_mask);
+ const int64_t aligned_end = unaligned_end == global_end ?
+ unaligned_end :
+ std::max(global_begin, unaligned_end & alignment_mask);
+ const IndexRange aligned_range{aligned_begin, aligned_end - aligned_begin};
+ function(aligned_range);
+ });
+}
+
template<typename Value, typename Function, typename Reduction>
Value parallel_reduce(IndexRange range,
int64_t grain_size,
diff --git a/source/blender/functions/intern/multi_function.cc b/source/blender/functions/intern/multi_function.cc
index fedf9a00d13..2f83ab08879 100644
--- a/source/blender/functions/intern/multi_function.cc
+++ b/source/blender/functions/intern/multi_function.cc
@@ -52,6 +52,16 @@ static int64_t compute_grain_size(const ExecutionHints &hints, const IndexMask m
return grain_size;
}
+static int64_t compute_alignment(const int64_t grain_size)
+{
+ if (grain_size <= 512) {
+ /* Don't use a number that's too large, or otherwise the work will be split quite unevenly. */
+ return 8;
+ }
+ /* It's not common that more elements are processed in a loop at once. */
+ return 32;
+}
+
void MultiFunction::call_auto(IndexMask mask, Params params, Context context) const
{
if (mask.is_empty()) {
@@ -71,71 +81,75 @@ void MultiFunction::call_auto(IndexMask mask, Params params, Context context) co
return;
}
- threading::parallel_for(mask.index_range(), grain_size, [&](const IndexRange sub_range) {
- const IndexMask sliced_mask = mask.slice(sub_range);
- if (!hints.allocates_array) {
- /* There is no benefit to changing indices in this case. */
- this->call(sliced_mask, params, context);
- return;
- }
- if (sliced_mask[0] < grain_size) {
- /* The indices are low, no need to offset them. */
- this->call(sliced_mask, params, context);
- return;
- }
- const int64_t input_slice_start = sliced_mask[0];
- const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
- const IndexRange input_slice_range{input_slice_start, input_slice_size};
-
- Vector<int64_t> offset_mask_indices;
- const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
-
- ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
-
- /* Slice all parameters so that for the actual function call. */
- for (const int param_index : this->param_indices()) {
- const ParamType param_type = this->param_type(param_index);
- switch (param_type.category()) {
- case ParamCategory::SingleInput: {
- const GVArray &varray = params.readonly_single_input(param_index);
- offset_params.add_readonly_single_input(varray.slice(input_slice_range));
- break;
+ const int64_t alignment = compute_alignment(grain_size);
+ threading::parallel_for_aligned(
+ mask.index_range(), grain_size, alignment, [&](const IndexRange sub_range) {
+ const IndexMask sliced_mask = mask.slice(sub_range);
+ if (!hints.allocates_array) {
+ /* There is no benefit to changing indices in this case. */
+ this->call(sliced_mask, params, context);
+ return;
}
- case ParamCategory::SingleMutable: {
- const GMutableSpan span = params.single_mutable(param_index);
- const GMutableSpan sliced_span = span.slice(input_slice_range);
- offset_params.add_single_mutable(sliced_span);
- break;
+ if (sliced_mask[0] < grain_size) {
+ /* The indices are low, no need to offset them. */
+ this->call(sliced_mask, params, context);
+ return;
}
- case ParamCategory::SingleOutput: {
- if (bool(signature_ref_->params[param_index].flag & ParamFlag::SupportsUnusedOutput)) {
- const GMutableSpan span = params.uninitialized_single_output_if_required(param_index);
- if (span.is_empty()) {
- offset_params.add_ignored_single_output();
+ const int64_t input_slice_start = sliced_mask[0];
+ const int64_t input_slice_size = sliced_mask.last() - input_slice_start + 1;
+ const IndexRange input_slice_range{input_slice_start, input_slice_size};
+
+ Vector<int64_t> offset_mask_indices;
+ const IndexMask offset_mask = mask.slice_and_offset(sub_range, offset_mask_indices);
+
+ ParamsBuilder offset_params{*this, offset_mask.min_array_size()};
+
+ /* Slice all parameters so that for the actual function call. */
+ for (const int param_index : this->param_indices()) {
+ const ParamType param_type = this->param_type(param_index);
+ switch (param_type.category()) {
+ case ParamCategory::SingleInput: {
+ const GVArray &varray = params.readonly_single_input(param_index);
+ offset_params.add_readonly_single_input(varray.slice(input_slice_range));
+ break;
}
- else {
+ case ParamCategory::SingleMutable: {
+ const GMutableSpan span = params.single_mutable(param_index);
const GMutableSpan sliced_span = span.slice(input_slice_range);
- offset_params.add_uninitialized_single_output(sliced_span);
+ offset_params.add_single_mutable(sliced_span);
+ break;
+ }
+ case ParamCategory::SingleOutput: {
+ if (bool(signature_ref_->params[param_index].flag &
+ ParamFlag::SupportsUnusedOutput)) {
+ const GMutableSpan span = params.uninitialized_single_output_if_required(
+ param_index);
+ if (span.is_empty()) {
+ offset_params.add_ignored_single_output();
+ }
+ else {
+ const GMutableSpan sliced_span = span.slice(input_slice_range);
+ offset_params.add_uninitialized_single_output(sliced_span);
+ }
+ }
+ else {
+ const GMutableSpan span = params.uninitialized_single_output(param_index);
+ const GMutableSpan sliced_span = span.slice(input_slice_range);
+ offset_params.add_uninitialized_single_output(sliced_span);
+ }
+ break;
+ }
+ case ParamCategory::VectorInput:
+ case ParamCategory::VectorMutable:
+ case ParamCategory::VectorOutput: {
+ BLI_assert_unreachable();
+ break;
}
}
- else {
- const GMutableSpan span = params.uninitialized_single_output(param_index);
- const GMutableSpan sliced_span = span.slice(input_slice_range);
- offset_params.add_uninitialized_single_output(sliced_span);
- }
- break;
}
- case ParamCategory::VectorInput:
- case ParamCategory::VectorMutable:
- case ParamCategory::VectorOutput: {
- BLI_assert_unreachable();
- break;
- }
- }
- }
- this->call(offset_mask, offset_params, context);
- });
+ this->call(offset_mask, offset_params, context);
+ });
}
std::string MultiFunction::debug_name() const
More information about the Bf-blender-cvs
mailing list