[Bf-blender-cvs] [2920a569b52] temp-parallel-multi-function: progress

Fri Sep 10 11:02:28 CEST 2021

Commit: 2920a569b527c3543dd393a96bca2362ee04feef
Author: Jacques Lucke
Date:   Thu Sep 9 11:19:09 2021 +0200
Branches: temp-parallel-multi-function
https://developer.blender.org/rB2920a569b527c3543dd393a96bca2362ee04feef

progress

===================================================================

M	source/blender/blenlib/BLI_virtual_array.hh
M	source/blender/functions/CMakeLists.txt
M	source/blender/functions/FN_multi_function_parallel.hh
M	source/blender/functions/intern/field.cc
A	source/blender/functions/intern/multi_function_parallel.cc
M	source/blender/functions/tests/FN_multi_function_test.cc

===================================================================

diff --git a/source/blender/blenlib/BLI_virtual_array.hh b/source/blender/blenlib/BLI_virtual_array.hh
index 1c02bce8411..e99036d06a9 100644
--- a/source/blender/blenlib/BLI_virtual_array.hh
+++ b/source/blender/blenlib/BLI_virtual_array.hh
@@ -622,41 +622,50 @@ inline void devirtualize_varray2(const VArray<T1> &varray1,
                                  const Func &func,
                                  bool enable = true)
 {
-  /* Support disabling the devirtualization to simplify benchmarking. */
-  if (enable) {
-    const bool is_span1 = varray1.is_span();
-    const bool is_span2 = varray2.is_span();
-    const bool is_single1 = varray1.is_single();
-    const bool is_single2 = varray2.is_single();
-    if (is_span1 && is_span2) {
-      const VArray_For_Span<T1> varray1_span{varray1.get_internal_span()};
-      const VArray_For_Span<T2> varray2_span{varray2.get_internal_span()};
-      func(varray1_span, varray2_span);
-      return;
-    }
-    if (is_span1 && is_single2) {
-      const VArray_For_Span<T1> varray1_span{varray1.get_internal_span()};
-      const VArray_For_Single<T2> varray2_single{varray2.get_internal_single(), varray2.size()};
-      func(varray1_span, varray2_single);
-      return;
-    }
-    if (is_single1 && is_span2) {
-      const VArray_For_Single<T1> varray1_single{varray1.get_internal_single(), varray1.size()};
-      const VArray_For_Span<T2> varray2_span{varray2.get_internal_span()};
-      func(varray1_single, varray2_span);
-      return;
-    }
-    if (is_single1 && is_single2) {
-      const VArray_For_Single<T1> varray1_single{varray1.get_internal_single(), varray1.size()};
-      const VArray_For_Single<T2> varray2_single{varray2.get_internal_single(), varray2.size()};
-      func(varray1_single, varray2_single);
-      return;
-    }
-  }
-  /* This fallback is used even when one of the inputs could be optimized. It's probably not worth
-   * it to optimize just one of the inputs, because then the compiler still has to call into
-   * unknown code, which inhibits many compiler optimizations. */
-  func(varray1, varray2);
+  devirtualize_varray(
+      varray1,
+      [&](const auto &varray1) {
+        devirtualize_varray(
+            varray2, [&](const auto &varray2) { func(varray1, varray2); }, enable);
+      },
+      enable);
+
+  // /* Support disabling the devirtualization to simplify benchmarking. */
+  // if (enable) {
+  //   const bool is_span1 = varray1.is_span();
+  //   const bool is_span2 = varray2.is_span();
+  //   const bool is_single1 = varray1.is_single();
+  //   const bool is_single2 = varray2.is_single();
+  //   if (is_span1 && is_span2) {
+  //     const VArray_For_Span<T1> varray1_span{varray1.get_internal_span()};
+  //     const VArray_For_Span<T2> varray2_span{varray2.get_internal_span()};
+  //     func(varray1_span, varray2_span);
+  //     return;
+  //   }
+  //   if (is_span1 && is_single2) {
+  //     const VArray_For_Span<T1> varray1_span{varray1.get_internal_span()};
+  //     const VArray_For_Single<T2> varray2_single{varray2.get_internal_single(), varray2.size()};
+  //     func(varray1_span, varray2_single);
+  //     return;
+  //   }
+  //   if (is_single1 && is_span2) {
+  //     const VArray_For_Single<T1> varray1_single{varray1.get_internal_single(), varray1.size()};
+  //     const VArray_For_Span<T2> varray2_span{varray2.get_internal_span()};
+  //     func(varray1_single, varray2_span);
+  //     return;
+  //   }
+  //   if (is_single1 && is_single2) {
+  //     const VArray_For_Single<T1> varray1_single{varray1.get_internal_single(), varray1.size()};
+  //     const VArray_For_Single<T2> varray2_single{varray2.get_internal_single(), varray2.size()};
+  //     func(varray1_single, varray2_single);
+  //     return;
+  //   }
+  // }
+  // /* This fallback is used even when one of the inputs could be optimized. It's probably not
+  // worth
+  //  * it to optimize just one of the inputs, because then the compiler still has to call into
+  //  * unknown code, which inhibits many compiler optimizations. */
+  // func(varray1, varray2);
 }
 
 }  // namespace blender
diff --git a/source/blender/functions/CMakeLists.txt b/source/blender/functions/CMakeLists.txt
index 3c27e9d5e19..856668f01d7 100644
--- a/source/blender/functions/CMakeLists.txt
+++ b/source/blender/functions/CMakeLists.txt
@@ -34,6 +34,7 @@ set(SRC
   intern/generic_virtual_vector_array.cc
   intern/multi_function.cc
   intern/multi_function_builder.cc
+  intern/multi_function_parallel.cc
   intern/multi_function_procedure.cc
   intern/multi_function_procedure_builder.cc
   intern/multi_function_procedure_executor.cc
@@ -54,6 +55,7 @@ set(SRC
   FN_multi_function_data_type.hh
   FN_multi_function_param_type.hh
   FN_multi_function_params.hh
+  FN_multi_function_parallel.hh
   FN_multi_function_procedure.hh
   FN_multi_function_procedure_builder.hh
   FN_multi_function_procedure_executor.hh
@@ -64,6 +66,22 @@ set(LIB
   bf_blenlib
 )
 
+if(WITH_TBB)
+  add_definitions(-DWITH_TBB)
+  if(WIN32)
+    # TBB includes Windows.h which will define min/max macros
+    # that will collide with the stl versions.
+    add_definitions(-DNOMINMAX)
+  endif()
+  list(APPEND INC_SYS
+    ${TBB_INCLUDE_DIRS}
+  )
+
+  list(APPEND LIB
+    ${TBB_LIBRARIES}
+  )
+endif()
+
 blender_add_lib(bf_functions "${SRC}" "${INC}" "${INC_SYS}" "${LIB}")
 
 if(WITH_GTESTS)
diff --git a/source/blender/functions/FN_multi_function_parallel.hh b/source/blender/functions/FN_multi_function_parallel.hh
index b5b3e2f2f94..84c57efd434 100644
--- a/source/blender/functions/FN_multi_function_parallel.hh
+++ b/source/blender/functions/FN_multi_function_parallel.hh
@@ -20,5 +20,20 @@
  * \ingroup fn
  */
 
+#include "FN_multi_function.hh"
+
 namespace blender::fn {
-}
+
+class ParallelMultiFunction : public MultiFunction {
+ private:
+  const MultiFunction &fn_;
+  const int64_t grain_size_;
+  bool threading_supported_;
+
+ public:
+  ParallelMultiFunction(const MultiFunction &fn, const int64_t grain_size);
+
+  void call(IndexMask mask, MFParams params, MFContext context) const override;
+};
+
+}  // namespace blender::fn
diff --git a/source/blender/functions/intern/field.cc b/source/blender/functions/intern/field.cc
index a27c5e4e3dc..7b35593ad75 100644
--- a/source/blender/functions/intern/field.cc
+++ b/source/blender/functions/intern/field.cc
@@ -18,9 +18,11 @@
 #include "BLI_multi_value_map.hh"
 #include "BLI_set.hh"
 #include "BLI_stack.hh"
+#include "BLI_timeit.hh"
 #include "BLI_vector_set.hh"
 
 #include "FN_field.hh"
+#include "FN_multi_function_parallel.hh"
 
 namespace blender::fn {
 
@@ -271,6 +273,8 @@ Vector<const GVArray *> evaluate_fields(ResourceScope &scope,
                                         const FieldContext &context,
                                         Span<GVMutableArray *> dst_hints)
 {
+  SCOPED_TIMER(__func__);
+
   Vector<const GVArray *> r_varrays(fields_to_evaluate.size(), nullptr);
 
   /* Destination hints are optional. Create a small utility method to access them. */
@@ -334,7 +338,10 @@ Vector<const GVArray *> evaluate_fields(ResourceScope &scope,
     build_multi_function_procedure_for_fields(
         procedure, scope, field_tree_info, varying_fields_to_evaluate);
     MFProcedureExecutor procedure_executor{"Procedure", procedure};
-    MFParamsBuilder mf_params{procedure_executor, array_size};
+    fn::ParallelMultiFunction parallel_fn{procedure_executor, 20000};
+    const MultiFunction &fn_to_execute = procedure_executor;
+
+    MFParamsBuilder mf_params{fn_to_execute, array_size};
     MFContextBuilder mf_context;
 
     /* Provide inputs to the procedure executor. */
@@ -376,7 +383,7 @@ Vector<const GVArray *> evaluate_fields(ResourceScope &scope,
       mf_params.add_uninitialized_single_output(span);
     }
 
-    procedure_executor.call(mask, mf_params, mf_context);
+    fn_to_execute.call(mask, mf_params, mf_context);
   }
 
   /* Evaluate constant fields if necessary. */
diff --git a/source/blender/functions/intern/multi_function_parallel.cc b/source/blender/functions/intern/multi_function_parallel.cc
new file mode 100644
index 00000000000..6843c4a233b
--- /dev/null
+++ b/source/blender/functions/intern/multi_function_parallel.cc
@@ -0,0 +1,109 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include "FN_multi_function_parallel.hh"
+
+#include "BLI_task.hh"
+
+#include <mutex>
+
+namespace blender::fn {
+
+ParallelMultiFunction::ParallelMultiFunction(const MultiFunction &fn, const int64_t grain_size)
+    : fn_(fn), grain_size_(grain_size)
+{
+  this->set_signature(&fn.signature());
+
+  threading_supported_ = true;
+  for (const int param_index : fn.param_indices()) {
+    const MFParamType param_type = fn.param_type(param_index);
+    if (param_type.data_type().category() == MFDataType::Vector) {
+      threading_supported_ = false;
+      break;
+    }
+  }
+}
+
+void ParallelMultiFunction::call(IndexMask mask, MFParams params, MFContext context) const
+{
+  if (mask.size() <= grain_size_ || !threading_supported_) {
+    fn_.call(mask, params, context);
+    return;
+  }
+
+  threading::parallel_for(mask.index_range(), grain_size_, [&](const IndexRange range) {
+    const int size = range.size();
+    IndexMask original_sub_mask{mask.indices().slice(range)};
+    const int64_t offset = original_sub_mask.indices().first();
+    const int64_t slice_size = original_sub_mask.indices().last() - offset + 1;
+    const IndexRange slice_range{offset, slice_size};
+    IndexMask sub_mask;
+    Vec

@@ Diff output truncated at 10240 characters. @@