[Bf-blender-cvs] [dba2d828462] master: Geometry Nodes: avoid using enumerable thread specific on single thread

Thu Dec 29 21:06:12 CET 2022

Commit: dba2d828462ae22de53f20f734bda6eb4d65171e
Author: Jacques Lucke
Date:   Thu Dec 29 21:05:41 2022 +0100
Branches: master
https://developer.blender.org/rBdba2d828462ae22de53f20f734bda6eb4d65171e

Geometry Nodes: avoid using enumerable thread specific on single thread

The geometry nodes evaluator supports "lazy threading", i.e. it starts out
single-threaded. But when it determines that multi-threading can be
benefitial, it switches to multi-threaded mode.

Now it only creates an enumerable-thread-specific if it is actually using
multiple threads. This results in a 6% speedup in my test file with many
node groups and math nodes.

===================================================================

M	source/blender/functions/intern/lazy_function_graph_executor.cc

===================================================================

diff --git a/source/blender/functions/intern/lazy_function_graph_executor.cc b/source/blender/functions/intern/lazy_function_graph_executor.cc
index 21040bd4550..83b14952829 100644
--- a/source/blender/functions/intern/lazy_function_graph_executor.cc
+++ b/source/blender/functions/intern/lazy_function_graph_executor.cc
@@ -245,8 +245,11 @@ class Executor {
    * A separate linear allocator for every thread. We could potentially reuse some memory, but that
    * doesn't seem worth it yet.
    */
-  threading::EnumerableThreadSpecific<LinearAllocator<>> local_allocators_;
-  LinearAllocator<> *main_local_allocator_ = nullptr;
+  struct ThreadLocalData {
+    LinearAllocator<> allocator;
+  };
+  std::unique_ptr<threading::EnumerableThreadSpecific<ThreadLocalData>> thread_locals_;
+  LinearAllocator<> main_allocator_;
   /**
    * Set to false when the first execution ends.
    */
@@ -259,7 +262,6 @@ class Executor {
   {
     /* The indices are necessary, because they are used as keys in #node_states_. */
     BLI_assert(self_.graph_.node_indices_are_valid());
-    main_local_allocator_ = &local_allocators_.local();
   }
 
   ~Executor()
@@ -338,16 +340,25 @@ class Executor {
     Span<const Node *> nodes = self_.graph_.nodes();
     node_states_.reinitialize(nodes.size());
 
-    /* Construct all node states in parallel. */
-    threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
-      LinearAllocator<> &allocator = local_allocators_.local();
+    auto construct_node_range = [&](const IndexRange range, LinearAllocator<> &allocator) {
       for (const int i : range) {
         const Node &node = *nodes[i];
         NodeState &node_state = *allocator.construct<NodeState>().release();
         node_states_[i] = &node_state;
         this->construct_initial_node_state(allocator, node, node_state);
       }
-    });
+    };
+    if (nodes.size() <= 256) {
+      construct_node_range(nodes.index_range(), main_allocator_);
+    }
+    else {
+      this->ensure_thread_locals();
+      /* Construct all node states in parallel. */
+      threading::parallel_for(nodes.index_range(), 256, [&](const IndexRange range) {
+        LinearAllocator<> &allocator = this->get_main_or_local_allocator();
+        construct_node_range(range, allocator);
+      });
+    }
   }
 
   void construct_initial_node_state(LinearAllocator<> &allocator,
@@ -1067,10 +1078,23 @@ class Executor {
     if (BLI_system_thread_count() <= 1) {
       return false;
     }
+    this->ensure_thread_locals();
     task_pool_.store(BLI_task_pool_create(this, TASK_PRIORITY_HIGH));
     return true;
   }
 
+  void ensure_thread_locals()
+  {
+#ifdef FN_LAZY_FUNCTION_DEBUG_THREADS
+    if (current_main_thread_ != std::this_thread::get_id()) {
+      BLI_assert_unreachable();
+    }
+#endif
+    if (!thread_locals_) {
+      thread_locals_ = std::make_unique<threading::EnumerableThreadSpecific<ThreadLocalData>>();
+    }
+  }
+
   /**
    * Allow other threads to steal all the nodes that are currently scheduled on this thread.
    */
@@ -1109,9 +1133,9 @@ class Executor {
   LinearAllocator<> &get_main_or_local_allocator()
   {
     if (this->use_multi_threading()) {
-      return local_allocators_.local();
+      return thread_locals_->local().allocator;
     }
-    return *main_local_allocator_;
+    return main_allocator_;
   }
 };