[Bf-blender-cvs] [e206a0ae960] master: Geometry Nodes: reduce thread switching in evaluator

Jacques Lucke noreply at git.blender.org
Wed Nov 24 17:28:20 CET 2021


Commit: e206a0ae960c2c62df6ece863bf855dda581d4f1
Author: Jacques Lucke
Date:   Wed Nov 24 17:22:02 2021 +0100
Branches: master
https://developer.blender.org/rBe206a0ae960c2c62df6ece863bf855dda581d4f1

Geometry Nodes: reduce thread switching in evaluator

When a node is executed, it usually schedules other nodes.
Right now, those newly scheduled nodes are added to a
task pool so that another thread can start working on them
immediatly.

However, that leads to the situation where sometimes each
node in a simple chain is executed by another thread. That
leads to additional threading overhead and reduced cache
efficiency (for caches that are not shared between cores).

Now, when a node is executed and schedules other nodes,
the first of those newly scheduled nodes will always be
executed on the same thread once the current node is done.
If it schedules more than one other node, those will be
added to the task pool as before.

The speedup achieved by this is hard to measure. I found it
to be a couple percent faster in some extreme cases, not
much to get excited about. It's nice though that the number
of tasks added to the task pool is commonly reduced by a
factor of 4 or 5.

===================================================================

M	source/blender/modifiers/intern/MOD_nodes_evaluator.cc

===================================================================

diff --git a/source/blender/modifiers/intern/MOD_nodes_evaluator.cc b/source/blender/modifiers/intern/MOD_nodes_evaluator.cc
index 6d7abb084e4..33a5da7ccb7 100644
--- a/source/blender/modifiers/intern/MOD_nodes_evaluator.cc
+++ b/source/blender/modifiers/intern/MOD_nodes_evaluator.cc
@@ -386,14 +386,23 @@ static bool node_supports_laziness(const DNode node)
   return node->typeinfo()->geometry_node_execute_supports_laziness;
 }
 
+struct NodeTaskRunState {
+  /** The node that should be run on the same thread after the current node finished. */
+  DNode next_node_to_run;
+};
+
 /** Implements the callbacks that might be called when a node is executed. */
 class NodeParamsProvider : public nodes::GeoNodeExecParamsProvider {
  private:
   GeometryNodesEvaluator &evaluator_;
   NodeState &node_state_;
+  NodeTaskRunState *run_state_;
 
  public:
-  NodeParamsProvider(GeometryNodesEvaluator &evaluator, DNode dnode, NodeState &node_state);
+  NodeParamsProvider(GeometryNodesEvaluator &evaluator,
+                     DNode dnode,
+                     NodeState &node_state,
+                     NodeTaskRunState *run_state);
 
   bool can_get_input(StringRef identifier) const override;
   bool can_set_output(StringRef identifier) const override;
@@ -645,7 +654,7 @@ class GeometryNodesEvaluator {
         value.destruct();
         continue;
       }
-      this->forward_output(socket, value);
+      this->forward_output(socket, value, nullptr);
     }
   }
 
@@ -654,7 +663,7 @@ class GeometryNodesEvaluator {
     for (const DInputSocket &socket : params_.output_sockets) {
       const DNode node = socket.node();
       NodeState &node_state = this->get_node_state(node);
-      this->with_locked_node(node, node_state, [&](LockedNode &locked_node) {
+      this->with_locked_node(node, node_state, nullptr, [&](LockedNode &locked_node) {
         /* Setting an input as required will schedule any linked node. */
         this->set_input_required(locked_node, socket);
       });
@@ -662,7 +671,7 @@ class GeometryNodesEvaluator {
     for (const DSocket socket : params_.force_compute_sockets) {
       const DNode node = socket.node();
       NodeState &node_state = this->get_node_state(node);
-      this->with_locked_node(node, node_state, [&](LockedNode &locked_node) {
+      this->with_locked_node(node, node_state, nullptr, [&](LockedNode &locked_node) {
         if (socket->is_input()) {
           this->set_input_required(locked_node, DInputSocket(socket));
         }
@@ -707,12 +716,24 @@ class GeometryNodesEvaluator {
   {
     void *user_data = BLI_task_pool_user_data(task_pool);
     GeometryNodesEvaluator &evaluator = *(GeometryNodesEvaluator *)user_data;
-    const NodeWithState *node_with_state = (const NodeWithState *)task_data;
-
-    evaluator.node_task_run(node_with_state->node, *node_with_state->state);
+    const NodeWithState *root_node_with_state = (const NodeWithState *)task_data;
+
+    /* First, the node provided by the task pool is executed. During the execution other nodes
+     * might be scheduled. One of those nodes is not added to the task pool but is executed in the
+     * loop below directly. This has two main benefits:
+     * - Fewer round trips through the task pool which add threading overhead.
+     * - Helps with cpu cache efficiency, because a thread is more likely to process data that it
+     *   has processed shortly before.
+     */
+    DNode next_node_to_run = root_node_with_state->node;
+    while (next_node_to_run) {
+      NodeTaskRunState run_state;
+      evaluator.node_task_run(next_node_to_run, &run_state);
+      next_node_to_run = run_state.next_node_to_run;
+    }
   }
 
-  void node_task_run(const DNode node, NodeState &node_state)
+  void node_task_run(const DNode node, NodeTaskRunState *run_state)
   {
     /* These nodes are sometimes scheduled. We could also check for them in other places, but
      * it's the easiest to do it here. */
@@ -720,21 +741,25 @@ class GeometryNodesEvaluator {
       return;
     }
 
-    const bool do_execute_node = this->node_task_preprocessing(node, node_state);
+    NodeState &node_state = *node_states_.lookup_key_as(node).state;
+
+    const bool do_execute_node = this->node_task_preprocessing(node, node_state, run_state);
 
     /* Only execute the node if all prerequisites are met. There has to be an output that is
      * required and all required inputs have to be provided already. */
     if (do_execute_node) {
-      this->execute_node(node, node_state);
+      this->execute_node(node, node_state, run_state);
     }
 
-    this->node_task_postprocessing(node, node_state, do_execute_node);
+    this->node_task_postprocessing(node, node_state, do_execute_node, run_state);
   }
 
-  bool node_task_preprocessing(const DNode node, NodeState &node_state)
+  bool node_task_preprocessing(const DNode node,
+                               NodeState &node_state,
+                               NodeTaskRunState *run_state)
   {
     bool do_execute_node = false;
-    this->with_locked_node(node, node_state, [&](LockedNode &locked_node) {
+    this->with_locked_node(node, node_state, run_state, [&](LockedNode &locked_node) {
       BLI_assert(node_state.schedule_state == NodeScheduleState::Scheduled);
       node_state.schedule_state = NodeScheduleState::Running;
 
@@ -893,7 +918,7 @@ class GeometryNodesEvaluator {
    * Actually execute the node. All the required inputs are available and at least one output is
    * required.
    */
-  void execute_node(const DNode node, NodeState &node_state)
+  void execute_node(const DNode node, NodeState &node_state, NodeTaskRunState *run_state)
   {
     const bNode &bnode = *node->bnode();
 
@@ -907,25 +932,25 @@ class GeometryNodesEvaluator {
 
     /* Use the geometry node execute callback if it exists. */
     if (bnode.typeinfo->geometry_node_execute != nullptr) {
-      this->execute_geometry_node(node, node_state);
+      this->execute_geometry_node(node, node_state, run_state);
       return;
     }
 
     /* Use the multi-function implementation if it exists. */
     const nodes::NodeMultiFunctions::Item &fn_item = params_.mf_by_node->try_get(node);
     if (fn_item.fn != nullptr) {
-      this->execute_multi_function_node(node, fn_item, node_state);
+      this->execute_multi_function_node(node, fn_item, node_state, run_state);
       return;
     }
 
-    this->execute_unknown_node(node, node_state);
+    this->execute_unknown_node(node, node_state, run_state);
   }
 
-  void execute_geometry_node(const DNode node, NodeState &node_state)
+  void execute_geometry_node(const DNode node, NodeState &node_state, NodeTaskRunState *run_state)
   {
     const bNode &bnode = *node->bnode();
 
-    NodeParamsProvider params_provider{*this, node, node_state};
+    NodeParamsProvider params_provider{*this, node, node_state, run_state};
     GeoNodeExecParams params{params_provider};
     if (node->idname().find("Legacy") != StringRef::not_found) {
       params.error_message_add(geo_log::NodeWarningType::Legacy,
@@ -944,11 +969,12 @@ class GeometryNodesEvaluator {
 
   void execute_multi_function_node(const DNode node,
                                    const nodes::NodeMultiFunctions::Item &fn_item,
-                                   NodeState &node_state)
+                                   NodeState &node_state,
+                                   NodeTaskRunState *run_state)
   {
     if (node->idname().find("Legacy") != StringRef::not_found) {
       /* Create geometry nodes params just for creating an error message. */
-      NodeParamsProvider params_provider{*this, node, node_state};
+      NodeParamsProvider params_provider{*this, node, node_state, run_state};
       GeoNodeExecParams params{params_provider};
       params.error_message_add(geo_log::NodeWarningType::Legacy,
                                TIP_("Legacy node will be removed before Blender 4.0"));
@@ -980,11 +1006,11 @@ class GeometryNodesEvaluator {
 
     if (any_input_is_field) {
       this->execute_multi_function_node__field(
-          node, fn_item, node_state, allocator, input_values, input_types);
+          node, fn_item, node_state, allocator, input_values, input_types, run_state);
     }
     else {
       this->execute_multi_function_node__value(
-          node, *fn_item.fn, node_state, allocator, input_values, input_types);
+          node, *fn_item.fn, node_state, allocator, input_values, input_types, run_state);
     }
   }
 
@@ -993,7 +1019,8 @@ class GeometryNodesEvaluator {
                                           NodeState &node_state,
                                           LinearAllocator<> &allocator,
                                           Span<const void *> input_values,
-                                          Span<const ValueOrFieldCPPType *> input_types)
+                                          Span<const ValueOrFieldCPPType *> input_types,
+                                          NodeTaskRunState *run_state)
   {
     Vector<GField> input_fields;
     for (const int i : input_values.index_range()) {
@@ -1023,7 +1050,7 @@ class GeometryNodesEvaluator {
       GField new_field{operation, output_index};
       void *buffer = allocator.allocate(cpp_type->size(), cpp_type->alignment());
       cpp_type->construct_from_field(buffer, std::move(new_field));
-      this->forward_output(socket, {cpp_type, buffer});
+      this->forward_output(socket, {cpp_type, buffer}, run_state);
       output_state.has_been_computed = true;
       output_index++;
     }
@@ -1034,7 +1061,8 @@ class GeometryNodesEvaluator {
                                           NodeState &node_state,
                                           LinearAllocator<> &allocator,
                                           Span<const void *> input_values,
-                                          Span<const ValueOrFieldCPPType *> input_types)
+                                          Span<const ValueOrFieldCPPType *> input_types,
+                                          NodeTaskRunState *run_state)
   {
     MFParamsBuilder params{fn, 1};
     for (const int i : input_values.index_range()) {
@@ -1073,14 +1101,14 @@ class GeometryNodesEvaluator {
         cont

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list