[Bf-blender-cvs] [a6eae9213f2] cycles-x: Cycles X: Multi-device re-balancing

Fri Jul 2 18:01:22 CEST 2021

Commit: a6eae9213f237650b289751df37e3e4b67360c31
Author: Sergey Sharybin
Date:   Wed Jun 30 18:19:33 2021 +0200
Branches: cycles-x
https://developer.blender.org/rBa6eae9213f237650b289751df37e3e4b67360c31

Cycles X: Multi-device re-balancing

This is an initial implementation which seems to give better
device utilization here when using two non-matched GPUs, as
well as multi-GPU and CPU.

General idea is to balance amount of work based on an
observed performance of devices, and "re-slice" the big tile.

Things which are known to be not final but considered a further
development:

- The balancing algorithm might need some tweaks for the
  objective   function and weight modification to converge to
  the ideal balance quicker.

- The "re-slicing" might also be optimized memory-wise.

- Headless rendering needs to give few iterations of smaller
  works to allow multi-device to settle down in the balance.

The balancing logic is in own little file, which simplifies
process of experiments.

Differential Revision: https://developer.blender.org/D11774

===================================================================

M	intern/cycles/integrator/path_trace.cpp
M	intern/cycles/integrator/render_scheduler.cpp
M	intern/cycles/integrator/work_balancer.cpp

===================================================================

diff --git a/intern/cycles/integrator/path_trace.cpp b/intern/cycles/integrator/path_trace.cpp
index 366030df6aa..3d434c5beda 100644
--- a/intern/cycles/integrator/path_trace.cpp
+++ b/intern/cycles/integrator/path_trace.cpp
@@ -516,23 +516,56 @@ void PathTrace::update_display(const RenderWork &render_work)
 
 void PathTrace::rebalance(const RenderWork &render_work)
 {
+  static const int kLogLevel = 3;
+
+  scoped_timer timer;
+
+  const int num_works = path_trace_works_.size();
+
   if (!render_work.rebalance) {
     return;
   }
 
-  if (path_trace_works_.size() == 1) {
-    VLOG(3) << "Ignoring rebalance work due to single device render.";
+  if (num_works == 1) {
+    VLOG(kLogLevel) << "Ignoring rebalance work due to single device render.";
     return;
   }
 
-  VLOG(3) << "Perform rebalance work.";
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Perform rebalance work.";
+    VLOG(kLogLevel) << "Per-device path tracing time (seconds):";
+    for (int i = 0; i < num_works; ++i) {
+      VLOG(kLogLevel) << path_trace_works_[i]->get_device()->info.description << ": "
+                      << work_balance_infos_[i].time_spent;
+    }
+  }
+
+  const bool did_rebalance = work_balance_do_rebalance(work_balance_infos_);
 
-  if (!work_balance_do_rebalance(work_balance_infos_)) {
-    VLOG(3) << "Balance in path trace works did not change.";
+  if (VLOG_IS_ON(kLogLevel)) {
+    VLOG(kLogLevel) << "Calculated per-device weights for works:";
+    for (int i = 0; i < num_works; ++i) {
+      LOG(INFO) << path_trace_works_[i]->get_device()->info.description << ": "
+                << work_balance_infos_[i].weight;
+    }
+  }
+
+  if (!did_rebalance) {
+    VLOG(kLogLevel) << "Balance in path trace works did not change.";
     return;
   }
 
-  /* TODO(sergey): Update buffer allocation, and copy data across devices as needed. */
+  TempCPURenderBuffers big_tile_cpu_buffers(device_);
+  big_tile_cpu_buffers.buffers->reset(render_state_.effective_big_tile_params);
+
+  copy_to_render_buffers(big_tile_cpu_buffers.buffers.get());
+
+  render_state_.need_reset_params = true;
+  update_work_buffer_params_if_needed(render_work);
+
+  copy_from_render_buffers(big_tile_cpu_buffers.buffers.get());
+
+  VLOG(kLogLevel) << "Rebalance time (seconds): " << timer.get_time();
 }
 
 void PathTrace::cancel()
diff --git a/intern/cycles/integrator/render_scheduler.cpp b/intern/cycles/integrator/render_scheduler.cpp
index 14b1ba69fdd..e9586075a5b 100644
--- a/intern/cycles/integrator/render_scheduler.cpp
+++ b/intern/cycles/integrator/render_scheduler.cpp
@@ -712,7 +712,7 @@ bool RenderScheduler::work_need_rebalance()
 {
   /* This is the minimum time, as the rebalancing can not happen more often than the path trace
    * work. */
-  static const double kRebalanceIntervalInSeconds = 5;
+  static const double kRebalanceIntervalInSeconds = 1;
 
   if (state_.resolution_divider != pixel_size_) {
     /* Don't rebalance at a non-final resolution divider. Some reasons for this:
@@ -721,6 +721,10 @@ bool RenderScheduler::work_need_rebalance()
     return false;
   }
 
+  if (state_.num_rendered_samples == 1) {
+    return true;
+  }
+
   return (time_dt() - state_.last_rebalance_time) > kRebalanceIntervalInSeconds;
 }
 
diff --git a/intern/cycles/integrator/work_balancer.cpp b/intern/cycles/integrator/work_balancer.cpp
index b2a69866982..3edb8ba5598 100644
--- a/intern/cycles/integrator/work_balancer.cpp
+++ b/intern/cycles/integrator/work_balancer.cpp
@@ -16,6 +16,8 @@
 
 #include "integrator/work_balancer.h"
 
+#include "util/util_math.h"
+
 CCL_NAMESPACE_BEGIN
 
 void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
@@ -27,17 +29,108 @@ void work_balance_do_initial(vector<WorkBalanceInfo> &work_balance_infos)
     return;
   }
 
+  /* There is no statistics available, so start with an equal distribution. */
   const double weight = 1.0 / num_infos;
   for (WorkBalanceInfo &balance_info : work_balance_infos) {
     balance_info.weight = weight;
   }
 }
 
+/* Calculate time which takes for every work to complete a unit of work.
+ * The result times are normalized so that their sum is 1. */
+static vector<double> calculate_normalized_times_per_unit(
+    const vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  vector<double> times_per_unit;
+  times_per_unit.reserve(num_infos);
+
+  double total_time_per_unit = 0;
+  for (const WorkBalanceInfo &work_balance_info : work_balance_infos) {
+    /* The work did `total_work * weight`, and the time per unit is
+     * `time_spent / (total_work * weight). The total amount of work is not known here, but it will
+     * gets cancelled out during normalization anyway.
+     *
+     * Note that in some degenerated cases (when amount of work is smaller than amount of workers)
+     * it is possible that the time and/or weight of the work is 0. */
+    const double time_per_unit = work_balance_info.weight != 0 ?
+                                     work_balance_info.time_spent / work_balance_info.weight :
+                                     0;
+    times_per_unit.push_back(time_per_unit);
+    total_time_per_unit += time_per_unit;
+  }
+
+  const double total_time_per_unit_inv = 1.0 / total_time_per_unit;
+  for (double &time_per_unit : times_per_unit) {
+    time_per_unit *= total_time_per_unit_inv;
+  }
+
+  return times_per_unit;
+}
+
+/* Calculate weights for the more ideal distribution of work.
+ * The calculation here is based on an observed performance of every worker: the amount of work
+ * scheduler is proportional to the performance of the worker. Performance of the worker is an
+ * inverse of the time-per-unit-work. */
+static vector<double> calculate_normalized_weights(
+    const vector<WorkBalanceInfo> &work_balance_infos)
+{
+  const int num_infos = work_balance_infos.size();
+
+  const vector<double> times_per_unit = calculate_normalized_times_per_unit(work_balance_infos);
+
+  vector<double> weights;
+  weights.reserve(num_infos);
+
+  double total_weight = 0;
+  for (double time_per_unit : times_per_unit) {
+    /* Note that in some degenerated cases (when amount of work is smaller than amount of workers)
+     * it is possible that the time and/or weight of the work is 0. */
+    const double weight = time_per_unit != 0 ? 1.0 / time_per_unit : 0;
+    total_weight += weight;
+    weights.push_back(weight);
+  }
+
+  const double total_weight_inv = 1.0 / total_weight;
+  for (double &weight : weights) {
+    weight *= total_weight_inv;
+  }
+
+  return weights;
+}
+
+static bool apply_new_weights(vector<WorkBalanceInfo> &work_balance_infos,
+                              const vector<double> &new_weights)
+{
+  const int num_infos = work_balance_infos.size();
+
+  bool has_big_difference = false;
+  for (int i = 0; i < num_infos; ++i) {
+    /* Apparently, there is no `ccl::fabs()`. */
+    if (std::fabs(work_balance_infos[i].weight - new_weights[i]) > 0.02) {
+      has_big_difference = true;
+    }
+  }
+
+  if (!has_big_difference) {
+    return false;
+  }
+
+  for (int i = 0; i < num_infos; ++i) {
+    WorkBalanceInfo &info = work_balance_infos[i];
+    info.weight = new_weights[i];
+    info.time_spent = 0;
+  }
+
+  return true;
+}
+
 bool work_balance_do_rebalance(vector<WorkBalanceInfo> &work_balance_infos)
 {
-  /* TODO(sergey): Needs implementation. */
-  (void)work_balance_infos;
-  return false;
+  const vector<double> new_weights = calculate_normalized_weights(work_balance_infos);
+
+  return apply_new_weights(work_balance_infos, new_weights);
 }
 
 CCL_NAMESPACE_END