[Bf-blender-cvs] [e50f1ddc654] master: Cycles: use TBB for task pools and task scheduler
Brecht Van Lommel
noreply at git.blender.org
Mon Jun 22 13:28:15 CEST 2020
Commit: e50f1ddc6540680d2aafc1c76f8339d69350f84a
Author: Brecht Van Lommel
Date: Fri Jun 5 16:39:57 2020 +0200
Branches: master
https://developer.blender.org/rBe50f1ddc6540680d2aafc1c76f8339d69350f84a
Cycles: use TBB for task pools and task scheduler
No significant performance improvement is expected, but it means we have a
single thread pool throughout Blender. And it should make adding more
parallellization in the future easier.
After previous refactoring commits this is basically a drop-in replacement.
One difference is that the task pool had a mechanism for scheduling tasks to
the front of the queue to minimize memory usage. TBB has a smarter algorithm
to balance depth-first and breadth-first scheduling of tasks and we assume that
removes the need to manually provide hints to the scheduler.
Fixes T77533
===================================================================
M intern/cycles/bvh/bvh_build.cpp
M intern/cycles/bvh/bvh_sort.cpp
M intern/cycles/device/device.cpp
M intern/cycles/device/device.h
M intern/cycles/util/util_task.cpp
M intern/cycles/util/util_task.h
===================================================================
diff --git a/intern/cycles/bvh/bvh_build.cpp b/intern/cycles/bvh/bvh_build.cpp
index 116576b101d..0235ac33c77 100644
--- a/intern/cycles/bvh/bvh_build.cpp
+++ b/intern/cycles/bvh/bvh_build.cpp
@@ -626,8 +626,8 @@ BVHNode *BVHBuild::build_node(const BVHObjectBinning &range, int level)
/* Threaded build */
inner = new InnerNode(bounds);
- task_pool.push([=] { thread_build_node(inner, 0, left, level + 1); }, true);
- task_pool.push([=] { thread_build_node(inner, 1, right, level + 1); }, true);
+ task_pool.push([=] { thread_build_node(inner, 0, left, level + 1); });
+ task_pool.push([=] { thread_build_node(inner, 1, right, level + 1); });
}
if (do_unalinged_split) {
@@ -742,16 +742,12 @@ BVHNode *BVHBuild::build_node(const BVHRange &range,
/* Create tasks for left and right nodes, using copy for most arguments and
* move for reference to avoid memory copies. */
- task_pool.push(
- [=, refs = std::move(left_references)]() mutable {
- thread_build_spatial_split_node(inner, 0, left, refs, level + 1);
- },
- true);
- task_pool.push(
- [=, refs = std::move(right_references)]() mutable {
- thread_build_spatial_split_node(inner, 1, right, refs, level + 1);
- },
- true);
+ task_pool.push([=, refs = std::move(left_references)]() mutable {
+ thread_build_spatial_split_node(inner, 0, left, refs, level + 1);
+ });
+ task_pool.push([=, refs = std::move(right_references)]() mutable {
+ thread_build_spatial_split_node(inner, 1, right, refs, level + 1);
+ });
}
if (do_unalinged_split) {
diff --git a/intern/cycles/bvh/bvh_sort.cpp b/intern/cycles/bvh/bvh_sort.cpp
index 5bdded354bc..b01785b547a 100644
--- a/intern/cycles/bvh/bvh_sort.cpp
+++ b/intern/cycles/bvh/bvh_sort.cpp
@@ -147,7 +147,7 @@ static void bvh_reference_sort_threaded(TaskPool *task_pool,
if (left < end) {
if (start < right) {
task_pool->push(
- function_bind(bvh_reference_sort_threaded, task_pool, data, left, end, compare), true);
+ function_bind(bvh_reference_sort_threaded, task_pool, data, left, end, compare));
}
else {
start = left;
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 41dd7894d93..263d3d24b13 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -77,7 +77,7 @@ std::ostream &operator<<(std::ostream &os, const DeviceRequestedFeatures &reques
/* Device */
-Device::~Device()
+Device::~Device() noexcept(false)
{
if (!background) {
if (vertex_buffer != 0) {
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index dff981080a5..67828103394 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -319,7 +319,8 @@ class Device {
virtual void mem_free_sub_ptr(device_ptr /*ptr*/){};
public:
- virtual ~Device();
+ /* noexcept needed to silence TBB warning. */
+ virtual ~Device() noexcept(false);
/* info */
DeviceInfo info;
diff --git a/intern/cycles/util/util_task.cpp b/intern/cycles/util/util_task.cpp
index eb07ec0bfa0..4fb61392e92 100644
--- a/intern/cycles/util/util_task.cpp
+++ b/intern/cycles/util/util_task.cpp
@@ -20,28 +20,12 @@
#include "util/util_system.h"
#include "util/util_time.h"
-//#define THREADING_DEBUG_ENABLED
-
-#ifdef THREADING_DEBUG_ENABLED
-# include <stdio.h>
-# define THREADING_DEBUG(...) \
- do { \
- printf(__VA_ARGS__); \
- fflush(stdout); \
- } while (0)
-#else
-# define THREADING_DEBUG(...)
-#endif
-
CCL_NAMESPACE_BEGIN
/* Task Pool */
-TaskPool::TaskPool()
+TaskPool::TaskPool() : start_time(time_dt()), num_tasks_handled(0)
{
- num_tasks_handled = 0;
- num = 0;
- do_cancel = false;
}
TaskPool::~TaskPool()
@@ -49,66 +33,15 @@ TaskPool::~TaskPool()
cancel();
}
-void TaskPool::push(TaskRunFunction &&task, bool front)
+void TaskPool::push(TaskRunFunction &&task)
{
- TaskScheduler::Entry entry;
-
- entry.task = new TaskRunFunction(std::move(task));
- entry.pool = this;
-
- TaskScheduler::push(entry, front);
+ tbb_group.run(std::move(task));
+ num_tasks_handled++;
}
void TaskPool::wait_work(Summary *stats)
{
- thread_scoped_lock num_lock(num_mutex);
-
- while (num != 0) {
- num_lock.unlock();
-
- thread_scoped_lock queue_lock(TaskScheduler::queue_mutex);
-
- /* find task from this pool. if we get a task from another pool,
- * we can get into deadlock */
- TaskScheduler::Entry work_entry;
- bool found_entry = false;
- list<TaskScheduler::Entry>::iterator it;
-
- for (it = TaskScheduler::queue.begin(); it != TaskScheduler::queue.end(); it++) {
- TaskScheduler::Entry &entry = *it;
-
- if (entry.pool == this) {
- work_entry = entry;
- found_entry = true;
- TaskScheduler::queue.erase(it);
- break;
- }
- }
-
- queue_lock.unlock();
-
- /* if found task, do it, otherwise wait until other tasks are done */
- if (found_entry) {
- /* run task */
- (*work_entry.task)();
-
- /* delete task */
- delete work_entry.task;
-
- /* notify pool task was done */
- num_decrease(1);
- }
-
- num_lock.lock();
- if (num == 0)
- break;
-
- if (!found_entry) {
- THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::wait_work !found_entry\n", num);
- num_cond.wait(num_lock);
- THREADING_DEBUG("num==%d, condition wait done in TaskPool::wait_work !found_entry\n", num);
- }
- }
+ tbb_group.wait();
if (stats != NULL) {
stats->time_total = time_dt() - start_time;
@@ -118,180 +51,21 @@ void TaskPool::wait_work(Summary *stats)
void TaskPool::cancel()
{
- do_cancel = true;
-
- TaskScheduler::clear(this);
-
- {
- thread_scoped_lock num_lock(num_mutex);
-
- while (num) {
- THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::cancel\n", num);
- num_cond.wait(num_lock);
- THREADING_DEBUG("num==%d condition wait done in TaskPool::cancel\n", num);
- }
- }
-
- do_cancel = false;
+ tbb_group.cancel();
+ tbb_group.wait();
}
bool TaskPool::canceled()
{
- return do_cancel;
-}
-
-void TaskPool::num_decrease(int done)
-{
- num_mutex.lock();
- num -= done;
-
- assert(num >= 0);
- if (num == 0) {
- THREADING_DEBUG("num==%d, notifying all in TaskPool::num_decrease\n", num);
- num_cond.notify_all();
- }
-
- num_mutex.unlock();
-}
-
-void TaskPool::num_increase()
-{
- thread_scoped_lock num_lock(num_mutex);
- if (num_tasks_handled == 0) {
- start_time = time_dt();
- }
- num++;
- num_tasks_handled++;
- THREADING_DEBUG("num==%d, notifying all in TaskPool::num_increase\n", num);
- num_cond.notify_all();
+ return tbb_group.is_canceling();
}
/* Task Scheduler */
thread_mutex TaskScheduler::mutex;
int TaskScheduler::users = 0;
-vector<thread *> TaskScheduler::threads;
-bool TaskScheduler::do_exit = false;
-
-list<TaskScheduler::Entry> TaskScheduler::queue;
-thread_mutex TaskScheduler::queue_mutex;
-thread_condition_variable TaskScheduler::queue_cond;
-
-namespace {
-
-/* Get number of processors on each of the available nodes. The result is sized
- * by the highest node index, and element corresponds to number of processors on
- * that node.
- * If node is not available, then the corresponding number of processors is
- * zero. */
-void get_per_node_num_processors(vector<int> *num_per_node_processors)
-{
- const int num_nodes = system_cpu_num_numa_nodes();
- if (num_nodes == 0) {
- LOG(ERROR) << "Zero available NUMA nodes, is not supposed to happen.";
- return;
- }
- num_per_node_processors->resize(num_nodes);
- for (int node = 0; node < num_nodes; ++node) {
- if (!system_cpu_is_numa_node_available(node)) {
- (*num_per_node_processors)[node] = 0;
- continue;
- }
- (*num_per_node_processors)[node] = system_cpu_num_numa_node_processors(node);
- }
-}
-
-/* Calculate total number of processors on all available nodes.
- * This is similar to system_cpu_thread_count(), but uses pre-calculated number
- * of processors on each of the node, avoiding extra system calls and checks for
- * the node availability. */
-int get_num_total_processors(const vector<int> &num_per_node_processors)
-{
- int num_total_processors = 0;
- foreach (int num_node_processors, num_per_node_processors) {
- num_total_processors += num_node_processors;
- }
- return num_total_processors;
-}
-
-/* Compute NUMA node for every thread to run on, for the best performance. */
-vector<int> distribute_threads_on_nodes(const int num_threads)
-{
- /* Start with all threads unassigned to any specific NUMA node. */
- vector<int> thread_nodes(num_threads, -1);
- const int num_active_group_processors = system_cpu_num_active_group_processors();
- VLOG(1) << "Detected " << num_active_group_processors << " processors "
- << "in active group.";
- if (num_active_group_processors >= num_threads) {
- /* If the current thread is set up in a way that its affinity allows to
- * use at least requested number of threads we do not explicitly set
- * affinity to the worker threads.
- * This way we allow users to manually edit affinity of the parent
- * thread, and here we follow that affinity. This way it's possible to
- * have two Cycles/Blender instances running manually set to a different
- * dies on a CPU. */
- VLOG(1) << "Not setting thread group affinity.";
- return thread_nodes;
- }
- vector<int> num_per_node_processors;
- get_per_node_num_processors(&num_per_node_processors);
- if (num_per_node_processors.size() == 0) {
- /* Error was already reported, here we can't do anything, so we simply
- * leave default affinity to all the worker threads. */
- return thread_nodes;
- }
- const int num_nodes = num_per_node_processors.size();
- int thread_index = 0;
- /* First pass: fill in all the nodes to their maximum.
- *
- * If there is less threads than the overall nodes capacity, some of the
- * nodes or parts of them will idle.
- *
- * TODO(sergey): Consider picking up fastest nodes if number of threads
- * fits on them. For example, on Thread
@@ Diff output truncated at 10240 characters. @@
More information about the Bf-blender-cvs
mailing list