[Bf-blender-cvs] [c3d2a10e505] asset-browser-poselib: Refactor: Draw Cache: use 'BLI_task_parallel_range'

Germano Cavalcante noreply at git.blender.org
Fri Jun 11 16:38:00 CEST 2021


Commit: c3d2a10e50584de876a5d27aa0c38329b768ac7a
Author: Germano Cavalcante
Date:   Thu Jun 10 11:01:36 2021 -0300
Branches: asset-browser-poselib
https://developer.blender.org/rBc3d2a10e50584de876a5d27aa0c38329b768ac7a

Refactor: Draw Cache: use 'BLI_task_parallel_range'

This is an adaptation of {D11488}.

A disadvantage of manually setting the iter ranges per thread is that
we don't know how many threads are running in the background and so we
don't know how to best distribute the ranges.

To solve this limitation we can use `parallel_reduce` and thus let the
driver choose the best distribution of ranges among the threads.

This proved to be especially beneficial for computers with few cores.

**Benchmarking:**
Here's the result on an 4-core laptop:
||master:|PATCH:
|---|---|---|
|large_mesh_editing:|Average: 5.203638 FPS|Average: 5.398925 FPS
||rdata 15ms iter 43ms (frame 193ms)|rdata 14ms iter 36ms (frame 187ms)

Here's the result on an 8-core PC:
||master:|PATCH:
|---|---|---|
|large_mesh_editing:|Average: 15.267482 FPS|Average: 15.906881 FPS
||rdata 9ms iter 28ms (frame 65ms)|rdata 9ms iter 25ms (frame 63ms)
|large_mesh_editing_ledge: |Average: 15.145966 FPS|Average: 15.520474 FPS
||rdata 9ms iter 29ms (frame 65ms)|rdata 9ms iter 25ms (frame 64ms)
|looptris_test:|Average: 4.001917 FPS|Average: 4.061105 FPS
||rdata 12ms iter 90ms (frame 236ms)|rdata 12ms iter 87ms (frame 230ms)
|subdiv_mesh_cage_and_final:|Average: 1.917769 FPS|Average: 1.971790 FPS
||rdata 7ms iter 37ms (frame 261ms)|rdata 7ms iter 31ms (frame 258ms)
||rdata 7ms iter 38ms (frame 252ms)|rdata 7ms iter 33ms (frame 249ms)
|subdiv_mesh_final_only:|Average: 6.387240 FPS|Average: 6.591251 FPS
||rdata 3ms iter 25ms (frame 151ms)|rdata 3ms iter 16ms (frame 145ms)
|subdiv_mesh_final_only_ledge:|Average: 6.247393 FPS|Average: 6.596024 FPS
||rdata 3ms iter 26ms (frame 158ms)|rdata 3ms iter 16ms (frame 148ms)

**Notes:**
- The improvement can only be noticed if all extracts are multithreaded.
- This patch touches different areas of the code, so it can be split into another patch if the idea is accepted.

These screenshots show how threads behave in a quadcore:
Master:
{F10164664}
Patch:
{F10164666}

Differential Revision: https://developer.blender.org/D11558

===================================================================

M	source/blender/draw/intern/draw_cache_extract_mesh.cc
M	source/blender/draw/intern/draw_cache_extract_mesh_extractors.c
M	source/blender/draw/intern/draw_cache_extract_mesh_private.h
M	source/blender/draw/intern/mesh_extractors/extract_mesh_ibo_edituv.cc
M	source/blender/draw/intern/mesh_extractors/extract_mesh_ibo_fdots.cc
M	source/blender/draw/intern/mesh_extractors/extract_mesh_ibo_lines.cc
M	source/blender/draw/intern/mesh_extractors/extract_mesh_ibo_lines_adjacency.cc
M	source/blender/draw/intern/mesh_extractors/extract_mesh_ibo_lines_paint_mask.cc
M	source/blender/draw/intern/mesh_extractors/extract_mesh_ibo_points.cc
M	source/blender/draw/intern/mesh_extractors/extract_mesh_ibo_tris.cc
M	source/blender/gpu/GPU_index_buffer.h
M	source/blender/gpu/intern/gpu_index_buffer.cc

===================================================================

diff --git a/source/blender/draw/intern/draw_cache_extract_mesh.cc b/source/blender/draw/intern/draw_cache_extract_mesh.cc
index 6b5877e6759..c6303d541b3 100644
--- a/source/blender/draw/intern/draw_cache_extract_mesh.cc
+++ b/source/blender/draw/intern/draw_cache_extract_mesh.cc
@@ -50,7 +50,7 @@
 #  include "PIL_time_utildefines.h"
 #endif
 
-#define CHUNK_SIZE 8192
+#define CHUNK_SIZE 1024
 
 namespace blender::draw {
 
@@ -65,26 +65,12 @@ struct ExtractorRunData {
   const MeshExtract *extractor;
   /* During iteration the VBO/IBO that is being build. */
   void *buffer = nullptr;
-  /* User data during iteration. Created in MeshExtract.init and passed along to other MeshExtract
-   * functions. */
-  void *user_data = nullptr;
-  std::optional<Array<void *>> task_user_datas;
+  uint32_t data_offset = 0;
 
   ExtractorRunData(const MeshExtract *extractor) : extractor(extractor)
   {
   }
 
-  void init_task_user_datas(const TaskLen task_len)
-  {
-    task_user_datas = Array<void *>(task_len);
-  }
-
-  void *&operator[](const TaskId task_id)
-  {
-    BLI_assert(task_user_datas);
-    return (*task_user_datas)[task_id];
-  }
-
 #ifdef WITH_CXX_GUARDEDALLOC
   MEM_CXX_CLASS_ALLOC_FUNCS("DRAW:ExtractorRunData")
 #endif
@@ -157,6 +143,16 @@ class ExtractorRunDatas : public Vector<ExtractorRunData> {
     return data_type;
   }
 
+  size_t data_size_total()
+  {
+    size_t data_size = 0;
+    for (const ExtractorRunData &data : *this) {
+      const MeshExtract *extractor = data.extractor;
+      data_size += extractor->data_size;
+    }
+    return data_size;
+  }
+
 #ifdef WITH_CXX_GUARDEDALLOC
   MEM_CXX_CLASS_ALLOC_FUNCS("DRAW:ExtractorRunDatas")
 #endif
@@ -165,446 +161,334 @@ class ExtractorRunDatas : public Vector<ExtractorRunData> {
 /** \} */
 
 /* ---------------------------------------------------------------------- */
-/** \name Extract
+/** \name ExtractTaskData
+ * \{ */
+struct ExtractTaskData {
+  const MeshRenderData *mr = nullptr;
+  MeshBatchCache *cache = nullptr;
+  ExtractorRunDatas *extractors = nullptr;
+  MeshBufferCache *mbc = nullptr;
+
+  eMRIterType iter_type;
+  bool use_threading = false;
+
+  ExtractTaskData(const MeshRenderData *mr,
+                  struct MeshBatchCache *cache,
+                  ExtractorRunDatas *extractors,
+                  MeshBufferCache *mbc,
+                  const bool use_threading)
+      : mr(mr), cache(cache), extractors(extractors), mbc(mbc), use_threading(use_threading)
+  {
+    iter_type = extractors->iter_types();
+  };
+
+  ExtractTaskData(const ExtractTaskData &src) = default;
+
+  ~ExtractTaskData()
+  {
+    delete extractors;
+  }
+
+#ifdef WITH_CXX_GUARDEDALLOC
+  MEM_CXX_CLASS_ALLOC_FUNCS("DRW:ExtractTaskData")
+#endif
+};
+
+static void extract_task_data_free(void *data)
+{
+  ExtractTaskData *task_data = static_cast<ExtractTaskData *>(data);
+  delete task_data;
+}
+
+/** \} */
+
+/* ---------------------------------------------------------------------- */
+/** \name Extract Init and Finish
  * \{ */
 
 BLI_INLINE void extract_init(const MeshRenderData *mr,
                              struct MeshBatchCache *cache,
                              ExtractorRunDatas &extractors,
-                             MeshBufferCache *mbc)
+                             MeshBufferCache *mbc,
+                             void *data_stack)
 {
-  /* Multi thread. */
+  uint32_t data_offset = 0;
   for (ExtractorRunData &run_data : extractors) {
     const MeshExtract *extractor = run_data.extractor;
     run_data.buffer = mesh_extract_buffer_get(extractor, mbc);
-    run_data.user_data = extractor->init(mr, cache, run_data.buffer);
+    run_data.data_offset = data_offset;
+    extractor->init(mr, cache, run_data.buffer, POINTER_OFFSET(data_stack, data_offset));
+    data_offset += (uint32_t)extractor->data_size;
   }
 }
 
-BLI_INLINE void extract_iter_looptri_bm(const MeshRenderData *mr,
-                                        const ExtractTriBMesh_Params *params,
-                                        const ExtractorRunDatas &all_extractors,
-                                        const TaskId task_id)
+BLI_INLINE void extract_finish(const MeshRenderData *mr,
+                               struct MeshBatchCache *cache,
+                               const ExtractorRunDatas &extractors,
+                               void *data_stack)
 {
-  ExtractorRunDatas extractors;
-  all_extractors.filter_into(extractors, MR_ITER_LOOPTRI);
-
-  EXTRACT_TRIS_LOOPTRI_FOREACH_BM_BEGIN(elt, elt_index, params)
-  {
-    for (ExtractorRunData &run_data : extractors) {
-      run_data.extractor->iter_looptri_bm(mr, elt, elt_index, run_data[task_id]);
+  for (const ExtractorRunData &run_data : extractors) {
+    const MeshExtract *extractor = run_data.extractor;
+    if (extractor->finish) {
+      extractor->finish(
+          mr, cache, run_data.buffer, POINTER_OFFSET(data_stack, run_data.data_offset));
     }
   }
-  EXTRACT_TRIS_LOOPTRI_FOREACH_BM_END;
 }
 
-BLI_INLINE void extract_iter_looptri_mesh(const MeshRenderData *mr,
-                                          const ExtractTriMesh_Params *params,
-                                          const ExtractorRunDatas &all_extractors,
-                                          const TaskId task_id)
-{
+/** \} */
+
+/* ---------------------------------------------------------------------- */
+/** \name Extract In Parallel Ranges
+ * \{ */
+
+struct ExtractorIterData {
   ExtractorRunDatas extractors;
-  all_extractors.filter_into(extractors, MR_ITER_LOOPTRI);
+  const MeshRenderData *mr = nullptr;
+  const void *elems = nullptr;
+  const int *loose_elems = nullptr;
 
-  EXTRACT_TRIS_LOOPTRI_FOREACH_MESH_BEGIN(mlt, mlt_index, params)
-  {
-    for (ExtractorRunData &run_data : extractors) {
-      run_data.extractor->iter_looptri_mesh(mr, mlt, mlt_index, run_data[task_id]);
+#ifdef WITH_CXX_GUARDEDALLOC
+  MEM_CXX_CLASS_ALLOC_FUNCS("DRW:MeshRenderDataUpdateTaskData")
+#endif
+};
+
+static void extract_task_reduce(const void *__restrict userdata,
+                                void *__restrict chunk_to,
+                                void *__restrict chunk_from)
+{
+  const ExtractorIterData *data = static_cast<const ExtractorIterData *>(userdata);
+  for (const ExtractorRunData &run_data : data->extractors) {
+    const MeshExtract *extractor = run_data.extractor;
+    if (extractor->task_reduce) {
+      extractor->task_reduce(POINTER_OFFSET(chunk_to, run_data.data_offset),
+                             POINTER_OFFSET(chunk_from, run_data.data_offset));
     }
   }
-  EXTRACT_TRIS_LOOPTRI_FOREACH_MESH_END;
 }
 
-BLI_INLINE void extract_iter_poly_bm(const MeshRenderData *mr,
-                                     const ExtractPolyBMesh_Params *params,
-                                     const ExtractorRunDatas &all_extractors,
-                                     const TaskId task_id)
+static void extract_range_iter_looptri_bm(void *__restrict userdata,
+                                          const int iter,
+                                          const TaskParallelTLS *__restrict tls)
 {
-  ExtractorRunDatas extractors;
-  all_extractors.filter_into(extractors, MR_ITER_POLY);
-
-  EXTRACT_POLY_FOREACH_BM_BEGIN(f, f_index, params, mr)
-  {
-    for (ExtractorRunData &run_data : extractors) {
-      run_data.extractor->iter_poly_bm(mr, f, f_index, run_data[task_id]);
-    }
+  const ExtractorIterData *data = static_cast<ExtractorIterData *>(userdata);
+  void *extract_data = tls->userdata_chunk;
+  const MeshRenderData *mr = data->mr;
+  BMLoop **elt = ((BMLoop * (*)[3]) data->elems)[iter];
+  for (const ExtractorRunData &run_data : data->extractors) {
+    run_data.extractor->iter_looptri_bm(
+        mr, elt, iter, POINTER_OFFSET(extract_data, run_data.data_offset));
   }
-  EXTRACT_POLY_FOREACH_BM_END;
 }
 
-BLI_INLINE void extract_iter_poly_mesh(const MeshRenderData *mr,
-                                       const ExtractPolyMesh_Params *params,
-                                       const ExtractorRunDatas &all_extractors,
-                                       const TaskId task_id)
+static void extract_range_iter_looptri_mesh(void *__restrict userdata,
+                                            const int iter,
+                                            const TaskParallelTLS *__restrict tls)
 {
-  ExtractorRunDatas extractors;
-  all_extractors.filter_into(extractors, MR_ITER_POLY);
+  void *extract_data = tls->userdata_chunk;
 
-  EXTRACT_POLY_FOREACH_MESH_BEGIN(mp, mp_index, params, mr)
-  {
-    for (ExtractorRunData &run_data : extractors) {
-      run_data.extractor->iter_poly_mesh(mr, mp, mp_index, run_data[task_id]);
-    }
+  const ExtractorIterData *data = static_cast<ExtractorIterData *>(userdata);
+  const MeshRenderData *mr = data->mr;
+  const MLoopTri *mlt = &((const MLoopTri *)data->elems)[iter];
+  for (const ExtractorRunData &run_data : data->extractors) {
+    run_data.extractor->iter_looptri_mesh(
+        mr, mlt, iter, POINTER_OFFSET(extract_data, run_data.data_offset));
   }
-  EXTRACT_POLY_FOREACH_MESH_END;
 }
 
-BLI_INLINE void extract_iter_ledge_bm(const MeshRenderData *mr,
-                                      const ExtractLEdgeBMesh_Params *params,
-                                      const ExtractorRunDatas &all_extractors,
-                                      const TaskId task_id)
+static void extract_range_iter_poly_bm(void *__restrict userdata,
+                                       const int iter,
+                                       const TaskParallelTLS *__restrict tls)
 {
-  ExtractorRunDatas extractors;
-  all_extractors.filter_into(extractors, MR_ITER_LEDGE);
+  void *extract_data = tls->userdata_chunk;
 
-  EXTRACT_LEDGE_FOREACH_BM_BEGIN(eed, ledge_index, params)
-  {
-    for (ExtractorRunData &run_data : extractors) {
-      run_data.extractor->iter_ledge_bm(mr, eed, ledge_index, run_data[task_id]);
-    }
+  const ExtractorIterData *data = static_cast<ExtractorIterData *>(userdata);
+  const MeshRenderData *mr = data->mr;
+  const BMFace *f = ((const BMFace **)data->elems)[iter];
+  for (const ExtractorRunData &run_data : data->extractors) {
+    run_data.extracto

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list