[Bf-blender-cvs] [78f28b55d39] master: Allocator: improve multi-threaded allocation performance

Wed Jan 4 14:57:13 CET 2023

Commit: 78f28b55d39288926634d0cc7ceaa005937cc528
Author: Jacques Lucke
Date:   Wed Jan 4 14:55:21 2023 +0100
Branches: master
https://developer.blender.org/rB78f28b55d39288926634d0cc7ceaa005937cc528

Allocator: improve multi-threaded allocation performance

Both, the guarded and lockfree allocator, are keeping track of current
and peak memory usage. Even the lockfree allocator used to use a
global atomic variable for the memory usage. When multiple threads
use the allocator at the same time, this variable is highly contended.
This can result in significant slowdowns as presented in D16862.

While specific cases could always be optimized by reducing the number
of allocations, having this synchronization point in functions used by
almost every part of Blender is not great.

The solution is use thread-local memory counters which are only added
together when the memory usage is actually requested. For more details
see in-code comments and D16862.

Differential Revision: https://developer.blender.org/D16862

===================================================================

M	intern/guardedalloc/CMakeLists.txt
M	intern/guardedalloc/intern/leak_detector.cc
M	intern/guardedalloc/intern/mallocn_intern.h
M	intern/guardedalloc/intern/mallocn_lockfree_impl.c
A	intern/guardedalloc/intern/memory_usage.cc
M	source/blender/makesdna/intern/CMakeLists.txt
M	source/blender/makesrna/intern/CMakeLists.txt

===================================================================

diff --git a/intern/guardedalloc/CMakeLists.txt b/intern/guardedalloc/CMakeLists.txt
index 0d16879adb5..5d766d8543d 100644
--- a/intern/guardedalloc/CMakeLists.txt
+++ b/intern/guardedalloc/CMakeLists.txt
@@ -20,6 +20,7 @@ set(SRC
   ./intern/mallocn.c
   ./intern/mallocn_guarded_impl.c
   ./intern/mallocn_lockfree_impl.c
+  ./intern/memory_usage.cc
 
   MEM_guardedalloc.h
   ./intern/mallocn_inline.h
diff --git a/intern/guardedalloc/intern/leak_detector.cc b/intern/guardedalloc/intern/leak_detector.cc
index 5b565b15920..0213376682b 100644
--- a/intern/guardedalloc/intern/leak_detector.cc
+++ b/intern/guardedalloc/intern/leak_detector.cc
@@ -53,6 +53,9 @@ class MemLeakPrinter {
 
 void MEM_init_memleak_detection()
 {
+  /* Calling this ensures that the memory usage counters outlive the memory leak detection. */
+  memory_usage_init();
+
   /**
    * This variable is constructed when this function is first called. This should happen as soon as
    * possible when the program starts.
diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h
index 1e9883f42c8..c2e9f9117bc 100644
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@@ -89,6 +89,14 @@ void aligned_free(void *ptr);
 extern bool leak_detector_has_run;
 extern char free_after_leak_detection_message[];
 
+void memory_usage_init(void);
+void memory_usage_block_alloc(size_t size);
+void memory_usage_block_free(size_t size);
+size_t memory_usage_block_num(void);
+size_t memory_usage_current(void);
+size_t memory_usage_peak(void);
+void memory_usage_peak_reset(void);
+
 /* Prototypes for counted allocator functions */
 size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
 void MEM_lockfree_freeN(void *vmemh);
diff --git a/intern/guardedalloc/intern/mallocn_lockfree_impl.c b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
index 5a969186b19..2c4761c74f0 100644
--- a/intern/guardedalloc/intern/mallocn_lockfree_impl.c
+++ b/intern/guardedalloc/intern/mallocn_lockfree_impl.c
@@ -30,8 +30,6 @@ typedef struct MemHeadAligned {
   size_t len;
 } MemHeadAligned;
 
-static unsigned int totblock = 0;
-static size_t mem_in_use = 0, peak_mem = 0;
 static bool malloc_debug_memset = false;
 
 static void (*error_callback)(const char *) = NULL;
@@ -46,18 +44,6 @@ enum {
 #define MEMHEAD_IS_ALIGNED(memhead) ((memhead)->len & (size_t)MEMHEAD_ALIGN_FLAG)
 #define MEMHEAD_LEN(memhead) ((memhead)->len & ~((size_t)(MEMHEAD_ALIGN_FLAG)))
 
-/* Uncomment this to have proper peak counter. */
-#define USE_ATOMIC_MAX
-
-MEM_INLINE void update_maximum(size_t *maximum_value, size_t value)
-{
-#ifdef USE_ATOMIC_MAX
-  atomic_fetch_and_update_max_z(maximum_value, value);
-#else
-  *maximum_value = value > *maximum_value ? value : *maximum_value;
-#endif
-}
-
 #ifdef __GNUC__
 __attribute__((format(printf, 1, 2)))
 #endif
@@ -103,8 +89,7 @@ void MEM_lockfree_freeN(void *vmemh)
   MemHead *memh = MEMHEAD_FROM_PTR(vmemh);
   size_t len = MEMHEAD_LEN(memh);
 
-  atomic_sub_and_fetch_u(&totblock, 1);
-  atomic_sub_and_fetch_z(&mem_in_use, len);
+  memory_usage_block_free(len);
 
   if (UNLIKELY(malloc_debug_memset && len)) {
     memset(memh + 1, 255, len);
@@ -224,16 +209,14 @@ void *MEM_lockfree_callocN(size_t len, const char *str)
 
   if (LIKELY(memh)) {
     memh->len = len;
-    atomic_add_and_fetch_u(&totblock, 1);
-    atomic_add_and_fetch_z(&mem_in_use, len);
-    update_maximum(&peak_mem, mem_in_use);
+    memory_usage_block_alloc(len);
 
     return PTR_FROM_MEMHEAD(memh);
   }
   print_error("Calloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
               SIZET_ARG(len),
               str,
-              (uint)mem_in_use);
+              (uint)memory_usage_current());
   return NULL;
 }
 
@@ -247,7 +230,7 @@ void *MEM_lockfree_calloc_arrayN(size_t len, size_t size, const char *str)
         SIZET_ARG(len),
         SIZET_ARG(size),
         str,
-        (unsigned int)mem_in_use);
+        (unsigned int)memory_usage_current());
     abort();
     return NULL;
   }
@@ -269,16 +252,14 @@ void *MEM_lockfree_mallocN(size_t len, const char *str)
     }
 
     memh->len = len;
-    atomic_add_and_fetch_u(&totblock, 1);
-    atomic_add_and_fetch_z(&mem_in_use, len);
-    update_maximum(&peak_mem, mem_in_use);
+    memory_usage_block_alloc(len);
 
     return PTR_FROM_MEMHEAD(memh);
   }
   print_error("Malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
               SIZET_ARG(len),
               str,
-              (uint)mem_in_use);
+              (uint)memory_usage_current());
   return NULL;
 }
 
@@ -292,7 +273,7 @@ void *MEM_lockfree_malloc_arrayN(size_t len, size_t size, const char *str)
         SIZET_ARG(len),
         SIZET_ARG(size),
         str,
-        (uint)mem_in_use);
+        (uint)memory_usage_current());
     abort();
     return NULL;
   }
@@ -340,16 +321,14 @@ void *MEM_lockfree_mallocN_aligned(size_t len, size_t alignment, const char *str
 
     memh->len = len | (size_t)MEMHEAD_ALIGN_FLAG;
     memh->alignment = (short)alignment;
-    atomic_add_and_fetch_u(&totblock, 1);
-    atomic_add_and_fetch_z(&mem_in_use, len);
-    update_maximum(&peak_mem, mem_in_use);
+    memory_usage_block_alloc(len);
 
     return PTR_FROM_MEMHEAD(memh);
   }
   print_error("Malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
               SIZET_ARG(len),
               str,
-              (uint)mem_in_use);
+              (uint)memory_usage_current());
   return NULL;
 }
 
@@ -369,8 +348,8 @@ void MEM_lockfree_callbackmemlist(void (*func)(void *))
 
 void MEM_lockfree_printmemlist_stats(void)
 {
-  printf("\ntotal memory len: %.3f MB\n", (double)mem_in_use / (double)(1024 * 1024));
-  printf("peak memory len: %.3f MB\n", (double)peak_mem / (double)(1024 * 1024));
+  printf("\ntotal memory len: %.3f MB\n", (double)memory_usage_current() / (double)(1024 * 1024));
+  printf("peak memory len: %.3f MB\n", (double)memory_usage_peak() / (double)(1024 * 1024));
   printf(
       "\nFor more detailed per-block statistics run Blender with memory debugging command line "
       "argument.\n");
@@ -398,23 +377,23 @@ void MEM_lockfree_set_memory_debug(void)
 
 size_t MEM_lockfree_get_memory_in_use(void)
 {
-  return mem_in_use;
+  return memory_usage_current();
 }
 
 uint MEM_lockfree_get_memory_blocks_in_use(void)
 {
-  return totblock;
+  return (uint)memory_usage_block_num();
 }
 
 /* dummy */
 void MEM_lockfree_reset_peak_memory(void)
 {
-  peak_mem = mem_in_use;
+  memory_usage_peak_reset();
 }
 
 size_t MEM_lockfree_get_peak_memory(void)
 {
-  return peak_mem;
+  return memory_usage_peak();
 }
 
 #ifndef NDEBUG
diff --git a/intern/guardedalloc/intern/memory_usage.cc b/intern/guardedalloc/intern/memory_usage.cc
new file mode 100644
index 00000000000..71987ac38d9
--- /dev/null
+++ b/intern/guardedalloc/intern/memory_usage.cc
@@ -0,0 +1,258 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <iostream>
+#include <mutex>
+#include <vector>
+
+#include "MEM_guardedalloc.h"
+#include "mallocn_intern.h"
+
+#include "../../source/blender/blenlib/BLI_strict_flags.h"
+
+namespace {
+
+/**
+ * This is stored per thread. Align to cache line size to avoid false sharing.
+ */
+struct alignas(64) Local {
+  /** Helps to find bugs during program shutdown. */
+  bool destructed = false;
+  /**
+   * This is the first created #Local and on the main thread. When the main local data is
+   * destructed, we know that Blender is quitting and that we can't rely on thread locals being
+   * available still.
+   */
+  bool is_main = false;
+  /**
+   * Number of bytes. This can be negative when e.g. one thread allocates a lot of memory, and
+   * another frees it. It has to be an atomic, because it may be accessed by other threads when the
+   * total memory usage is counted.
+   */
+  std::atomic<int64_t> mem_in_use = 0;
+  /**
+   * Number of allocated blocks. Can be negative and is atomic for the same reason as above.
+   */
+  std::atomic<int64_t> blocks_num = 0;
+  /**
+   * Amount of memory used when the peak was last updated. This is used so that we don't have to
+   * update the peak memory usage after every memory allocation. Instead it's only updated when "a
+   * lot" of new memory has been allocated. This makes the peak memory usage a little bit less
+   * accurate, but it's still good enough for practical purposes.
+   */
+  std::atomic<int64_t> mem_in_use_during_peak_update = 0;
+
+  Local();
+  ~Local();
+};
+
+/**
+ * This is a singleton that stores global data.
+ */
+struct Global {
+  /**
+   * Mutex that protects the vector below.
+   */
+  std::mutex locals_mutex;
+  /**
+   * All currently constructed #Local. This must only be accessed when the mutex above is
+   * locked. Individual threads insert and remove themselves here.
+   */
+  std::vector<Local *> locals;
+  /**
+   * Number of bytes that are not tracked by #Local. This is necessary because when a thread exits,
+   * its #Local data is freed. The memory counts stored there would be lost. The memory counts may
+   * be non-zero during thread destruction, if the thread did an unequal amount of allocations and
+   * frees (which is perfectly valid behavior as long as other threads have the responsibility to
+   * free any memory that the thread allocated).
+   *
+   * To solve this, the memory counts are added to these global counters when the thread
+   * exists. The global counters are also used when the entire process starts to exit, because the
+   * #Local data of the main thread is already destructed when the leak detection happens (during
+   * destruction of static variables which happens after destruction of threadlocals).
+   */
+  std::atomic<int64_t> mem_in_use_outside_locals = 0;
+  /**
+   * Number of blocks that are not tracked by #Local, for the same reason as above.
+   */
+  std::atomic<int64_t> blocks_num_outside_locals = 0;
+  /**
+   * Peak memory usage since the last reset.
+   */
+  std::atomic<size_t> peak = 0;
+};
+
+}  // namespace
+
+/**
+ * This is true for most of the lifetime of the program. Only when it starts exiting t

@@ Diff output truncated at 10240 characters. @@