[Bf-blender-cvs] [a87fb34] master: Use advantage of SSE2 instructions in gaussian blur node

Sergey Sharybin noreply at git.blender.org
Fri Jun 13 20:39:28 CEST 2014


Commit: a87fb34edaf1a10f5527b6dc8a506a1c9ecbc683
Author: Sergey Sharybin
Date:   Sat Jun 14 00:30:13 2014 +0600
https://developer.blender.org/rBa87fb34edaf1a10f5527b6dc8a506a1c9ecbc683

Use advantage of SSE2 instructions in gaussian blur node

This gives around 30% of speedup for gaussian blur node.

Pretty much straightforward implementation inside the node
itself, but needed to implement some additional things:

- Aligned malloc. It's needed to load data onto SSE registers
  faster. based on the aligned_malloc() from Libmv with
  some additional trickery going on to support arbitrary
  alignment (this magic is needed because of MemHead).

  In the practice only 16bit alignment is supported because
  of the lack of aligned malloc with arbitrary alignment
  for OSX. Not a bit deal for now because we need 16 bytes
  alignment at this moment only. Could be tweaked further
  later.

- Memory buffers in compositor are now aligned to 16 bytes.
  Should be harmless for non-SSE cases too. just mentioning.

Reviewers: campbellbarton, lukastoenne, jbakker

Reviewed By: campbellbarton

CC: lockal

Differential Revision: https://developer.blender.org/D564

===================================================================

M	intern/guardedalloc/MEM_guardedalloc.h
M	intern/guardedalloc/intern/mallocn.c
M	intern/guardedalloc/intern/mallocn_guarded_impl.c
M	intern/guardedalloc/intern/mallocn_intern.h
M	intern/guardedalloc/intern/mallocn_lockfree_impl.c
M	source/blender/compositor/intern/COM_MemoryBuffer.cpp
M	source/blender/compositor/operations/COM_BlurBaseOperation.cpp
M	source/blender/compositor/operations/COM_BlurBaseOperation.h
M	source/blender/compositor/operations/COM_GaussianXBlurOperation.cpp
M	source/blender/compositor/operations/COM_GaussianXBlurOperation.h
M	source/blender/compositor/operations/COM_GaussianYBlurOperation.cpp
M	source/blender/compositor/operations/COM_GaussianYBlurOperation.h

===================================================================

diff --git a/intern/guardedalloc/MEM_guardedalloc.h b/intern/guardedalloc/MEM_guardedalloc.h
index 4fb6896..8c5ad77 100644
--- a/intern/guardedalloc/MEM_guardedalloc.h
+++ b/intern/guardedalloc/MEM_guardedalloc.h
@@ -120,6 +120,12 @@ extern "C" {
 	extern void *(*MEM_mallocN)(size_t len, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(2);
 
 	/**
+	 * Allocate an aligned block of memory of size len, with tag name str. The
+	 * name must be a static, because only a pointer to it is stored !
+	 * */
+	extern void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) /* ATTR_MALLOC */ ATTR_WARN_UNUSED_RESULT ATTR_ALLOC_SIZE(1) ATTR_NONNULL(3);
+
+	/**
 	 * Same as callocN, clears memory and uses mmap (disk cached) if supported.
 	 * Can be free'd with MEM_freeN as usual.
 	 * */
diff --git a/intern/guardedalloc/intern/mallocn.c b/intern/guardedalloc/intern/mallocn.c
index e85fba7..b0d252c 100644
--- a/intern/guardedalloc/intern/mallocn.c
+++ b/intern/guardedalloc/intern/mallocn.c
@@ -41,6 +41,7 @@ void *(*MEM_reallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfre
 void *(*MEM_recallocN_id)(void *vmemh, size_t len, const char *str) = MEM_lockfree_recallocN_id;
 void *(*MEM_callocN)(size_t len, const char *str) = MEM_lockfree_callocN;
 void *(*MEM_mallocN)(size_t len, const char *str) = MEM_lockfree_mallocN;
+void *(*MEM_mallocN_aligned)(size_t len, size_t alignment, const char *str) = MEM_lockfree_mallocN_aligned;
 void *(*MEM_mapallocN)(size_t len, const char *str) = MEM_lockfree_mapallocN;
 void (*MEM_printmemlist_pydict)(void) = MEM_lockfree_printmemlist_pydict;
 void (*MEM_printmemlist)(void) = MEM_lockfree_printmemlist;
@@ -60,6 +61,40 @@ uintptr_t (*MEM_get_peak_memory)(void) = MEM_lockfree_get_peak_memory;
 const char *(*MEM_name_ptr)(void *vmemh) = MEM_lockfree_name_ptr;
 #endif
 
+void *aligned_malloc(size_t size, size_t alignment)
+{
+#ifdef _WIN32
+	return _aligned_malloc(size, alignment);
+#elif defined(__APPLE__)
+	/* On Mac OS X, both the heap and the stack are guaranteed 16-byte aligned so
+	 * they work natively with SSE types with no further work.
+	 */
+	assert(alignment == 16);
+	return malloc(size);
+#elif defined(__FreeBSD__) || defined(__NetBSD__)
+	void *result;
+
+	if (posix_memalign(&result, alignment, size)) {
+		/* non-zero means allocation error
+		 * either no allocation or bad alignment value
+		 */
+		return NULL;
+	}
+	return result;
+#else  /* This is for Linux. */
+	return memalign(alignment, size);
+#endif
+}
+
+void aligned_free(void *ptr)
+{
+#ifdef _WIN32
+	_aligned_free(ptr);
+#else
+	free(ptr);
+#endif
+}
+
 void MEM_use_guarded_allocator(void)
 {
 	MEM_allocN_len = MEM_guarded_allocN_len;
@@ -69,6 +104,7 @@ void MEM_use_guarded_allocator(void)
 	MEM_recallocN_id = MEM_guarded_recallocN_id;
 	MEM_callocN = MEM_guarded_callocN;
 	MEM_mallocN = MEM_guarded_mallocN;
+	MEM_mallocN_aligned = MEM_guarded_mallocN_aligned;
 	MEM_mapallocN = MEM_guarded_mapallocN;
 	MEM_printmemlist_pydict = MEM_guarded_printmemlist_pydict;
 	MEM_printmemlist = MEM_guarded_printmemlist;
diff --git a/intern/guardedalloc/intern/mallocn_guarded_impl.c b/intern/guardedalloc/intern/mallocn_guarded_impl.c
index 172c79d5..206390e 100644
--- a/intern/guardedalloc/intern/mallocn_guarded_impl.c
+++ b/intern/guardedalloc/intern/mallocn_guarded_impl.c
@@ -113,7 +113,10 @@ typedef struct MemHead {
 	const char *name;
 	const char *nextname;
 	int tag2;
-	int mmap;  /* if true, memory was mmapped */
+	short mmap;  /* if true, memory was mmapped */
+	short alignment;  /* if non-zero aligned alloc was used
+	                   * and alignment is stored here.
+	                   */
 #ifdef DEBUG_MEMCOUNTER
 	int _count;
 #endif
@@ -128,6 +131,8 @@ typedef struct MemHead {
 #endif
 } MemHead;
 
+typedef MemHead MemHeadAligned;
+
 /* for openmp threading asserts, saves time troubleshooting
  * we may need to extend this if blender code starts using MEM_
  * functions inside OpenMP correctly with omp_set_lock() */
@@ -187,7 +192,7 @@ static const char *check_memlist(MemHead *memh);
 
 #define MEMNEXT(x) \
 	((MemHead *)(((char *) x) - ((char *) &(((MemHead *)0)->next))))
-	
+
 /* --------------------------------------------------------------------- */
 /* vars                                                                  */
 /* --------------------------------------------------------------------- */
@@ -325,10 +330,12 @@ void *MEM_guarded_dupallocN(const void *vmemh)
 		memh--;
 
 #ifndef DEBUG_MEMDUPLINAME
-		if (memh->mmap)
+		if (UNLIKELY(memh->mmap))
+			newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc");
+		else if (LIKELY(memh->alignment == 0))
 			newp = MEM_guarded_mapallocN(memh->len, "dupli_mapalloc");
 		else
-			newp = MEM_guarded_mallocN(memh->len, "dupli_alloc");
+			newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, "dupli_alloc");
 
 		if (newp == NULL) return NULL;
 #else
@@ -336,14 +343,18 @@ void *MEM_guarded_dupallocN(const void *vmemh)
 			MemHead *nmemh;
 			char *name = malloc(strlen(memh->name) + 24);
 
-			if (memh->mmap) {
+			if (UNLIKELY(memh->mmap)) {
 				sprintf(name, "%s %s", "dupli_mapalloc", memh->name);
 				newp = MEM_guarded_mapallocN(memh->len, name);
 			}
-			else {
+			else if (LIKELY(memh->alignment == 0)) {
 				sprintf(name, "%s %s", "dupli_alloc", memh->name);
 				newp = MEM_guarded_mallocN(memh->len, name);
 			}
+			else {
+				sprintf(name, "%s %s", "dupli_alloc", memh->name);
+				newp = MEM_guarded_mallocN_aligned(memh->len, (size_t) memh->alignment, name);
+			}
 
 			if (newp == NULL) return NULL;
 
@@ -368,7 +379,13 @@ void *MEM_guarded_reallocN_id(void *vmemh, size_t len, const char *str)
 		MemHead *memh = vmemh;
 		memh--;
 
-		newp = MEM_guarded_mallocN(len, memh->name);
+		if (LIKELY(memh->alignment == 0)) {
+			newp = MEM_guarded_mallocN(len, memh->name);
+		}
+		else {
+			newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name);
+		}
+
 		if (newp) {
 			if (len < memh->len) {
 				/* shrink */
@@ -397,7 +414,13 @@ void *MEM_guarded_recallocN_id(void *vmemh, size_t len, const char *str)
 		MemHead *memh = vmemh;
 		memh--;
 
-		newp = MEM_guarded_mallocN(len, memh->name);
+		if (LIKELY(memh->alignment == 0)) {
+			newp = MEM_guarded_mallocN(len, memh->name);
+		}
+		else {
+			newp = MEM_guarded_mallocN_aligned(len, (size_t) memh->alignment, memh->name);
+		}
+
 		if (newp) {
 			if (len < memh->len) {
 				/* shrink */
@@ -464,6 +487,7 @@ static void make_memhead_header(MemHead *memh, size_t len, const char *str)
 	memh->nextname = NULL;
 	memh->len = len;
 	memh->mmap = 0;
+	memh->alignment = 0;
 	memh->tag2 = MEMTAG2;
 
 #ifdef DEBUG_MEMDUPLINAME
@@ -514,6 +538,54 @@ void *MEM_guarded_mallocN(size_t len, const char *str)
 	return NULL;
 }
 
+void *MEM_guarded_mallocN_aligned(size_t len, size_t alignment, const char *str)
+{
+	MemHead *memh;
+
+	/* It's possible that MemHead's size is not properly aligned,
+	 * do extra padding to deal with this.
+	 *
+	 * We only support small alignments which fits into short in
+	 * order to save some bits in MemHead structure.
+	 */
+	short extra_padding = (short)MEMHEAD_ALIGN_PADDING(alignment);
+
+	/* Huge alignment values doesn't make sense and they
+	 * wouldn't fit into 'short' used in the MemHead.
+	 */
+	assert(alignment < 1024);
+
+	/* We only support alignment to a power of two. */
+	assert(IS_POW2(alignment));
+
+	len = SIZET_ALIGN_4(len);
+
+	memh = (MemHead *)aligned_malloc(len + (size_t)extra_padding + sizeof(MemHead) + sizeof(MemTail), alignment);
+
+	if (LIKELY(memh)) {
+		/* We keep padding in the beginning of MemHead,
+		 * this way it's always possible to get MemHead
+		 * from the data pointer.
+		 */
+		memh = (MemHead *)((char *)memh + extra_padding);
+
+		make_memhead_header(memh, len, str);
+		memh->alignment = (short) alignment;
+		if (UNLIKELY(malloc_debug_memset && len))
+			memset(memh + 1, 255, len);
+
+#ifdef DEBUG_MEMCOUNTER
+		if (_mallocn_count == DEBUG_MEMCOUNTER_ERROR_VAL)
+			memcount_raise(__func__);
+		memh->_count = _mallocn_count++;
+#endif
+		return (++memh);
+	}
+	print_error("aligned_malloc returns null: len=" SIZET_FORMAT " in %s, total %u\n",
+	            SIZET_ARG(len), str, (unsigned int) mem_in_use);
+	return NULL;
+}
+
 void *MEM_guarded_callocN(size_t len, const char *str)
 {
 	MemHead *memh;
@@ -953,7 +1025,12 @@ static void rem_memblock(MemHead *memh)
 	else {
 		if (UNLIKELY(malloc_debug_memset && memh->len))
 			memset(memh + 1, 255, memh->len);
-		free(memh);
+		if (LIKELY(memh->alignment == 0)) {
+			free(memh);
+		}
+		else {
+			aligned_free(MEMHEAD_REAL_PTR(memh));
+		}
 	}
 }
 
diff --git a/intern/guardedalloc/intern/mallocn_intern.h b/intern/guardedalloc/intern/mallocn_intern.h
index 7c8922d..a69bcf3 100644
--- a/intern/guardedalloc/intern/mallocn_intern.h
+++ b/intern/guardedalloc/intern/mallocn_intern.h
@@ -85,6 +85,35 @@
 #  define UNLIKELY(x)     (x)
 #endif
 
+#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__NetBSD__)
+// Needed for memalign on Linux and _aligned_alloc on Windows.
+#  ifdef FREE_WINDOWS
+/* make sure _aligned_malloc is included */
+#    ifdef __MSVCRT_VERSION__
+#      undef __MSVCRT_VERSION__
+#    endif
+
+#    define __MSVCRT_VERSION__ 0x0700
+#  endif  // FREE_WINDOWS
+
+#  include <malloc.h>
+#else
+// Apple's malloc is 16-byte aligned, and does not have malloc.h, so include
+// stdilb instead.
+#  include <cstdlib>
+#endif
+
+#define IS_POW2(a) (((a) & ((a) - 1)) == 0)
+
+/* Extra padding which needs to be applied on MemHead to make it aligned. */
+#define MEMHEAD_ALIGN_PADDING(alignment) ((size_t)alignment - (sizeof(MemHeadAligned) % (size_t)alignment))
+
+/* Real pointer returned by the malloc or aligned_alloc. */
+#define MEMHEAD_REAL_PTR(memh) ((char *)memh - MEMHEAD_ALIGN_PADDING(memh->alignment))
+
+void *aligned_malloc(size_t size, size_t alignment);
+void aligned_free(void *ptr);
+
 /* Prototypes for counted allocator functions */
 size_t MEM_lockfree_allocN_len(const void *vmemh) ATTR_WARN_UNUSED_RESULT;
 void MEM_lockfree_freeN(void *vmemh);
@@ -93,6 +122,7 @@ void *MEM_lockfree_reallocN_id(void *vmemh, size_t len, cons

@@ Diff output truncated at 10240 characters. @@




More information about the Bf-blender-cvs mailing list