[Bf-blender-cvs] [2f12b83dbdb] gsoc-2018-many-light-sampling: Cycles Denoising: Don't use atomics in the accumulation kernel on CPUs

Fri Jun 1 16:17:30 CEST 2018

Commit: 2f12b83dbdbcd17b7964cde5f552518ce554ec9e
Author: Lukas Stockner
Date:   Sun Apr 1 02:10:27 2018 +0200
Branches: gsoc-2018-many-light-sampling
https://developer.blender.org/rB2f12b83dbdbcd17b7964cde5f552518ce554ec9e

Cycles Denoising: Don't use atomics in the accumulation kernel on CPUs

The GPU kernel needs to use atomics for accumulation since all offsets are processed in
parallel, but on CPUs that's not the case, so we can disable them there for a considerable speedup.

===================================================================

M	intern/cycles/kernel/filter/filter_reconstruction.h
M	intern/cycles/util/util_math_matrix.h

===================================================================

diff --git a/intern/cycles/kernel/filter/filter_reconstruction.h b/intern/cycles/kernel/filter/filter_reconstruction.h
index b7bf322f9ce..58740d5b06a 100644
--- a/intern/cycles/kernel/filter/filter_reconstruction.h
+++ b/intern/cycles/kernel/filter/filter_reconstruction.h
@@ -61,8 +61,13 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
 	                                make_int2(x+dx, y+dy), buffer + q_offset,
 	                                pass_stride, *rank, design_row, transform, stride);
 
+#ifdef __KERNEL_GPU__
 	math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
 	math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
+#else
+	math_trimatrix_add_gramian(XtWX, (*rank)+1, design_row, weight);
+	math_vec3_add(XtWY, (*rank)+1, design_row, weight * q_color);
+#endif
 }
 
 ccl_device_inline void kernel_filter_finalize(int x, int y,
diff --git a/intern/cycles/util/util_math_matrix.h b/intern/cycles/util/util_math_matrix.h
index 382dad64ea5..9ffcb9659b2 100644
--- a/intern/cycles/util/util_math_matrix.h
+++ b/intern/cycles/util/util_math_matrix.h
@@ -144,6 +144,18 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
 	}
 }
 
+ccl_device_inline void math_trimatrix_add_gramian(ccl_global float *A,
+                                                  int n,
+                                                  const float *ccl_restrict v,
+                                                  float weight)
+{
+	for(int row = 0; row < n; row++) {
+		for(int col = 0; col <= row; col++) {
+			MATHS(A, row, col, 1) += v[row]*v[col]*weight;
+		}
+	}
+}
+
 /* Transpose matrix A inplace. */
 ccl_device_inline void math_matrix_transpose(ccl_global float *A, int n, int stride)
 {