[Bf-blender-cvs] [04c0e7b] soc-2016-cycles_denoising: Merge remote-tracking branch 'origin/master' into soc-2016-cycles_denoising

Tue Dec 20 16:06:50 CET 2016

Commit: 04c0e7b2d71a3925f1019c365342c1ea08904be1
Author: Lukas Stockner
Date:   Tue Dec 6 21:13:06 2016 +0100
Branches: soc-2016-cycles_denoising
https://developer.blender.org/rB04c0e7b2d71a3925f1019c365342c1ea08904be1

Merge remote-tracking branch 'origin/master' into soc-2016-cycles_denoising

Conflicts:
	intern/cycles/device/device_cpu.cpp
	intern/cycles/device/device_cuda.cpp
	intern/cycles/render/tile.cpp
	intern/cycles/render/tile.h

===================================================================



===================================================================

diff --cc intern/cycles/app/cycles_standalone.cpp
index 95d0120,9816d61..0987d4b

--- a/intern/cycles/app/cycles_standalone.cpp
+++ b/intern/cycles/app/cycles_standalone.cpp
@@@ -63,10 -70,8 +63,8 @@@ static void session_print(const string
  	fflush(stdout);
  }
  
 -static void session_print_status()
 +void session_print_status()
  {
- 	int sample, tile;
- 	double total_time, sample_time, render_time;
  	string status, substatus;
  
  	/* get status */
diff --cc intern/cycles/device/device_cpu.cpp
index 0f5ac8e,c8e001e..4e713e8
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@@ -276,361 -213,74 +281,362 @@@ public
  		}
  	};
  
 -	void thread_path_trace(DeviceTask& task)
 +	float* denoise_fill_buffer(KernelGlobals *kg, int sample, int4 rect, float** buffers, int* tile_x, int* tile_y, int *offsets, int *strides, int frames, int *frame_strides)
  	{
 -		if(task_pool.canceled()) {
 -			if(task.need_finish_queue == false)
 -				return;
 -		}
 +		bool cross_denoise = kg->__data.film.denoise_cross;
 +		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
 +		int pass_stride = w*h*frames;
 +		int passes = cross_denoise? 28:22;
 +		float *filter_buffers = new float[passes*pass_stride];
 +		memset(filter_buffers, 0, sizeof(float)*passes*pass_stride);
 +
 +
 +		for(int frame = 0; frame < frames; frame++) {
 +			float *filter_buffer = filter_buffers + w*h*frame;
 +			float *buffer[9];
 +			for(int i = 0; i < 9; i++) {
 +				buffer[i] = buffers[i] + frame_strides[i]*frame;
 +			}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +			DenoiseDebug debug((rect.z - rect.x), h, 34);
 +#endif
 +			/* ==== Step 1: Prefilter general features. ==== */
 +			{
  
 -		KernelGlobals kg = thread_kernel_globals_init();
 -		RenderTile tile;
 +				float *unfiltered = filter_buffer + 16*pass_stride;
 +				/* Order in render buffers:
 +				 *   Normal[X, Y, Z] NormalVar[X, Y, Z] Albedo[R, G, B] AlbedoVar[R, G, B ] Depth DepthVar
 +				 *          0  1  2            3  4  5         6  7  8            9  10 11  12    13
 +				 *
 +				 * Order in denoise buffer:
 +				 *   Normal[X, XVar, Y, YVar, Z, ZVar] Depth DepthVar Shadow ShadowVar Albedo[R, RVar, G, GVar, B, BVar] Color[R, RVar, G, GVar, B, BVar]
 +				 *          0  1     2  3     4  5     6     7        8      9                10 11    12 13    14 15          16 17    18 19    20 21
 +				 *
 +				 * Order of processing: |NormalXYZ|Depth|AlbedoXYZ |
 +				 *                      |         |     |          | */
 +				int mean_from[]      = { 0, 1, 2,   6,    7,  8, 12 };
 +				int variance_from[]  = { 3, 4, 5,   9,   10, 11, 13 };
 +				int offset_to[]      = { 0, 2, 4,  10,   12, 14,  6 };
 +				for(int i = 0; i < 7; i++) {
 +					for(int y = rect.y; y < rect.w; y++) {
 +						for(int x = rect.x; x < rect.z; x++) {
 +							filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, &rect.x);
 +						}
 +					}
 +					for(int y = rect.y; y < rect.w; y++) {
 +						for(int x = rect.x; x < rect.z; x++) {
 +							filter_non_local_means_kernel()(x, y, unfiltered, unfiltered, filter_buffer + (offset_to[i]+1)*pass_stride, filter_buffer + offset_to[i]*pass_stride, &rect.x, 2, 2, 1, 0.25f);
 +						}
 +					}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +#define WRITE_DEBUG(name, var) debug.add_pass(string_printf("f%d_%s", i, name), var, 1, w);
 +					WRITE_DEBUG("unfiltered", unfiltered);
 +					WRITE_DEBUG("sampleV", filter_buffer + (offset_to[i]+1)*pass_stride);
 +					WRITE_DEBUG("filtered", filter_buffer + offset_to[i]*pass_stride);
 +#undef WRITE_DEBUG
 +#endif
 +				}
 +			}
  
 -		void(*path_trace_kernel)(KernelGlobals*, float*, unsigned int*, int, int, int, int, int);
  
 -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
 -		if(system_cpu_support_avx2()) {
 -			path_trace_kernel = kernel_cpu_avx2_path_trace;
 -		}
 -		else
 +
 +			/* ==== Step 2: Prefilter shadow feature. ==== */
 +			{
 +				/* Reuse some passes of the filter_buffer for temporary storage. */
 +				float *sampleV = filter_buffer + 16*pass_stride, *sampleVV = filter_buffer + 17*pass_stride, *bufferV = filter_buffer + 18*pass_stride, *cleanV = filter_buffer + 19*pass_stride;
 +				float *unfiltered = filter_buffer + 20*pass_stride;
 +
 +				/* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the sample variance and the buffer variance. */
 +				for(int y = rect.y; y < rect.w; y++) {
 +					for(int x = rect.x; x < rect.z; x++) {
 +						filter_divide_shadow_kernel()(kg, sample, buffer, x, y, tile_x, tile_y, offsets, strides, unfiltered, sampleV, sampleVV, bufferV, &rect.x);
 +					}
 +				}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +#define WRITE_DEBUG(name, var) debug.add_pass(string_printf("shadow_%s", name), var, 1, w);
 +				WRITE_DEBUG("unfilteredA", unfiltered);
 +				WRITE_DEBUG("unfilteredB", unfiltered + pass_stride);
 +				WRITE_DEBUG("bufferV", bufferV);
 +				WRITE_DEBUG("sampleV", sampleV);
 +				WRITE_DEBUG("sampleVV", sampleVV);
  #endif
 -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
 -		if(system_cpu_support_avx()) {
 -			path_trace_kernel = kernel_cpu_avx_path_trace;
 -		}
 -		else
 +
 +				/* Smooth the (generally pretty noisy) buffer variance using the spatial information from the sample variance. */
 +				for(int y = rect.y; y < rect.w; y++) {
 +					for(int x = rect.x; x < rect.z; x++) {
 +						filter_non_local_means_kernel()(x, y, bufferV, sampleV, sampleVV, cleanV, &rect.x, 6, 3, 4, 1.0f);
 +					}
 +				}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +			WRITE_DEBUG("cleanV", cleanV);
  #endif
 -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
 -		if(system_cpu_support_sse41()) {
 -			path_trace_kernel = kernel_cpu_sse41_path_trace;
 -		}
 -		else
 +
 +				/* Use the smoothed variance to filter the two shadow half images using each other for weight calculation. */
 +				for(int y = rect.y; y < rect.w; y++) {
 +					for(int x = rect.x; x < rect.z; x++) {
 +						filter_non_local_means_kernel()(x, y, unfiltered, unfiltered + pass_stride, cleanV, sampleV, &rect.x, 5, 3, 1, 0.25f);
 +						filter_non_local_means_kernel()(x, y, unfiltered + pass_stride, unfiltered, cleanV, bufferV, &rect.x, 5, 3, 1, 0.25f);
 +					}
 +				}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +				WRITE_DEBUG("filteredA", sampleV);
 +				WRITE_DEBUG("filteredB", bufferV);
  #endif
 -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
 -		if(system_cpu_support_sse3()) {
 -			path_trace_kernel = kernel_cpu_sse3_path_trace;
 -		}
 -		else
 +
 +				/* Estimate the residual variance between the two filtered halves. */
 +				for(int y = rect.y; y < rect.w; y++) {
 +					for(int x = rect.x; x < rect.z; x++) {
 +						filter_combine_halves_kernel()(x, y, NULL, sampleVV, sampleV, bufferV, &rect.x, 2);
 +					}
 +				}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +				WRITE_DEBUG("residualV", sampleVV);
  #endif
 -#ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
 -		if(system_cpu_support_sse2()) {
 -			path_trace_kernel = kernel_cpu_sse2_path_trace;
 +
 +				/* Use the residual variance for a second filter pass. */
 +				for(int y = rect.y; y < rect.w; y++) {
 +					for(int x = rect.x; x < rect.z; x++) {
 +						filter_non_local_means_kernel()(x, y, sampleV, bufferV, sampleVV, unfiltered              , &rect.x, 4, 2, 1, 0.5f);
 +						filter_non_local_means_kernel()(x, y, bufferV, sampleV, sampleVV, unfiltered + pass_stride, &rect.x, 4, 2, 1, 0.5f);
 +					}
 +				}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +				WRITE_DEBUG("finalA", unfiltered);
 +				WRITE_DEBUG("finalB", unfiltered + pass_stride);
 +#endif
 +
 +				/* Combine the two double-filtered halves to a final shadow feature image and associated variance. */
 +				for(int y = rect.y; y < rect.w; y++) {
 +					for(int x = rect.x; x < rect.z; x++) {
 +						filter_combine_halves_kernel()(x, y, filter_buffer + 8*pass_stride, filter_buffer + 9*pass_stride, unfiltered, unfiltered + pass_stride, &rect.x, 0);
 +					}
 +				}
 +#ifdef WITH_CYCLES_DEBUG_FILTER
 +				WRITE_DEBUG("final", filter_buffer + 8*pass_stride);
 +				WRITE_DEBUG("finalV", filter_buffer + 9 * pass_stride);
 +				debug.write(string_printf("debugf_%dx%d.exr", tile_x[1], tile_y[1]));
 +#undef WRITE_DEBUG
 +#endif
 +			}
 +
 +
 +
 +			/* ==== Step 3: Copy combined color pass. ==== */
 +			{
 +				if(cross_denoise) {
 +					int mean_from[]      = {20, 21, 22, 26, 27, 28};
 +					int variance_from[]  = {23, 24, 25, 29, 30, 31};
 +					int offset_to[]      = {16, 18, 20, 22, 24, 26};
 +					for(int i = 0; i < 6; i++) {
 +						for(int y = rect.y; y < rect.w; y++) {
 +							for(int x = rect.x; x < rect.z; x++) {
 +								filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, filter_buffer + offset_to[i]*pass_stride, filter_buffer + (offset_to[i]+1)*pass_stride, &rect.x);
 +							}
 +						}
 +					}
 +				}
 +				else {
 +					int mean_from[]      = {20, 21, 22};
 +					int variance_from[]  = {23, 24, 25};
 +					int offset_to[]      = {16, 18, 20};
 +					for(int i = 0; i < 3; i++) {
 +						for(int y = rect.y; y < rect.w; y++) {
 +							for(int x = rect.x; x < rect.z; x++) {
 +								filter_get_feature_kernel()(kg, sample, buffer, mean_from[i], variance_from[i], x, y, tile_x, tile_y, offsets, strides, filter_buffer + offset_to[i]*pass_stride, filter_buffer + (offset_to[i]+1)*pass_stride, &rect.x);
 +							}
 +						}
 +					}
 +				}
 +			}
  		}
 -		else
 +
 +		return filter_buffers;
 +	}
 +
 +	void denoise_run(KernelGlobals *kg, int sample, float *filter_buffer, int4 filter_area, int4 rect, int offset, int stride, float *buffers)
 +	{
 +		bool only_nlm_filter = getenv("ONLY_NLM_FILTER");
 +		bool use_gradients = kg->__data.integrator.use_gradients;
 +		bool nlm_weights = kg->__data.integrator.use_nlm_weights;
 +
 +		int hw = kg->__data.integrator.half_window;
 +		FilterStorage *storage = new FilterStorage[filter_area.z*filter_area.w];
 +		float *weight_cache = new float[(2*hw+1)*(2*hw+1)];
 +
 +		int w = align_up(rect.z - rect.x, 4), h = (rect.w - rect.y);
 +		int pass_stride = w*h;
 +
 +		if(only_nlm_filter) {
 +			float *img[3] = {filter_buffer + 16*pass_stride, filter_buffer + 18*pa

@@ Diff output truncated at 10240 characters. @@