[Bf-blender-cvs] [7fa6f72084b] master: Cycles: Add sample-based runtime profiler that measures time spent in various parts of the CPU kernel

Lukas Stockner noreply at git.blender.org
Thu Nov 29 02:49:41 CET 2018


Commit: 7fa6f72084b1364cddfbef4f06bbb244210d6967
Author: Lukas Stockner
Date:   Thu Nov 29 02:06:30 2018 +0100
Branches: master
https://developer.blender.org/rB7fa6f72084b1364cddfbef4f06bbb244210d6967

Cycles: Add sample-based runtime profiler that measures time spent in various parts of the CPU kernel

This commit adds a sample-based profiler that runs during CPU rendering and collects statistics on time spent in different parts of the kernel (ray intersection, shader evaluation etc.) as well as time spent per material and object.

The results are currently not exposed in the user interface or per Python yet, to see the stats on the console pass the "--cycles-print-stats" argument to Cycles (e.g. "./blender -- --cycles-print-stats").

Unfortunately, there is no clear way to extend this functionality to CUDA or OpenCL, so it is CPU-only for now.

Reviewers: brecht, sergey, swerner

Reviewed By: brecht, swerner

Differential Revision: https://developer.blender.org/D3892

===================================================================

M	intern/cycles/blender/blender_session.cpp
M	intern/cycles/blender/blender_sync.cpp
M	intern/cycles/device/device.cpp
M	intern/cycles/device/device.h
M	intern/cycles/device/device_cpu.cpp
M	intern/cycles/device/device_denoising.cpp
M	intern/cycles/device/device_denoising.h
M	intern/cycles/kernel/CMakeLists.txt
M	intern/cycles/kernel/bvh/bvh.h
M	intern/cycles/kernel/kernel_globals.h
M	intern/cycles/kernel/kernel_passes.h
M	intern/cycles/kernel/kernel_path.h
M	intern/cycles/kernel/kernel_path_subsurface.h
M	intern/cycles/kernel/kernel_path_surface.h
A	intern/cycles/kernel/kernel_profiling.h
M	intern/cycles/kernel/kernel_shader.h
M	intern/cycles/render/object.cpp
M	intern/cycles/render/object.h
M	intern/cycles/render/session.cpp
M	intern/cycles/render/session.h
M	intern/cycles/render/stats.cpp
M	intern/cycles/render/stats.h
M	intern/cycles/util/CMakeLists.txt
A	intern/cycles/util/util_profiling.cpp
A	intern/cycles/util/util_profiling.h
M	intern/cycles/util/util_stats.h

===================================================================

diff --git a/intern/cycles/blender/blender_session.cpp b/intern/cycles/blender/blender_session.cpp
index 75c7dcee05e..30ae0bc813d 100644
--- a/intern/cycles/blender/blender_session.cpp
+++ b/intern/cycles/blender/blender_session.cpp
@@ -516,7 +516,7 @@ void BlenderSession::render()
 
 		if(!b_engine.is_preview() && background && print_render_stats) {
 			RenderStats stats;
-			session->scene->collect_statistics(&stats);
+			session->collect_statistics(&stats);
 			printf("Render statistics:\n%s\n", stats.full_report().c_str());
 		}
 
diff --git a/intern/cycles/blender/blender_sync.cpp b/intern/cycles/blender/blender_sync.cpp
index 832847c179f..49b046d0a88 100644
--- a/intern/cycles/blender/blender_sync.cpp
+++ b/intern/cycles/blender/blender_sync.cpp
@@ -874,7 +874,8 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 	}
 
 	/* tiles */
-	if(params.device.type != DEVICE_CPU && !background) {
+	const bool is_cpu = (params.device.type == DEVICE_CPU);
+	if(!is_cpu && !background) {
 		/* currently GPU could be much slower than CPU when using tiles,
 		 * still need to be investigated, but meanwhile make it possible
 		 * to work in viewport smoothly
@@ -960,6 +961,9 @@ SessionParams BlenderSync::get_session_params(BL::RenderEngine& b_engine,
 		params.progressive_update_timeout = 0.1;
 	}
 
+	params.use_profiling = params.device.has_profiling && !b_engine.is_preview() &&
+	                       background && BlenderSession::print_render_stats;
+
 	return params;
 }
 
diff --git a/intern/cycles/device/device.cpp b/intern/cycles/device/device.cpp
index 7e20bb449c3..54ffd4bc4df 100644
--- a/intern/cycles/device/device.cpp
+++ b/intern/cycles/device/device.cpp
@@ -362,6 +362,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 	info.has_half_images = true;
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
+	info.has_profiling = true;
 
 	foreach(const DeviceInfo &device, subdevices) {
 		/* Ensure CPU device does not slow down GPU. */
@@ -396,6 +397,7 @@ DeviceInfo Device::get_multi_device(const vector<DeviceInfo>& subdevices, int th
 		info.has_half_images &= device.has_half_images;
 		info.has_volume_decoupled &= device.has_volume_decoupled;
 		info.has_osl &= device.has_osl;
+		info.has_profiling &= device.has_profiling;
 	}
 
 	return info;
diff --git a/intern/cycles/device/device.h b/intern/cycles/device/device.h
index f3fb338e638..071f61a7566 100644
--- a/intern/cycles/device/device.h
+++ b/intern/cycles/device/device.h
@@ -60,6 +60,7 @@ public:
 	bool has_volume_decoupled;      /* Decoupled volume shading. */
 	bool has_osl;                   /* Support Open Shading Language. */
 	bool use_split_kernel;          /* Use split or mega kernel. */
+	bool has_profiling;             /* Supports runtime collection of profiling info. */
 	int cpu_threads;
 	vector<DeviceInfo> multi_devices;
 
@@ -75,6 +76,7 @@ public:
 		has_volume_decoupled = false;
 		has_osl = false;
 		use_split_kernel = false;
+		has_profiling = false;
 	}
 
 	bool operator==(const DeviceInfo &info) {
diff --git a/intern/cycles/device/device_cpu.cpp b/intern/cycles/device/device_cpu.cpp
index 36fe9bfc92b..f0a6fd6e3f4 100644
--- a/intern/cycles/device/device_cpu.cpp
+++ b/intern/cycles/device/device_cpu.cpp
@@ -477,6 +477,8 @@ public:
 	bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
 	                               DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_NON_LOCAL_MEANS);
+
 		int4 rect = task->rect;
 		int   r   = task->nlm_state.r;
 		int   f   = task->nlm_state.f;
@@ -529,6 +531,8 @@ public:
 
 	bool denoising_construct_transform(DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_CONSTRUCT_TRANSFORM);
+
 		for(int y = 0; y < task->filter_area.w; y++) {
 			for(int x = 0; x < task->filter_area.z; x++) {
 				filter_construct_transform_kernel()((float*) task->buffer.mem.device_pointer,
@@ -551,6 +555,8 @@ public:
 	                           device_ptr output_ptr,
 	                           DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_RECONSTRUCT);
+
 		mem_zero(task->storage.XtWX);
 		mem_zero(task->storage.XtWY);
 
@@ -609,8 +615,10 @@ public:
 
 	bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
 	                              device_ptr mean_ptr, device_ptr variance_ptr,
-	                              int r, int4 rect, DenoisingTask * /*task*/)
+	                              int r, int4 rect, DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_COMBINE_HALVES);
+
 		for(int y = rect.y; y < rect.w; y++) {
 			for(int x = rect.x; x < rect.z; x++) {
 				filter_combine_halves_kernel()(x, y,
@@ -629,6 +637,8 @@ public:
 	                             device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
 	                             device_ptr buffer_variance_ptr, DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DIVIDE_SHADOW);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_divide_shadow_kernel()(task->render_buffer.samples,
@@ -653,6 +663,8 @@ public:
 	                           device_ptr variance_ptr,
 	                           DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_GET_FEATURE);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_get_feature_kernel()(task->render_buffer.samples,
@@ -676,6 +688,8 @@ public:
 	                               device_ptr output_ptr,
 	                               DenoisingTask *task)
 	{
+		ProfilingHelper profiling(task->profiler, PROFILING_DENOISING_DETECT_OUTLIERS);
+
 		for(int y = task->rect.y; y < task->rect.w; y++) {
 			for(int x = task->rect.x; x < task->rect.z; x++) {
 				filter_detect_outliers_kernel()(x, y,
@@ -735,6 +749,8 @@ public:
 
 	void denoise(DenoisingTask& denoising, RenderTile &tile)
 	{
+		ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
+
 		tile.sample = tile.start_sample + tile.num_samples;
 
 		denoising.functions.construct_transform = function_bind(&CPUDevice::denoising_construct_transform, this, &denoising);
@@ -765,6 +781,8 @@ public:
 
 		KernelGlobals *kg = new ((void*) kgbuffer.device_pointer) KernelGlobals(thread_kernel_globals_init());
 
+		stats.profiler.add_state(&kg->profiler);
+
 		CPUSplitKernel *split_kernel = NULL;
 		if(use_split_kernel) {
 			split_kernel = new CPUSplitKernel(this);
@@ -778,6 +796,7 @@ public:
 
 		RenderTile tile;
 		DenoisingTask denoising(this, task);
+		denoising.profiler = &kg->profiler;
 
 		while(task.acquire_tile(this, tile)) {
 			if(tile.task == RenderTile::PATH_TRACE) {
@@ -802,6 +821,8 @@ public:
 			}
 		}
 
+		stats.profiler.remove_state(&kg->profiler);
+
 		thread_kernel_globals_free((KernelGlobals*)kgbuffer.device_pointer);
 		kg->~KernelGlobals();
 		kgbuffer.free();
@@ -1061,6 +1082,7 @@ void device_cpu_info(vector<DeviceInfo>& devices)
 	info.has_volume_decoupled = true;
 	info.has_osl = true;
 	info.has_half_images = true;
+	info.has_profiling = true;
 
 	devices.insert(devices.begin(), info);
 }
diff --git a/intern/cycles/device/device_denoising.cpp b/intern/cycles/device/device_denoising.cpp
index 78c65a3d22d..433cbd3c265 100644
--- a/intern/cycles/device/device_denoising.cpp
+++ b/intern/cycles/device/device_denoising.cpp
@@ -22,6 +22,7 @@ CCL_NAMESPACE_BEGIN
 
 DenoisingTask::DenoisingTask(Device *device, const DeviceTask &task)
 : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
+  profiler(NULL),
   storage(device),
   buffer(device),
   device(device)
diff --git a/intern/cycles/device/device_denoising.h b/intern/cycles/device/device_denoising.h
index 8e0666d0e59..beae60c220f 100644
--- a/intern/cycles/device/device_denoising.h
+++ b/intern/cycles/device/device_denoising.h
@@ -23,6 +23,8 @@
 
 #include "kernel/filter/filter_defines.h"
 
+#include "util/util_profiling.h"
+
 CCL_NAMESPACE_BEGIN
 
 class DenoisingTask {
@@ -51,6 +53,8 @@ public:
 	TileInfo *tile_info;
 	device_vector<int> tile_info_mem;
 
+	ProfilingState *profiler;
+
 	int4 rect;
 	int4 filter_area;
 
diff --git a/intern/cycles/kernel/CMakeLists.txt b/intern/cycles/kernel/CMakeLists.txt
index 92cb66bdec9..d4145225b77 100644
--- a/intern/cycles/kernel/CMakeLists.txt
+++ b/intern/cycles/kernel/CMakeLists.txt
@@ -110,6 +110,7 @@ set(SRC_HEADERS
 	kernel_path_surface.h
 	kernel_path_subsurface.h
 	kernel_path_volume.h
+	kernel_profiling.h
 	kernel_projection.h
 	kernel_queues.h
 	kernel_random.h
diff --git a/intern/cycles/kernel/bvh/bvh.h b/intern/cycles/kernel/bvh/bvh.h
index 6708a3efac1..284b1e9208c 100644
--- a/intern/cycles/kernel/bvh/bvh.h
+++ b/intern/cycles/kernel/bvh/bvh.h
@@ -186,6 +186,8 @@ ccl_device_intersect bool scene_intersect(KernelGlobals *kg,
                                           float difl,
                                           float extmax)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT);
+
 	if(!scene_intersect_valid(&ray)) {
 		return false;
 	}
@@ -248,6 +250,8 @@ ccl_device_intersect bool scene_intersect_local(KernelGlobals *kg,
                                                 uint *lcg_state,
                                                 int max_hits)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_LOCAL);
+
 	if(!scene_intersect_valid(&ray)) {
 		return false;
 	}
@@ -327,6 +331,8 @@ ccl_device_intersect bool scene_intersect_shadow_all(KernelGlobals *kg,
                                                      uint max_hits,
                                                      uint *num_hits)
 {
+	PROFILING_INIT(kg, PROFILING_INTERSECT_SHADOW_ALL);
+
 	if(!scene_intersect_valid(ray)) {
 		return false;
 	}
@@ -407,6 +413,8 @@ ccl_device_intersect bool scene_intersect_volume(KernelGlobals *kg,
                                                  Intersection *isect,
                                                  const uint visibility)
 {
+	PROFILING_INIT(kg, PROF

@@ Diff output truncated at 10240 characters. @@



More information about the Bf-blender-cvs mailing list