[Bf-blender-cvs] [c5eba46d7f4] blender2.8: Gawain: Refactor: VAOs caching AND use new VAOs manager.

Wed Feb 21 15:28:41 CET 2018

Commit: c5eba46d7f4ddfcdf372a3f4968e4d170ee0a002
Author: Clément Foucault
Date:   Tue Feb 20 01:55:19 2018 +0100
Branches: blender2.8
https://developer.blender.org/rBc5eba46d7f4ddfcdf372a3f4968e4d170ee0a002

Gawain: Refactor: VAOs caching AND use new VAOs manager.

A major bottleneck of current implementation is the call to create_bindings() for basically every drawcalls.
This is due to the VAO being tagged dirty when assigning a new shader to the Batch, defeating the purpose of the Batch (reuse it for drawing).

Since managing hundreds of batches in DrawManager and DrawCache seems not fun enough to me, I prefered rewritting the batches itself.

--- Batch changes ---
For this to happen I needed to change the Instancing to be part of the Batch rather than being another batch supplied at drawtime.
The Gwn_VertBuffers are copied from the batch to be instanciated and a new Gwn_VertBuffer is supplied for instancing attribs.
This mean a VAO can be generated and cached for this instancing case.

A Batch can be rendered with instancing, without instancing attribs and without the need for a new VAO using the GWN_batch_draw_range_ex with the force_instance parameter set to true.

--- Draw manager changes ---
The downside with this approach is that we must track the validity of the instanced batch (the original one). For this the only way (I could think of) is to set a callback for when the batch is getting free.
This means a bit of refactor in the DrawManager with the separation of batching and instancing Batches.

--- VAO cache ---
Each VAO is generated for a given ShaderInterface. This means we can keep it alive as long as the shader interface lives.
If a ShaderInterface is discarded, it needs to destroy every VAO associated to it. Otherwise, a new ShaderInterface with the same adress could be generated and reuse the same VAO with incorrect bindings.
The VAO cache itself is using a mix between a static array of VAO and a dynamic array if the is not enough space in the static.
Using this hybrid approach is a bit more performant than the dynamic array alone.
The array will not resize down but empty entries will be filled up again. It's unlikely we get a buffer overflow from this. Resizing could be done on next allocation if needed.

--- Results ---
Using Cached VAOs means that we are not querying each vertex attrib for each vbo for each drawcall, every redraw!
In a CPU limited test scene (10000 cubes in Clay engine) I get a reduction of CPU drawing time from ~20ms to 13ms.

The only area that is not caching VAOs is the instancing from particles (see comment DRW_shgroup_instance_batch).

===================================================================

M	intern/gawain/CMakeLists.txt
M	intern/gawain/gawain/gwn_batch.h
M	intern/gawain/gawain/gwn_buffer_id.h
M	intern/gawain/gawain/gwn_shader_interface.h
M	intern/gawain/gawain/gwn_vertex_array_id.h
M	intern/gawain/src/gwn_batch.c
M	intern/gawain/src/gwn_buffer_id.cpp
M	intern/gawain/src/gwn_immediate.c
M	intern/gawain/src/gwn_shader_interface.c
M	intern/gawain/src/gwn_vertex_array_id.cpp
M	source/blender/draw/intern/DRW_render.h
M	source/blender/draw/intern/draw_instance_data.c
M	source/blender/draw/intern/draw_instance_data.h
M	source/blender/draw/intern/draw_manager.c
M	source/blender/draw/modes/object_mode.c

===================================================================

diff --git a/intern/gawain/CMakeLists.txt b/intern/gawain/CMakeLists.txt
index 9924daa8cd1..424b364ae8e 100644
--- a/intern/gawain/CMakeLists.txt
+++ b/intern/gawain/CMakeLists.txt
@@ -16,6 +16,7 @@ set(SRC
 	src/gwn_imm_util.c
 	src/gwn_primitive.c
 	src/gwn_shader_interface.c
+	src/gwn_vertex_array_id.cpp
 	src/gwn_vertex_buffer.c
 	src/gwn_vertex_format.c
 
@@ -30,6 +31,7 @@ set(SRC
 	gawain/gwn_primitive.h
 	gawain/gwn_primitive_private.h
 	gawain/gwn_shader_interface.h
+	gawain/gwn_vertex_array_id.h
 	gawain/gwn_vertex_buffer.h
 	gawain/gwn_vertex_format.h
 	gawain/gwn_vertex_format_private.h
diff --git a/intern/gawain/gawain/gwn_batch.h b/intern/gawain/gawain/gwn_batch.h
index 94cd893f09e..c676cfef119 100644
--- a/intern/gawain/gawain/gwn_batch.h
+++ b/intern/gawain/gawain/gwn_batch.h
@@ -23,34 +23,61 @@ typedef enum {
 } Gwn_BatchPhase;
 
 #define GWN_BATCH_VBO_MAX_LEN 3
+#define GWN_BATCH_VAO_STATIC_LEN 3
+#define GWN_BATCH_VAO_DYN_ALLOC_COUNT 16
 
 typedef struct Gwn_Batch {
 	// geometry
 	Gwn_VertBuf* verts[GWN_BATCH_VBO_MAX_LEN]; // verts[0] is required, others can be NULL
+	Gwn_VertBuf* inst; // instance attribs
 	Gwn_IndexBuf* elem; // NULL if element list not needed
-	Gwn_PrimType prim_type;
 	GLenum gl_prim_type;
 
+	// cached values (avoid dereferencing later)
+	GLuint vao_id;
+	GLuint program;
+	const struct Gwn_ShaderInterface* interface;
+
 	// book-keeping
-	GLuint vao_id; // remembers all geometry state (vertex attrib bindings & element buffer)
+	unsigned owns_flag;
+	struct Gwn_Context *context; // used to free all vaos. this implies all vaos were created under the same context.
 	Gwn_BatchPhase phase;
-	bool program_dirty;
 	bool program_in_use;
-	unsigned owns_flag;
 
-	// state
-	GLuint program;
-	const Gwn_ShaderInterface* interface;
+	// Vao management: remembers all geometry state (vertex attrib bindings & element buffer)
+	// for each shader interface. Start with a static number of vaos and fallback to dynamic count
+	// if necessary. Once a batch goes dynamic it does not go back.
+	bool is_dynamic_vao_count;
+	union {
+		// Static handle count
+		struct {
+			const struct Gwn_ShaderInterface* interfaces[GWN_BATCH_VAO_STATIC_LEN];
+			GLuint vao_ids[GWN_BATCH_VAO_STATIC_LEN];
+		} static_vaos;
+		// Dynamic handle count
+		struct {
+			unsigned count;
+			const struct Gwn_ShaderInterface** interfaces;
+			GLuint* vao_ids;
+		} dynamic_vaos;
+	};
+
+	// XXX This is the only solution if we want to have some data structure using
+	// batches as key to identify nodes. We must destroy these nodes with this callback.
+	void (*free_callback)(struct Gwn_Batch*, void*);
+	void* callback_data;
 } Gwn_Batch;
 
 enum {
 	GWN_BATCH_OWNS_VBO = (1 << 0),
 	/* each vbo index gets bit-shifted */
+	GWN_BATCH_OWNS_INSTANCES = (1 << 30),
 	GWN_BATCH_OWNS_INDEX = (1 << 31),
 };
 
 Gwn_Batch* GWN_batch_create_ex(Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
 void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, unsigned owns_flag);
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src);
 
 #define GWN_batch_create(prim, verts, elem) \
 	GWN_batch_create_ex(prim, verts, elem, 0)
@@ -59,11 +86,18 @@ void GWN_batch_init_ex(Gwn_Batch*, Gwn_PrimType, Gwn_VertBuf*, Gwn_IndexBuf*, un
 
 void GWN_batch_discard(Gwn_Batch*); // verts & elem are not discarded
 
+void GWN_batch_callback_free_set(Gwn_Batch*, void (*callback)(Gwn_Batch*, void*), void*);
+
+void GWN_batch_instbuf_set(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo); // Instancing
+
 int GWN_batch_vertbuf_add_ex(Gwn_Batch*, Gwn_VertBuf*, bool own_vbo);
 
 #define GWN_batch_vertbuf_add(batch, verts) \
 	GWN_batch_vertbuf_add_ex(batch, verts, false)
 
+// This is a private function
+void GWN_batch_remove_interface_ref(Gwn_Batch*, const Gwn_ShaderInterface*);
+
 void GWN_batch_program_set(Gwn_Batch*, GLuint program, const Gwn_ShaderInterface*);
 void GWN_batch_program_unset(Gwn_Batch*);
 // Entire batch draws with one shader program, but can be redrawn later with another program.
@@ -84,11 +118,14 @@ void GWN_batch_uniform_4fv(Gwn_Batch*, const char* name, const float data[4]);
 
 void GWN_batch_draw(Gwn_Batch*);
 
+// This does not bind/unbind shader and does not call gpuBindMatrices()
+void GWN_batch_draw_range_ex(Gwn_Batch*, int v_first, int v_count, bool force_instance);
 
-void GWN_batch_draw_stupid(Gwn_Batch*, int v_first, int v_count);
-void GWN_batch_draw_stupid_instanced(Gwn_Batch*, Gwn_Batch*, int instance_first, int instance_count);
-void GWN_batch_draw_procedural(Gwn_Batch*, Gwn_PrimType, int v_count);
+#define GWN_batch_draw_range(batch, first, count) \
+	GWN_batch_draw_range_ex(batch, first, count, false)
 
+// Does not even need batch
+void GWN_draw_primitive(Gwn_PrimType, int v_count);
 
 #if 0 // future plans
 
diff --git a/intern/gawain/gawain/gwn_buffer_id.h b/intern/gawain/gawain/gwn_buffer_id.h
index db5df99f526..6f51ca6905d 100644
--- a/intern/gawain/gawain/gwn_buffer_id.h
+++ b/intern/gawain/gawain/gwn_buffer_id.h
@@ -25,10 +25,6 @@ extern "C" {
 GLuint GWN_buf_id_alloc(void);
 void GWN_buf_id_free(GLuint buffer_id);
 
-GLuint GWN_vao_alloc(void);
-void GWN_vao_free(GLuint vao_id);
-
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/intern/gawain/gawain/gwn_shader_interface.h b/intern/gawain/gawain/gwn_shader_interface.h
index 345ad8d389b..3bca541d6e8 100644
--- a/intern/gawain/gawain/gwn_shader_interface.h
+++ b/intern/gawain/gawain/gwn_shader_interface.h
@@ -54,6 +54,7 @@ typedef struct Gwn_ShaderInput {
 } Gwn_ShaderInput;
 
 #define GWN_NUM_SHADERINTERFACE_BUCKETS 257
+#define GWN_SHADERINTERFACE_REF_ALLOC_COUNT 16
 
 typedef struct Gwn_ShaderInterface {
 	GLint program;
@@ -63,6 +64,8 @@ typedef struct Gwn_ShaderInterface {
 	Gwn_ShaderInput* ubo_buckets[GWN_NUM_SHADERINTERFACE_BUCKETS];
 	Gwn_ShaderInput* builtin_uniforms[GWN_NUM_UNIFORMS];
 	char* name_buffer;
+	struct Gwn_Batch** batches; // references to batches using this interface
+	unsigned batches_ct;
 } Gwn_ShaderInterface;
 
 Gwn_ShaderInterface* GWN_shaderinterface_create(GLint program_id);
@@ -72,3 +75,7 @@ const Gwn_ShaderInput* GWN_shaderinterface_uniform(const Gwn_ShaderInterface*, c
 const Gwn_ShaderInput* GWN_shaderinterface_uniform_builtin(const Gwn_ShaderInterface*, Gwn_UniformBuiltin);
 const Gwn_ShaderInput* GWN_shaderinterface_ubo(const Gwn_ShaderInterface*, const char* name);
 const Gwn_ShaderInput* GWN_shaderinterface_attr(const Gwn_ShaderInterface*, const char* name);
+
+// keep track of batches using this interface
+void GWN_shaderinterface_add_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
+void GWN_shaderinterface_remove_batch_ref(Gwn_ShaderInterface*, struct Gwn_Batch*);
diff --git a/intern/gawain/gawain/gwn_vertex_array_id.h b/intern/gawain/gawain/gwn_vertex_array_id.h
index 6d2a059b9bd..1c093d428ce 100644
--- a/intern/gawain/gawain/gwn_vertex_array_id.h
+++ b/intern/gawain/gawain/gwn_vertex_array_id.h
@@ -26,8 +26,8 @@ extern "C" {
 #include "gwn_context.h"
 
 GLuint GWN_vao_default(void);
-GLuint GWN_vao_alloc_new(void);
-void GWN_vao_free_new(GLuint vao_id, Gwn_Context*);
+GLuint GWN_vao_alloc(void);
+void GWN_vao_free(GLuint vao_id, Gwn_Context*);
 
 #ifdef __cplusplus
 }
diff --git a/intern/gawain/src/gwn_batch.c b/intern/gawain/src/gwn_batch.c
index ec3f98e348c..098c547c662 100644
--- a/intern/gawain/src/gwn_batch.c
+++ b/intern/gawain/src/gwn_batch.c
@@ -11,12 +11,48 @@
 
 #include "gwn_batch.h"
 #include "gwn_buffer_id.h"
+#include "gwn_vertex_array_id.h"
 #include "gwn_primitive_private.h"
 #include <stdlib.h>
+#include <string.h>
 
 // necessary functions from matrix API
 extern void gpuBindMatrices(const Gwn_ShaderInterface* shaderface);
-extern bool gpuMatricesDirty(void); // how best to use this here?
+
+static void batch_update_program_bindings(Gwn_Batch* batch, unsigned int v_first);
+
+static void Batch_vao_cache_clear(Gwn_Batch* batch)
+	{
+	if (batch->is_dynamic_vao_count)
+		{
+		for (int i = 0; i < batch->dynamic_vaos.count; ++i)
+			{
+			if (batch->dynamic_vaos.vao_ids[i])
+				GWN_vao_free(batch->dynamic_vaos.vao_ids[i], batch->context);
+			if (batch->dynamic_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->dynamic_vaos.interfaces[i], batch);
+			}
+		free(batch->dynamic_vaos.interfaces);
+		free(batch->dynamic_vaos.vao_ids);
+		}
+	else
+		{
+		for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+			{
+			if (batch->static_vaos.vao_ids[i])
+				GWN_vao_free(batch->static_vaos.vao_ids[i], batch->context);
+			if (batch->static_vaos.interfaces[i])
+				GWN_shaderinterface_remove_batch_ref((Gwn_ShaderInterface *)batch->static_vaos.interfaces[i], batch);
+			}
+		}
+
+	batch->is_dynamic_vao_count = false;
+	for (int i = 0; i < GWN_BATCH_VAO_STATIC_LEN; ++i)
+		{
+		batch->static_vaos.vao_ids[i] = 0;
+		batch->static_vaos.interfaces[i] = NULL;
+		}
+	}
 
 Gwn_Batch* GWN_batch_create_ex(
         Gwn_PrimType prim_type, Gwn_VertBuf* verts, Gwn_IndexBuf* elem,
@@ -40,11 +76,25 @@ void GWN_batch_init_ex(
 	batch->verts[0] = verts;
 	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
 		batch->verts[v] = NULL;
+	batch->inst = NULL;
 	batch->elem = elem;
-	batch->prim_type = prim_type;
 	batch->gl_prim_type = convert_prim_type_to_gl(prim_type);
 	batch->phase = GWN_BATCH_READY_TO_DRAW;
+	batch->is_dynamic_vao_count = false;
 	batch->owns_flag = owns_flag;
+	batch->free_callback = NULL;
+	}
+
+// This will share the VBOs with the new batch
+Gwn_Batch* GWN_batch_duplicate(Gwn_Batch* batch_src)
+	{
+	Gwn_Batch* batch = GWN_batch_create_ex(GWN_PRIM_POINTS, batch_src->verts[0], batch_src->elem, 0);
+
+	batch->gl_prim_type = batch_src->gl_prim_type;
+	for (int v = 1; v < GWN_BATCH_VBO_MAX_LEN; ++v)
+		batch->verts[v] = batch_src->verts[v];
+
+	return batch;
 	}
 
 void GWN_batch_discard(Gwn_Batch* batch)
@@ -52,6 +102,9 @@ void GWN_batch_discard(Gwn_Batch* batch)
 	if (batch->owns_flag & GWN_BATCH_OWNS_INDEX)
 		GWN_indexbuf_discard(batch->elem);
 
+	if (batch->owns_flag & GWN_BATCH_OWNS_INSTANCES)
+		GWN_vertbuf_discard(batch->inst);
+
 	if ((batch->owns_flag & ~GWN_BATCH_OWNS_INDEX) != 0)
 		{
 		for (int v = 0; v < GWN_BATCH_VBO_MAX_LEN; ++v)
@@ -63,12 +116,39 @@ void GWN_batch_discard(Gwn_Batch* ba

@@ Diff output truncated at 10240 characters. @@