[Bf-blender-cvs] [d9a7ec3947e] tmp-drw-callbatching: DRW: Use new GPUDrawList to speedup instancing

Sat Aug 17 14:50:33 CEST 2019

Commit: d9a7ec3947eb7056bb3662cd8ce4a9b420ab3065
Author: Clément Foucault
Date:   Wed Jun 19 16:01:02 2019 +0200
Branches: tmp-drw-callbatching
https://developer.blender.org/rBd9a7ec3947eb7056bb3662cd8ce4a9b420ab3065

DRW: Use new GPUDrawList to speedup instancing

This improves the performances of random instancing even further.
Test 30K objects:
42 fps -> 52 fps
24 ms  -> 19 ms

===================================================================

M	source/blender/draw/intern/draw_manager.c
M	source/blender/draw/intern/draw_manager.h
M	source/blender/draw/intern/draw_manager_exec.c

===================================================================

diff --git a/source/blender/draw/intern/draw_manager.c b/source/blender/draw/intern/draw_manager.c
index 8a43ddbc4d5..6037d1e6435 100644
--- a/source/blender/draw/intern/draw_manager.c
+++ b/source/blender/draw/intern/draw_manager.c
@@ -724,6 +724,10 @@ static void drw_viewport_var_init(void)
     G_draw.view_ubo = DRW_uniformbuffer_create(sizeof(DRWViewUboStorage), NULL);
   }
 
+  if (DST.draw_list == NULL) {
+    DST.draw_list = GPU_draw_list_create(DRW_DRAWLIST_LEN);
+  }
+
   memset(DST.object_instance_data, 0x0, sizeof(DST.object_instance_data));
 }
 
@@ -2908,6 +2912,10 @@ void DRW_engines_free(void)
 
   MEM_SAFE_FREE(DST.uniform_names.buffer);
 
+  if (DST.draw_list) {
+    GPU_draw_list_discard(DST.draw_list);
+  }
+
   DRW_opengl_context_disable();
 }
 
diff --git a/source/blender/draw/intern/draw_manager.h b/source/blender/draw/intern/draw_manager.h
index 54e35c7e5c4..1b4cb06a59c 100644
--- a/source/blender/draw/intern/draw_manager.h
+++ b/source/blender/draw/intern/draw_manager.h
@@ -286,14 +286,6 @@ struct DRWView {
   void *user_data;
 };
 
-/* TODO(fclem): Future awaits */
-#if 0
-typedef struct ModelUboStorage {
-  float model[4][4];
-  float modelinverse[4][4];
-} ModelUboStorage;
-#endif
-
 /* ------------ Data Chunks --------------- */
 /**
  * In order to keep a cache friendly data structure,
@@ -347,6 +339,7 @@ typedef struct DRWDebugSphere {
 #define DST_MAX_SLOTS 64  /* Cannot be changed without modifying RST.bound_tex_slots */
 #define MAX_CLIP_PLANES 6 /* GL_MAX_CLIP_PLANES is at least 6 */
 #define STENCIL_UNDEFINED 256
+#define DRW_DRAWLIST_LEN 256
 typedef struct DRWManager {
   /* TODO clean up this struct a bit */
   /* Cache generation */
@@ -431,6 +424,8 @@ typedef struct DRWManager {
   /** Mutex to lock the drw manager and avoid concurrent context usage. */
   TicketMutex *gl_context_mutex;
 
+  GPUDrawList *draw_list;
+
   /** GPU Resource State: Memory storage between drawing. */
   struct {
     /* High end GPUs supports up to 32 binds per shader stage.
diff --git a/source/blender/draw/intern/draw_manager_exec.c b/source/blender/draw/intern/draw_manager_exec.c
index 107c80d7619..e140d005796 100644
--- a/source/blender/draw/intern/draw_manager_exec.c
+++ b/source/blender/draw/intern/draw_manager_exec.c
@@ -597,6 +597,23 @@ BLI_INLINE void draw_legacy_matrix_update(DRWShadingGroup *shgroup,
   }
 }
 
+BLI_INLINE void draw_geometry_bind(DRWShadingGroup *shgroup, GPUBatch *geom)
+{
+  /* XXX hacking gawain. we don't want to call glUseProgram! (huge performance loss) */
+  if (DST.batch) {
+    DST.batch->program_in_use = false;
+  }
+
+  DST.batch = geom;
+
+  GPU_batch_program_set_no_use(
+      geom, GPU_shader_get_program(shgroup->shader), GPU_shader_get_interface(shgroup->shader));
+
+  geom->program_in_use = true;
+
+  GPU_batch_bind(geom);
+}
+
 BLI_INLINE void draw_geometry_execute(DRWShadingGroup *shgroup,
                                       GPUBatch *geom,
                                       int vert_first,
@@ -617,20 +634,33 @@ BLI_INLINE void draw_geometry_execute(DRWShadingGroup *shgroup,
 
   /* bind vertex array */
   if (DST.batch != geom) {
-    DST.batch = geom;
-
-    GPU_batch_program_set_no_use(
-        geom, GPU_shader_get_program(shgroup->shader), GPU_shader_get_interface(shgroup->shader));
-
-    GPU_batch_bind(geom);
+    draw_geometry_bind(shgroup, geom);
   }
 
-  /* XXX hacking gawain. we don't want to call glUseProgram! (huge performance loss) */
-  geom->program_in_use = true;
-
   GPU_batch_draw_advanced(geom, vert_first, vert_count, inst_first, inst_count);
+}
 
-  geom->program_in_use = false; /* XXX hacking gawain */
+BLI_INLINE void draw_indirect_call(DRWShadingGroup *shgroup,
+                                   GPUBatch *geom,
+                                   int vert_first,
+                                   int vert_count,
+                                   int inst_first,
+                                   int inst_count,
+                                   int baseinst_loc)
+{
+  if (baseinst_loc == -1) {
+    /* bind vertex array */
+    if (DST.batch != geom) {
+      GPU_draw_list_submit(DST.draw_list);
+      draw_geometry_bind(shgroup, geom);
+    }
+    GPU_draw_list_command_add(DST.draw_list, vert_first, vert_count, inst_first, inst_count);
+  }
+  /* Fallback when unsupported */
+  else if (inst_count > 0) {
+    draw_geometry_execute(
+        shgroup, geom, vert_first, vert_count, inst_first, inst_count, baseinst_loc);
+  }
 }
 
 enum {
@@ -992,6 +1022,8 @@ typedef struct DRWCommandsState {
   int resource_chunk;
   int base_inst;
   int inst_count;
+  int v_first;
+  int v_count;
   GPUBatch *batch;
   bool neg_scale;
 } DRWCommandsState;
@@ -1040,9 +1072,18 @@ static bool draw_call_do_batching(DRWShadingGroup *shgroup,
      * where any if the above condition are true. */
     BLI_assert(state->inst_count == 0);
     if (state->inst_count > 0) {
-      draw_geometry_execute(
-          shgroup, state->batch, 0, 0, state->base_inst, state->inst_count, baseinst_loc);
+      /* We need to draw the pending instances. */
+      draw_indirect_call(shgroup,
+                         state->batch,
+                         state->v_first,
+                         state->v_count,
+                         state->base_inst,
+                         state->inst_count,
+                         baseinst_loc);
     }
+    /* Submit the pending commands. */
+    /* NOTE/TODO: We could allow command list usage in this case. */
+    GPU_draw_list_submit(DST.draw_list);
     /* We cannot pack in this situation. */
     state->inst_count = 0;
     state->base_inst = 0;
@@ -1051,20 +1092,44 @@ static bool draw_call_do_batching(DRWShadingGroup *shgroup,
   }
   else {
     /* See if any condition requires to interupt the packing. */
-    if ((call->handle.id != state->base_inst + state->inst_count) || /* Is the id consecutive? */
-        (call->handle.negative_scale != state->neg_scale) ||         /* */
-        (call->handle.chunk != state->resource_chunk) ||             /* */
-        (call->batch != state->batch)                                /* */
+    if ((call->handle.negative_scale != state->neg_scale) || /* Need to change state. */
+        (call->handle.chunk != state->resource_chunk) ||     /* Need to change UBOs. */
+        (call->batch != state->batch)                        /* Need to change VAO. */
     ) {
       /* We need to draw the pending instances. */
-      if (state->inst_count > 0) {
-        draw_geometry_execute(
-            shgroup, state->batch, 0, 0, state->base_inst, state->inst_count, baseinst_loc);
-      }
+      draw_indirect_call(shgroup,
+                         state->batch,
+                         state->v_first,
+                         state->v_count,
+                         state->base_inst,
+                         state->inst_count,
+                         baseinst_loc);
+      /* Submit the pending commands. */
+      GPU_draw_list_submit(DST.draw_list);
+
+      state->batch = call->batch;
+      state->v_first = 0;
+      state->v_count = (call->batch->elem) ? call->batch->elem->index_len :
+                                             call->batch->verts[0]->vertex_len;
       state->inst_count = 1;
       state->base_inst = call->handle.id;
-      state->batch = call->batch;
+
       draw_call_resource_bind(state, call->handle, obmats_loc, obinfos_loc, chunkid_loc);
+
+      GPU_draw_list_init(DST.draw_list, state->batch);
+    }
+    /* Is the id consecutive? */
+    else if (call->handle.id != state->base_inst + state->inst_count) {
+      /* We need to add a draw command for the pending instances. */
+      draw_indirect_call(shgroup,
+                         state->batch,
+                         state->v_first,
+                         state->v_count,
+                         state->base_inst,
+                         state->inst_count,
+                         baseinst_loc);
+      state->inst_count = 1;
+      state->base_inst = call->handle.id;
     }
     else {
       state->inst_count++;
@@ -1072,6 +1137,25 @@ static bool draw_call_do_batching(DRWShadingGroup *shgroup,
     return true;
   }
 }
+
+/* Flush remaining pending drawcalls. */
+static void draw_call_batching_finish(DRWShadingGroup *shgroup,
+                                      DRWCommandsState *state,
+                                      int baseinst_loc)
+{
+  if (state->inst_count > 0) {
+    /* Add last instance call if there was any in preparation. */
+    draw_indirect_call(shgroup,
+                       state->batch,
+                       state->v_first,
+                       state->v_count,
+                       state->base_inst,
+                       state->inst_count,
+                       baseinst_loc);
+  }
+  /* Flush the last pending drawcalls batched together. */
+  GPU_draw_list_submit(DST.draw_list);
+}
 #endif
 
 static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
@@ -1095,6 +1179,10 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
     }
     GPU_shader_bind(shgroup->shader);
     DST.shader = shgroup->shader;
+    /* XXX hacking gawain */
+    if (DST.batch) {
+      DST.batch->program_in_use = false;
+    }
     DST.batch = NULL;
   }
 
@@ -1122,6 +1210,10 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
   {
     GPUBatch *first_batch = (shgroup->calls.first) ? shgroup->calls.first->calls[0].batch : NULL;
 
+    if (first_batch) {
+      GPU_draw_list_init(DST.draw_list, first_batch);
+    }
+
     DRWCallIterator iter;
     draw_call_iter_begin(&iter, shgroup);
     DRWCall *call;
@@ -1132,6 +1224,10 @@ static void draw_shgroup(DRWShadingGroup *shgroup, DRWState pass_state)
         .base_inst = 0,
         .inst_count = 0,
         .callid = 0,
+        .v_first = 0,
+        .v_count = (first_batch ? (first_batch->elem ? first_batch->elem->index_len :
+                                                       first_batch->verts[0]->vertex_len) :
+                                  0),
         .batch = first_batch,
     };
     while ((call = draw_call_iter_step(&iter))) {
@@ -1167,13 +1263,8 @@ static void draw_shgroup(DRW

@@ Diff output truncated at 10240 characters. @@