[Bf-blender-cvs] [396fe9a98af] temp-texture-painting-gpu: Researching bottlenecks.

Fri Oct 14 14:51:26 CEST 2022

Commit: 396fe9a98af41805ade2e10a1a36f08e1e59bda6
Author: Jeroen Bakker
Date:   Fri Oct 14 14:51:10 2022 +0200
Branches: temp-texture-painting-gpu
https://developer.blender.org/rB396fe9a98af41805ade2e10a1a36f08e1e59bda6

Researching bottlenecks.

===================================================================

M	source/blender/blenkernel/BKE_pbvh_pixels.hh
M	source/blender/blenkernel/intern/pbvh_pixels.cc
M	source/blender/editors/sculpt_paint/sculpt_paint_image.cc
M	source/blender/gpu/GPU_sculpt_shader_shared.h
M	source/blender/gpu/shaders/sculpt_paint/infos/sculpt_paint_image_info.hh
M	source/blender/gpu/shaders/sculpt_paint/sculpt_paint_image_comp.glsl
M	source/blender/gpu/shaders/sculpt_paint/sculpt_paint_image_merge_comp.glsl
M	source/blender/gpu/shaders/sculpt_paint/sculpt_paint_tile_lib.glsl

===================================================================

diff --git a/source/blender/blenkernel/BKE_pbvh_pixels.hh b/source/blender/blenkernel/BKE_pbvh_pixels.hh
index bff6bee41e4..1eed7aab423 100644
--- a/source/blender/blenkernel/BKE_pbvh_pixels.hh
+++ b/source/blender/blenkernel/BKE_pbvh_pixels.hh
@@ -106,14 +106,12 @@ struct UDIMTilePixels {
   Vector<PackedPixelRow> pixel_rows;
   int64_t gpu_buffer_offset;
   /* Region of the tile that can be painted on by this node. Size of a subtile is determined by  */
-  /* TODO: use list of sub_tile_ids to not overcommit texture usage. */
-  rcti gpu_sub_tiles;
+  Vector<int2> gpu_sub_tiles;
 
   UDIMTilePixels()
   {
     flags.dirty = false;
     BLI_rcti_init_minmax(&dirty_region);
-    BLI_rcti_init_minmax(&gpu_sub_tiles);
   }
 
   void mark_dirty(const PackedPixelRow &pixel_row)
diff --git a/source/blender/blenkernel/intern/pbvh_pixels.cc b/source/blender/blenkernel/intern/pbvh_pixels.cc
index a87a29019b3..38fae311d04 100644
--- a/source/blender/blenkernel/intern/pbvh_pixels.cc
+++ b/source/blender/blenkernel/intern/pbvh_pixels.cc
@@ -15,6 +15,8 @@
 #include "BLI_math.h"
 #include "BLI_task.h"
 
+#include "PIL_time_utildefines.h"
+
 #include "BKE_image_wrappers.hh"
 
 #include "bmesh.h"
@@ -79,19 +81,37 @@ void NodeData::build_pixels_gpu_buffer()
 
 void UDIMTilePixels::init_gpu_sub_tiles()
 {
-  BLI_rcti_init_minmax(&gpu_sub_tiles);
+  BLI_assert(gpu_sub_tiles.is_empty());
+  const int max_sub_tiles = 16;
+  bool sub_tiles_hit[max_sub_tiles][max_sub_tiles];
+  for (int x = 0; x < max_sub_tiles; x++) {
+    for (int y = 0; y < max_sub_tiles; y++) {
+      sub_tiles_hit[x][y] = false;
+    }
+  }
+
+  int2 max_sub_tile_len(0, 0);
   for (const PackedPixelRow &elements : pixel_rows) {
     int2 subtile_from = int2(elements.start_image_coordinate / TEXTURE_STREAMING_TILE_SIZE);
     int2 coord_to = int2(elements.start_image_coordinate) + int2(elements.num_pixels + 1, 1);
     int2 subtile_to = int2(coord_to / TEXTURE_STREAMING_TILE_SIZE);
+    for (int x = subtile_from.x; x < subtile_to.x; x++) {
+      sub_tiles_hit[x][subtile_from.y] = true;
+    }
+  }
 
-    BLI_rcti_do_minmax_v(&gpu_sub_tiles, subtile_from);
-    BLI_rcti_do_minmax_v(&gpu_sub_tiles, subtile_to);
+  for (int x = 0; x < max_sub_tiles; x++) {
+    for (int y = 0; y < max_sub_tiles; y++) {
+      if (sub_tiles_hit[x][y]) {
+        gpu_sub_tiles.append(int2(x, y));
+      }
+    }
   }
 }
 
 void NodeData::init_gpu_sub_tiles()
 {
+  printf("%s\n", __func__);
   for (UDIMTilePixels &tile : tiles) {
     tile.init_gpu_sub_tiles();
   }
diff --git a/source/blender/editors/sculpt_paint/sculpt_paint_image.cc b/source/blender/editors/sculpt_paint/sculpt_paint_image.cc
index ddcf91d76e4..88f4aca67d8 100644
--- a/source/blender/editors/sculpt_paint/sculpt_paint_image.cc
+++ b/source/blender/editors/sculpt_paint/sculpt_paint_image.cc
@@ -553,7 +553,7 @@ static void init_paint_brush(const SculptSession &ss,
  * - Only tiles that are painted on are loaded in memory, painted on and merged back to the actual
  * texture.
  */
-template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
+template<int32_t Size, int32_t Depth = 16> class GPUSubTileTexture {
   struct Info {
     struct {
       bool in_use_stroke : 1;
@@ -572,7 +572,7 @@ template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
   std::array<int32_t, Depth> layer_lookup_;
 
   GPUTexture *gpu_texture_ = nullptr;
-  GPUStorageBuf *tile_buf_ = nullptr;
+  GPUStorageBuf *paint_tile_buf_ = nullptr;
   int64_t tile_buf_size_ = 0;
 
  public:
@@ -593,17 +593,9 @@ template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
       gpu_texture_ = nullptr;
     }
 
-    if (tile_buf_) {
-      GPU_storagebuf_free(tile_buf_);
-      tile_buf_ = nullptr;
-    }
-  }
-
-  void reset_usage()
-  {
-    printf("%s\n", __func__);
-    for (Info &info : infos_) {
-      info.flags.in_use = false;
+    if (paint_tile_buf_) {
+      GPU_storagebuf_free(paint_tile_buf_);
+      paint_tile_buf_ = nullptr;
     }
   }
 
@@ -766,6 +758,7 @@ template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
   {
     BLI_assert(gpu_texture_);
     float *buffer = nullptr;
+    bool tiles_updated = false;
     for (int64_t index : infos_.index_range()) {
       Info &info = infos_[index];
       PaintTileData &tile = paint_tiles_[index];
@@ -781,11 +774,22 @@ template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
         buffer = static_cast<float *>(MEM_callocN(Size * Size * 4 * sizeof(float), __func__));
       }
 
+      printf("%s: initializing tile {tile:%d, sub_tile:%d,%d, layer_id:%d}\n",
+             __func__,
+             tile.tile_number,
+             UNPACK2(tile.sub_tile_id),
+             tile.layer_id);
+
       /* TODO: Copy correct data from ImBuf.*/
 
-      // GPU_texture_update_sub(
-      //    gpu_texture_, GPU_DATA_FLOAT, buffer, 0, 0, tile.layer_id, Size, Size, 1);
+      GPU_texture_update_sub(
+          gpu_texture_, GPU_DATA_FLOAT, buffer, 0, 0, tile.layer_id, Size, Size, 1);
       info.flags.needs_update = false;
+      tiles_updated = true;
+    }
+
+    if (tiles_updated) {
+      GPU_memory_barrier(GPU_BARRIER_TEXTURE_UPDATE);
     }
 
     if (buffer) {
@@ -798,32 +802,42 @@ template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
     return gpu_texture_;
   }
 
-  void ensure_tile_buf()
+  void ensure_paint_tile_buf()
   {
     int64_t needed_size = paint_tiles_.capacity() * sizeof(PaintTileData);
 
     /* Reuse previous buffer only when exact size, due to potentional read out of bound errors.*/
-    if (tile_buf_ && tile_buf_size_ == needed_size) {
+    if (paint_tile_buf_ && tile_buf_size_ == needed_size) {
       return;
     }
 
-    if (tile_buf_) {
-      GPU_storagebuf_free(tile_buf_);
-      tile_buf_ = nullptr;
+    if (paint_tile_buf_) {
+      GPU_storagebuf_free(paint_tile_buf_);
+      paint_tile_buf_ = nullptr;
+    }
+    paint_tile_buf_ = GPU_storagebuf_create(needed_size);
+  }
+
+  void update_paint_tile_buf()
+  {
+    BLI_assert(paint_tile_buf_);
+    for (PaintTileData &tile : paint_tiles_) {
+      tile.in_use_frame = false;
     }
-    tile_buf_ = GPU_storagebuf_create(needed_size);
+    GPU_storagebuf_update(paint_tile_buf_, paint_tiles_.data());
   }
 
-  void update_tile_buf()
+  void read_back_paint_tile_buf()
   {
-    BLI_assert(tile_buf_);
-    GPU_storagebuf_update(tile_buf_, paint_tiles_.data());
+    BLI_assert(paint_tile_buf_);
+    // GPU_memory_barrier(GPU_BARRIER_SHADER_STORAGE);
+    GPU_storagebuf_read(paint_tile_buf_, paint_tiles_.data());
   }
 
-  GPUStorageBuf *tile_buf_get()
+  GPUStorageBuf *paint_tile_buf_get()
   {
-    BLI_assert(tile_buf_);
-    return tile_buf_;
+    BLI_assert(paint_tile_buf_);
+    return paint_tile_buf_;
   }
 
   int32_t paint_tiles_len()
@@ -835,7 +849,7 @@ template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
   {
     GPU_texture_image_bind(gpu_texture_get(),
                            GPU_shader_get_texture_binding(shader, "paint_tiles_img"));
-    GPU_storagebuf_bind(tile_buf_get(), GPU_shader_get_ssbo(shader, "paint_tile_buf"));
+    GPU_storagebuf_bind(paint_tile_buf_get(), GPU_shader_get_ssbo(shader, "paint_tile_buf"));
     GPU_shader_uniform_1i(shader, "paint_tile_buf_len", paint_tiles_len());
   }
 
@@ -847,6 +861,11 @@ template<int32_t Size, int32_t Depth = 512> class GPUSubTileTexture {
       if (!info.flags.in_use_frame) {
         continue;
       }
+      /*
+      PaintTileData &paint_tile = paint_tiles_[index];
+      if (!paint_tile.in_use_frame) {
+        continue;
+      }*/
       predicate(paint_tiles_[index]);
     }
   }
@@ -1029,19 +1048,15 @@ static void gpu_painting_image_merge(GPUSculptPaintData &batches,
                                      ImageUser &image_user,
                                      ImBuf &image_buffer)
 {
+  GPU_memory_barrier(GPU_BARRIER_SHADER_IMAGE_ACCESS);
   GPUTexture *canvas_tex = BKE_image_get_gpu_texture(&image, &image_user, &image_buffer);
   GPUShader *shader = SCULPT_shader_paint_image_merge_get();
   GPU_shader_bind(shader);
   batches.tile_texture.bind(shader);
   GPU_texture_image_bind(canvas_tex, GPU_shader_get_texture_binding(shader, "texture_img"));
   batches.tile_texture.foreach_in_frame([shader](PaintTileData &paint_tile) {
-    printf("%s: merging tile stored on layer %d {tile:%d sub_tile:%d,%d} \n",
-           __func__,
-           paint_tile.layer_id,
-           paint_tile.tile_number,
-           UNPACK2(paint_tile.sub_tile_id));
     GPU_shader_uniform_1i(shader, "layer_id", paint_tile.layer_id);
-    GPU_compute_dispatch(shader, TEXTURE_STREAMING_TILE_SIZE, TEXTURE_STREAMING_TILE_SIZE, 1);
+    GPU_compute_dispatch(shader, TEXTURE_STREAMING_TILE_SIZE / 32, TEXTURE_STREAMING_TILE_SIZE, 1);
   });
 }
 
@@ -1064,7 +1079,7 @@ static void init_paint_step(const SculptSession &ss,
   }
 }
 
-static void dispatch_gpu_painting(TexturePaintingUserData &data)
+static void add_paint_step(TexturePaintingUserData &data)
 {
   SculptSession &ss = *data.ob->sculpt;
 
@@ -1073,6 +1088,7 @@ static void dispatch_gpu_painting(TexturePaintingUserData &data)
   PaintStepData paint_step;
   init_paint_step(ss, *data.brush, paint_step);
   batches.steps.append(paint_step);
+  PIL_sleep_ms(1);
 }
 
 /* This should be done based on the frame_selection nodes, otherwise we might be over
@@ -1086,11 +1102,8 @@ static void paint_tiles_mark_used(TexturePaintingUserData &data)
   for (PBVHNode *node : MutableSpan<PBVHNode *>(data.nodes, data.nodes_len)) {
     NodeData &node_data = BKE_pbvh_pixels_node_data_get(*node);
     for (UDIMTilePixels &tile : node_data.tiles) {
-      for (int x = tile.gpu_sub_tiles.xmin; x <= tile.gpu_sub_tiles.xmax; x++) {
-        for (int y = tile.gpu_sub_tiles.ymin; y <= tile.gpu_sub_tiles.ymax; y++) {
-          int2 sub_tile_id(x, y);
-          batches.tile_texture.mark_usage(tile.tile_number, sub_tile_id);
-        }
+      for (int2 &sub_tile_id : tile.gpu_sub_tiles) {
+        batches.tile_texture.mark_usage(tile.tile_number, sub_tile_id);
       }
     }
   }
@@ -1120,7 +1133,7 @@ static TileNumbers collect_active_tile_numbers(const TexturePaintingUserData &da
   return result;
 }
 
-sta

@@ Diff output truncated at 10240 characters. @@