Introduce chunked_vector, it has some more overhead than needed but lets see if it helps

2025-08-31 10:21:45 +02:00 · 2025-03-22 22:50:11 +02:00
parent 7e7d7de546
commit 020fe7b5b5
1 changed files with 199 additions and 17 deletions
--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@@ -816,13 +816,188 @@ void beginUpdate(Camera* cam)  {
 }


-std::vector<atomic_context_t> atomicContexts;
-std::vector<mesh_context_t> meshContexts;
-std::vector<skin_context_t> skinContexts;
-std::vector<matfx_context_t> matfxContexts;
-std::vector<std::function<void()>> opCallbacks;
-std::vector<std::function<void()>> blendCallbacks;
-std::vector<std::function<void()>> ptCallbacks;
+template<typename T>
+struct chunked_vector {
+    static constexpr size_t chunk_size = 8192;
+
+    struct chunk;
+    struct chunk_header {
+        chunk* prev;
+        chunk* next;
+        size_t used;
+        size_t free;
+    };
+
+    struct chunk {
+        static constexpr size_t item_count = (chunk_size - sizeof(chunk_header)) / sizeof(T);
+        union {
+            struct {
+                chunk_header header;
+                T items[item_count];
+            };
+            uint8_t data[chunk_size];
+        };
+    };
+
+    // In-object first chunk storage.
+    chunk first_chunk;
+    chunk* first;
+    chunk* last;
+
+    // Constructor: initialize first_chunk’s header and set pointers.
+    chunked_vector()
+      : first_chunk{ { nullptr, nullptr, 0, chunk::item_count } },
+        first(&first_chunk), last(&first_chunk)
+    {
+        static_assert(sizeof(chunk) == chunk_size, "chunk size mismatch");
+    }
+
+    // Destructor: free extra chunks and call clear() to destruct contained objects.
+    ~chunked_vector() {
+        clear();
+        // Free all dynamically allocated chunks (except first_chunk).
+        chunk* curr = first_chunk.header.next;
+        while (curr) {
+            chunk* next = curr->header.next;
+            free(curr);
+            curr = next;
+        }
+    }
+
+    // Return a reference to the last element. (Precondition: not empty.)
+    T& back() {
+        assert(last->header.used > 0 && "back() called on empty vector");
+        return last->items[last->header.used - 1];
+    }
+
+    // Random-access: iterate through chunks until the correct index is found.
+    T& operator[](size_t idx) {
+        chunk* curr = first;
+        while (curr) {
+            if (idx < curr->header.used)
+                return curr->items[idx];
+            idx -= curr->header.used;
+            curr = curr->header.next;
+        }
+        assert(0 && "Index out of range");
+        // Should never reach here.
+        return first->items[0];
+    }
+
+    // Emplace amt default-constructed elements in a contiguous block (within one chunk)
+    // and return a pointer to the first new element.
+    T* emplace_many(size_t amt) {
+        // Assert that amt is not greater than one chunk's capacity.
+        assert(amt <= chunk::item_count && "emplace_many: amt exceeds a single chunk's capacity");
+
+        // Ensure the current chunk has enough free space.
+        if (last->header.free < amt) {
+            if (last->header.next && last->header.next->header.free >= amt) {
+                last = last->header.next;
+            } else {
+                // Allocate a new chunk.
+                chunk* new_chunk = static_cast<chunk*>(malloc(sizeof(chunk)));
+                assert(new_chunk && "malloc failed in emplace_many");
+                new_chunk->header.prev = last;
+                new_chunk->header.next = nullptr;
+                new_chunk->header.used = 0;
+                new_chunk->header.free = chunk::item_count;
+                last->header.next = new_chunk;
+                last = new_chunk;
+            }
+        }
+        T* start_ptr = &last->items[last->header.used];
+        for (size_t i = 0; i < amt; ++i) {
+            new (&last->items[last->header.used]) T();
+            last->header.used++;
+            last->header.free--;
+        }
+        return start_ptr;
+    }
+
+    // Return total number of elements across all chunks.
+    size_t size() const {
+        size_t total = 0;
+        for (chunk* curr = first; curr; curr = curr->header.next) {
+            total += curr->header.used;
+        }
+        return total;
+    }
+
+    // Clear all elements: call destructors and reset used/free counters.
+    // Note: extra chunks are NOT freed.
+    void clear() {
+        for (chunk* curr = first; curr; curr = curr->header.next) {
+            for (size_t i = 0; i < curr->header.used; ++i) {
+                curr->items[i].~T();
+            }
+            curr->header.used = 0;
+            curr->header.free = chunk::item_count;
+        }
+        // Optionally, reset last pointer to first for reuse.
+        last = first;
+    }
+
+    // Emplace a default-constructed element at the end.
+    void emplace_back() {
+        if (last->header.free == 0) {
+            if (last->header.next) {
+                last = last->header.next;
+            } else {
+                chunk* new_chunk = static_cast<chunk*>(malloc(sizeof(chunk)));
+                assert(new_chunk && "malloc failed in emplace_back");
+                new_chunk->header.prev = last;
+                new_chunk->header.next = nullptr;
+                new_chunk->header.used = 0;
+                new_chunk->header.free = chunk::item_count;
+                last->header.next = new_chunk;
+                last = new_chunk;
+            }
+        }
+        new (&last->items[last->header.used]) T();
+        last->header.used++;
+        last->header.free--;
+    }
+
+    // Emplace an element by moving it into the container.
+    void emplace_back(T&& v) {
+        if (last->header.free == 0) {
+            if (last->header.next) {
+                last = last->header.next;
+            } else {
+                chunk* new_chunk = static_cast<chunk*>(malloc(sizeof(chunk)));
+                assert(new_chunk && "malloc failed in emplace_back(T&&)");
+                new_chunk->header.prev = last;
+                new_chunk->header.next = nullptr;
+                new_chunk->header.used = 0;
+                new_chunk->header.free = chunk::item_count;
+                last->header.next = new_chunk;
+                last = new_chunk;
+            }
+        }
+        new (&last->items[last->header.used]) T(std::forward<T>(v));
+        last->header.used++;
+        last->header.free--;
+    }
+
+    // Iterate over each element and invoke the callback.
+    void forEach(void(*cb)(T&)) {
+        for (chunk* curr = first; curr; curr = curr->header.next) {
+            for (size_t i = 0; i < curr->header.used; ++i) {
+                cb(curr->items[i]);
+            }
+        }
+    }
+};
+
+chunked_vector<atomic_context_t> atomicContexts;
+chunked_vector<mesh_context_t> meshContexts;
+chunked_vector<skin_context_t> skinContexts;
+static_assert(chunked_vector<skin_context_t>::chunk::item_count >= 64);
+chunked_vector<matfx_context_t> matfxContexts;
+chunked_vector<std::function<void()>> opCallbacks;
+chunked_vector<std::function<void()>> blendCallbacks;
+chunked_vector<std::function<void()>> ptCallbacks;

 void dcMotionBlur_v1(uint8_t a, uint8_t r, uint8_t g, uint8_t b) {
 	
@@ -1125,26 +1300,26 @@ void endUpdate(Camera* cam) {
 		pvr_list_begin(PVR_LIST_OP_POLY);
 		enter_oix();
 		if (opCallbacks.size()) {
-			for (auto&& cb: opCallbacks) {
+			opCallbacks.forEach([](auto &cb) {
 				cb();
-			}
+			});
 		}
 		pvr_list_finish();
 		if (ptCallbacks.size()) {
 			PVR_SET(0x11C, 64); // PT Alpha test value
 			pvr_dr_init(&drState);
 			pvr_list_begin(PVR_LIST_PT_POLY);
-			for (auto&& cb: ptCallbacks) {
+			ptCallbacks.forEach([](auto &cb) {
 				cb();
-			}
+			});
 			pvr_list_finish();
 		}
 		pvr_list_begin(PVR_LIST_TR_POLY);
 		if (blendCallbacks.size()) {
 			pvr_dr_init(&drState);
-			for (auto&& cb: blendCallbacks) {
+			blendCallbacks.forEach([](auto &cb) {
 				cb();
-			}
+			});
 		}

 		if (vertexOverflown()) {
@@ -3567,8 +3742,8 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 	size_t skinContextOffset = skinContexts.size();
 	bool skinMatrix0Identity = false;
 	if (skin) {
-		skinContexts.resize(skinContextOffset + skin->numBones);
-		skinMatrix0Identity = uploadSkinMatrices(atomic, &(skinContexts.data() + skinContextOffset)->mtx);
+		auto allocation = skinContexts.emplace_many(skin->numBones);
+		skinMatrix0Identity = uploadSkinMatrices(atomic, &allocation->mtx);
 	}

 	atomicContexts.emplace_back();
@@ -3621,7 +3796,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 		if (doEnvironmentMaps && matfx && matfx->type == MatFX::ENVMAP && matfx->fx[0].env.tex != nil && matfx->fx[0].env.coefficient != 0.0f) {
 			isMatFX = true;
 			matfxCoefficient = matfx->fx[0].env.coefficient;
-			matfxContexts.resize(matfxContexts.size() + 1);
+			matfxContexts.emplace_back();
 			// N.B. world here gets converted to a 3x3 matrix
 			// 		this is fine, as we only use it for env mapping from now on
 			uploadEnvMatrix(matfx->fx[0].env.frame, &world, &matfxContexts.back().mtx);
@@ -3841,7 +4016,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 						
 						bool small_xyz = selector & 8;
 						unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2;
-						tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset],  normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &(skinContexts.data() + acp->skinContextOffset)->mtx);
+						tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset],  normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &skinContexts[acp->skinContextOffset].mtx);
 						
 						mat_load(&mtx);
 						tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64);
@@ -4744,6 +4919,13 @@ driverOpen(void *o, int32, int32)
 		}
 	}
 	#endif
+
+	dbglog(DBG_CRITICAL, "atomicContexts: %d per %d allocation\n", decltype(atomicContexts)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "skinContexts: %d per %d allocation\n", decltype(skinContexts)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "matfxContexts: %d per %d allocation\n", decltype(matfxContexts)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "opCallbacks: %d per %d allocation\n", decltype(opCallbacks)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "blendCallbacks: %d per %d allocation\n", decltype(blendCallbacks)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "ptCallbacks: %d per %d allocation\n", decltype(ptCallbacks)::chunk::item_count, decltype(atomicContexts)::chunk_size);
 	

    pvr_init(&pvr_params);