Merge branch 'skmp/oom-mitigation-3' into 'main'

Memory usage improvements and memleak fixes See merge request skmp/dca3-game!72
2025-09-01 18:52:58 +02:00 · 2025-03-25 18:07:56 +00:00
parent ad0e0ac6fb cfaab31a71
commit 6eba303fb0
26 changed files with 922 additions and 349 deletions
--- a/src/liberty/animation/AnimManager.cpp
+++ b/src/liberty/animation/AnimManager.cpp
@@ -11,6 +11,8 @@
 #include "AnimBlendAssocGroup.h"
 #include "AnimManager.h"

+void* re3StreamingAlloc(size_t size);
+
 CAnimBlock CAnimManager::ms_aAnimBlocks[NUMANIMBLOCKS];
 CAnimBlendHierarchy CAnimManager::ms_aAnimations[NUMANIMATIONS];
 int32 CAnimManager::ms_numAnimBlocks;
@@ -837,7 +839,7 @@ CAnimManager::LoadAnimFile(int fd, bool compress)
 			uint16_t flags;
 			CFileMgr::Read(fd, (char*)&flags, sizeof(flags));

-			seq->keyFrames = RwMalloc(dataSize);
+			seq->keyFrames = re3StreamingAlloc(dataSize);
 			assert(seq->keyFrames);
 			CFileMgr::Read(fd, (char*)seq->keyFrames, dataSize - sizeof(flags));
 			seq->type = flags;
--- a/src/liberty/audio/sampman_dc.cpp
+++ b/src/liberty/audio/sampman_dc.cpp
@@ -175,6 +175,12 @@ file_t fdPedSfx;
 volatile uint32 nPedSfxReqReadId = 1;
 volatile uint32 nPedSfxReqNextId = 1;

+// this is very wasteful and temporary
+#define BANK_STAGE_SIZE 16 * 2048
+static  uint8_t stagingBufferBank[BANK_STAGE_SIZE] __attribute__((aligned(32)));
+std::mutex stagingBufferMtx;
+
+
 static int32 DCStreamedLength[TOTAL_STREAMED_SOUNDS];

 struct WavHeader {
@@ -568,16 +574,19 @@ cSampleManager::LoadSampleBank(uint8 nBank)
 		// TODO: Split per-bank sfx file
 		int fd = fs_open(SampleBankDataFilename, O_RDONLY);
 		assert(fd >= 0);
-		// this is very wasteful and temporary
-		void* stagingBuffer = memalign(32, 32 * 2048);
-		assert(stagingBuffer != 0);
+		
+		
+		{
+			std::lock_guard lk(stagingBufferMtx); // for stagingBufferBank
+		
+			void* stagingBuffer = stagingBufferBank;

 			// Ideally, we'd suspend the CdStream thingy here or read via that instead
 			uintptr_t loadOffset = bank.base;
 			fs_seek(fd, fileStart, SEEK_SET);

 			while (fileSize > 0) {
-			size_t readSize = fileSize > 32 * 2048 ? 32 * 2048 : fileSize;
+				size_t readSize = fileSize > sizeof(stagingBufferBank) ? sizeof(stagingBufferBank) : fileSize;
 				int rs = fs_read(fd, stagingBuffer, readSize);
 				debugf("Read %d bytes, expected %d\n", rs, readSize);
 				assert(rs == readSize);
@@ -586,8 +595,8 @@ cSampleManager::LoadSampleBank(uint8 nBank)
 				fileSize -= readSize;
 				debugf("Loaded %d bytes, %d remaining\n", readSize, fileSize);
 			}
+		}
 		fs_close(fd);
-		free(stagingBuffer);
 		

 		for (int nSfx = BankStartOffset[nBank]; nSfx < BankStartOffset[nBank+1]; nSfx++) {
@@ -736,7 +745,10 @@ cSampleManager::LoadPedComment(uint32 nComment)
 		// TODO: When we can dma directly to AICA, we can use this instead
 		// fs_read(fdPedSfx, SPU_BASE_U8 + (uintptr_t)cmd->dest, cmd->size);

-		void* stagingBuffer = memalign(32, cmd->size);
+		assert(cmd->size < sizeof(stagingBufferBank));
+		{
+			std::lock_guard lk(stagingBufferMtx); // for stagingBufferBank
+			void* stagingBuffer = stagingBufferBank;
 			assert(stagingBuffer != 0);
 			debugf("Allocated %d bytes at %p\n", cmd->size, stagingBuffer);
 			int rs = fs_read(fdPedSfx, stagingBuffer, cmd->size);
@@ -744,7 +756,8 @@ cSampleManager::LoadPedComment(uint32 nComment)
 			assert(rs == cmd->size);
 	
 			spu_memload((uintptr_t)cmd->dest, stagingBuffer, cmd->size);
-		free(stagingBuffer);
+		}
+		
 		nPedSfxReqReadId = nPedSfxReqReadId + 1;
 	});

@@ -1268,6 +1281,8 @@ cSampleManager::InitialiseSampleBanks(void)
 		assert(m_aSamples[nComment].nByteSize <= PED_BLOCKSIZE_ADPCM);
 	}

+	assert(PED_BLOCKSIZE_ADPCM <= BANK_STAGE_SIZE);
+
 	LoadSampleBank(SFX_BANK_0);
 	
 	return TRUE;
--- a/src/liberty/collision/ColModel.cpp
+++ b/src/liberty/collision/ColModel.cpp
@@ -2,6 +2,9 @@
 #include "ColModel.h"
 #include "Game.h"
 #include "MemoryHeap.h"
+#include "Collision.h"
+
+void* re3StreamingAlloc(size_t size);

 CColModel::CColModel(void)
 {
@@ -22,12 +25,12 @@ CColModel::CColModel(void)
 CColModel::~CColModel(void)
 {
 	RemoveCollisionVolumes();
-	RemoveTrianglePlanes();
 }

 void
 CColModel::RemoveCollisionVolumes(void)
 {
+	CCollision::RemoveTrianglePlanes(this);
 	if(ownsCollisionVolumes){
 		RwFree(spheres);
 		RwFree(lines);
@@ -93,6 +96,8 @@ CColModel::operator=(const CColModel &other)
 	int i;
 	int numVerts;

+	CCollision::RemoveTrianglePlanes(this);
+
 	boundingSphere = other.boundingSphere;
 	boundingBox = other.boundingBox;

@@ -163,7 +168,7 @@ CColModel::operator=(const CColModel &other)
 		if(vertices)
 			RwFree(vertices);
 		if(numVerts){
-			vertices = (CompressedVector*)RwMalloc(numVerts*sizeof(CompressedVector));
+			vertices = (CompressedVector*)re3StreamingAlloc(numVerts*sizeof(CompressedVector));
 			for(i = 0; i < numVerts; i++)
 				vertices[i] = other.vertices[i];
 		}
@@ -173,7 +178,7 @@ CColModel::operator=(const CColModel &other)
 			numTriangles = other.numTriangles;
 			if(triangles)
 				RwFree(triangles);
-			triangles = (CColTriangle*)RwMalloc(numTriangles*sizeof(CColTriangle));
+			triangles = (CColTriangle*)re3StreamingAlloc(numTriangles*sizeof(CColTriangle));
 		}
 		for(i = 0; i < numTriangles; i++)
 			triangles[i] = other.triangles[i];
--- a/src/liberty/collision/Collision.cpp
+++ b/src/liberty/collision/Collision.cpp
@@ -2287,6 +2287,15 @@ CCollision::DistToLine(const CVector *l0, const CVector *l1, const CVector *poin
 	return (*point - closest).Magnitude();
 }

+void
+CCollision::RemoveTrianglePlanes(CColModel *model)
+{
+	if(model->trianglePlanes){
+		ms_colModelCache.Remove(model->GetLinkPtr());
+		model->RemoveTrianglePlanes();
+	}
+}
+
 void
 CCollision::CalculateTrianglePlanes(CColModel *model)
 {
--- a/src/liberty/collision/Collision.h
+++ b/src/liberty/collision/Collision.h
@@ -41,6 +41,7 @@ public:
 	static void DrawColModel(const CMatrix &mat, const CColModel &colModel);
 	static void DrawColModel_Coloured(const CMatrix &mat, const CColModel &colModel, int32 id);

+	static void RemoveTrianglePlanes(CColModel *model);
 	static void CalculateTrianglePlanes(CColModel *model);

 	// all these return true if there's a collision
--- a/src/liberty/core/FileLoader.cpp
+++ b/src/liberty/core/FileLoader.cpp
@@ -28,6 +28,8 @@

 #include <kos/dbglog.h>

+void* re3StreamingAlloc(size_t size);
+
 char CFileLoader::ms_line[256];

 const char*
@@ -221,7 +223,7 @@ CFileLoader::LoadCollisionFile(const char *filename)

 		mi = CModelInfo::GetModelInfo(modelname, nil);
 		if(mi){
-			if(mi->GetColModel()){
+			if(mi->GetColModel() && mi->DoesOwnColModel()){
 				LoadCollisionModel(work_buff+24, *mi->GetColModel(), modelname);
 			}else{
 				CColModel *model = new CColModel;
@@ -255,6 +257,24 @@ CFileLoader::LoadCollisionModel(uint8 *buf, CColModel &model, char *modelname)
 	model.boundingBox.max.z = *(float*)(buf+36);
 	model.numSpheres = *(int16*)(buf+40);
 	buf += 44;
+	if (model.spheres) {
+		RwFree(model.spheres);
+	}
+	if (model.lines) {
+		RwFree(model.lines);
+	}
+	if (model.boxes) {
+		RwFree(model.boxes);
+	}
+	if (model.vertices) {
+		RwFree(model.vertices);
+	}
+	if (model.triangles) {
+		RwFree(model.triangles);
+	}
+	if (model.trianglePlanes) {
+		CCollision::RemoveTrianglePlanes(&model);
+	}
 	if(model.numSpheres > 0){
 		model.spheres = (CColSphere*)RwMalloc(model.numSpheres*sizeof(CColSphere));
 		REGISTER_MEMPTR(&model.spheres);
@@ -292,7 +312,7 @@ CFileLoader::LoadCollisionModel(uint8 *buf, CColModel &model, char *modelname)
 	int32 numVertices = *(int16*)buf;
 	buf += 4;
 	if(numVertices > 0){
-		model.vertices = (CompressedVector*)RwMalloc(numVertices*sizeof(CompressedVector));
+		model.vertices = (CompressedVector*)re3StreamingAlloc(numVertices*sizeof(CompressedVector));
 		REGISTER_MEMPTR(&model.vertices);
 		for(i = 0; i < numVertices; i++){
 			model.vertices[i].SetFixed(*(int16*)buf, *(int16*)(buf+2), *(int16*)(buf+4));
@@ -304,7 +324,7 @@ CFileLoader::LoadCollisionModel(uint8 *buf, CColModel &model, char *modelname)
 	model.numTriangles = *(int16*)buf;
 	buf += 4;
 	if(model.numTriangles > 0){
-		model.triangles = (CColTriangle*)RwMalloc(model.numTriangles*sizeof(CColTriangle));
+		model.triangles = (CColTriangle*)re3StreamingAlloc(model.numTriangles*sizeof(CColTriangle));
 		REGISTER_MEMPTR(&model.triangles);
 		for(i = 0; i < model.numTriangles; i++){
 			model.triangles[i].Set(model.vertices, *(uint16*)buf, *(uint16*)(buf+2), *(uint16*)(buf+4), buf[6], buf[7]);
--- a/src/liberty/core/Streaming.cpp
+++ b/src/liberty/core/Streaming.cpp
@@ -1170,6 +1170,24 @@ bool re3EmergencyRemoveModel() {
 	return usedmem != CStreaming::ms_memoryUsed;
 }

+void* re3StreamingAlloc(size_t size) {
+	auto rv = RwMalloc(size);
+
+	while (rv == nil) {
+		if (re3RemoveLeastUsedModel()) {
+			rv = RwMalloc(size);
+			continue;
+		}
+		if (re3EmergencyRemoveModel()) {
+			rv = RwMalloc(size);
+			continue;
+		}
+		return nil;
+	}
+
+	return rv;
+}
+
 bool
 CStreaming::RemoveLeastUsedModel(void)
 {
--- a/src/liberty/modelinfo/BaseModelInfo.cpp
+++ b/src/liberty/modelinfo/BaseModelInfo.cpp
@@ -38,6 +38,14 @@ CBaseModelInfo::DeleteCollisionModel(void)
 	}
 }

+void CBaseModelInfo::SetColModel(CColModel *col, bool owns) {
+	if (m_bOwnsColModel) {
+		delete m_colModel;
+	}
+	m_colModel = col;
+	m_bOwnsColModel = owns;
+}
+
 void
 CBaseModelInfo::AddRef(void)
 {
--- a/src/liberty/modelinfo/BaseModelInfo.h
+++ b/src/liberty/modelinfo/BaseModelInfo.h
@@ -56,8 +56,7 @@ public:
 	}
 	char *GetModelName(void) { return m_name; }
 	void SetModelName(const char *name) { strncpy(m_name, name, MAX_MODEL_NAME); }
-	void SetColModel(CColModel *col, bool owns = false){
-		m_colModel = col; m_bOwnsColModel = owns; }
+	void SetColModel(CColModel *col, bool owns = false);
 	CColModel *GetColModel(void) { return m_colModel; }
 	bool DoesOwnColModel(void) { return m_bOwnsColModel; }
 	void DeleteCollisionModel(void);
--- a/src/liberty/objects/CutsceneHead.cpp
+++ b/src/liberty/objects/CutsceneHead.cpp
@@ -197,6 +197,10 @@ CCutsceneHead::PlayAnimation(const char *animName)
 		RwStreamSkip(stream, offset*2048);
 		if(RwStreamFindChunk(stream, rwID_HANIMANIMATION, nil, nil)){
 			anim = RpHAnimAnimationStreamRead(stream);
+			if (hier->interpolator->currentAnim) {
+				RpHAnimAnimationDestroy(hier->interpolator->currentAnim);
+				hier->interpolator->currentAnim = nil;
+			}
 			RpHAnimHierarchySetCurrentAnim(hier, anim);
 		}

--- a/src/liberty/save/PCSave.cpp
+++ b/src/liberty/save/PCSave.cpp
@@ -17,6 +17,8 @@

 #include "vmu/vmu.h"

+void* re3StreamingAlloc(size_t size);
+
 const char* _psGetUserFilesFolder();

 C_PcSave PcSaveHelper;
@@ -93,16 +95,17 @@ uint32_t C_PcSave::PcClassLoadRoutine(int32 file, uint8 *data) {
 		return size;
 	} else {
 		size &= ~0x80000000;
-		uint8* compressed = (uint8*)malloc(size);
+		uint8* compressed = (uint8*)re3StreamingAlloc(size);
+		assert(compressed);
 		err = CFileMgr::Read(file, (const char*)compressed, size) != size;
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
-			free(compressed);
+			RwFree(compressed);
 			return 0;
 		}

 		lzo_uint decompressed_size = 0;
 		auto crv = lzo1x_decompress(compressed, size, data, &decompressed_size, NULL);
-		free(compressed);
+		RwFree(compressed);
 		if (crv != LZO_E_OK) {
 			return 0;
 		}
@@ -117,31 +120,37 @@ uint32_t C_PcSave::PcClassLoadRoutine(int32 file, uint8 *data) {
 bool
 C_PcSave::PcClassSaveRoutine(int32 file, uint8 *data, uint32 size)
 {
-	void* wrkmem = malloc(LZO1X_1_MEM_COMPRESS);
-	uint8* compressed = (uint8*)malloc(size*2);
+	void* wrkmem = re3StreamingAlloc(LZO1X_1_MEM_COMPRESS);
+	assert(wrkmem);
+	uint8* compressed = (uint8*)re3StreamingAlloc(size*2);
+	assert(compressed);
 	lzo_uint compressed_size;
 	int crv = lzo1x_1_compress(data, size, compressed, &compressed_size, wrkmem);
-	free(wrkmem);
+	RwFree(wrkmem);
+
+	if (crv == LZO_E_OK && compressed_size >= size) {
+		crv = LZO_E_NOT_COMPRESSIBLE;
+	}

 	if (crv == LZO_E_OK) {
 		uint32_t compressed_size32 = compressed_size | 0x80000000;
 		bool err = CFileMgr::Write(file, (const char*)&compressed_size32, sizeof(compressed_size32)) != sizeof(compressed_size32);
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
-			free(compressed);
+			RwFree(compressed);
 			nErrorCode = SAVESTATUS_ERR_SAVE_WRITE;
 			strncpy(SaveFileNameJustSaved, ValidSaveName, sizeof(ValidSaveName) - 1);
 			return false;
 		}

 		err = CFileMgr::Write(file, (const char*)compressed, compressed_size) != compressed_size;
-		free(compressed);
+		RwFree(compressed);
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
 			nErrorCode = SAVESTATUS_ERR_SAVE_WRITE;
 			strncpy(SaveFileNameJustSaved, ValidSaveName, sizeof(ValidSaveName) - 1);
 			return false;
 		}
 	} else if (crv == LZO_E_NOT_COMPRESSIBLE) {
-		free(compressed);
+		RwFree(compressed);
 		uint32_t compressed_size32 = size;
 		bool err = CFileMgr::Write(file, (const char*)&compressed_size32, sizeof(compressed_size32)) != sizeof(compressed_size32);
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
@@ -156,7 +165,7 @@ C_PcSave::PcClassSaveRoutine(int32 file, uint8 *data, uint32 size)
 			return false;
 		}
 	} else {
-		free(compressed);
+		RwFree(compressed);
 		return false;
 	}

--- a/src/miami/animation/AnimManager.cpp
+++ b/src/miami/animation/AnimManager.cpp
@@ -12,6 +12,8 @@
 #include "AnimManager.h"
 #include "Streaming.h"

+void* re3StreamingAlloc(size_t size);
+
 CAnimBlock CAnimManager::ms_aAnimBlocks[NUMANIMBLOCKS];
 CAnimBlendHierarchy CAnimManager::ms_aAnimations[NUMANIMATIONS];
 int32 CAnimManager::ms_numAnimBlocks;
@@ -1312,7 +1314,7 @@ CAnimManager::LoadAnimFile(RwStream *stream, bool compress, char (*uncompressedA
 			uint16_t flags;
 			RwStreamRead(stream, &flags, sizeof(flags));

-			seq->keyFrames = RwMalloc(dataSize);
+			seq->keyFrames = re3StreamingAlloc(dataSize);
 			assert(seq->keyFrames);
 			RwStreamRead(stream, seq->keyFrames, dataSize - sizeof(flags));
 			seq->type = flags;
--- a/src/miami/animation/CutsceneMgr.cpp
+++ b/src/miami/animation/CutsceneMgr.cpp
@@ -419,7 +419,8 @@ CCutsceneMgr::DeleteCutsceneData(void)
 		CBaseModelInfo *minfo = CModelInfo::GetModelInfo(i);
 		CColModel *colModel = minfo->GetColModel();
 		if (colModel != &CTempColModels::ms_colModelPed1) {
-			delete colModel;
+			// no need to delete anymore, SetColModel will do it (~skmp)
+			//delete colModel;
 			minfo->SetColModel(&CTempColModels::ms_colModelPed1);
 		}
 	}
--- a/src/miami/audio/sampman_dc.cpp
+++ b/src/miami/audio/sampman_dc.cpp
@@ -182,6 +182,12 @@ uintptr_t gPlayerTalkData = 0;
 uint32 gPlayerTalkReqId = 0;
 #endif

+// this is very wasteful and temporary
+#define BANK_STAGE_SIZE 16 * 2048
+static  uint8_t stagingBufferBank[BANK_STAGE_SIZE] __attribute__((aligned(32)));
+std::mutex stagingBufferMtx;
+
+
 static int32 DCStreamedLength[TOTAL_STREAMED_SOUNDS];

 struct WavHeader {
@@ -581,16 +587,19 @@ cSampleManager::LoadSampleBank(uint8 nBank)
 		// TODO: Split per-bank sfx file
 		int fd = fs_open(SampleBankDataFilename, O_RDONLY);
 		assert(fd >= 0);
-		// this is very wasteful and temporary
-		void* stagingBuffer = memalign(32, 8 * 2048);
+		
+		fs_seek(fd, fileStart, SEEK_SET);
+		{ 
+			std::lock_guard lk(stagingBufferMtx); // for stagingBufferBank
+		
+			void* stagingBuffer = stagingBufferBank;
 			assert(stagingBuffer != 0);

 			// Ideally, we'd suspend the CdStream thingy here or read via that instead
 			uintptr_t loadOffset = bank.base;
-		fs_seek(fd, fileStart, SEEK_SET);

 			while (fileSize > 0) {
-			size_t readSize = fileSize > 8 * 2048 ? 8 * 2048 : fileSize;
+				size_t readSize = fileSize > sizeof(stagingBufferBank) ? sizeof(stagingBufferBank) : fileSize;
 				int rs = fs_read(fd, stagingBuffer, readSize);
 				debugf("Read %d bytes, expected %d\n", rs, readSize);
 				assert(rs == readSize);
@@ -599,8 +608,8 @@ cSampleManager::LoadSampleBank(uint8 nBank)
 				fileSize -= readSize;
 				debugf("Loaded %d bytes, %d remaining\n", readSize, fileSize);
 			}
+		}
 		fs_close(fd);
-		free(stagingBuffer);
 		

 		for (int nSfx = BankStartOffset[nBank]; nSfx < BankStartOffset[nBank+1]; nSfx++) {
@@ -693,7 +702,10 @@ cSampleManager::LoadMissionAudio(uint8 nSlot, uint32 nSample)
 		// TODO: When we can dma directly to AICA, we can use this instead
 		// fs_read(fdPedSfx, SPU_BASE_U8 + (uintptr_t)cmd->dest, cmd->size);

-		void* stagingBuffer = memalign(32, cmd->size);
+		assert(cmd->size < sizeof(stagingBufferBank));
+		{
+			std::lock_guard lk(stagingBufferMtx); // for stagingBufferBank
+			void* stagingBuffer = stagingBufferBank;
 			assert(stagingBuffer != 0);
 			debugf("Allocated %d bytes at %p\n", cmd->size, stagingBuffer);
 			int rs = fs_read(fdPedSfx, stagingBuffer, cmd->size);
@@ -701,7 +713,8 @@ cSampleManager::LoadMissionAudio(uint8 nSlot, uint32 nSample)
 			assert(rs == cmd->size);
 	
 			spu_memload((uintptr_t)cmd->dest, stagingBuffer, cmd->size);
-		free(stagingBuffer);
+		}
+		
 		nPedSfxReqReadId = nPedSfxReqReadId + 1;
 	});
 	
@@ -787,7 +800,10 @@ cSampleManager::LoadPedComment(uint32 nComment)
 		// TODO: When we can dma directly to AICA, we can use this instead
 		// fs_read(fdPedSfx, SPU_BASE_U8 + (uintptr_t)cmd->dest, cmd->size);

-		void* stagingBuffer = memalign(32, cmd->size);
+		assert(cmd->size < sizeof(stagingBufferBank));
+		{
+			std::lock_guard lk(stagingBufferMtx); // for stagingBufferBank
+			void* stagingBuffer = stagingBufferBank;
 			assert(stagingBuffer != 0);
 			debugf("Allocated %d bytes at %p\n", cmd->size, stagingBuffer);
 			int rs = fs_read(fdPedSfx, stagingBuffer, cmd->size);
@@ -795,7 +811,8 @@ cSampleManager::LoadPedComment(uint32 nComment)
 			assert(rs == cmd->size);

 			spu_memload((uintptr_t)cmd->dest, stagingBuffer, cmd->size);
-		free(stagingBuffer);
+		}
+
 		nPedSfxReqReadId = nPedSfxReqReadId + 1;
 	});

@@ -1349,16 +1366,21 @@ cSampleManager::InitialiseSampleBanks(void)
 	for (uint32 nComment = SAMPLEBANK_PED_START; nComment <= SAMPLEBANK_PED_END; nComment++) {
 		pedBlocksizeMax = Max(pedBlocksizeMax, m_aSamples[nComment].nByteSize);
 	}
+	assert(pedBlocksizeMax <= BANK_STAGE_SIZE);
 	debugf("Max ped comment size: %d\n", pedBlocksizeMax);

 #ifdef FIX_BUGS

 	// Find biggest player comment
 	uint32 nMaxPlayerSize = 0;
-	for (uint32 i = PLAYER_COMMENTS_START; i <= PLAYER_COMMENTS_END; i++)
+	for (uint32 i = PLAYER_COMMENTS_START; i <= PLAYER_COMMENTS_END; i++) {
 		nMaxPlayerSize = Max(nMaxPlayerSize, m_aSamples[i].nByteSize);
+	}

 	debugf("Max player comment size: %d\n", nMaxPlayerSize);
+
+	assert(nMaxPlayerSize < sizeof(stagingBufferBank));
+
 	gPlayerTalkData = snd_mem_malloc(nMaxPlayerSize);
 	ASSERT(gPlayerTalkData != 0);

--- a/src/miami/collision/ColModel.cpp
+++ b/src/miami/collision/ColModel.cpp
@@ -5,6 +5,8 @@
 #include "MemoryHeap.h"
 #include "Pools.h"

+void* re3StreamingAlloc(size_t size);
+
 CColModel::CColModel(void)
 {
 	numSpheres = 0;
@@ -43,13 +45,13 @@ CColModel::operator delete(void *p, size_t) throw()
 void
 CColModel::RemoveCollisionVolumes(void)
 {
+	CCollision::RemoveTrianglePlanes(this);
 	if(ownsCollisionVolumes){
 		RwFree(spheres);
 		RwFree(lines);
 		RwFree(boxes);
 		RwFree(vertices);
 		RwFree(triangles);
-		CCollision::RemoveTrianglePlanes(this);
 	}
 	numSpheres = 0;
 	numLines = 0;
@@ -109,6 +111,8 @@ CColModel::operator=(const CColModel &other)
 	int i;
 	int numVerts;

+	CCollision::RemoveTrianglePlanes(this);
+
 	boundingSphere = other.boundingSphere;
 	boundingBox = other.boundingBox;

@@ -179,7 +183,7 @@ CColModel::operator=(const CColModel &other)
 		if(vertices)
 			RwFree(vertices);
 		if(numVerts){
-			vertices = (CompressedVector*)RwMalloc(numVerts*sizeof(CompressedVector));
+			vertices = (CompressedVector*)re3StreamingAlloc(numVerts*sizeof(CompressedVector));
 			for(i = 0; i < numVerts; i++)
 				vertices[i] = other.vertices[i];
 		}
@@ -189,7 +193,7 @@ CColModel::operator=(const CColModel &other)
 			numTriangles = other.numTriangles;
 			if(triangles)
 				RwFree(triangles);
-			triangles = (CColTriangle*)RwMalloc(numTriangles*sizeof(CColTriangle));
+			triangles = (CColTriangle*)re3StreamingAlloc(numTriangles*sizeof(CColTriangle));
 		}
 		for(i = 0; i < numTriangles; i++)
 			triangles[i] = other.triangles[i];
--- a/src/miami/core/FileLoader.cpp
+++ b/src/miami/core/FileLoader.cpp
@@ -30,6 +30,8 @@
 #include "ColStore.h"
 #include "Occlusion.h"

+void* re3StreamingAlloc(size_t size);
+
 char CFileLoader::ms_line[256];

 const char*
@@ -303,6 +305,24 @@ CFileLoader::LoadCollisionModel(uint8 *buf, CColModel &model, char *modelname)
 	model.boundingBox.max.z = *(float*)(buf+36);
 	model.numSpheres = *(int16*)(buf+40);
 	buf += 44;
+	if (model.spheres) {
+		RwFree(model.spheres);
+	}
+	if (model.lines) {
+		RwFree(model.lines);
+	}
+	if (model.boxes) {
+		RwFree(model.boxes);
+	}
+	if (model.vertices) {
+		RwFree(model.vertices);
+	}
+	if (model.triangles) {
+		RwFree(model.triangles);
+	}
+	if (model.trianglePlanes) {
+		CCollision::RemoveTrianglePlanes(&model);
+	}
 	if(model.numSpheres > 0){
 		model.spheres = (CColSphere*)RwMalloc(model.numSpheres*sizeof(CColSphere));
 		REGISTER_MEMPTR(&model.spheres);
@@ -360,7 +380,7 @@ CFileLoader::LoadCollisionModel(uint8 *buf, CColModel &model, char *modelname)
 	model.numTriangles = *(int16*)buf;
 	buf += 4;
 	if(model.numTriangles > 0){
-		model.triangles = (CColTriangle*)RwMalloc(model.numTriangles*sizeof(CColTriangle));
+		model.triangles = (CColTriangle*)re3StreamingAlloc(model.numTriangles*sizeof(CColTriangle));
 		REGISTER_MEMPTR(&model.triangles);
 		for(i = 0; i < model.numTriangles; i++){
 			model.triangles[i].Set(*(uint16*)buf, *(uint16*)(buf+2), *(uint16*)(buf+4), buf[6]);
--- a/src/miami/core/Streaming.cpp
+++ b/src/miami/core/Streaming.cpp
@@ -1386,6 +1386,24 @@ bool re3EmergencyRemoveModel() {
 	return usedmem != CStreaming::ms_memoryUsed;
 }

+void* re3StreamingAlloc(size_t size) {
+	auto rv = RwMalloc(size);
+
+	while (rv == nil) {
+		if (re3RemoveLeastUsedModel()) {
+			rv = RwMalloc(size);
+			continue;
+		}
+		if (re3EmergencyRemoveModel()) {
+			rv = RwMalloc(size);
+			continue;
+		}
+		return nil;
+	}
+
+	return rv;
+}
+
 bool
 CStreaming::RemoveLeastUsedModel(uint32 excludeMask)
 {
--- a/src/miami/modelinfo/BaseModelInfo.cpp
+++ b/src/miami/modelinfo/BaseModelInfo.cpp
@@ -40,6 +40,14 @@ CBaseModelInfo::DeleteCollisionModel(void)
 	}
 }

+void CBaseModelInfo::SetColModel(CColModel *col, bool owns) {
+	if (m_bOwnsColModel) {
+		delete m_colModel;
+	}
+	m_colModel = col;
+	m_bOwnsColModel = owns;
+}
+
 void
 CBaseModelInfo::AddRef(void)
 {
--- a/src/miami/modelinfo/BaseModelInfo.h
+++ b/src/miami/modelinfo/BaseModelInfo.h
@@ -52,8 +52,7 @@ public:
 	bool IsClump(void) { return m_type == MITYPE_CLUMP || m_type == MITYPE_PED || m_type == MITYPE_VEHICLE;	}
 	char *GetModelName(void) { return m_name; }
 	void SetModelName(const char *name) { strncpy(m_name, name, MAX_MODEL_NAME); }
-	void SetColModel(CColModel *col, bool owns = false){
-		m_colModel = col; m_bOwnsColModel = owns; }
+	void SetColModel(CColModel *col, bool owns = false);
 	CColModel *GetColModel(void) { return m_colModel; }
 	bool DoesOwnColModel(void) { return m_bOwnsColModel; }
 	void DeleteCollisionModel(void);
--- a/src/miami/renderer/ShadowCamera.cpp
+++ b/src/miami/renderer/ShadowCamera.cpp
@@ -271,13 +271,13 @@ CShadowCamera::InvertRaster()
 	RwIm2DVertexSetIntRGBA     (&vx[1], 255, 255, 255, 255);

 	RwIm2DVertexSetScreenX     (&vx[2], crw);
-	RwIm2DVertexSetScreenY     (&vx[2], 0.0f);
+	RwIm2DVertexSetScreenY     (&vx[2], crh);
 	RwIm2DVertexSetScreenZ     (&vx[2], RwIm2DGetNearScreenZ());
 	RwIm2DVertexSetRecipCameraZ(&vx[2], recipZ);
 	RwIm2DVertexSetIntRGBA     (&vx[2], 255, 255, 255, 255);

 	RwIm2DVertexSetScreenX     (&vx[3], crw);
-	RwIm2DVertexSetScreenY     (&vx[3], crh);
+	RwIm2DVertexSetScreenY     (&vx[3], 0.0f);
 	RwIm2DVertexSetScreenZ     (&vx[3], RwIm2DGetNearScreenZ());
 	RwIm2DVertexSetRecipCameraZ(&vx[3], recipZ);
 	RwIm2DVertexSetIntRGBA     (&vx[3], 255, 255, 255, 255);
@@ -289,7 +289,7 @@ CShadowCamera::InvertRaster()
 	RwRenderStateSet(rwRENDERSTATESRCBLEND,          (void *)rwBLENDINVDESTCOLOR);
 	RwRenderStateSet(rwRENDERSTATEDESTBLEND,         (void *)rwBLENDZERO);

-	RwIm2DRenderPrimitive(rwPRIMTYPETRISTRIP, vx, 4);
+	RwIm2DRenderPrimitive(rwPRIMTYPETRIFAN, vx, 4);

 	RwRenderStateSet(rwRENDERSTATEZTESTENABLE,       (void *)TRUE);
 	RwRenderStateSet(rwRENDERSTATESRCBLEND,          (void *)rwBLENDSRCALPHA);
--- a/src/miami/rw/RwHelper.cpp
+++ b/src/miami/rw/RwHelper.cpp
@@ -385,22 +385,22 @@ RwBool Im2DRenderQuad(RwReal x1, RwReal y1, RwReal x2, RwReal y2, RwReal z, RwRe
    RwIm2DVertexSetV(&vx[1], 1.0f + uvOffset, recipCamZ);

 	RwIm2DVertexSetScreenX(&vx[2], x2);
-    RwIm2DVertexSetScreenY(&vx[2], y1);
+    RwIm2DVertexSetScreenY(&vx[2], y2);
    RwIm2DVertexSetScreenZ(&vx[2], z);
    RwIm2DVertexSetIntRGBA(&vx[2], 255, 255, 255, 255);
    RwIm2DVertexSetRecipCameraZ(&vx[2], recipCamZ);
    RwIm2DVertexSetU(&vx[2], 1.0f + uvOffset, recipCamZ);
-    RwIm2DVertexSetV(&vx[2], uvOffset, recipCamZ);
+    RwIm2DVertexSetV(&vx[2], 1.0f + uvOffset, recipCamZ);
 	
    RwIm2DVertexSetScreenX(&vx[3], x2);
-    RwIm2DVertexSetScreenY(&vx[3], y2);
+    RwIm2DVertexSetScreenY(&vx[3], y1);
    RwIm2DVertexSetScreenZ(&vx[3], z);
    RwIm2DVertexSetIntRGBA(&vx[3], 255, 255, 255, 255);
    RwIm2DVertexSetRecipCameraZ(&vx[3], recipCamZ);
    RwIm2DVertexSetU(&vx[3], 1.0f + uvOffset, recipCamZ);
-    RwIm2DVertexSetV(&vx[3], 1.0f + uvOffset, recipCamZ);
+    RwIm2DVertexSetV(&vx[3], uvOffset, recipCamZ);

-    RwIm2DRenderPrimitive(rwPRIMTYPETRISTRIP, vx, 4);
+    RwIm2DRenderPrimitive(rwPRIMTYPETRIFAN, vx, 4);

    return TRUE;
 }
--- a/src/miami/save/PCSave.cpp
+++ b/src/miami/save/PCSave.cpp
@@ -17,6 +17,8 @@

 #include "vmu/vmu.h"

+void* re3StreamingAlloc(size_t size);
+
 const char* _psGetUserFilesFolder();

 C_PcSave PcSaveHelper;
@@ -76,31 +78,37 @@ C_PcSave::SaveSlot(int32 slot)
 bool
 C_PcSave::PcClassSaveRoutine(int32 file, uint8 *data, uint32 size)
 {
-	void* wrkmem = malloc(LZO1X_1_MEM_COMPRESS);
-	uint8* compressed = (uint8*)malloc(size*2);
+	void* wrkmem = re3StreamingAlloc(LZO1X_1_MEM_COMPRESS);
+	assert(wrkmem);
+	uint8* compressed = (uint8*)re3StreamingAlloc(size*2);
+	assert(compressed);
 	lzo_uint compressed_size;
 	int crv = lzo1x_1_compress(data, size, compressed, &compressed_size, wrkmem);
-	free(wrkmem);
+	RwFree(wrkmem);
+
+	if (crv == LZO_E_OK && compressed_size >= size) {
+		crv = LZO_E_NOT_COMPRESSIBLE;
+	}

 	if (crv == LZO_E_OK) {
 		uint32_t compressed_size32 = compressed_size | 0x80000000;
 		bool err = CFileMgr::Write(file, (const char*)&compressed_size32, sizeof(compressed_size32)) != sizeof(compressed_size32);
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
-			free(compressed);
+			RwFree(compressed);
 			nErrorCode = SAVESTATUS_ERR_SAVE_WRITE;
 			strncpy(SaveFileNameJustSaved, ValidSaveName, sizeof(ValidSaveName) - 1);
 			return false;
 		}

 		err = CFileMgr::Write(file, (const char*)compressed, compressed_size) != compressed_size;
-		free(compressed);
+		RwFree(compressed);
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
 			nErrorCode = SAVESTATUS_ERR_SAVE_WRITE;
 			strncpy(SaveFileNameJustSaved, ValidSaveName, sizeof(ValidSaveName) - 1);
 			return false;
 		}
 	} else if (crv == LZO_E_NOT_COMPRESSIBLE) {
-		free(compressed);
+		RwFree(compressed);
 		uint32_t compressed_size32 = size;
 		bool err = CFileMgr::Write(file, (const char*)&compressed_size32, sizeof(compressed_size32)) != sizeof(compressed_size32);
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
@@ -115,7 +123,7 @@ C_PcSave::PcClassSaveRoutine(int32 file, uint8 *data, uint32 size)
 			return false;
 		}
 	} else {
-		free(compressed);
+		RwFree(compressed);
 		return false;
 	}

@@ -153,16 +161,17 @@ uint32_t C_PcSave::PcClassLoadRoutine(int32 file, uint8 *data) {
 		return size;
 	} else {
 		size &= ~0x80000000;
-		uint8* compressed = (uint8*)malloc(size);
+		uint8* compressed = (uint8*)re3StreamingAlloc(size);
+		assert(compressed);
 		err = CFileMgr::Read(file, (const char*)compressed, size) != size;
 		if (err || CFileMgr::GetErrorReadWrite(file)) {
-			free(compressed);
+			RwFree(compressed);
 			return 0;
 		}

 		lzo_uint decompressed_size = 0;
 		auto crv = lzo1x_decompress(compressed, size, data, &decompressed_size, NULL);
-		free(compressed);
+		RwFree(compressed);
 		if (crv != LZO_E_OK) {
 			return 0;
 		}
--- a/src/tools/texconv.cpp
+++ b/src/tools/texconv.cpp
@@ -53,6 +53,9 @@ uint32_t pvr_map32(uint32_t offset32) {return 0;}
 void Hackpresent() { }
 void re3RemoveLeastUsedModel() { assert(false); }
 void re3EmergencyRemoveModel() { assert(false); }
+void* re3StreamingAlloc(size_t sz) {
+	return RwMalloc(sz);
+}
 void RwTexDictionaryGtaStreamRead1(rw::Stream*){ assert(false); }
 void RwTexDictionaryGtaStreamRead2(rw::Stream*, rw::TexDictionary*) { assert(false);  }
 void pvr_ta_data(void* data, int size) {
--- a/vendor/emu/emu/window.cpp
+++ b/vendor/emu/emu/window.cpp
@@ -125,6 +125,8 @@ void x11_window_create()
    x11_win = (void*)x11Window;
    x11_vis = (void*)x11Visual->visual;

+    delete x11Visual;
+
    x11_window_set_text("GTA3dc");
 }

--- a/vendor/librw/src/anim.cpp
+++ b/vendor/librw/src/anim.cpp
@@ -221,6 +221,7 @@ AnimInterpolator::setCurrentAnim(Animation *anim)
 {
 	int32 i;
 	AnimInterpolatorInfo *interpInfo = anim->interpInfo;
+	assert(this->currentAnim == nil || this->currentAnim  == anim);
 	this->currentAnim = anim;
 	this->currentTime = 0.0f;
 	int32 maxkf = this->maxInterpKeyFrameSize;
--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@@ -43,6 +43,7 @@ extern const char* currentFile;
 #define logf(...) // printf(__VA_ARGS__)
 bool re3RemoveLeastUsedModel();
 bool re3EmergencyRemoveModel();
+void* re3StreamingAlloc(size_t size);

 // #include "rwdcimpl.h"

@@ -627,13 +628,11 @@ struct alignas(8) UniformObject
 // So we provide default ctors. We lose the POD status but win
 // in perf for std::vector.

-struct mesh_context_t {
-	mesh_context_t() { }
+struct matfx_context_t {
+	matfx_context_t() { }

-	RGBA color;
-	float32 ambient;
-	float32 diffuse;
-	size_t matfxContextOffset;
+	matrix_t mtx;
+	float32 coefficient;

 	uint32_t hdr_cmd;
 	uint32_t hdr_mode1;
@@ -641,11 +640,13 @@ struct mesh_context_t {
 	uint32_t hdr_mode3;
 };

-struct matfx_context_t {
-	matfx_context_t() { }
+struct mesh_context_t {
+	mesh_context_t() { }

-	matrix_t mtx;
-	float32 coefficient;
+	RGBA color;
+	float32 ambient;
+	float32 diffuse;
+	matfx_context_t* matfxContextPointer;

 	uint32_t hdr_cmd;
 	uint32_t hdr_mode1;
@@ -664,17 +665,16 @@ static_assert(sizeof(skin_context_t) == sizeof(Matrix));
 struct atomic_context_t {
 	atomic_context_t() { }

-	size_t meshContextOffset;
-	size_t skinContextOffset;
+	matrix_t mtx;
+	UniformObject uniform;
+	
+	skin_context_t* skinContextPointer;
 	Atomic* atomic;
 	Geometry* geo;
 	Camera* cam;

 	bool global_needsNoClip;
 	bool skinMatrix0Identity;
-
-	matrix_t worldView, mtx;
-	UniformObject uniform;
 };
 /* END Ligting Structs and Defines */

@@ -815,13 +815,283 @@ void beginUpdate(Camera* cam)  {
 }


-std::vector<atomic_context_t> atomicContexts;
-std::vector<mesh_context_t> meshContexts;
-std::vector<skin_context_t> skinContexts;
-std::vector<matfx_context_t> matfxContexts;
-std::vector<std::function<void()>> opCallbacks;
-std::vector<std::function<void()>> blendCallbacks;
-std::vector<std::function<void()>> ptCallbacks;
+template<typename T>
+struct chunked_vector {
+    static constexpr size_t chunk_size = 8192;
+
+	struct chunk;
+
+    struct chunk_header {
+        chunk* prev;
+        chunk* next;
+        size_t used;
+        size_t free;
+    };
+
+    struct chunk {
+        static constexpr size_t item_count = (chunk_size - sizeof(chunk_header)) / sizeof(T);
+        union {
+            struct {
+                chunk_header header;
+                T items[item_count];
+            };
+            uint8_t data[chunk_size];
+        };
+    };
+
+    // In-object first chunk storage.
+    chunk* first;
+    chunk* last;
+
+    // Constructor: initialize first chunk’s header and set pointers.
+    chunked_vector()
+    {
+		first = last = static_cast<chunk*>(malloc(sizeof(chunk)));
+		
+		first->header.prev = nullptr;
+		first->header.next = nullptr;
+		first->header.used = 0;
+		first->header.free = chunk::item_count;
+
+        static_assert(sizeof(chunk) == chunk_size, "chunk size mismatch");
+    }
+
+    // Destructor: free extra chunks and call clear() to destruct contained objects.
+    ~chunked_vector() {
+        clear();
+        // Free all dynamically allocated chunks
+        chunk* curr = first;
+        while (curr) {
+            chunk* next = curr->header.next;
+            free(curr);
+            curr = next;
+        }
+    }
+
+    // Return a reference to the last element. (Precondition: not empty.)
+    T& back() {
+        assert(last->header.used > 0 && "back() called on empty vector");
+        return last->items[last->header.used - 1];
+    }
+
+    // // Random-access: iterate through chunks until the correct index is found.
+    // T& operator[](size_t idx) {
+    //     chunk* curr = first;
+    //     while (curr) {
+    //         if (idx < curr->header.used)
+    //             return curr->items[idx];
+    //         idx -= curr->header.used;
+    //         curr = curr->header.next;
+    //     }
+    //     assert(0 && "Index out of range");
+    //     // Should never reach here.
+    //     return first->items[0];
+    // }
+
+    // Emplace amt default-constructed elements in a contiguous block (within one chunk)
+    // and return a pointer to the first new element.
+    T* emplace_many(size_t amt) {
+        // Assert that amt is not greater than one chunk's capacity.
+        assert(amt <= chunk::item_count && "emplace_many: amt exceeds a single chunk's capacity");
+
+        // Ensure the current chunk has enough free space.
+        if (last->header.free < amt) {
+            if (last->header.next && last->header.next->header.free >= amt) {
+                last = last->header.next;
+            } else {
+                // Allocate a new chunk.
+                chunk* new_chunk = static_cast<chunk*>(malloc(sizeof(chunk)));
+                assert(new_chunk && "malloc failed in emplace_many");
+                new_chunk->header.prev = last;
+                new_chunk->header.next = nullptr;
+                new_chunk->header.used = 0;
+                new_chunk->header.free = chunk::item_count;
+                last->header.next = new_chunk;
+                last = new_chunk;
+            }
+        }
+        T* start_ptr = &last->items[last->header.used];
+        for (size_t i = 0; i < amt; ++i) {
+            new (&last->items[last->header.used]) T();
+            last->header.used++;
+            last->header.free--;
+        }
+        return start_ptr;
+    }
+
+    // // Return total number of elements across all chunks.
+    // size_t size() const {
+    //     size_t total = 0;
+    //     for (chunk* curr = first; curr; curr = curr->header.next) {
+    //         total += curr->header.used;
+    //     }
+    //     return total;
+    // }
+	bool empty() const {
+		return first->header.used == 0;
+	}
+
+    // Clear all elements: call destructors and reset used/free counters.
+    // Note: extra chunks are NOT freed.
+    void clear() {
+        for (chunk* curr = first; curr; curr = curr->header.next) {
+            for (size_t i = 0; i < curr->header.used; ++i) {
+                curr->items[i].~T();
+            }
+            curr->header.used = 0;
+            curr->header.free = chunk::item_count;
+        }
+		// Free all chunks except first chunk.
+		chunk* curr = first->header.next;
+		while (curr) {
+			chunk* next = curr->header.next;
+			free(curr);
+			curr = next;
+		}
+		first->header.next = nullptr;
+        // Reset last pointer to first
+        last = first;
+    }
+
+    // Emplace a default-constructed element at the end.
+    void emplace_back() {
+        if (last->header.free == 0) {
+            if (last->header.next) {
+                last = last->header.next;
+            } else {
+                chunk* new_chunk = static_cast<chunk*>(malloc(sizeof(chunk)));
+                assert(new_chunk && "malloc failed in emplace_back");
+                new_chunk->header.prev = last;
+                new_chunk->header.next = nullptr;
+                new_chunk->header.used = 0;
+                new_chunk->header.free = chunk::item_count;
+                last->header.next = new_chunk;
+                last = new_chunk;
+            }
+        }
+        new (&last->items[last->header.used]) T();
+        last->header.used++;
+        last->header.free--;
+    }
+
+    // Emplace an element by moving it into the container.
+    void emplace_back(T&& v) {
+        if (last->header.free == 0) {
+            if (last->header.next) {
+                last = last->header.next;
+            } else {
+                chunk* new_chunk = static_cast<chunk*>(malloc(sizeof(chunk)));
+                assert(new_chunk && "malloc failed in emplace_back(T&&)");
+                new_chunk->header.prev = last;
+                new_chunk->header.next = nullptr;
+                new_chunk->header.used = 0;
+                new_chunk->header.free = chunk::item_count;
+                last->header.next = new_chunk;
+                last = new_chunk;
+            }
+        }
+        new (&last->items[last->header.used]) T(std::forward<T>(v));
+        last->header.used++;
+        last->header.free--;
+    }
+
+    // Iterate over each element and invoke the callback.
+    void forEach(void(*cb)(T&)) {
+        for (chunk* curr = first; curr; curr = curr->header.next) {
+            for (size_t i = 0; i < curr->header.used; ++i) {
+                cb(curr->items[i]);
+            }
+        }
+    }
+};
+
+template<typename T>
+struct free_pointer_t {
+	T* ptr;
+	free_pointer_t(T* p) : ptr(p) { }
+	free_pointer_t(free_pointer_t&& other) : ptr(other.ptr) { other.ptr = nullptr; }
+	free_pointer_t(const free_pointer_t&) = delete;
+	~free_pointer_t() {
+		if (ptr) {
+			free(ptr);
+		}
+	}
+};
+
+chunked_vector<atomic_context_t> atomicContexts;
+chunked_vector<mesh_context_t> meshContexts;
+chunked_vector<skin_context_t> skinContexts;
+static_assert(chunked_vector<skin_context_t>::chunk::item_count >= 64);
+chunked_vector<matfx_context_t> matfxContexts;
+
+// A basic move-only function wrapper for callables with signature R(Args...)
+template <typename>
+class move_only_function; // primary template not defined
+
+template <typename R, typename... Args>
+class move_only_function<R(Args...)> {
+public:
+    // Default constructor creates an empty callable.
+    move_only_function() noexcept : callable_(nullptr) {}
+
+    // Templated constructor to accept any callable object.
+    template <typename F>
+    move_only_function(F&& f)
+        : callable_(new model<F>(std::move(f))) {}
+
+    // Move constructor.
+    move_only_function(move_only_function&& other) noexcept
+        : callable_(other.callable_) {
+        other.callable_ = nullptr;
+    }
+
+    // Move assignment operator.
+    move_only_function& operator=(move_only_function&& other) noexcept {
+        if (this != &other) {
+            delete callable_;
+            callable_ = other.callable_;
+            other.callable_ = nullptr;
+        }
+        return *this;
+    }
+
+    // Delete copy constructor and copy assignment operator.
+    move_only_function(const move_only_function&) = delete;
+    move_only_function& operator=(const move_only_function&) = delete;
+
+    // Destructor.
+    ~move_only_function() {
+        delete callable_;
+    }
+
+    // Invoke the stored callable.
+    R operator()(Args... args) {
+        return callable_->invoke(std::forward<Args>(args)...);
+    }
+
+private:
+    // Base class for type erasure.
+    struct concept_t {
+        virtual ~concept_t() = default;
+        virtual R invoke(Args&&... args) = 0;
+    };
+
+    // Derived template class that stores the actual callable.
+    template <typename F>
+    struct model : concept_t {
+        F f;
+        explicit model(F&& f) : f(std::move(f)) {}
+        R invoke(Args&&... args) override {
+            return f(std::forward<Args>(args)...);
+        }
+    };
+
+    concept_t* callable_;
+};
+
+chunked_vector<move_only_function<void()>> opCallbacks;
+chunked_vector<move_only_function<void()>> blendCallbacks;
+chunked_vector<move_only_function<void()>> ptCallbacks;

 void dcMotionBlur_v1(uint8_t a, uint8_t r, uint8_t g, uint8_t b) {
 	
@@ -1123,27 +1393,27 @@ void endUpdate(Camera* cam) {
 		pvr_dr_init(&drState);
 		pvr_list_begin(PVR_LIST_OP_POLY);
 		enter_oix();
-		if (opCallbacks.size()) {
-			for (auto&& cb: opCallbacks) {
+		if (!opCallbacks.empty()) {
+			opCallbacks.forEach([](auto &cb) {
 				cb();
-			}
+			});
 		}
 		pvr_list_finish();
-		if (ptCallbacks.size()) {
+		if (!ptCallbacks.empty()) {
 			PVR_SET(0x11C, 64); // PT Alpha test value
 			pvr_dr_init(&drState);
 			pvr_list_begin(PVR_LIST_PT_POLY);
-			for (auto&& cb: ptCallbacks) {
+			ptCallbacks.forEach([](auto &cb) {
 				cb();
-			}
+			});
 			pvr_list_finish();
 		}
 		pvr_list_begin(PVR_LIST_TR_POLY);
-		if (blendCallbacks.size()) {
+		if (!blendCallbacks.empty()) {
 			pvr_dr_init(&drState);
-			for (auto&& cb: blendCallbacks) {
+			blendCallbacks.forEach([](auto &cb) {
 				cb();
-			}
+			});
 		}

 		if (vertexOverflown()) {
@@ -1480,22 +1750,6 @@ pvr_ptr_t pvrTexturePointer(Raster *r) {
 void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVertices) {
 	auto *verts = reinterpret_cast<Im2DVertex *>(vertices);

-	auto renderCB = 
-		[=,
-			current_raster = dc::current_raster,
-			blend_enabled  = dc::blendEnabled,
-			src_blend      = dc::srcBlend,
-			dst_blend      = dc::dstBlend,
-			z_function     = dc::zFunction,
-			z_write        = dc::zWrite,
-			cull_mode_pvr  = dc::cullModePvr,
-			addressingU    = dc::addressingU,
-			addressingV    = dc::addressingV,
-            fog_func_pvr   = dc::fogFuncPvr]
-		(const Im2DVertex* vtx) __attribute__((always_inline)) 
-	{
-
-		auto pvrHeaderSubmit = [=]() __attribute__((always_inline)) {
 	pvr_poly_cxt_t cxt;

 	if (current_raster) [[likely]] {
@@ -1511,9 +1765,9 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
 		pvr_poly_cxt_col(&cxt, PVR_LIST_TR_POLY);
 	}

-			if (blend_enabled) [[likely]] {
-				cxt.blend.src = src_blend;
-				cxt.blend.dst = dst_blend;
+	if (blendEnabled) [[likely]] {
+		cxt.blend.src = srcBlend;
+		cxt.blend.dst = dstBlend;
 	} else {
 		// non blended sprites are also submitted in TR lists
 		// so we need to reset the blend mode
@@ -1521,14 +1775,35 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
 		cxt.blend.dst = PVR_BLEND_ZERO;
 	}

-			cxt.gen.culling      = cull_mode_pvr;
-			cxt.depth.comparison = z_function;
-			cxt.depth.write      = z_write;
+	cxt.gen.culling      = cullModePvr;
+	cxt.depth.comparison = zFunction;
+	cxt.depth.write      = zWrite;

-			cxt.gen.fog_type = fog_func_pvr;
+	cxt.gen.fog_type = fogFuncPvr;

+	pvr_poly_hdr_t hdr;
+	pvr_poly_compile(&hdr, &cxt);
+
+	assert(primType == PRIMTYPETRILIST || primType == PRIMTYPETRIFAN);
+	
+	auto renderCB = 
+		[
+			primType,
+			numVertices,
+			cmd = hdr.cmd,
+			mode1 = hdr.mode1,
+			mode2 = hdr.mode2,
+			mode3 = hdr.mode3
+		]
+		(const Im2DVertex* vtx) __attribute__((always_inline)) 
+	{
+
+		auto pvrHeaderSubmit = [=]() __attribute__((always_inline)) {
 			auto* hdr = reinterpret_cast<pvr_poly_hdr_t *>(pvr_dr_target(drState));
-			pvr_poly_compile(hdr, &cxt);
+			hdr->cmd = cmd;
+			hdr->mode1 = mode1;
+			hdr->mode2 = mode2;
+			hdr->mode3 = mode3;
 			pvr_dr_commit(hdr);
 		};

@@ -1584,26 +1859,130 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
 		}
 	};

-	std::vector<Im2DVertex> vertData(verts, verts + numVertices);
-	blendCallbacks.emplace_back([=, data = std::move(vertData)]() {
-		renderCB(&data[0]);
+	Im2DVertex* vertData = (Im2DVertex*)malloc(numVertices * sizeof(Im2DVertex));
+	assert(vertData);
+	memcpy(vertData, verts, numVertices * sizeof(Im2DVertex));
+	blendCallbacks.emplace_back([renderCB, vertData=free_pointer_t{vertData}]() {
+		renderCB(vertData.ptr);
 	});
 }

 void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 numVertices, void *indices, int32 numIndices) {
 	auto idx = (unsigned short*)indices;
-	auto vtx = (Im2DVertex*)vertices;
+	auto verts = (Im2DVertex*)vertices;

-    std::vector<Im2DVertex> vertData(numIndices);
+	pvr_poly_cxt_t cxt;

-	for (int32 i = 0; i < numIndices; i++) {
-		vertData[i] = vtx[idx[i]];
+	if (current_raster) [[likely]] {
+		pvr_poly_cxt_txr(&cxt, 
+						PVR_LIST_TR_POLY, 
+						pvrFormatForRaster(current_raster), 
+						current_raster->width, 
+						current_raster->height,
+						pvrTexturePointer(current_raster), 
+						PVR_FILTER_BILINEAR);
+		pvrTexAddress(&cxt, addressingU, addressingV);
+	} else { 
+		pvr_poly_cxt_col(&cxt, PVR_LIST_TR_POLY);
 	}

-	im2DRenderPrimitive(primType, &vertData[0], vertData.size());
+	if (blendEnabled) [[likely]] {
+		cxt.blend.src = srcBlend;
+		cxt.blend.dst = dstBlend;
+	} else {
+		// non blended sprites are also submitted in TR lists
+		// so we need to reset the blend mode
+		cxt.blend.src = PVR_BLEND_ONE;
+		cxt.blend.dst = PVR_BLEND_ZERO;
 	}

-static std::vector<Im3DVertex> im3dVertices; 
+	cxt.gen.culling      = cullModePvr;
+	cxt.depth.comparison = zFunction;
+	cxt.depth.write      = zWrite;
+
+	cxt.gen.fog_type = fogFuncPvr;
+
+	pvr_poly_hdr_t hdr;
+	pvr_poly_compile(&hdr, &cxt);
+
+	assert(primType == PRIMTYPETRILIST);
+	
+	auto renderCB = 
+		[
+			primType,
+			numIndices,
+			cmd = hdr.cmd,
+			mode1 = hdr.mode1,
+			mode2 = hdr.mode2,
+			mode3 = hdr.mode3
+		]
+		(const Im2DVertex* vtx, const uint16_t* idx) __attribute__((always_inline)) 
+	{
+
+		auto pvrHeaderSubmit = [=]() __attribute__((always_inline)) {
+			auto* hdr = reinterpret_cast<pvr_poly_hdr_t *>(pvr_dr_target(drState));
+			hdr->cmd = cmd;
+			hdr->mode1 = mode1;
+			hdr->mode2 = mode2;
+			hdr->mode3 = mode3;
+			pvr_dr_commit(hdr);
+		};
+
+		auto pvrVertexSubmit = [](const Im2DVertex &gtaVert, unsigned flags) 
+		__attribute__((always_inline)) 
+		{
+			auto *pvrVert  = pvr_dr_target(drState); 
+			pvrVert->flags = flags;
+			pvrVert->x 	   = gtaVert.x * VIDEO_MODE_SCALE_X;
+			pvrVert->y	   = gtaVert.y;
+			pvrVert->z 	   = MATH_Fast_Invert(gtaVert.w); // this is perfect for almost every case...
+			pvrVert->u 	   = gtaVert.u;
+			pvrVert->v 	   = gtaVert.v;
+			pvrVert->argb  = (gtaVert.a << 24) |
+							 (gtaVert.r << 16) |
+							 (gtaVert.g <<  8) |
+							 (gtaVert.b <<  0);
+			pvr_dr_commit(pvrVert);
+		};
+
+		switch(primType) {
+			case PRIMTYPETRILIST:
+				pvrHeaderSubmit();
+				dcache_pref_block(vtx);
+				for(int i = 0; i < numIndices; i += 3) [[likely]] {
+					dcache_pref_block(&vtx[idx[i + 1]]);
+					pvrVertexSubmit(vtx[idx[i + 0]], PVR_CMD_VERTEX);
+					dcache_pref_block(&vtx[idx[i + 2]]);
+					pvrVertexSubmit(vtx[idx[i + 1]], PVR_CMD_VERTEX);
+					dcache_pref_block(&vtx[idx[i + 3]]);
+					pvrVertexSubmit(vtx[idx[i + 2]], PVR_CMD_VERTEX_EOL);
+				}
+			break;
+		default:
+			UNIMPL_LOGV("primType: %d, vertices: %p, numVertices: %d", primType, vertices, numVertices);
+		}
+	};
+
+	Im2DVertex* vertData = (Im2DVertex*)malloc(numVertices * sizeof(Im2DVertex));
+	assert(vertData);
+	memcpy(vertData, verts, numVertices * sizeof(Im2DVertex));
+	uint16_t* idxData = (uint16_t*)malloc(numIndices * sizeof(uint16_t));
+	assert(idxData);
+	memcpy(idxData, idx, numIndices * sizeof(uint16_t));
+	blendCallbacks.emplace_back([renderCB, vertData=free_pointer_t(vertData), idxData=free_pointer_t(idxData)]() {
+		renderCB(vertData.ptr, idxData.ptr);
+	});
+
+    // std::vector<Im2DVertex> vertData(numIndices);
+
+	// for (int32 i = 0; i < numIndices; i++) {
+	// 	vertData[i] = vtx[idx[i]];
+	// }
+
+	// im2DRenderPrimitive(primType, &vertData[0], vertData.size());
+}
+
+static Im3DVertex* im3dVertices; 
 void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 flags) {
    // UNIMPL_LOGV("start %d", numVertices);
    if(worldMat == nil){
@@ -1621,7 +2000,12 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f
 	rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW);
 	// mat_load(&DCE_MAT_SCREENVIEW);     // ~11 cycles.
 	mat_load(( matrix_t*)&mtx.right);  // Number of cycles: ~32.
-    im3dVertices.resize(numVertices);
+    if (im3dVertices) {
+		free(im3dVertices);
+	}
+
+	im3dVertices = (Im3DVertex*)malloc(numVertices * sizeof(Im3DVertex));
+	assert(im3dVertices);

    auto vtx = (Im3DVertex*)vertices;

@@ -1649,22 +2033,9 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,
 								void         *indices, 
 								int32_t       numIndices) 
 {
-	auto renderCB = 
-		[=,
-		 current_raster = dc::current_raster,
-		 cull_mode_pvr  = dc::cullModePvr,
-		 src_blend      = dc::srcBlend,
-		 dst_blend      = dc::dstBlend,
-		 blend_enabled  = dc::blendEnabled,
-		 z_function     = dc::zFunction,
-		 z_write        = dc::zWrite,
-		 addressingU    = dc::addressingU,
-		 addressingV    = dc::addressingV,
-		 fog_func_pvr   = dc::fogFuncPvr]
-		 (const void* indices, const Im3DVertex *im3dVertices) __attribute__((always_inline)) 
-		
-	{
-		auto pvrHeaderSubmit = [=]() __attribute__((always_inline)) {
+	if (primType == PRIMTYPELINELIST || primType == PRIMTYPEPOLYLINE) {
+		return;
+	}
 	pvr_poly_cxt_t cxt;

 	if (current_raster) [[likely]] {
@@ -1678,20 +2049,40 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,
 		pvrTexAddress(&cxt, addressingU, addressingV);
 	} else pvr_poly_cxt_col(&cxt, blendEnabled? PVR_LIST_TR_POLY : PVR_LIST_OP_POLY);		

-			if (blend_enabled) [[likely]] {
-				cxt.blend.src = src_blend;
-				cxt.blend.dst = dst_blend;
+	if (blendEnabled) [[likely]] {
+		cxt.blend.src = srcBlend;
+		cxt.blend.dst = dstBlend;
 	}

-			cxt.gen.culling      = cull_mode_pvr;
-			cxt.depth.comparison = z_function;
-			cxt.depth.write      = z_write;
+	cxt.gen.culling      = cullModePvr;
+	cxt.depth.comparison = zFunction;
+	cxt.depth.write      = zWrite;


-			cxt.gen.fog_type = fog_func_pvr;
+	cxt.gen.fog_type = fogFuncPvr;

+	pvr_poly_hdr_t hdr;
+	pvr_poly_compile(&hdr, &cxt);
+
+	assert(primType == PRIMTYPETRILIST);
+
+	auto renderCB = 
+		[
+			numIndices,
+			cmd = hdr.cmd,
+			mode1 = hdr.mode1,
+			mode2 = hdr.mode2,
+			mode3 = hdr.mode3
+		]
+		 (const void* indices, const Im3DVertex *im3dVertices) __attribute__((always_inline)) 
+		
+	{
+		auto pvrHeaderSubmit = [=]() __attribute__((always_inline)) {
 			auto* hdr = reinterpret_cast<pvr_poly_hdr_t *>(pvr_dr_target(drState));
-			pvr_poly_compile(hdr, &cxt);
+			hdr->cmd = cmd;
+			hdr->mode1 = mode1;
+			hdr->mode2 = mode2;
+			hdr->mode3 = mode3;
 			pvr_dr_commit(hdr);
 		};

@@ -1740,7 +2131,6 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,
 			DCE_RenderSubmitVertex(&pvrVert, flags);
 		};

-		if(primType == PRIMTYPETRILIST) [[likely]] {
 		const auto *idx = reinterpret_cast<const uint16 *>(indices);
 		
 		pvrHeaderSubmit();
@@ -1813,25 +2203,23 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,
 					break;
 			}
 		}
-		} 
-		else UNIMPL_LOGV("primType: %d", primType);
 	};

+	assert(im3dVertices);
+	auto vtxData = im3dVertices;
+	im3dVertices = nullptr;
+
+	auto *idxData = (uint16_t*)malloc(numIndices * sizeof(uint16_t));
+	assert(idxData);
+	memcpy(idxData, indices, numIndices * sizeof(uint16_t));
+
 	if (blendEnabled) {
-		auto *idx = reinterpret_cast<uint16_t *>(indices);
-		std::vector<uint16_t> indexBuffer(idx, idx + numIndices);
-		blendCallbacks.emplace_back([=, 
-								 data = std::move(indexBuffer), 
-								 vtxData = im3dVertices](){
-				renderCB(&data[0], &vtxData[0]);
+		blendCallbacks.emplace_back([renderCB, idxData = free_pointer_t(idxData), vtxData = free_pointer_t(vtxData)](){
+				renderCB(idxData.ptr, vtxData.ptr);
 		});
 	} else {
-		auto *idx = reinterpret_cast<uint16_t *>(indices);
-		std::vector<uint16_t> indexBuffer(idx, idx + numIndices);
-		opCallbacks.emplace_back([=, 
-								 data = std::move(indexBuffer), 
-								 vtxData = im3dVertices](){
-				renderCB(&data[0], &vtxData[0]);
+		opCallbacks.emplace_back([renderCB, idxData = free_pointer_t(idxData), vtxData = free_pointer_t(vtxData)](){
+			renderCB(idxData.ptr, vtxData.ptr);
 		});
 	}

@@ -1839,7 +2227,10 @@ void im3DRenderIndexedPrimitive(PrimitiveType primType,

 void im3DEnd(void) {
    // UNIMPL_LOG();
-    im3dVertices.resize(0);
+    if (im3dVertices) {
+		free(im3dVertices);
+	}
+	im3dVertices = nullptr;
 }

 template<typename Vin, typename Vout>
@@ -3563,18 +3954,17 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 	int32 numMeshes = geo->meshHeader->numMeshes;

-	size_t skinContextOffset = skinContexts.size();
+	skin_context_t* skinContextPointer = nullptr;
 	bool skinMatrix0Identity = false;
 	if (skin) {
-		skinContexts.resize(skinContextOffset + skin->numBones);
-		skinMatrix0Identity = uploadSkinMatrices(atomic, &(skinContexts.data() + skinContextOffset)->mtx);
+		skinContextPointer = skinContexts.emplace_many(skin->numBones);
+		skinMatrix0Identity = uploadSkinMatrices(atomic, &skinContextPointer->mtx);
 	}

 	atomicContexts.emplace_back();
 	auto ac = &atomicContexts.back();

-	ac->meshContextOffset = meshContexts.size();
-	ac->skinContextOffset = skinContextOffset;
+	ac->skinContextPointer = skinContextPointer;
 	ac->atomic = atomic;
 	ac->geo = geo;
 	ac->cam = cam;
@@ -3589,18 +3979,11 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 	rw::convMatrix(&world, atomic->getFrame()->getLTM());
 	

-	mat_load((matrix_t*)&cam->devView);
-	mat_apply((matrix_t*)&world);
-	mat_store((matrix_t*)&atomicContexts.back().worldView);
-
 	mat_load((matrix_t*)&cam->devProjScreen);
-	mat_apply((matrix_t*)&atomicContexts.back().worldView);
+	mat_apply((matrix_t*)&cam->devView);
+	mat_apply((matrix_t*)&world);
 	mat_store((matrix_t*)&atomicContexts.back().mtx);

-	int16_t contextId = atomicContexts.size() - 1;
-
-	assert(numMeshes <= 32767);
-	assert(atomicContexts.size() <= 32767);
 	auto meshes = geo->meshHeader->getMeshes();

 	for (int16_t n = 0; n < numMeshes; n++) {
@@ -3614,17 +3997,16 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 		MatFX *matfx = MatFX::get(meshes[n].material);

-		bool isMatFX = false;
-		float matfxCoefficient = 0.0f;
-		size_t matfxContextOffset = matfxContexts.size();
+		matfx_context_t* matfxContextPointer = nullptr;
+
 		if (doEnvironmentMaps && matfx && matfx->type == MatFX::ENVMAP && matfx->fx[0].env.tex != nil && matfx->fx[0].env.coefficient != 0.0f) {
-			isMatFX = true;
-			matfxCoefficient = matfx->fx[0].env.coefficient;
-			matfxContexts.resize(matfxContexts.size() + 1);
+			float matfxCoefficient = matfx->fx[0].env.coefficient;
+			matfxContexts.emplace_back();
+			matfxContextPointer = &matfxContexts.back();
 			// N.B. world here gets converted to a 3x3 matrix
 			// 		this is fine, as we only use it for env mapping from now on
 			uploadEnvMatrix(matfx->fx[0].env.frame, &world, &matfxContexts.back().mtx);
-			matfxContexts.back().coefficient = matfxCoefficient;
+			matfxContextPointer->coefficient = matfxCoefficient;
 			
 			pvr_poly_cxt_t cxt;

@@ -3647,15 +4029,15 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {

 			pvr_poly_hdr_t hdr;
 			pvr_poly_compile(&hdr, &cxt);
-			matfxContexts.back().hdr_cmd = hdr.cmd;
-			matfxContexts.back().hdr_mode1 = hdr.mode1;
-			matfxContexts.back().hdr_mode2 = hdr.mode2;
-			matfxContexts.back().hdr_mode3 = hdr.mode3;
+			matfxContextPointer->hdr_cmd = hdr.cmd;
+			matfxContextPointer->hdr_mode1 = hdr.mode1;
+			matfxContextPointer->hdr_mode2 = hdr.mode2;
+			matfxContextPointer->hdr_mode3 = hdr.mode3;
 		}

 		pvr_poly_cxt_t cxt;
 		int pvrList;
-		if (doBlend || isMatFX) {
+		if (doBlend || matfxContextPointer) {
 			if (doAlphaTest && !doBlendMaterial) {
 				pvrList = PVR_LIST_PT_POLY;
 			} else {
@@ -3685,8 +4067,8 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 				PVR_UVFMT_16BIT,

 				PVR_CLRFMT_4FLOATS,
-				isMatFX ? PVR_BLEND_SRCALPHA : doBlend ? srcBlend : PVR_BLEND_ONE,
-				isMatFX ? PVR_BLEND_INVSRCALPHA : doBlend ? dstBlend : PVR_BLEND_ZERO,
+				matfxContextPointer ? PVR_BLEND_SRCALPHA : doBlend ? srcBlend : PVR_BLEND_ONE,
+				matfxContextPointer ? PVR_BLEND_INVSRCALPHA : doBlend ? dstBlend : PVR_BLEND_ZERO,
 				zFunction,
 				zWrite,
 				cullModePvr,
@@ -3698,8 +4080,8 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 				pvrList,

 				PVR_CLRFMT_4FLOATS,
-				isMatFX ? PVR_BLEND_SRCALPHA : doBlend ? srcBlend : PVR_BLEND_ONE,
-				isMatFX ? PVR_BLEND_INVSRCALPHA : doBlend ? dstBlend : PVR_BLEND_ZERO,
+				matfxContextPointer ? PVR_BLEND_SRCALPHA : doBlend ? srcBlend : PVR_BLEND_ONE,
+				matfxContextPointer ? PVR_BLEND_INVSRCALPHA : doBlend ? dstBlend : PVR_BLEND_ZERO,
 				zFunction,
 				zWrite,
 				cullModePvr,
@@ -3713,7 +4095,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 		mc->color = meshes[n].material->color;
 		mc->ambient = meshes[n].material->surfaceProps.ambient;
 		mc->diffuse = meshes[n].material->surfaceProps.diffuse;
-		mc->matfxContextOffset = isMatFX ? matfxContextOffset : SIZE_MAX;
+		mc->matfxContextPointer = matfxContextPointer;

 		mc->hdr_cmd = hdr.cmd;
 		mc->hdr_mode1 = hdr.mode1;
@@ -3721,20 +4103,17 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 		mc->hdr_mode3 = hdr.mode3;

 		// clipping performed per meshlet
-		auto renderCB = [contextId, n] {
+		auto renderCB = [acp = (const atomic_context_t*) ac , meshContext = (const mesh_context_t*) mc, n] () {
 			if (vertexBufferFree() < freeVertexTarget) {
 				return;
 			}
-			const atomic_context_t* acp = &atomicContexts[contextId];
 			auto geo = acp->geo;
 			auto mesh = geo->meshHeader->getMeshes() + n;
 			const auto& global_needsNoClip = acp->global_needsNoClip;
 			const auto& uniformObject = acp->uniform;
 			const auto& mtx = acp->mtx;
-			const auto& worldView = acp->worldView;
 			const auto& atomic = acp->atomic;
 			const auto& cam = acp->cam;
-			const auto meshContext = &meshContexts[acp->meshContextOffset + n];
 			Skin* skin = Skin::get(geo);

 			bool textured = geo->numTexCoordSets && mesh->material->texture;
@@ -3799,7 +4178,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 						}
 					}

-					if (meshContext->matfxContextOffset != SIZE_MAX) {
+					if (meshContext->matfxContextPointer) {
 						auto* hdr = reinterpret_cast<pvr_poly_hdr_t *>(pvr_dr_target(drState));
 						hdr->cmd = meshContext->hdr_cmd;
 						hdr->mode1 = meshContext->hdr_mode1;
@@ -3840,7 +4219,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 						
 						bool small_xyz = selector & 8;
 						unsigned skinSelector = small_xyz + acp->skinMatrix0Identity*2;
-						tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset],  normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &(skinContexts.data() + acp->skinContextOffset)->mtx);
+						tnlMeshletSkinVerticesSelector[skinSelector](OCR_SPACE, normalDst, &dcModel->data[meshlet->vertexOffset],  normalSrc, &dcModel->data[meshlet->skinWeightOffset], &dcModel->data[meshlet->skinIndexOffset], meshlet->vertexCount, meshlet->vertexSize, &acp->skinContextPointer->mtx);
 						
 						mat_load(&mtx);
 						tnlMeshletTransformSelector[clippingRequired * 2](OCR_SPACE, OCR_SPACE + 4, meshlet->vertexCount, 64);
@@ -3927,9 +4306,9 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 						clipAndsubmitMeshletSelector[textured](OCR_SPACE, indexData, meshlet->indexCount);
 					}

-					if (meshContext->matfxContextOffset != SIZE_MAX) {
+					if (meshContext->matfxContextPointer) {
 						assert(!skin);
-						auto matfxContext = &matfxContexts[meshContext->matfxContextOffset];
+						auto matfxContext = meshContext->matfxContextPointer;

 						auto* hdr = reinterpret_cast<pvr_poly_hdr_t *>(pvr_dr_target(drState));
 						hdr->cmd = matfxContext->hdr_cmd;
@@ -4020,7 +4399,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 			}
 		};

-		if (doBlend || isMatFX) {
+		if (doBlend || matfxContextPointer) {
 			if (doAlphaTest && !doBlendMaterial) {
 				ptCallbacks.emplace_back(std::move(renderCB));
 			} else {
@@ -4744,6 +5123,14 @@ driverOpen(void *o, int32, int32)
 	}
 	#endif

+	#if !defined(DC_TEXCONV)
+	dbglog(DBG_CRITICAL, "atomicContexts: %d per %d allocation\n", decltype(atomicContexts)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "skinContexts: %d per %d allocation\n", decltype(skinContexts)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "matfxContexts: %d per %d allocation\n", decltype(matfxContexts)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "opCallbacks: %d per %d allocation\n", decltype(opCallbacks)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "blendCallbacks: %d per %d allocation\n", decltype(blendCallbacks)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	dbglog(DBG_CRITICAL, "ptCallbacks: %d per %d allocation\n", decltype(ptCallbacks)::chunk::item_count, decltype(atomicContexts)::chunk_size);
+	#endif

    pvr_init(&pvr_params);

@@ -4782,6 +5169,8 @@ driverClose(void *o, int32, int32)

 	pvr_shutdown();

+	engine->driver[PLATFORM_DC]->defaultPipeline->destroy();
+	engine->driver[PLATFORM_DC]->defaultPipeline = nil;
 	return o;
 }

@@ -4837,6 +5226,11 @@ readNativeTexture(Stream *stream)
 	
 	auto cached = cachedRasters.find(pvr_id);

+	assert(natras->raster != nil);
+	assert(natras->raster->texaddr == nil);
+	assert(natras->raster->refs == 1);
+	free(natras->raster);
+
 	if (pvr_id != 0 && cached != cachedRasters.end()) {
 		cached->second->refs++;
 		natras->raster = cached->second;
@@ -4985,7 +5379,7 @@ readNativeData(Stream *stream, int32 length, void *object, int32, int32)
 		return nil;
 	}

-	DCModelDataHeader *header = (DCModelDataHeader *)rwNew(sizeof(DCModelDataHeader) + chunkLen - 8, MEMDUR_EVENT | ID_GEOMETRY);
+	DCModelDataHeader *header = (DCModelDataHeader *)re3StreamingAlloc(sizeof(DCModelDataHeader) + chunkLen - 8 /*, MEMDUR_EVENT | ID_GEOMETRY*/);
 	geo->instData = header;
 	stream->read32(&header->platform, 4);
 	uint32_t version;