From 2361fcd88298feeb114ee708140b7a6401ae62a6 Mon Sep 17 00:00:00 2001
From: Falco Girgis <gyrovorbis@gmail.com>
Date: Sun, 27 Apr 2025 14:40:30 -0500
Subject: [PATCH 1/6] Collision + Vec optimizations (liberty only).

---
 src/liberty/core/World.cpp        | 28 +++++++++++++--
 src/liberty/core/ZoneCull.cpp     | 57 +++++++++++++++++++++++--------
 src/liberty/math/Vector.cpp       |  3 +-
 vendor/librw/src/dc/rwdc_common.h | 40 +++++++++++++++++++++-
 4 files changed, 107 insertions(+), 21 deletions(-)

diff --git a/src/liberty/core/World.cpp b/src/liberty/core/World.cpp
index 1c34a633..17a3532f 100644
--- a/src/liberty/core/World.cpp
+++ b/src/liberty/core/World.cpp
@@ -1215,6 +1215,9 @@ CWorld::FindObjectsIntersectingAngledCollisionBox(const CColBox &boundingBox, co
 	const int32 nStartY = Max(GetSectorIndexY(fStartY), 0);
 	const int32 nEndX = Min(GetSectorIndexX(fEndX), NUMSECTORS_X - 1);
 	const int32 nEndY = Min(GetSectorIndexY(fEndY), NUMSECTORS_Y - 1);
+#ifdef DC_SH4
+    mat_load_transpose(matrix);
+#endif
 	for(int32 y = nStartY; y <= nEndY; y++) {
 		for(int32 x = nStartX; x <= nEndX; x++) {
 			CSector *pSector = GetSector(x, y);
@@ -1268,14 +1271,19 @@ CWorld::FindObjectsIntersectingAngledCollisionBoxSectorList(CPtrList &list, cons
                                                             int16 *nEntitiesFound, int16 maxEntitiesToFind,
                                                             CEntity **aEntities)
 {
-	for(CPtrNode *pNode = list.first; pNode; pNode = pNode->next) {
+    for(CPtrNode *pNode = list.first; pNode; pNode = pNode->next) {
 		CEntity *pEntity = (CEntity *)pNode->item;
 		if(pEntity->m_scanCode != GetCurrentScanCode()) {
 			pEntity->m_scanCode = GetCurrentScanCode();
 			CColSphere sphere;
 			CVector vecDistance = pEntity->GetPosition() - position;
 			sphere.radius = pEntity->GetBoundRadius();
+#ifndef DC_SH4
 			sphere.center = Multiply3x3(vecDistance, matrix);
+#else   // Transposed matrix was already loaded by the callee, so no need to reload!
+            mat_trans_normal3_nomod(vecDistance.x, vecDistance.y, vecDistance.z,
+                                    sphere.center.x, sphere.center.y, sphere.center.z);
+#endif
 			if(CCollision::TestSphereBox(sphere, boundingBox) && *nEntitiesFound < maxEntitiesToFind) {
 				if(aEntities) aEntities[*nEntitiesFound] = pEntity;
 				++*nEntitiesFound;
@@ -1444,13 +1452,27 @@ CWorld::CallOffChaseForAreaSectorListVehicles(CPtrList &list, float x1, float y1
 				pVehicle->AutoPilot.m_nTimeTempAction = CTimer::GetTimeInMilliseconds() + 2000;
 				CColModel *pColModel = pVehicle->GetColModel();
 				bool bInsideSphere = false;
+#ifdef DC_SH4
+                mat_load2(pVehicle->GetMatrix());
+#endif
 				for(int32 i = 0; i < pColModel->numSpheres; i++) {
+#ifndef DC_SH4
 					CVector pos = pVehicle->GetMatrix() * pColModel->spheres[i].center;
-					float fRadius = pColModel->spheres[i].radius;
+#else
+                    CVector pos;
+                    auto &center = pColModel->spheres[i].center;
+                    mat_trans_single3_nodiv_nomod(center.x, center.y, center.z,
+                                                  pos.x, pos.y, pos.z);
+#endif
+                    float fRadius = pColModel->spheres[i].radius;
 					if(pos.x + fRadius > x1 && pos.x - fRadius < x2 && pos.y + fRadius > y1 &&
-					   pos.y - fRadius < y2)
+					    pos.y - fRadius < y2) {
 						bInsideSphere = true;
 					// Maybe break the loop when bInsideSphere is set to true?
+#ifdef DC_SH4       // Don't see why not!
+                        break;
+#endif
+                    }
 				}
 				if(bInsideSphere) {
 					if(pVehicle->GetPosition().x <= (x1 + x2) * 0.5f)
diff --git a/src/liberty/core/ZoneCull.cpp b/src/liberty/core/ZoneCull.cpp
index 5a76e5ed..44e3d4b7 100644
--- a/src/liberty/core/ZoneCull.cpp
+++ b/src/liberty/core/ZoneCull.cpp
@@ -949,11 +949,15 @@ CCullZone::FindTestPoints()
 	if(ElementsY > 32) ElementsY = 32;
 	if(ElementsZ > 32) ElementsZ = 32;
 	Memsize = ElementsX * ElementsY * ElementsZ;
-	StepX = (maxx-minx)/(ElementsX-1);
-	StepY = (maxy-miny)/(ElementsY-1);
-	StepZ = (maxz-minz)/(ElementsZ-1);
+	StepX = Div<true, false>(maxx-minx, ElementsX-1);
+	StepY = Div<true, false>(maxy-miny, ElementsY-1);
+	StepZ = Div<true, false>(maxz-minz, ElementsZ-1);
 
+#ifndef DC_SH4
 	pMem = new uint8[Memsize];
+#else
+    pMem = reinterpret_cast<uint8 *>(alloca(Memsize));
+#endif
 	memset(pMem, 0, Memsize);
 
 	// indices of center
@@ -1496,13 +1500,28 @@ CCullZone::TestEntityVisibilityFromCullZone(CEntity *entity, float extraDist, CE
 	else
 		boundMaxZ += extraDist;
 
+#ifndef DC_SH4
 	CVector vecMin = entity->GetMatrix() * CVector(boundMinX, boundMinY, boundMinZ);
 	CVector vecMaxX = entity->GetMatrix() * CVector(boundMaxX, boundMinY, boundMinZ);
 	CVector vecMaxY = entity->GetMatrix() * CVector(boundMinX, boundMaxY, boundMinZ);
 	CVector vecMaxZ = entity->GetMatrix() * CVector(boundMinX, boundMinY, boundMaxZ);
-	CVector dirx = vecMaxX - vecMin;
-	CVector diry = vecMaxY - vecMin;
-	CVector dirz = vecMaxZ - vecMin;
+#else
+    mat_load2(entity->GetMatrix());
+
+    CVector vecMin, vecMaxX, vecMaxY, vecMaxZ;
+    mat_trans_single3_nodiv_nomod(boundMinX, boundMinY, boundMinZ,
+                                  vecMin.x, vecMin.y, vecMin.z);
+    mat_trans_single3_nodiv_nomod(boundMaxX, boundMinY, boundMinZ,
+                                  vecMaxX.x, vecMaxX.y, vecMaxX.z);
+    mat_trans_single3_nodiv_nomod(boundMinX, boundMaxY, boundMinZ,
+                                  vecMaxY.x, vecMaxY.y, vecMaxY.z);
+    mat_trans_single3_nodiv_nomod(boundMinX, boundMinY, boundMaxZ,
+                                  vecMaxZ.x, vecMaxZ.y, vecMaxZ.z);
+#endif
+
+    CVector dirx = vecMaxX - vecMin;
+    CVector diry = vecMaxY - vecMin;
+    CVector dirz = vecMaxZ - vecMin;
 
 	// If building intersects zone at all, it's visible
 	int x, y, z;
@@ -1520,22 +1539,30 @@ CCullZone::TestEntityVisibilityFromCullZone(CEntity *entity, float extraDist, CE
 
 	float distToZone = CalcDistToCullZone(entity->GetPosition().x, entity->GetPosition().y)/15.0f;
 	distToZone = Max(distToZone, 7.0f);
-	int numX = (boundMaxX - boundMinX)/distToZone + 2.0f;
-	int numY = (boundMaxY - boundMinY)/distToZone + 2.0f;
-	int numZ = (boundMaxZ - boundMinZ)/distToZone + 2.0f;
+    float invDistToZone = Invert<true, false>(distToZone);
+    int numX = (boundMaxX - boundMinX)*invDistToZone + 2.0f;
+	int numY = (boundMaxY - boundMinY)*invDistToZone + 2.0f;
+	int numZ = (boundMaxZ - boundMinZ)*invDistToZone + 2.0f;
 
-	float stepX = 1.0f/(numX-1);
-	float stepY = 1.0f/(numY-1);
-	float stepZ = 1.0f/(numZ-1);
+	float stepX = Invert<true, false>(numX-1);
+	float stepY = Invert<true, false>(numY-1);
+	float stepZ = Invert<true, false>(numZ-1);
 	float midX = (boundMaxX + boundMinX)/2.0f;
 	float midY = (boundMaxY + boundMinY)/2.0f;
 	float midZ = (boundMaxZ + boundMinZ)/2.0f;
 
 	// check both xy planes
-	for(int i = 0; i < NumTestPoints; i++){
+#ifndef DC_SH4
+    CVector mid = entity->GetMatrix() * CVector(midX, midY, midZ);
+#else
+    CVector mid;
+    mat_trans_single3_nodiv_nomod(midX, midY, midZ,
+                                  mid.x, mid.y, mid.z);
+#endif
+    mid.z += 0.1f;
+    for(int i = 0; i < NumTestPoints; i++){
 		CVector testPoint = aTestPoints[i];
-		CVector mid = entity->GetMatrix() * CVector(midX, midY, midZ);
-		mid.z += 0.1f;
+
 		if(DoThoroughLineTest(testPoint, mid, entity))
 			return true;
 
diff --git a/src/liberty/math/Vector.cpp b/src/liberty/math/Vector.cpp
index 4f5a21ed..797b22d0 100644
--- a/src/liberty/math/Vector.cpp
+++ b/src/liberty/math/Vector.cpp
@@ -50,8 +50,7 @@ Multiply3x3(const CVector &vec, const CMatrix &mat)
 	               mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z);
 #else
 	CVector out;
-	dc::mat_load2(mat);
-	mat_transpose();
+	dc::mat_load_transpose(mat);
 	mat_trans_normal3_nomod(vec.x, vec.y, vec.z,
 	                        out.x, out.y, out.z);
 	return out;
diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h
index 9c8f673a..dc532776 100644
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -185,7 +185,7 @@ __always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
         x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
     } while(false)
 
-inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) {
+inline __hot __icache_aligned void mat_load2(const matrix_t *mtx) {
     asm volatile(
         R"(
             fschg
@@ -208,6 +208,44 @@ inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) {
     );
 }
 
+inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) {
+    asm volatile(
+        R"(
+            frchg
+
+            fmov.s  @%[mtx]+, fr0
+
+            add     #32, %[mtx]
+            pref    @%[mtx]
+            add     #-(32 - 4), %[mtx]
+
+            fmov.s  @%[mtx]+, fr4
+            fmov.s  @%[mtx]+, fr8
+            fmov.s  @%[mtx]+, fr12
+
+            fmov.s  @%[mtx]+, fr1
+            fmov.s  @%[mtx]+, fr5
+            fmov.s  @%[mtx]+, fr9
+            fmov.s  @%[mtx]+, fr13
+
+            fmov.s  @%[mtx]+, fr2
+            fmov.s  @%[mtx]+, fr6
+            fmov.s  @%[mtx]+, fr10
+            fmov.s  @%[mtx]+, fr14
+
+            fmov.s  @%[mtx]+, fr3
+            fmov.s  @%[mtx]+, fr7
+            fmov.s  @%[mtx]+, fr11
+            fmov.s  @%[mtx]+, fr15
+
+            frchg
+        )"
+        : [mtx] "+r" (mtx)
+        :
+        :
+    );
+}
+
 inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
     asm volatile(
         R"(

From 056fe395675d6e6d3a017776b0ec63008f3f9615 Mon Sep 17 00:00:00 2001
From: Falco Girgis <gyrovorbis@gmail.com>
Date: Sun, 27 Apr 2025 14:59:40 -0500
Subject: [PATCH 2/6] Accelerated rw::RawMatrix layer.

---
 vendor/librw/src/base.cpp    | 23 +++++++++++++++++++----
 vendor/librw/src/dc/rwdc.cpp |  4 ++--
 vendor/librw/src/rwbase.h    | 32 +++++++++++++++++++++++++++++---
 3 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp
index 9f45b459..dcc01dfd 100644
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -151,8 +151,9 @@ slerp(const Quat &q, const Quat &p, float32 a)
 	float32 phi = acosf(c);
 	if(phi > 0.00001f){
 		float32 s = sinf(phi);
-		return add(scale(q1, sinf((1.0f-a)*phi)/s),
-		           scale(p,  sinf(a*phi)/s));
+        float invS = dc::Invert<true, false>(s);
+		return add(scale(q1, sinf((1.0f-a)*phi) * invS),
+		           scale(p,  sinf(a*phi) * invS));
 	}
 	return q1;
 }
@@ -202,6 +203,7 @@ V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
 void
 RawMatrix::mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2)
 {
+#ifndef DC_SH4
 	dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x + src1->rightw*src2->pos.x;
 	dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y + src1->rightw*src2->pos.y;
 	dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z + src1->rightw*src2->pos.z;
@@ -218,11 +220,15 @@ RawMatrix::mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2)
 	dst->pos.y   = src1->pos.x*src2->right.y   + src1->pos.y*src2->up.y   + src1->pos.z*src2->at.y + src1->posw*src2->pos.y;
 	dst->pos.z   = src1->pos.x*src2->right.z   + src1->pos.y*src2->up.z   + src1->pos.z*src2->at.z + src1->posw*src2->pos.z;
 	dst->posw    = src1->pos.x*src2->rightw    + src1->pos.y*src2->upw    + src1->pos.z*src2->atw  + src1->posw*src2->posw;
+#else
+    dc::mat_mult(*dst, *src2, *src1);
+#endif
 }
 
 void
 RawMatrix::transpose(RawMatrix *dst, RawMatrix *src)
 {
+#ifndef DC_SH4
 	dst->right.x = src->right.x;
 	dst->up.x = src->right.y;
 	dst->at.x = src->right.z;
@@ -239,18 +245,27 @@ RawMatrix::transpose(RawMatrix *dst, RawMatrix *src)
 	dst->upw = src->pos.y;
 	dst->atw = src->pos.z;
 	dst->posw = src->posw;
+#else
+    dc::mat_load_transpose(*src);
+    dc::mat_store2(*dst);
+#endif
 }
 
 void
 RawMatrix::setIdentity(RawMatrix *dst)
 {
-	static RawMatrix identity = {
+#ifndef DC_SH4
+	static RawMatrix identity = {{
 		{ 1.0f, 0.0f, 0.0f }, 0.0f,
 		{ 0.0f, 1.0f, 0.0f }, 0.0f,
 		{ 0.0f, 0.0f, 1.0f }, 0.0f,
 		{ 0.0f, 0.0f, 0.0f }, 1.0f
-	};
+	}};
 	*dst = identity;
+#else
+    dc::mat_identity2();
+    dc::mat_store2(*dst);
+#endif
 }
 
 //
diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp
index 2849461f..3793f59b 100644
--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@@ -3530,12 +3530,12 @@ uploadSkinMatrices(Atomic *a, Matrix* skinMatrices)
 	return skinMatrices[0].identityError() < 0.01f;
 }
 
-static RawMatrix normal2texcoord = {
+static RawMatrix normal2texcoord = {{
 	{ 0.5f / 127,  0.0f, 0.0f }, 0.0f,
 	{ 0.0f, -0.5f / 127, 0.0f }, 0.0f,
 	{ 0.0f,  0.0f, 1.0f }, 0.0f,
 	{ 0.5f,  0.5f, 0.0f }, 1.0f
-};
+}};
 
 void
 uploadEnvMatrix(Frame *frame, RawMatrix *world, matrix_t* envMatrix)
diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h
index 8b567709..941fb989 100644
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -339,9 +339,9 @@ inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.
 Quat lerp(const Quat &q, const Quat &p, float32 r);
 Quat slerp(const Quat &q, const Quat &p, float32 a);
 
-struct __attribute__((aligned(8))) RawMatrix 
+struct alignas(8) RawMatrixBase
 {
-	V3d right;
+    V3d right;
 	float32 rightw;
 	V3d up;
 	float32 upw;
@@ -349,6 +349,32 @@ struct __attribute__((aligned(8))) RawMatrix
 	float32 atw;
 	V3d pos;
 	float32 posw;
+};
+
+struct RawMatrix: public RawMatrixBase
+{
+    RawMatrix() {}
+
+    RawMatrix(RawMatrixBase &&aggregate):
+	    RawMatrixBase{aggregate}
+    {}
+
+    RawMatrix(const RawMatrix &rhs) {
+        *this = rhs;
+    }
+
+    operator matrix_t *() {
+        return reinterpret_cast<matrix_t *>(this);
+    }
+
+    operator const matrix_t *() const {
+        return reinterpret_cast<const matrix_t *>(this);
+    }
+
+    RawMatrix &operator=(const RawMatrix &rhs) {
+        dc::mat_copy(*this, rhs);
+        return *this;
+    }
 
 	// NB: this is dst = src2*src1, i.e. src1 is applied first, then src2
 	static void mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2);
@@ -356,7 +382,7 @@ struct __attribute__((aligned(8))) RawMatrix
 	static void setIdentity(RawMatrix *dst);
 };
 
-struct Matrix
+struct alignas(8) Matrix
 {
 	enum Type {
 		TYPENORMAL	= 1,

From c3454ac7ec4f12ae7f6db536864541fcd87834d6 Mon Sep 17 00:00:00 2001
From: Falco Girgis <gyrovorbis@gmail.com>
Date: Sun, 27 Apr 2025 15:25:54 -0500
Subject: [PATCH 3/6] Basic rw::Matrix acceleration.

---
 vendor/librw/src/base.cpp    | 35 +++++++++++++++++-----
 vendor/librw/src/dc/rwdc.cpp | 12 ++++++++
 vendor/librw/src/rwbase.h    | 58 +++++++++++++++++++++++++++++-------
 3 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp
index dcc01dfd..dd4f168b 100644
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -45,12 +45,17 @@ int32 build = 0xFFFF;
 bool32 streamAppendFrames = 0;
 char *debugFile = nil;
 
-static Matrix identMat = {
-	{ 1.0f, 0.0f, 0.0f }, Matrix::IDENTITY|Matrix::TYPEORTHONORMAL,
-	{ 0.0f, 1.0f, 0.0f }, 0,
-	{ 0.0f, 0.0f, 1.0f }, 0,
-	{ 0.0f, 0.0f, 0.0f }, 0
-};
+static Matrix identMat = {{
+    .right 	= { 1.0f, 0.0f, 0.0f }, 
+    .flags 	= Matrix::IDENTITY|Matrix::TYPEORTHONORMAL,
+	.pad0   = 0,
+    .up 	= { 0.0f, 1.0f, 0.0f }, 
+    .upw 	= 0.0f,
+    .at 	= { 0.0f, 0.0f, 1.0f }, 
+    .atw 	= 0.0f,
+    .pos 	= { 0.0f, 0.0f, 0.0f }, 
+    .posw 	= 1.0f
+}};
 
 // lazy implementation
 int
@@ -174,19 +179,27 @@ void
 V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m)
 {
 	int32 i;
-	V3d tmp;
-	for(i = 0; i < n; i++){
+#ifndef DC_SH4
+    V3d tmp;
+    for(i = 0; i < n; i++){
 		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
 		tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
 		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
 		out[i] = tmp;
 	}
+#else
+    dc::mat_load2(*m);
+    for(i = 0; i < n; i++)
+        mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
+                                      out[i].x, out[i].y, out[i].z);
+#endif
 }
 
 void
 V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
 {
 	int32 i;
+#ifndef DC_SH4
 	V3d tmp;
 	for(i = 0; i < n; i++){
 		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
@@ -194,6 +207,12 @@ V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
 		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
 		out[i] = tmp;
 	}
+#else
+    dc::mat_load2(*m);
+    for(i = 0; i < n; i++)
+        mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
+                                out[i].x, out[i].y, out[i].z);
+#endif
 }
 
 //
diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp
index 3793f59b..a5ddacbb 100644
--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@@ -6247,6 +6247,18 @@ writeNativeSkin(Stream *stream, int32 len, void *object, int32 offset)
 
 	stream->write8(&skin->numBones, 4);
 
+	for(int32 i = 0; i < skin->numBones; i++){
+		Matrix &m = *reinterpret_cast<Matrix *>(
+			&skin->inverseMatrices[i * 16]);
+
+		if(m.flags & MatrixBase::IDENTITY_OLD)
+			m.flags |= MatrixBase::IDENTITY;
+		m.pad0 = 0;
+		m.upw = 0.0f;
+		m.atw = 0.0f;
+		m.posw = 1.0f;
+	}
+
 	stream->write32(skin->inverseMatrices, skin->numBones*64);
 	return stream;
 }
diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h
index 941fb989..4965f2fd 100644
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -382,7 +382,7 @@ struct RawMatrix: public RawMatrixBase
 	static void setIdentity(RawMatrix *dst);
 };
 
-struct alignas(8) Matrix
+struct alignas(8) MatrixBase
 {
 	enum Type {
 		TYPENORMAL	= 1,
@@ -391,22 +391,60 @@ struct alignas(8) Matrix
 		TYPEMASK = 3
 	};
 	enum Flags {
-		IDENTITY = 0x20000
+		IDENTITY = 0x4,
+        IDENTITY_OLD = 0x20000
 	};
+
+	V3d right;
+	union {
+		struct {
+			uint32_t flags: 3 = TYPEORTHONORMAL|IDENTITY;
+			uint32_t pad0: 29 = 0;
+		};
+		float rightw;
+	};
+	V3d up;
+	union {
+		uint32 pad1;
+		float  upw = 0.0f;
+	};
+	V3d at;
+	union {
+		uint32 pad2;
+		float  atw = 0.0f;
+	};
+	V3d pos;
+	union {
+		uint32 pad3;
+		float  posw = 1.0f;
+	};
+};
+
+struct Matrix: public MatrixBase
+{
 	struct Tolerance {
 		float32 normal;
 		float32 orthogonal;
 		float32 identity;
 	};
 
-	V3d right;
-	uint32 flags;
-	V3d up;
-	uint32 pad1;
-	V3d at;
-	uint32 pad2;
-	V3d pos;
-	uint32 pad3;
+    Matrix() {}
+
+    Matrix(MatrixBase &&aggregate):
+	    MatrixBase{aggregate}
+    {}
+
+    Matrix(const Matrix &rhs) {
+        *this = rhs;
+    }
+
+    Matrix &operator=(const RawMatrix &rhs) {
+        dc::mat_copy(*this, rhs);
+        return *this;
+    }
+
+    operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
+    operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
 
 	static Matrix *create(void);
 	void destroy(void);

From f59f84c1334daca71e4ab747eda9c2f22adf578e Mon Sep 17 00:00:00 2001
From: Falco Girgis <gyrovorbis@gmail.com>
Date: Tue, 29 Apr 2025 10:03:49 -0500
Subject: [PATCH 4/6] Accelerated lots of RW math + Coronas (liberty)

- lot of the RW matrix stuff has become accelerated
- went through and accelerated liberty's coronas/reflections
! apparently introduced a bug somewhere along the lines that cause boats
  to freak out and do summersaults when trying to drive. Will resolve
later.
---
 src/liberty/renderer/Coronas.cpp  |  33 ++--
 vendor/librw/src/base.cpp         | 232 ++++++++++++-------------
 vendor/librw/src/dc/rwdc_common.h | 272 +++++++++++++++++++-----------
 vendor/librw/src/rwbase.h         |  51 +++++-
 4 files changed, 343 insertions(+), 245 deletions(-)

diff --git a/src/liberty/renderer/Coronas.cpp b/src/liberty/renderer/Coronas.cpp
index e9f9e662..7f7de9cc 100644
--- a/src/liberty/renderer/Coronas.cpp
+++ b/src/liberty/renderer/Coronas.cpp
@@ -300,9 +300,9 @@ CCoronas::Render(void)
 
 
 			if(aCoronas[i].fadeAlpha && spriteCoors.z < aCoronas[i].drawDist){
-				float recipz = 1.0f/spriteCoors.z;
+				float recipz = dc::Invert<true, false>(spriteCoors.z);
 				float fadeDistance = aCoronas[i].drawDist / 2.0f;
-				float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
+				float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - dc::Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
 				int totalFade = aCoronas[i].fadeAlpha * distanceFade;
 
 				if(aCoronas[i].LOScheck)
@@ -313,6 +313,7 @@ CCoronas::Render(void)
 				// render corona itself
 				if(aCoronas[i].texture){
 					float fogscale = CWeather::Foggyness*Min(spriteCoors.z, 40.0f)/40.0f + 1.0f;
+                    float invFogScale = dc::Invert<true, false>(fogscale);
 					if(CCoronas::aCoronas[i].id == SUN_CORE)
 						spriteCoors.z = 0.95f * RwCameraGetFarClipPlane(Scene.camera);
 					RwRenderStateSet(rwRENDERSTATETEXTURERASTER, RwTextureGetRaster(aCoronas[i].texture));
@@ -328,9 +329,9 @@ CCoronas::Render(void)
 						CSprite::RenderOneXLUSprite(spriteCoors.x, spriteCoors.y, spriteCoors.z,
 							spritew * aCoronas[i].size * wscale,
 							spriteh * aCoronas[i].size * fogscale * hscale,
-							CCoronas::aCoronas[i].red / fogscale,
-							CCoronas::aCoronas[i].green / fogscale,
-							CCoronas::aCoronas[i].blue / fogscale,
+							CCoronas::aCoronas[i].red * invFogScale,
+							CCoronas::aCoronas[i].green * invFogScale,
+							CCoronas::aCoronas[i].blue * invFogScale,
 							totalFade,
 							recipz,
 							255);
@@ -339,9 +340,9 @@ CCoronas::Render(void)
 							spriteCoors.x, spriteCoors.y, spriteCoors.z,
 							spritew * aCoronas[i].size * fogscale,
 							spriteh * aCoronas[i].size * fogscale,
-							CCoronas::aCoronas[i].red / fogscale,
-							CCoronas::aCoronas[i].green / fogscale,
-							CCoronas::aCoronas[i].blue / fogscale,
+							CCoronas::aCoronas[i].red * invFogScale,
+							CCoronas::aCoronas[i].green * invFogScale,
+							CCoronas::aCoronas[i].blue * invFogScale,
 							totalFade,
 							recipz,
 							20.0f * recipz,
@@ -365,7 +366,7 @@ CCoronas::Render(void)
 							(spriteCoors.x - (screenw/2)) * flare->position + (screenw/2),
 							(spriteCoors.y - (screenh/2)) * flare->position + (screenh/2),
 							spriteCoors.z,
-							4.0f*flare->size * spritew/spriteh,
+							4.0f*flare->size * dc::Div<true, false>(spritew, spriteh),
 							4.0f*flare->size,
 							(flare->red * aCoronas[i].red)>>8,
 							(flare->green * aCoronas[i].green)>>8,
@@ -480,9 +481,9 @@ CCoronas::RenderReflections(void)
 					drawDist = Min(drawDist, 55.0f);
 					if(spriteCoors.z < drawDist){
 						float fadeDistance = drawDist / 2.0f;
-						float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
+						float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
 						distanceFade = Clamp(distanceFade, 0.0f, 1.0f);
-						float recipz = 1.0f/RwCameraGetNearClipPlane(Scene.camera);
+						float recipz = dc::Invert<true, false>(RwCameraGetNearClipPlane(Scene.camera));
 						float heightFade = (20.0f - aCoronas[i].heightAboveRoad)/20.0f;
 						int intensity = distanceFade*heightFade * 230.0 * CWeather::WetRoads;
 
@@ -606,7 +607,9 @@ CEntity::ProcessLightsForEntity(void)
 	flashTimer1 = 0;
 	flashTimer2 = 0;
 	flashTimer3 = 0;
-
+#ifdef DC_SH4
+    dc:mat_load2(GetMatrix());
+#endif
 	n = CModelInfo::GetModelInfo(GetModelIndex())->GetNum2dEffects();
 	for(i = 0; i < n; i++, flashTimer1 += 0x80, flashTimer2 += 0x100, flashTimer3 += 0x200){
 		effect = CModelInfo::GetModelInfo(GetModelIndex())->Get2dEffect(i);
@@ -614,8 +617,12 @@ CEntity::ProcessLightsForEntity(void)
 		if(effect->type != EFFECT_LIGHT)
 			continue;
 
+#ifndef DC_SH4
 		pos = GetMatrix() * effect->pos;
-
+#else
+        mat_trans_single3_nodiv_nomod(effect->pos.x, effect->pos.y, effect->pos.z,
+                                      pos.x, pos.y, pos.z);
+#endif
 		lightOn = false;
 		lightFlickering = false;
 		switch(effect->light.lightType){
diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp
index dd4f168b..2e3a22cc 100644
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -91,24 +91,6 @@ strncmp_ci(const char *s1, const char *s2, int n)
 	return 0;
 }
 
-Quat
-mult(const Quat &q, const Quat &p)
-{
-#ifndef DC_SH4
-	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
-	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
-	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
-	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
-#else
-	Quat o;
-	dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
-	              reinterpret_cast<const dc::quaternion_t &>(q),
-				  reinterpret_cast<const dc::quaternion_t &>(p));
-	return o;
-#endif
-}
-
-
 Quat*
 Quat::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
@@ -166,53 +148,39 @@ slerp(const Quat &q, const Quat &p, float32 a)
 //
 // V3d
 //
-
-V3d
-cross(const V3d &a, const V3d &b)
-{
-	return makeV3d(a.y*b.z - a.z*b.y,
-	               a.z*b.x - a.x*b.z,
-	               a.x*b.y - a.y*b.x);
+void V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) {
+    int32 i;
+    #ifndef DC_SH4
+        V3d tmp;
+        for(i = 0; i < n; i++){
+            tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
+            tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
+            tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
+            out[i] = tmp;
+        }
+    #else
+        dc::mat_load2(*m);
+        for(i = 0; i < n; i++)
+            mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
+                                          out[i].x, out[i].y, out[i].z);
+    #endif
 }
-
-void
-V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m)
-{
-	int32 i;
-#ifndef DC_SH4
-    V3d tmp;
-    for(i = 0; i < n; i++){
-		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
-		tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
-		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
-		out[i] = tmp;
-	}
-#else
-    dc::mat_load2(*m);
-    for(i = 0; i < n; i++)
-        mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
-                                      out[i].x, out[i].y, out[i].z);
-#endif
-}
-
-void
-V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
-{
-	int32 i;
-#ifndef DC_SH4
-	V3d tmp;
-	for(i = 0; i < n; i++){
-		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
-		tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
-		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
-		out[i] = tmp;
-	}
-#else
-    dc::mat_load2(*m);
-    for(i = 0; i < n; i++)
-        mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
-                                out[i].x, out[i].y, out[i].z);
-#endif
+void V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) {
+    int32 i;
+    #ifndef DC_SH4
+        V3d tmp;
+        for(i = 0; i < n; i++){
+            tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
+            tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
+            tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
+            out[i] = tmp;
+        }
+    #else
+        dc::mat_load2(*m);
+        for(i = 0; i < n; i++)
+            mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
+                                    out[i].x, out[i].y, out[i].z);
+    #endif
 }
 
 //
@@ -343,9 +311,10 @@ Matrix::mult(Matrix *dst, const Matrix *src1, const Matrix *src2)
 		*dst = *src2;
 	else if(src2->flags & IDENTITY)
 		*dst = *src1;
-	else{
+	else {
+        uint8_t flags = src1->flags & src2->flags;
 		mult_(dst, src1, src2);
-		dst->flags = src1->flags & src2->flags;
+		dst->flags = flags;
 	}
 	return dst;
 }
@@ -366,7 +335,8 @@ Matrix::invert(Matrix *dst, const Matrix *src)
 Matrix*
 Matrix::transpose(Matrix *dst, const Matrix *src)
 {
-	if(src->flags & IDENTITY)
+#ifndef DC_SH4
+	if(src->flags & IDENTITY) 
 		*dst = *src;
 	dst->right.x = src->right.x;
 	dst->up.x = src->right.y;
@@ -380,25 +350,31 @@ Matrix::transpose(Matrix *dst, const Matrix *src)
 	dst->pos.x = 0.0;
 	dst->pos.y = 0.0;
 	dst->pos.z = 0.0;
+#else
+    if(src->flags & IDENTITY)
+        *dst = *src;
+    else {
+        dc::mat_load_transpose(*src);
+        dc::mat_store2(*dst);
+    }
+#endif
 	return dst;
 }
 
 Matrix*
 Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
-	Matrix tmp, rot;
-	makeRotation(&rot, axis, angle);
+	Matrix rot;
+    makeRotation(&rot, axis, angle);
 	switch(op){
 	case COMBINEREPLACE:
 		*this = rot;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
-		*this = tmp;
+		mult(this, &rot, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
-		*this = tmp;
+        mult(this, this, &rot);
 		break;
 	}
 	return this;
@@ -407,27 +383,25 @@ Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 Matrix*
 Matrix::rotate(const Quat &q, CombineOp op)
 {
-	Matrix tmp, rot;
-	makeRotation(&rot, q);
+	Matrix rot;
+    makeRotation(&rot, q);
 	switch(op){
 	case COMBINEREPLACE:
 		*this = rot;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
-		*this = tmp;
+        mult(this, &rot, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
-		*this = tmp;
+        mult(this, this, &rot);
 		break;
 	}
 	return this;
 }
+
 Matrix*
 Matrix::translate(const V3d *translation, CombineOp op)
 {
-	Matrix tmp;
 	Matrix trans = identMat;
 	trans.pos = *translation;
 	trans.flags &= ~IDENTITY;
@@ -436,12 +410,10 @@ Matrix::translate(const V3d *translation, CombineOp op)
 		*this = trans;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &trans, this);
-		*this = tmp;
+		mult(this, &trans, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &trans);
-		*this = tmp;
+		mult(this, this, &trans);
 		break;
 	}
 	return this;
@@ -450,7 +422,6 @@ Matrix::translate(const V3d *translation, CombineOp op)
 Matrix*
 Matrix::scale(const V3d *scale, CombineOp op)
 {
-	Matrix tmp;
 	Matrix scl = identMat;
 	scl.right.x = scale->x;
 	scl.up.y = scale->y;
@@ -461,12 +432,10 @@ Matrix::scale(const V3d *scale, CombineOp op)
 		*this = scl;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &scl, this);
-		*this = tmp;
+		mult(this, &scl, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &scl);
-		*this = tmp;
+		mult(this, this, &scl);
 		break;
 	}
 	return this;
@@ -475,18 +444,15 @@ Matrix::scale(const V3d *scale, CombineOp op)
 Matrix*
 Matrix::transform(const Matrix *mat, CombineOp op)
 {
-	Matrix tmp;
 	switch(op){
 	case COMBINEREPLACE:
 		*this = *mat;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, mat, this);
-		*this = tmp;
+		mult(this, mat, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, mat);
-		*this = tmp;
+		mult(this, this, mat);
 		break;
 	}
 	return this;
@@ -501,27 +467,31 @@ Matrix::getRotation(void)
 	if(tr > 0.0f){
 		s = sqrtf(1.0f + tr) * 2.0f;
 		q.w = s / 4.0f;
-		q.x = (up.z - at.y) / s;
-		q.y = (at.x - right.z) / s;
-		q.z = (right.y - up.x) / s;
+        float invS = dc::Invert<true, false>(s);
+		q.x = (up.z - at.y) * invS;
+		q.y = (at.x - right.z) * invS;
+		q.z = (right.y - up.x) * invS;
 	}else if(right.x > up.y && right.x > at.z){
 		s = sqrtf(1.0f + right.x - up.y - at.z) * 2.0f;
-		q.w = (up.z - at.y) / s;
-		q.x = s / 4.0f;
-		q.y = (up.x + right.y) / s;
-		q.z = (at.x + right.z) / s;
+        q.x = s / 4.0f;
+        float invS = dc::Invert<true, false>(s);
+        q.w = (up.z - at.y) * invS;
+		q.y = (up.x + right.y) * invS;
+		q.z = (at.x + right.z) * invS;
 	}else if(up.y > at.z){
 		s = sqrtf(1.0f + up.y - right.x - at.z) * 2.0f;
-		q.w = (at.x - right.z) / s;
-		q.x = (up.x + right.y) / s;
-		q.y = s / 4.0f;
-		q.z = (at.y + up.z) / s;
+        q.y = s / 4.0f;
+        float invS = dc::Invert<true, false>(s);
+        q.w = (at.x - right.z) * invS;
+		q.x = (up.x + right.y) * invS;
+		q.z = (at.y + up.z) * invS;
 	}else{
 		s = sqrtf(1.0f + at.z - right.x - up.y) * 2.0f;
-		q.w = (right.y - up.x) / s;
-		q.x = (at.x + right.z) / s;
-		q.y = (at.y + up.z) / s;
-		q.z = s / 4.0f;
+        q.z = s / 4.0f;
+        float invS = dc::Invert<true, false>(s);
+        q.w = (right.y - up.x) * invS;
+		q.x = (at.x + right.z) * invS;
+		q.y = (at.y + up.z) * invS;
 	}
 	return q;
 }
@@ -543,20 +513,7 @@ Matrix::lookAt(const V3d &dir, const V3d &up)
 void
 Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const Matrix *__restrict__ src2)
 {
-	#if !defined(DC_TEXCONV) && !defined(DC_SIM)
-	dst->right.x = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->right.y = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->right.z = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->up.x    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->up.y    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->up.z    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->at.x    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->at.y    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->at.z    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->pos.x   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.x, src2->up.x, src2->at.x, src2->pos.x);
-	dst->pos.y   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  	 	  src2->right.y, src2->up.y, src2->at.y, src2->pos.y);
-	dst->pos.z   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.z, src2->up.z, src2->at.z, src2->pos.z);
-	#else
+#ifndef DC_SH4
 	dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x;
 	dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y;
 	dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z;
@@ -569,12 +526,15 @@ Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const M
 	dst->pos.x   = src1->pos.x*src2->right.x   + src1->pos.y*src2->up.x   + src1->pos.z*src2->at.x + src2->pos.x;
 	dst->pos.y   = src1->pos.x*src2->right.y   + src1->pos.y*src2->up.y   + src1->pos.z*src2->at.y + src2->pos.y;
 	dst->pos.z   = src1->pos.x*src2->right.z   + src1->pos.y*src2->up.z   + src1->pos.z*src2->at.z + src2->pos.z;
-	#endif
+#else
+    dc::mat_mult(*dst, *src2, *src1);
+#endif
 }
 
 void
 Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 {
+#if 1
 	dst->right.x = src->right.x;
 	dst->right.y = src->up.x;
 	dst->right.z = src->at.x;
@@ -593,7 +553,12 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 	dst->pos.z = -(src->pos.x*src->at.x +
 	               src->pos.y*src->at.y +
 	               src->pos.z*src->at.z);
-	dst->flags = TYPEORTHONORMAL;
+#else
+    dc::mat_load_transpose(*src);
+    dc::mat_invert_tranpose();
+    dc::mat_store2(*dst);
+#endif
+    dst->flags = TYPEORTHONORMAL;
 }
 
 Matrix*
@@ -688,7 +653,11 @@ Matrix::normalError(void)
 	x = dot(right, right) - 1.0f;
 	y = dot(up, up) - 1.0f;
 	z = dot(at, at) - 1.0f;
+#ifndef DC_SH4
 	return x*x + y*y + z*z;
+#else
+    return fipr_magnitude_sqr(x, y, z, 0.0f);
+#endif
 }
 
 float32
@@ -698,16 +667,27 @@ Matrix::orthogonalError(void)
 	x = dot(at, up);
 	y = dot(at, right);
 	z = dot(up, right);
+#ifndef DC_SH4
 	return x*x + y*y + z*z;
+#else
+    return fipr_magnitude_sqr(x, y, z, 0.0f);
+#endif
 }
 
 float32
 Matrix::identityError(void)
 {
-	V3d r = { right.x-1.0f, right.y, right.z };
+    V3d r = { right.x-1.0f, right.y, right.z };
 	V3d u = { up.x, up.y-1.0f, up.z };
 	V3d a = { at.x, at.y, at.z-1.0f };
+#ifndef DC_SH4
 	return dot(r,r) + dot(u,u) + dot(a,a) + dot(pos,pos);
+#else
+    return fipr_magnitude_sqr(r.x, r.y, r.z, 0.0f)    +
+           fipr_magnitude_sqr(u.x, u.y, u.z, 0.0f)    +
+           fipr_magnitude_sqr(at.x, at.y, at.z, 0.0f) +
+           fipr_magnitude_sqr(pos.x, pos.y, pos.z, 0.0f);
+#endif
 }
 
 void
diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h
index dc532776..0c40af52 100644
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -246,6 +246,83 @@ inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) {
     );
 }
 
+inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) {
+    asm volatile(
+        R"(
+            frchg
+
+            fmov.s  @%[mtx]+, fr0
+
+            add     #32, %[mtx]
+            pref    @%[mtx]
+            add     #-(32 - 4), %[mtx]
+
+            fmov.s  @%[mtx]+, fr4
+            fmov.s  @%[mtx]+, fr8
+            fldi0   fr12
+            add     #4, %[mtx]
+
+            fmov.s  @%[mtx]+, fr1
+            fmov.s  @%[mtx]+, fr5
+            fmov.s  @%[mtx]+, fr9
+            fldi0   fr13
+            add     #4, %[mtx]
+
+            fmov.s  @%[mtx]+, fr2
+            fmov.s  @%[mtx]+, fr6
+            fmov.s  @%[mtx]+, fr10
+            fldi0   fr14
+
+            fldi0  fr3
+            fldi0  fr7
+            fmov   fr3, fr11
+            fldi1  fr15
+
+            frchg
+        )"
+        : [mtx] "+r" (mtx)
+        :
+        :
+    );
+}
+
+inline __hot __icache_aligned void mat_invert_tranpose() {
+	asm volatile(
+		"frchg\n\t"
+		"fneg	fr12\n\t"
+		"fneg	fr13\n\t"
+		"fneg	fr14\n\t"
+		"fldi0	fr15\n\t"
+		"fldi0	fr3\n\t"
+		"fipr	fv12, fv0\n\t"
+		"fldi0	fr7\n\t"
+		"fipr	fv12, fv4\n\t"
+		"fldi0	fr11\n\t"
+		"fipr	fv12, fv8\n\t"
+
+		"fmov	fr3, fr12\n\t"
+		"fmov	fr7, fr13\n\t"
+		"fmov	fr11, fr14\n\t"
+		"fmov	fr1, fr15\n\t"
+		"fmov	fr4, fr1\n\t"
+		"fmov	fr15, fr4\n\t"
+		"fmov	fr2, fr15\n\t"
+		"fmov	fr8, fr2\n\t"
+		"fmov	fr15, fr2\n\t"
+		"fmov	fr6, fr15\n\t"
+		"fmov	fr9, fr6\n\t"
+		"fmov	fr15, fr9\n\t"
+
+		"fldi0	fr3\n\t"
+		"fldi0	fr7\n\t"
+		"fldi0	fr11\n\t"
+		"fldi1	fr15\n\t"
+		"frchg\n"
+		:
+		:
+		:);
+}
+
 inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
     asm volatile(
         R"(
@@ -449,103 +526,6 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src)
       :);
 }
 
-//TODO: FIXME FOR VC (AND USE FTRV)
-template<bool FAST_APPROX=false>
-__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
-    if(FAST_APPROX && !std::is_constant_evaluated()) {
-    /*
-        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
-        // q2 then needs to be rotated after each inner product
-        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
-        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
-        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
-        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
-    */
-        // keep q1 in fv4
-        register float q1x __asm__ ("fr4") = (q1.x);
-        register float q1y __asm__ ("fr5") = (q1.y);
-        register float q1z __asm__ ("fr6") = (q1.z);
-        register float q1w __asm__ ("fr7") = (q1.w);
-
-        // load q2 into fv8, use it to get the shuffled reorder into fv0
-        register float q2x __asm__ ("fr8")  = (q2.x);
-        register float q2y __asm__ ("fr9")  = (q2.y);
-        register float q2z __asm__ ("fr10") = (q2.z);
-        register float q2w __asm__ ("fr11") = (q2.w);
-
-        // temporary operand / result in fv0
-        register float t1x __asm__ ("fr0");
-        register float t1y __asm__ ("fr1");
-        register float t1z __asm__ ("fr2");
-        register float t1w __asm__ ("fr3");
-
-        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
-        t1x = q2w;
-        t1y = q2z;
-        t1z = -q2y;
-        t1w = q2w;
-        __asm__ ("\n"
-            " fipr	fv4,fv0\n"
-            : "+f" (t1w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (t1x), "f" (t1y), "f" (t1z)
-        );
-        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
-
-        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
-        t1x = -q2z;
-        t1y = q2w;
-        t1z = q2x;
-        __atomic_thread_fence(1);
-        r->x = t1w;   // get previous result
-        t1w = q2y;
-        __asm__ ("\n"
-            "	fipr	fv4,fv0\n"
-            : "+f" (t1w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (t1x), "f" (t1y), "f" (t1z)
-        );
-        //y = t1w;
-
-        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
-        t1x = q2y;
-        t1y = -q2x;
-        t1z = q2w;
-        __atomic_thread_fence(1);
-        r->y = t1w;   // get previous result
-        t1w = q2z;
-        __asm__ ("\n"
-            "	fipr	fv4,fv0\n"
-            : "+f" (t1w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (t1x), "f" (t1y), "f" (t1z)
-        );
-        //z = t1w;
-        __atomic_thread_fence(1);
-
-        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
-        q2x = -q2x;
-        q2y = -q2y;
-        q2z = -q2z;
-        __asm__ ("\n"
-            "	fipr	fv4,fv8\n"
-            : "+f" (q2w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (q2x), "f" (q2y), "f" (q2z)
-        );
-
-        __atomic_thread_fence(1);
-        r->z = t1w;
-        __atomic_thread_fence(1);
-        r->w = q2w;
-    } else {
-        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
-        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
-        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
-        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
-    }
-}
-
 __hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
     unsigned int prefetch_scratch;
 
@@ -669,6 +649,104 @@ __hot inline void mat_apply_rotate_z(float z) {
         : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
 }
 
+
+//TODO: FIXME FOR VC (AND USE FTRV)
+template<bool FAST_APPROX=false>
+__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+    /*
+        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
+        // q2 then needs to be rotated after each inner product
+        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+    */
+        // keep q1 in fv4
+        register float q1x __asm__ ("fr4") = (q1.x);
+        register float q1y __asm__ ("fr5") = (q1.y);
+        register float q1z __asm__ ("fr6") = (q1.z);
+        register float q1w __asm__ ("fr7") = (q1.w);
+
+        // load q2 into fv8, use it to get the shuffled reorder into fv0
+        register float q2x __asm__ ("fr8")  = (q2.x);
+        register float q2y __asm__ ("fr9")  = (q2.y);
+        register float q2z __asm__ ("fr10") = (q2.z);
+        register float q2w __asm__ ("fr11") = (q2.w);
+
+        // temporary operand / result in fv0
+        register float t1x __asm__ ("fr0");
+        register float t1y __asm__ ("fr1");
+        register float t1z __asm__ ("fr2");
+        register float t1w __asm__ ("fr3");
+
+        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        t1x = q2w;
+        t1y = q2z;
+        t1z = -q2y;
+        t1w = q2w;
+        __asm__ ("\n"
+            " fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
+
+        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        t1x = -q2z;
+        t1y = q2w;
+        t1z = q2x;
+        __atomic_thread_fence(1);
+        r->x = t1w;   // get previous result
+        t1w = q2y;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //y = t1w;
+
+        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        t1x = q2y;
+        t1y = -q2x;
+        t1z = q2w;
+        __atomic_thread_fence(1);
+        r->y = t1w;   // get previous result
+        t1w = q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //z = t1w;
+        __atomic_thread_fence(1);
+
+        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+        q2x = -q2x;
+        q2y = -q2y;
+        q2z = -q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv8\n"
+            : "+f" (q2w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (q2x), "f" (q2y), "f" (q2z)
+        );
+
+        __atomic_thread_fence(1);
+        r->z = t1w;
+        __atomic_thread_fence(1);
+        r->w = q2w;
+    } else {
+        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+    }
+}
+
 #   else
 #       ifdef DC_TEXCONV
 #           define mat_apply(a)
diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h
index 4965f2fd..1004e0e7 100644
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -238,8 +238,8 @@ inline V2d neg(const V2d &a) { return makeV2d(-a.x, -a.y); }
 inline V2d add(const V2d &a, const V2d &b) { return makeV2d(a.x+b.x, a.y+b.y); }
 inline V2d sub(const V2d &a, const V2d &b) { return makeV2d(a.x-b.x, a.y-b.y); }
 inline V2d scale(const V2d &a, float32 r) { return makeV2d(a.x*r, a.y*r); }
-inline float32 length(const V2d &v) { return sqrtf(v.x*v.x + v.y*v.y); }
-inline V2d normalize(const V2d &v) { return scale(v, 1.0f/length(v)); }
+inline float32 length(const V2d &v) { return dc::Sqrt(v.x*v.x + v.y*v.y); }
+inline V2d normalize(const V2d &v) { return scale(v, dc::RecipSqrt(v.x*v.x + v.y*v.y)); }
 
 struct V3d
 {
@@ -265,10 +265,22 @@ inline float32 length(const V3d &v) {
 	return len;
 #endif
 }
-inline V3d normalize(const V3d &v) { return scale(v, 1.0f/length(v)); }
-inline V3d setlength(const V3d &v, float32 l) { return scale(v, l/length(v)); }
-V3d cross(const V3d &a, const V3d &b);
-inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) {
+inline V3d normalize(const V3d &v) {
+    float invLen; 
+#ifndef DC_SH4
+    invLen = 1.0f / length(v);
+#else
+    invLen = dc::RecipSqrt(fipr_magnitude_sqr(v.x, v.y, v.z, 0.0f));
+#endif
+    return scale(v, invLen); 
+}
+inline V3d setlength(const V3d &v, float32 l) { return scale(v, dc::Div<true, false>(l, length(v))); }
+inline V3d cross(const V3d &a, const V3d &b) {
+    return makeV3d(a.y*b.z - a.z*b.y,
+        a.z*b.x - a.x*b.z,
+        a.x*b.y - a.y*b.x);
+}
+inline float32 dot(const V3d &a, const V3d &b) {
 #ifdef DC_SH4
 	return fipr(a.x, a.y, a.z, 0.0f, b.x, b.y, b.z, 0.0f);
 #else
@@ -329,12 +341,33 @@ inline float32 length(const Quat &q) {
 #ifndef DC_SH4
 	return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
 #else
-	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
+	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
 #endif
 }
-inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
+inline Quat normalize(const Quat &q) {
+    float invLen; 
+#ifndef DC_SH4
+    invLen = 1.0f / length(q);
+#else
+    invLen = dc::RecipSqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
+#endif
+    return scale(q, invLen);
+}
 inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
-Quat mult(const Quat &q, const Quat &p);
+inline Quat mult(const Quat &q, const Quat &p) {
+#ifndef DC_SH4
+	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
+	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
+	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
+	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
+#else
+    Quat o;
+    dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
+	              reinterpret_cast<const dc::quaternion_t &>(q),
+                  reinterpret_cast<const dc::quaternion_t &>(p));
+	return o;
+#endif
+}
 inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.0f, v)), conj(q)).vec(); }
 Quat lerp(const Quat &q, const Quat &p, float32 r);
 Quat slerp(const Quat &q, const Quat &p, float32 a);

From 92f8e04aa7f848a0d1fa69bb63d760b0c7d9c356 Mon Sep 17 00:00:00 2001
From: Falco Girgis <gyrovorbis@gmail.com>
Date: Tue, 29 Apr 2025 17:05:15 -0500
Subject: [PATCH 5/6] Saving state, liberty + RW T&L optimizations.

---
 src/liberty/math/Matrix.cpp       |  23 +++++
 src/liberty/math/Vector.h         |   8 +-
 src/miami/math/Vector.h           |  14 +++
 vendor/librw/src/base.cpp         |  19 ++++
 vendor/librw/src/dc/rwdc.cpp      | 164 ++++++++++++++----------------
 vendor/librw/src/dc/rwdc_common.h |  92 ++++++++++++++++-
 6 files changed, 228 insertions(+), 92 deletions(-)

diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp
index a4b6c73f..a282a211 100644
--- a/src/liberty/math/Matrix.cpp
+++ b/src/liberty/math/Matrix.cpp
@@ -54,20 +54,28 @@ CMatrix::Detach(void)
 void
 CMatrix::Update(void)
 {
+#if 1
 	GetRight() = m_attachment->right;
 	GetForward() = m_attachment->up;
 	GetUp() = m_attachment->at;
 	GetPosition() = m_attachment->pos;
+#else
+    mat_copy(*this, *m_attachment);
+#endif
 }
 
 void
 CMatrix::UpdateRW(void)
 {
 	if (m_attachment) {
+#if 1
 		m_attachment->right = GetRight();
 		m_attachment->up = GetForward();
 		m_attachment->at = GetUp();
 		m_attachment->pos = GetPosition();
+#else
+        mat_copy(*m_attachment, *this);
+#endif
 		RwMatrixUpdate(m_attachment);
 	}
 }
@@ -99,6 +107,7 @@ CMatrix::operator+=(CMatrix const &rhs)
 void
 CMatrix::SetUnity(void)
 {
+#ifndef DC_SH4
 	rx = 1.0f;
 	ry = 0.0f;
 	rz = 0.0f;
@@ -111,6 +120,10 @@ CMatrix::SetUnity(void)
 	px = 0.0f;
 	py = 0.0f;
 	pz = 0.0f;
+#else
+    dc::mat_identity2();
+    dc::mat_store2(*this);
+#endif
 }
 
 void
@@ -130,6 +143,7 @@ CMatrix::ResetOrientation(void)
 void
 CMatrix::SetScale(float s)
 {
+#ifndef DC_SH4
 	rx = s;
 	ry = 0.0f;
 	rz = 0.0f;
@@ -145,11 +159,16 @@ CMatrix::SetScale(float s)
 	px = 0.0f;
 	py = 0.0f;
 	pz = 0.0f;
+#else
+    mat_set_scale(s);
+    mat_store2(*this);
+#endif
 }
 
 void
 CMatrix::SetTranslate(float x, float y, float z)
 {
+#if 1
 	rx = 1.0f;
 	ry = 0.0f;
 	rz = 0.0f;
@@ -165,6 +184,10 @@ CMatrix::SetTranslate(float x, float y, float z)
 	px = x;
 	py = y;
 	pz = z;
+#else
+    mat_set_translation(x, y, z);
+    mat_store2(*this);
+#endif
 }
 
 void
diff --git a/src/liberty/math/Vector.h b/src/liberty/math/Vector.h
index 1501512d..63d6fee9 100644
--- a/src/liberty/math/Vector.h
+++ b/src/liberty/math/Vector.h
@@ -1,5 +1,7 @@
 #pragma once 
 
+#include "maths.h"
+
 class CVector : public RwV3d
 {
 public:
@@ -30,7 +32,7 @@ public:
 }
   __always_inline float MagnitudeSqr(void) const {
 #ifdef DC_SH4
-    return fipr_magnitude_sqr(x, y,z, 0.0f);
+    return fipr_magnitude_sqr(x, y, z, 0.0f);
 #else
     return x*x + y*y + z*z;
 #endif
@@ -68,7 +70,7 @@ public:
   }
 
   const CVector &operator/=(float right) {
-    right = Invert<false>(right);
+    right = Invert<true, true>(right);
     x *= right;
     y *= right;
     z *= right;
@@ -112,7 +114,7 @@ inline CVector operator*(float left, const CVector &right)
 
 inline CVector operator/(const CVector &left, float right)
 {
-  right = Invert<false>(right);
+  right = Invert<true, true>(right);
   return CVector(left.x * right, left.y * right, left.z * right);
 }
 
diff --git a/src/miami/math/Vector.h b/src/miami/math/Vector.h
index 04287ece..2e8aace1 100644
--- a/src/miami/math/Vector.h
+++ b/src/miami/math/Vector.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "maths.h"
+
 class CVector : public RwV3d
 {
 public:
@@ -68,9 +70,16 @@ __always_inline float Heading(void) const { return Atan2(-x, y); }
 	}
 
 	const CVector &operator/=(float right) {
+#ifndef DC_SH4
 		x /= right;
 		y /= right;
 		z /= right;
+#else
+        right = dc::Invert<true, true>(right);
+        x *= right;
+        y *= right;
+        z *= right;
+#endif
 		return *this;
 	}
 
@@ -111,7 +120,12 @@ inline CVector operator*(float left, const CVector &right)
 
 inline CVector operator/(const CVector &left, float right)
 {
+#ifndef DC_SH4
 	return CVector(left.x / right, left.y / right, left.z / right);
+#else
+    right = dc::Invert<true, true>(right);
+    return CVector(left.x * right, left.y * right, left.z * right);
+#endif
 }
 
 inline float
diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp
index 2e3a22cc..ff78eddb 100644
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -402,9 +402,16 @@ Matrix::rotate(const Quat &q, CombineOp op)
 Matrix*
 Matrix::translate(const V3d *translation, CombineOp op)
 {
+#if 1
 	Matrix trans = identMat;
 	trans.pos = *translation;
 	trans.flags &= ~IDENTITY;
+#else
+    Matrix trans;
+    dc::mat_set_translation(translation->x, translation->y, translation->z);
+    dc::mat_store2(trans);
+    trans.flags = Matrix::TYPEORTHONORMAL;
+#endif
 	switch(op){
 	case COMBINEREPLACE:
 		*this = trans;
@@ -422,11 +429,18 @@ Matrix::translate(const V3d *translation, CombineOp op)
 Matrix*
 Matrix::scale(const V3d *scale, CombineOp op)
 {
+#ifndef DC_SH4
 	Matrix scl = identMat;
 	scl.right.x = scale->x;
 	scl.up.y = scale->y;
 	scl.at.z = scale->z;
 	scl.flags &= ~IDENTITY;
+#else
+    Matrix scl;
+    dc::mat_set_scale(scale->x, scale->y, scale->z);
+    dc::mat_store2(scl);
+    scl.flags = Matrix::TYPEORTHONORMAL;
+#endif
 	switch(op){
 	case COMBINEREPLACE:
 		*this = scl;
@@ -594,8 +608,13 @@ void
 Matrix::makeRotation(Matrix *dst, const V3d *axis, float32 angle)
 {
 //	V3d v = normalize(*axis);
+#ifndef DC_SH4
 	float32 len = dot(*axis, *axis);
 	if(len != 0.0f) len = 1.0f/sqrtf(len);
+#else
+    float len = fipr_magnitude_sqr(axis->x, axis->y, axis->z, 0.0f);
+    if(len != 0.0f) len = dc::RecipSqrt(len);
+#endif
 	V3d v = rw::scale(*axis, len);
 	angle = angle*(float)M_PI/180.0f;
 	float32 s = sinf(angle);
diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp
index a5ddacbb..aa8a7c14 100644
--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@@ -563,7 +563,7 @@ Camera* rwdcCam;
 
 void beginUpdate(Camera* cam)  {
 	rwdcCam = cam;
-	float view[16], proj[16];
+	alignas(8) float view[16], proj[16];
 
 	// View Matrix
 	Matrix inv;
@@ -586,7 +586,7 @@ void beginUpdate(Camera* cam)  {
 	view[13] =  -inv.pos.y;
 	view[14] =  inv.pos.z;
 	view[15] =  1.0f;
-	memcpy4(&cam->devView, view, sizeof(RawMatrix));
+	mat_copy(cam->devView, reinterpret_cast<const matrix_t *>(view));
 //	d3ddevice->SetTransform(D3DTS_VIEW, (D3DMATRIX*)view);
 
 	// Projection Matrix
@@ -620,8 +620,8 @@ void beginUpdate(Camera* cam)  {
 		proj[15] = 1.0f;
 	}
 	proj[14] = -cam->nearPlane*proj[10];
-	memcpy4(&cam->devProj, proj, sizeof(RawMatrix));
-	
+    mat_copy(cam->devProj, reinterpret_cast<const matrix_t *>(proj));
+
 	DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height);
 	
 	mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj);
@@ -945,8 +945,6 @@ void dcMotionBlur_v1(uint8_t a, uint8_t r, uint8_t g, uint8_t b) {
 		auto addr2 = (pvr_ptr_t)&emu_vram[addr64b + 640 * 2];
 	#endif
 
-
-
 		PVR_SET(PVR_TEXTURE_MODULO, 640/32);
 
 		auto doquad = [=](float x, float y, float w, float h, float tx, float ty, float tw, float th) {
@@ -1666,13 +1664,13 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
 		switch(primType) {
 			case PRIMTYPETRILIST:
 				pvrHeaderSubmit();
-				dcache_pref_block(vtx);
+				__builtin_prefetch(vtx);
 				for(int i = 0; i < numVertices; i += 3) [[likely]] {
-					dcache_pref_block(&vtx[i + 1]);
+					__builtin_prefetch(&vtx[i + 1]);
 					pvrVertexSubmit(vtx[i + 0], PVR_CMD_VERTEX);
-					dcache_pref_block(&vtx[i + 2]);
+					__builtin_prefetch(&vtx[i + 2]);
 					pvrVertexSubmit(vtx[i + 1], PVR_CMD_VERTEX);
-					dcache_pref_block(&vtx[i + 3]);
+					__builtin_prefetch(&vtx[i + 3]);
 					pvrVertexSubmit(vtx[i + 2], PVR_CMD_VERTEX_EOL);
 				}
 			break;
@@ -1680,14 +1678,11 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert
 				pvrHeaderSubmit();
 				const auto *vtxA = vtx + 0;
 				const auto *vtxB = vtx + 1;
-				dcache_pref_block(vtxA);
+				__builtin_prefetch(vtxA);
 				for(int i = 2; i < numVertices; ++i) [[likely]] {
 					const auto *vtxC = vtx + i;
-					dcache_pref_block(vtxB);
 					pvrVertexSubmit(*vtxA, PVR_CMD_VERTEX);
-					dcache_pref_block(vtxC);
 					pvrVertexSubmit(*vtxB, PVR_CMD_VERTEX);
-					dcache_pref_block(&vtx[i]);
 					pvrVertexSubmit(*vtxC, PVR_CMD_VERTEX_EOL);
 					vtxB = vtxC;
 				}
@@ -1787,13 +1782,13 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu
 		switch(primType) {
 			case PRIMTYPETRILIST:
 				pvrHeaderSubmit();
-				dcache_pref_block(vtx);
+				__builtin_prefetch(vtx);
 				for(int i = 0; i < numIndices; i += 3) [[likely]] {
-					dcache_pref_block(&vtx[idx[i + 1]]);
+					__builtin_prefetch(&vtx[idx[i + 1]]);
 					pvrVertexSubmit(vtx[idx[i + 0]], PVR_CMD_VERTEX);
-					dcache_pref_block(&vtx[idx[i + 2]]);
+					__builtin_prefetch(&vtx[idx[i + 2]]);
 					pvrVertexSubmit(vtx[idx[i + 1]], PVR_CMD_VERTEX);
-					dcache_pref_block(&vtx[idx[i + 3]]);
+					__builtin_prefetch(&vtx[idx[i + 3]]);
 					pvrVertexSubmit(vtx[idx[i + 2]], PVR_CMD_VERTEX_EOL);
 				}
 			break;
@@ -1830,15 +1825,21 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f
 		worldMat = &ident;
 	}
 	
-	rw::RawMatrix mtx, proj, world, worldview;
 	rw::Camera *cam = engine->currentCamera;
-
-	rw::convMatrix(&world, worldMat);
+#ifndef DC_SH4
+    rw::RawMatrix mtx, proj, world, worldview;
+    rw::convMatrix(&world, worldMat);
 	rw::RawMatrix::mult(&worldview, &world, &cam->devView);
 	rw::RawMatrix::mult(&proj, &worldview, &cam->devProj);
 	rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW);
 	// mat_load2(&DCE_MAT_SCREENVIEW);     // ~11 cycles.
 	mat_load2(( matrix_t*)&mtx.right);  // Number of cycles: ~32.
+#else
+    mat_load_apply(&DCE_MAT_SCREENVIEW, cam->devProj);
+    mat_apply(cam->devView);
+    mat_apply(*worldMat);
+#endif
+
     if (im3dVertices) {
 		free(im3dVertices);
 	}
@@ -2136,39 +2137,36 @@ static_assert(sizeof(MeshletInfo) == 40); // or 32 if !skin
 
  inline __attribute__((always_inline))  void setLights(Atomic *atomic, WorldLights *lightData, UniformObject &uniformObject)
 {
-	int n = 0;
+    int i = 0;
 
 	uniformObject.ambLight = lightData->ambient;
 
 	if (lightData->numDirectionals) {
 		Matrix mat;
 		Matrix matsrc = *atomic->getFrame()->getLTM();
-		matsrc.pos = V3d {0,0,0};
+		matsrc.pos = V3d {0.0f, 0.0f, 0.0f};
+        matsrc.posw = 0.0f;
 		Matrix::invert(&mat, &matsrc);
-		
-		n = 0;
-		for(int i = 0; i < lightData->numDirectionals && i < MAX_LIGHTS; i++){
+		mat_load2(mat);
+
+		for(; i < lightData->numDirectionals && i < MAX_LIGHTS; i++){
 			Light *l = lightData->directionals[i];
-			uniformObject.col[n] = scale(l->color, 255);
+			uniformObject.col[i] = scale(l->color, 255);
 			V3d at = l->getFrame()->getLTM()->at;
 
 			V3d dir;
 
-			V3d::transformVectors(&dir, &at, 1, &mat);
+			mat_trans_normal3_nomod(at.x, at.y, at.z,
+                                    dir.x, dir.y, dir.z);
 
-			uniformObject.dir[n>>2][0][n&3] = -dir.x / 127.0f;
-			uniformObject.dir[n>>2][1][n&3] = -dir.y / 127.0f;
-			uniformObject.dir[n>>2][2][n&3] = -dir.z / 127.0f;
-			uniformObject.dir[n>>2][3][n&3] = 0;
-
-			n++;
-			if(n >= MAX_LIGHTS)
-				goto out;
+			uniformObject.dir[i>>2][0][i&3] = -dir.x / 127.0f;
+			uniformObject.dir[i>>2][1][i&3] = -dir.y / 127.0f;
+			uniformObject.dir[i>>2][2][i&3] = -dir.z / 127.0f;
+			uniformObject.dir[i>>2][3][i&3] = 0;
 		}
 	}
 
-out:
-	uniformObject.lightCount = n;
+	uniformObject.lightCount = i;
 }
 
 
@@ -2835,7 +2833,7 @@ void* interpolateAndSubmit(void* dst, const void* src1, const void* src2, uint32
 	float y = v1->o_r + t * (v2->o_r - v1->o_r);
 	float w = v1->o_g + t * (v2->o_g - v1->o_g);
 
-	w = frsqrt(w * w);
+	w = Invert<true, false>(w);
 
 	v->x = x * w;
 	v->y = y * w;
@@ -3209,15 +3207,14 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 		auto skinningIndexData = (int16_t*)skinIndexes;
 		auto skinningWeightData = (uint8_t*)skinWeights;
 
-		if (!matrix0Identity) {
-			mat_load_4x4(&skinMatrices[0]);
-			if (small_xyz) {
-				mat_apply(&DCE_MESHLET_MAT_DECODE);
-			}
-		} else {
-			if (small_xyz) {
-				mat_load2(&DCE_MESHLET_MAT_DECODE);
-			}
+		if constexpr (!matrix0Identity) {
+			if (!small_xyz)
+                mat_load2(skinMatrices[0]);
+			else 
+                mat_load_apply(skinMatrices[0], &DCE_MESHLET_MAT_DECODE);
+			
+		} else if constexpr (small_xyz) {
+			mat_load2(&DCE_MESHLET_MAT_DECODE);
 		}
 
 		for(;;) {
@@ -3227,7 +3224,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				int count = *skinningIndexData++;
 				uint8_t* dstVertexBytes = dest + *skinningIndexData++;
 
-				if (matrix0Identity && !small_xyz) {
+				if constexpr (matrix0Identity && !small_xyz) {
 					do {
 						const V3d* srcVtx = (const V3d*)(srcVtxBytes);
 						srcVtxBytes += vertexSize;
@@ -3246,11 +3243,8 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 						srcVtxBytes += vertexSize;
 						V3d* dstVertex = (V3d*)(dstVertexBytes);
 						dstVertexBytes += 64;
-						float x, y, z, w;
-						mat_trans_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z, x, y, z, w);
-						dstVertex->x = x;
-						dstVertex->y = y;
-						dstVertex->z = z;
+						mat_trans_single3_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z,
+                                                      dstVertex->x, dstVertex->y, dstVertex->z);
 					} while(--count != 0);
 				}
 			} else if (!(flags & 0x80)) {
@@ -3280,10 +3274,10 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				break;
 			}
 
-			mat_load_4x4(currentMatrix);
-			if (small_xyz){
-				mat_apply(&DCE_MESHLET_MAT_DECODE);
-			}
+			if constexpr(!small_xyz)
+                mat_load2(*currentMatrix);
+            else
+				mat_load_apply(*currentMatrix, &DCE_MESHLET_MAT_DECODE);
 
 			do {
 				auto srcOffset = *skinningIndexData++;
@@ -3297,9 +3291,9 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 					srcVtx = &tmpSrc;
 				}
 				auto dstVtx = (V3d*)(dest + dstOffset);
-				float x, y, z, w;
-				mat_trans_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z, x, y, z, w);
-				V3d tmp = { x, y, z };
+				V3d tmp;
+				mat_trans_single3_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z, 
+                                              tmp.x, tmp.y, tmp.z);
 				tmp = scale(tmp, *skinningWeightData++ / 255.0f);
 				*dstVtx = add(*dstVtx, tmp);
 			} while (--count != 0);
@@ -3312,8 +3306,8 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 		auto skinningIndexData = (int16_t*)skinIndexes;
 		auto skinningWeightData = (uint8_t*)skinWeights;
 
-		if (!matrix0Identity) {
-			mat_load_3x3(&skinMatrices[0]);
+		if constexpr(!matrix0Identity) {
+			mat_load2(skinMatrices[0]);
 		}
 
 		for(;;) {
@@ -3323,7 +3317,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				int count = *skinningIndexData++;
 				uint8_t* dstNormalBytes = destNormal + *skinningIndexData++;
 
-				if (matrix0Identity) {
+				if constexpr (matrix0Identity) {
 					do {
 						V3d srcNormal = { static_cast<float32>(srcNormalBytes[0]), static_cast<float32>(srcNormalBytes[1]), static_cast<float32>(srcNormalBytes[2]) };
 						
@@ -3339,9 +3333,9 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 						srcNormalBytes += vertexSize;
 						V3d* dstNormal = (V3d*)(dstNormalBytes);
 						dstNormalBytes += 64;
-						float x, y, z, w;
-						mat_trans_nodiv_nomod_zerow(srcNormal.x, srcNormal.y, srcNormal.z, x, y, z, w);
-						*dstNormal = { x, y, z };
+						float x, y, z;
+						mat_trans_normal3_nomod(srcNormal.x, srcNormal.y, srcNormal.z,
+                                                dstNormal->x, dstNormal->y, dstNormal->z);
 					} while(--count != 0);
 				}
 			} else if (!(flags & 0x80)) {
@@ -3371,7 +3365,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				break;
 			}
 
-			mat_load_3x3(currentMatrix);
+			mat_load2(*currentMatrix);
 
 			do {
 				auto srcOffset = *skinningIndexData++;
@@ -3383,8 +3377,8 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve
 				auto dstNormal = (V3d*)(destNormal + dstOffset);
 
 				V3d tmp;
-				float w;
-				mat_trans_nodiv_nomod_zerow(srcNormal.x, srcNormal.y, srcNormal.z, tmp.x, tmp.y, tmp.z, w);
+				mat_trans_normal3_nomod(srcNormal.x, srcNormal.y, srcNormal.z,
+                                        tmp.x, tmp.y, tmp.z);
 				tmp = scale(tmp, *skinningWeightData++ / 255.0f);
 				*dstNormal = add(*dstNormal, tmp);
 			} while (--count != 0);
@@ -3498,22 +3492,20 @@ uploadSkinMatrices(Atomic *a, Matrix* skinMatrices)
 
 	if(hier){
 		Matrix *invMats = (Matrix*)skin->inverseMatrices;
-		Matrix tmp;
 
 		assert(skin->numBones == hier->numNodes);
 		if(hier->flags & HAnimHierarchy::LOCALSPACEMATRICES){
 			for(i = 0; i < hier->numNodes; i++){
-				invMats[i].flags = 0;
-				Matrix::mult(m, &invMats[i], &hier->matrices[i]);
+				mat_mult(*m, invMats[i], hier->matrices[i]);
 				m++;
 			}
 		}else{
 			Matrix invAtmMat;
 			Matrix::invert(&invAtmMat, a->getFrame()->getLTM());
 			for(i = 0; i < hier->numNodes; i++){
-				invMats[i].flags = 0;
-				Matrix::mult(&tmp, &hier->matrices[i], &invAtmMat);
-				Matrix::mult(m, &invMats[i], &tmp);
+				mat_load_apply(invAtmMat, hier->matrices[i]);
+				mat_apply(invMats[i]);
+                mat_store2(*m);
 				m++;
 			}
 		}
@@ -3547,13 +3539,13 @@ uploadEnvMatrix(Frame *frame, RawMatrix *world, matrix_t* envMatrix)
 	RawMatrix *envMtx = (RawMatrix*)envMatrix;
 	{
 
-		RawMatrix invMtx;
+		//RawMatrix invMtx;
 		Matrix::invert(&invMat, frame->getLTM());
-		convMatrix(&invMtx, &invMat);
-		invMtx.pos.set(0.0f, 0.0f, 0.0f);
+		//convMatrix(&invMtx, &invMat);
+		//invMtx.pos.set(0.0f, 0.0f, 0.0f);
 		float uscale = fabs(normal2texcoord.right.x);
 		normal2texcoord.right.x = MatFX::envMapFlipU ? -uscale : uscale;
-		
+#if 0
 		RawMatrix tmpMtx;
 
 		RawMatrix::mult(&tmpMtx, &invMtx, &normal2texcoord);
@@ -3563,6 +3555,11 @@ uploadEnvMatrix(Frame *frame, RawMatrix *world, matrix_t* envMatrix)
 		world->upw = 0;
 		world->atw = 0;
 		RawMatrix::mult(envMtx, world, &tmpMtx);
+#else
+        mat_load_apply(normal2texcoord, invMat);
+        mat_apply(*world);
+        mat_store2(envMatrix);
+#endif
 	}
 }
 
@@ -3812,12 +3809,9 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 	ac->skinMatrix0Identity = skinMatrix0Identity;
 
 	lightingCB(atomic, ac->uniform);
-
-	rw::RawMatrix world;
-	rw::convMatrix(&world, atomic->getFrame()->getLTM());
 	
 	mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView);
-	mat_apply((matrix_t*)&world);
+	mat_apply(*atomic->getFrame()->getLTM());
 	mat_store2((matrix_t*)&atomicContexts.back().mtx);
 
 	auto meshes = geo->meshHeader->getMeshes();
@@ -3841,7 +3835,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) {
 			matfxContextPointer = &matfxContexts.back();
 			// N.B. world here gets converted to a 3x3 matrix
 			// 		this is fine, as we only use it for env mapping from now on
-			uploadEnvMatrix(matfx->fx[0].env.frame, &world, &matfxContexts.back().mtx);
+			uploadEnvMatrix(matfx->fx[0].env.frame, reinterpret_cast<rw::RawMatrix*>(atomic->getFrame()->getLTM()), &matfxContexts.back().mtx);
 			matfxContextPointer->coefficient = matfxCoefficient;
 			
 			pvr_poly_cxt_t cxt;
diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h
index 0c40af52..fbd7b9ea 100644
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -384,17 +384,71 @@ inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) {
             fmov	dr2, dr12
             fldi0	fr14
             fschg
+            fmov.s	@%[x], fr0
+            fmov.s	@%[y], fr5
+            fmov.s	@%[z], fr10
+            fldi1   fr15
             frchg
-            fmov    %[x], xf0
-            fmov    %[y], xf5
-            fmov    %[z], xf10
         )"
         :
-        : [x] "r" (x), [y] "r" (y), [z] "r" (z)
+        : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)
         :
     );
 }
 
+inline __hot __icache_aligned void mat_set_scale(float s) {
+    asm volatile(
+        R"(
+            frchg
+            fldi0	fr1
+            fschg
+            fldi0	fr2
+            fldi0	fr3
+            fldi0	fr4
+            fmov	dr2, dr6
+            fmov	dr2, dr8
+            fldi0	fr11
+            fmov	dr2, dr12
+            fldi0	fr14
+            fschg
+            fmov.s	@%[s], fr0
+            fmov    fr0, fr5
+            fmov	fr0, fr10
+            fldi1   fr15
+            frchg
+        )"
+        :
+        : [s] "r" (&s)
+        :
+    );
+}
+
+inline __hot __icache_aligned void mat_set_translation(float x, float y, float z) {
+    asm volatile(
+        R"(
+            frchg
+            fldi1	fr0
+            fschg
+            fldi0	fr1
+            fldi0	fr2
+            fldi0	fr3
+            fldi0	fr4
+            fldi1	fr5
+            fmov	dr2,dr6
+            fmov	dr2,dr8
+            fmov	dr0,dr10
+            fschg
+            fmov.s  @%[x], fr12
+            fmov.s  @%[y], fr13
+            fmov.s  @%[z], fr14
+            fldi1   fr15
+            frchg
+        )"
+        :
+        : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)
+    );
+}
+
 // no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
 inline __hot void mat_load_3x3(const rw::Matrix* mtx) {
     __asm__ __volatile__ (
@@ -747,6 +801,30 @@ __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, c
     }
 }
 
+template<bool FAST_APPROX=true>
+__always_inline constexpr float fipr2D(float x1, float y1, float x2, float y2) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+        register float v1x asm("fr0") = x1;
+        register float v1y asm("fr1") = y1;
+        register float res asm("fr3");
+
+        register float v2x asm("fr4") = x2;
+        register float v2y asm("fr5") = y2;
+
+        asm volatile(R"(
+                fldi0   fr2
+                fldi0   fr3
+                fipr    fv4, fv0
+            )"
+            : "=f" (res)
+            : "f" (v1x), "f" (v1y), "f" (v2x), "f" (v2y));
+
+        return res;
+    } else {
+        return (x1 * x2 + y1 * y2);
+    }
+}
+
 #   else
 #       ifdef DC_TEXCONV
 #           define mat_apply(a)
@@ -778,6 +856,12 @@ __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, c
         y2 = tmp.y; \
     } while(false)
 
+#define mat_trans_normal3_nomod(x_, y_, z_, x2, y2, z2) do { \
+        vector_t tmp = { x_, y_, z_, 0.0f }; \
+        mat_transform(&tmp, &tmp, 1, 0); \
+        x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \
+    } while(false)
+
 #define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
         vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \
         mat_transform(&tmp1233123, &tmp1233123, 1, 0); \

From f0b38ff86ff93b924d9ab1b829f66303724b78da Mon Sep 17 00:00:00 2001
From: Falco Girgis <gyrovorbis@gmail.com>
Date: Sat, 3 May 2025 14:24:46 -0500
Subject: [PATCH 6/6] Saving state.

- Good place for gainz in liberty
- Cutting off PR
- known issue: boats' rotations while driving are incorrect.
---
 liberty/Makefile                        |   24 +-
 src/liberty/animation/AnimBlendNode.cpp |    6 +-
 src/liberty/core/Cam.cpp                |   18 +-
 src/liberty/core/Camera.cpp             |    2 +-
 src/liberty/math/Matrix.cpp             |   66 +-
 src/liberty/renderer/PointLights.cpp    |    2 +-
 src/liberty/renderer/Renderer.cpp       |    3 +-
 src/liberty/renderer/Renderer.h         |    3 +-
 vendor/librw/src/base.cpp               |    4 +-
 vendor/librw/src/camera.cpp             |   56 +-
 vendor/librw/src/dc/rwdc.cpp            |    3 +-
 vendor/librw/src/dc/rwdc_common.h       | 1064 ++++++++++++++---------
 vendor/librw/src/rwbase.h               |   17 +-
 13 files changed, 741 insertions(+), 527 deletions(-)

diff --git a/liberty/Makefile b/liberty/Makefile
index 3be60a74..82c42c51 100644
--- a/liberty/Makefile
+++ b/liberty/Makefile
@@ -104,6 +104,8 @@ OBJS_TEXCONV += \
 OBJS_O3 = \
 	../vendor/librw/src/dc/rwdc.o \
 	../src/liberty/core/World.o \
+	../src/liberty/core/Zones.o \
+	../src/liberty/core/ZoneCull.o \
 	../src/liberty/collision/Collision.o \
 	../src/liberty/math/math.o \
 	../src/liberty/math/Matrix.o \
@@ -111,11 +113,22 @@ OBJS_O3 = \
 	../src/liberty/math/Rect.o \
 	../src/liberty/math/Vector.o \
 	../vendor/librw/src/base.o \
-	../src/liberty/renderer/Shadows.o
-
-OBJS_NO_FAST_MATH = \
+	../src/liberty/renderer/Shadows.o \
+	../src/liberty/renderer/Renderer.o \
+	../src/liberty/animation/FrameUpdate.o \
+	../src/liberty/animation/RpAnimblend.o \
+	../src/liberty/control/PathFind.o \
 	../src/liberty/core/Cam.o \
-	../src/liberty/core/Camera.o
+	../src/liberty/peds/Ped.o \
+	../src/liberty/peds/PedAI.o \
+	../src/liberty/vehicles/Automobile.o
+
+# ICE list with with -O3
+OBJS_O2 = \
+	../src/liberty/animation/AnimBlendNode.o
+
+# ICE list with -ffast-math
+OBJS_NO_FAST_MATH =
 
 KOS_CPPFLAGS += -fbuiltin -ffast-math -ffp-contract=fast \
 	-mfsrra -mfsca
@@ -169,6 +182,9 @@ clean:
 $(OBJS_O3): %.o: %.cpp
 	kos-c++ $(CXXFLAGS) $(CPPFLAGS) -O3 -c $< -o $@
 
+$(OBJS_O2): %.o: %.cpp
+	kos-c++ $(CXXFLAGS) $(CPPFLAGS) -O2 -c $< -o $@
+
 $(OBJS_NO_FAST_MATH): %.o: %.cpp
 	kos-c++ $(CXXFLAGS) $(CPPFLAGS) -O3 -c $< -o $@ -fno-fast-math
 
diff --git a/src/liberty/animation/AnimBlendNode.cpp b/src/liberty/animation/AnimBlendNode.cpp
index c54a83a6..7223b2b2 100644
--- a/src/liberty/animation/AnimBlendNode.cpp
+++ b/src/liberty/animation/AnimBlendNode.cpp
@@ -39,7 +39,7 @@ CAnimBlendNode::Update(CVector &trans, CQuaternion &rot, float weight)
 	float blend = association->GetBlendAmount(weight);
 	if(blend > 0.0f){
 		float kfAdt = player->GetNextTimeDelta();
-		float t = kfAdt == 0.0f ? 0.0f : (kfAdt - remainingTime)/kfAdt;
+		float t = kfAdt == 0.0f ? 0.0f : dc::Div<true, false>(kfAdt - remainingTime, kfAdt);
 		if(player->type & CAnimBlendSequence::KF_TRANS){
 			auto kfdAt = player->GetNextTranslationDelta();
 			auto kfBt = player->GetPrevTranslation();
@@ -153,7 +153,7 @@ CAnimBlendNode::CalcDeltas(void)
 	if(cos > 1.0f)
 		cos = 1.0f;
 	theta = Acos(cos);
-	invSin = theta == 0.0f ?  0.0f : 1.0f/Sin(theta);
+	invSin = theta == 0.0f ?  0.0f : dc::Invert<true, false>(Sin(theta));
 }
 
 void
@@ -164,7 +164,7 @@ CAnimBlendNode::GetCurrentTranslation(CVector &trans, float weight)
 	float blend = association->GetBlendAmount(weight);
 	if(blend > 0.0f){
 		auto kfAdt = player->GetNextTimeDelta();
-		float t = kfAdt == 0.0f ? 0.0f : (kfAdt - remainingTime)/kfAdt;
+		float t = kfAdt == 0.0f ? 0.0f : dc::Div<true, false>(kfAdt - remainingTime, kfAdt);
 		if(player->type & CAnimBlendSequence::KF_TRANS){
 			auto kfdAt = player->GetNextTranslationDelta();
 			auto kfBt = player->GetPrevTranslation();
diff --git a/src/liberty/core/Cam.cpp b/src/liberty/core/Cam.cpp
index ff35b984..ab2dedc0 100644
--- a/src/liberty/core/Cam.cpp
+++ b/src/liberty/core/Cam.cpp
@@ -3267,24 +3267,24 @@ CCam::Process_BehindBoat(const CVector &CameraTarget, float TargetOrientation, f
 	if(m_bCollisionChecksOn || ResetStatics){
 		CVector TestPoint;
 		// Weird calculations here, also casting bool to float...
-		c = Cos(TargetOrientation);
-		s = Sin(TargetOrientation);
+		c = Cos<false>(TargetOrientation);
+		s = Sin<false>(TargetOrientation);
 		TestPoint = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) +
 			(TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) +
 			TargetCoors;
 		TestPoint.z = WaterLevel + TheCamera.CarZoomValueSmooth;
 		float Test1 = CWorld::GetIsLineOfSightClear(TestPoint, TargetCoors, true, false, false, true, false, true, true);
 
-		c = Cos(TargetOrientation + 0.8f);
-		s = Sin(TargetOrientation + DEGTORAD(40.0f));
+		c = Cos<false>(TargetOrientation + 0.8f);
+		s = Sin<false>(TargetOrientation + DEGTORAD(40.0f));
 		TestPoint = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) +
 			(TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) +
 			TargetCoors;
 		TestPoint.z = WaterLevel + TheCamera.CarZoomValueSmooth;
 		float Test2 = CWorld::GetIsLineOfSightClear(TestPoint, TargetCoors, true, false, false, true, false, true, true);
 
-		c = Cos(TargetOrientation - 0.8);
-		s = Sin(TargetOrientation - DEGTORAD(40.0f));
+		c = Cos<false>(TargetOrientation - 0.8);
+		s = Sin<false>(TargetOrientation - DEGTORAD(40.0f));
 		TestPoint = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) +
 			(TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) +
 			TargetCoors;
@@ -3307,8 +3307,7 @@ CCam::Process_BehindBoat(const CVector &CameraTarget, float TargetOrientation, f
 			DeltaBeta = TargetOrientation - Beta;
 		}
 
-		c = Cos(Beta);
-		s = Sin(Beta);
+		auto [s, c] = SinCos<false>(Beta);
 		TestPoint.x = TheCamera.CarZoomValueSmooth * -c +
 			(TheCamera.CarZoomValueSmooth + 7.0f) * -c +
 			TargetCoors.x;
@@ -3333,8 +3332,7 @@ CCam::Process_BehindBoat(const CVector &CameraTarget, float TargetOrientation, f
 		// inlined
 		WellBufferMe(TargetWhenChecksWereOn, &Beta, &BetaSpeed, 0.07f, 0.015f, true);
 
-		s = Sin(Beta);
-		c = Cos(Beta);
+		auto [s, c] = SinCos<false>(Beta);
 		Source = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) +
 			(TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) +
 			TargetCoors;
diff --git a/src/liberty/core/Camera.cpp b/src/liberty/core/Camera.cpp
index bad96834..a2a52abf 100644
--- a/src/liberty/core/Camera.cpp
+++ b/src/liberty/core/Camera.cpp
@@ -3694,7 +3694,7 @@ CCamera::IsBoxVisible(CVUVECTOR *box, const CMatrix *mat)
 #ifdef GTA_PS2
 	TransformPoints(box, 8, *mat, box);
 #else
-	#ifdef FIX_BUGS
+	#if defined(FIX_BUGS) && !defined(DC_SH4)
 		for (i = 0; i < 8; i++)
 			box[i] = *mat * box[i];
 	#else
diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp
index a282a211..9f61c35c 100644
--- a/src/liberty/math/Matrix.cpp
+++ b/src/liberty/math/Matrix.cpp
@@ -54,7 +54,7 @@ CMatrix::Detach(void)
 void
 CMatrix::Update(void)
 {
-#if 1
+#ifndef DC_SH4
 	GetRight() = m_attachment->right;
 	GetForward() = m_attachment->up;
 	GetUp() = m_attachment->at;
@@ -68,15 +68,15 @@ void
 CMatrix::UpdateRW(void)
 {
 	if (m_attachment) {
-#if 1
+#ifndef DC_SH4
 		m_attachment->right = GetRight();
 		m_attachment->up = GetForward();
 		m_attachment->at = GetUp();
 		m_attachment->pos = GetPosition();
+		RwMatrixUpdate(m_attachment);
 #else
         mat_copy(*m_attachment, *this);
 #endif
-		RwMatrixUpdate(m_attachment);
 	}
 }
 
@@ -84,8 +84,10 @@ void
 CMatrix::operator=(CMatrix const &rhs)
 {
 	mat_copy(*this, rhs);
+#ifndef DC_SH4
 	if (m_attachment)
 		UpdateRW();
+#endif
 }
 
 void
@@ -168,7 +170,7 @@ CMatrix::SetScale(float s)
 void
 CMatrix::SetTranslate(float x, float y, float z)
 {
-#if 1
+#ifndef DC_SH4
 	rx = 1.0f;
 	ry = 0.0f;
 	rz = 0.0f;
@@ -247,34 +249,52 @@ CMatrix::SetRotateZOnly(float angle)
 void
 CMatrix::SetRotateX(float angle)
 {
+#ifndef DC_SH4
 	SetRotateXOnly(angle);
 	px = 0.0f;
 	py = 0.0f;
 	pz = 0.0f;
+#else
+    dc::mat_identity2();
+    dc::mat_apply_rotate_x(angle);
+    dc::mat_store2(*this);
+#endif
 }
 
-
 void
 CMatrix::SetRotateY(float angle)
 {
+#ifndef DC_SH4
 	SetRotateYOnly(angle);
 	px = 0.0f;
 	py = 0.0f;
 	pz = 0.0f;
+#else
+    dc::mat_identity2();
+    dc::mat_apply_rotate_y(angle);
+    dc::mat_store2(*this);
+#endif
 }
 
 void
 CMatrix::SetRotateZ(float angle)
 {
+#ifndef DC_SH4
 	SetRotateZOnly(angle);
 	px = 0.0f;
 	py = 0.0f;
 	pz = 0.0f;
+#else
+    dc::mat_identity2();
+    dc::mat_apply_rotate_z(angle);
+    dc::mat_store2(*this);
+#endif
 }
 
 void
 CMatrix::SetRotate(float xAngle, float yAngle, float zAngle)
 {
+#if 1
 	auto [sX, cX] = SinCos(xAngle);
 	auto [sY, cY] = SinCos(yAngle);
 	auto [sZ, cZ] = SinCos(zAngle);
@@ -294,15 +314,19 @@ CMatrix::SetRotate(float xAngle, float yAngle, float zAngle)
 	px = 0.0f;
 	py = 0.0f;
 	pz = 0.0f;
+#else
+    dc::mat_set_rotate(xAngle, yAngle, zAngle);
+    dc::mat_store2(*this);
+#endif
 }
 
 void
 CMatrix::RotateX(float x)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
-       mat_rotate_x(x);
-       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
+#if 0// this is bugged and does not yield correct results
+    dc::mat_set_rotate_x(x);
+    mat_apply(*this);
+    dc::mat_store2(*this);
 #else
 	auto [s, c] = SinCos(x);
 
@@ -329,10 +353,10 @@ CMatrix::RotateX(float x)
 void
 CMatrix::RotateY(float y)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
-       mat_rotate_y(y);
-       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
+#if 0 // this is bugged and does not yield correct results
+    dc::mat_set_rotate_y(y);
+    mat_apply(*this);
+    dc::mat_store2(*this);
 #else
 	auto [s, c] = SinCos(y);
 
@@ -359,10 +383,10 @@ CMatrix::RotateY(float y)
 void
 CMatrix::RotateZ(float z)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
-       mat_rotate_z(z);
-       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
+#if 0// this is bugged and does not yield correct results
+       dc::mat_set_rotate_z(z);
+       mat_apply(*this);
+       dc::mat_store2(*this);
 #else	
 	auto [s, c] = SinCos(z);
 
@@ -389,10 +413,10 @@ CMatrix::RotateZ(float z)
 void
 CMatrix::Rotate(float x, float y, float z)
 {
-#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results
-       dc::mat_load2(reinterpret_cast<matrix_t *>(this));
-       mat_rotate(x, y, z);
-       dc::mat_store2(reinterpret_cast<matrix_t *>(this));
+#if 0 // this is bugged and does not yield correct results
+    dc::mat_set_rotate(x, y, z);
+    mat_apply(*this);
+    dc::mat_store2(*this);
 #else
 	auto [sX, cX] = SinCos(x);
 	auto [sY, cY] = SinCos(y);
diff --git a/src/liberty/renderer/PointLights.cpp b/src/liberty/renderer/PointLights.cpp
index 84ac4ab2..716089dc 100644
--- a/src/liberty/renderer/PointLights.cpp
+++ b/src/liberty/renderer/PointLights.cpp
@@ -80,7 +80,7 @@ CPointLights::GenerateLightsAffectingObject(Const CVector *objCoors)
 			distance = dist.Magnitude();
 			if(distance < radius){
 
-				float distNorm = distance/radius;
+				float distNorm = Div<true, false>(distance, radius);
 				if(aLights[i].type == LIGHT_DARKEN){
 					// darken the object the closer it is
 					ret *= distNorm;
diff --git a/src/liberty/renderer/Renderer.cpp b/src/liberty/renderer/Renderer.cpp
index cfdae34a..ba02e0ad 100644
--- a/src/liberty/renderer/Renderer.cpp
+++ b/src/liberty/renderer/Renderer.cpp
@@ -1315,8 +1315,9 @@ CalcNewDelta(RwV2d *a, RwV2d *b)
 #define TOINT(x) ((int)(x))
 #endif
 
+template<typename F>
 void
-CRenderer::ScanSectorPoly(RwV2d *poly, int32 numVertices, void (*scanfunc)(CPtrList *))
+CRenderer::ScanSectorPoly(RwV2d *poly, int32 numVertices, F&& scanfunc)
 {
 	float miny, maxy;
 	int y, yend;
diff --git a/src/liberty/renderer/Renderer.h b/src/liberty/renderer/Renderer.h
index 0322939c..3517a8da 100644
--- a/src/liberty/renderer/Renderer.h
+++ b/src/liberty/renderer/Renderer.h
@@ -90,7 +90,8 @@ public:
 	static void ConstructRenderList(void);
 	static void ScanWorld(void);
 	static void RequestObjectsInFrustum(void);
-	static void ScanSectorPoly(RwV2d *poly, int32 numVertices, void (*scanfunc)(CPtrList *));
+    template<typename F>
+	static void ScanSectorPoly(RwV2d *poly, int32 numVertices, F &&scanfunc);
 	static void ScanBigBuildingList(CPtrList &list);
 	static void ScanSectorList(CPtrList *lists);
 	static void ScanSectorList_Priority(CPtrList *lists);
diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp
index ff78eddb..851f8f4a 100644
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -410,7 +410,7 @@ Matrix::translate(const V3d *translation, CombineOp op)
     Matrix trans;
     dc::mat_set_translation(translation->x, translation->y, translation->z);
     dc::mat_store2(trans);
-    trans.flags = Matrix::TYPEORTHONORMAL;
+    trans.flags = TYPEORTHONORMAL;
 #endif
 	switch(op){
 	case COMBINEREPLACE:
@@ -439,7 +439,7 @@ Matrix::scale(const V3d *scale, CombineOp op)
     Matrix scl;
     dc::mat_set_scale(scale->x, scale->y, scale->z);
     dc::mat_store2(scl);
-    scl.flags = Matrix::TYPEORTHONORMAL;
+    scl.flags = TYPEORTHONORMAL;
 #endif
 	switch(op){
 	case COMBINEREPLACE:
diff --git a/vendor/librw/src/camera.cpp b/vendor/librw/src/camera.cpp
index 0d2d55af..a9b3f8e5 100644
--- a/vendor/librw/src/camera.cpp
+++ b/vendor/librw/src/camera.cpp
@@ -493,33 +493,7 @@ Camera::frustumTestSphere(const Sphere *s) const
 	static_assert(offsetof (decltype (p[0].plane), distance)
 				 -offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
 
-	asm volatile (R"(
-		frchg
-
-		fmov.s  @%0+,fr0
-		fmov.s  @%1+,fr1
-		fmov.s  @%2+,fr2
-		fmov.s  @%3+,fr3
-
-		fmov.s  @%0+,fr4
-		fmov.s  @%1+,fr5
-		fmov.s  @%2+,fr6
-		fmov.s  @%3+,fr7
-
-		fmov.s  @%0+,fr8
-		fmov.s  @%1+,fr9
-		fmov.s  @%2+,fr10
-		fmov.s  @%3+,fr11
-
-		fmov.s  @%0,fr12
-		fmov.s  @%1,fr13
-		fmov.s  @%2,fr14
-		fmov.s  @%3,fr15
-
-		frchg
-	)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
-	:
-	: );
+    mat_load_rows(base_ptr0, base_ptr1, base_ptr2, base_ptr3);
 
 	float dists[4];
 	mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
@@ -643,33 +617,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const
 	static_assert(offsetof (decltype (p[0].plane), distance)
 				 -offsetof (decltype (p[0].plane.normal), z) == sizeof (float));
 
-	asm volatile (R"(
-		frchg
-
-		fmov.s  @%0+,fr0
-		fmov.s  @%1+,fr1
-		fmov.s  @%2+,fr2
-		fmov.s  @%3+,fr3
-
-		fmov.s  @%0+,fr4
-		fmov.s  @%1+,fr5
-		fmov.s  @%2+,fr6
-		fmov.s  @%3+,fr7
-
-		fmov.s  @%0+,fr8
-		fmov.s  @%1+,fr9
-		fmov.s  @%2+,fr10
-		fmov.s  @%3+,fr11
-
-		fmov.s  @%0,fr12
-		fmov.s  @%1,fr13
-		fmov.s  @%2,fr14
-		fmov.s  @%3,fr15
-
-		frchg
-	)" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3)
-	:
-	: );
+    mat_load_rows(base_ptr0, base_ptr1, base_ptr2, base_ptr3);
 
 	float dists[4];
 	mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw,
diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp
index aa8a7c14..c6c8e29a 100644
--- a/vendor/librw/src/dc/rwdc.cpp
+++ b/vendor/librw/src/dc/rwdc.cpp
@@ -811,7 +811,8 @@ struct chunked_vector {
     }
 
     // Iterate over each element and invoke the callback.
-    void forEach(void(*cb)(T&)) {
+    template<typename F>
+    void forEach(F&& cb) {
         for (chunk* curr = first; curr; curr = curr->header.next) {
             for (size_t i = 0; i < curr->header.used; ++i) {
                 cb(curr->items[i]);
diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h
index fbd7b9ea..eab4b277 100644
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -66,9 +66,38 @@ struct quaternion_t {
     float x, y, z, w;
 };
 
-__always_inline __hot constexpr float Sin(float x) { return sinf(x); }
-__always_inline __hot constexpr float Cos(float x) { return cosf(x); }
-__always_inline __hot constexpr auto  SinCos(float x) { return std::pair { Sin(x), Cos(x) }; }
+template<bool BUILTIN=true>
+__always_inline __hot constexpr float Sin(float x) {
+#ifdef DC_SH4
+    if constexpr(!BUILTIN)
+        return fsin(x);
+    else
+#endif
+    return sinf(x);
+}
+
+template<bool BUILTIN=true>
+__always_inline __hot constexpr float Cos(float x) {
+#ifdef DC_SH4
+    if constexpr(!BUILTIN)
+        return fcos(x);
+    else
+#endif
+    return cosf(x);
+}
+
+template<bool BUILTIN=true>
+__always_inline __hot constexpr auto SinCos(float x) {
+#ifdef DC_SH4
+    if constexpr(!BUILTIN) {
+        std::pair<float, float> result;
+        fsincosr(x, &std::get<0>(result), &std::get<1>(result));
+        return result;
+    } else
+#endif
+    return std::pair { Sin(x), Cos(x) };
+}
+
 __always_inline __hot constexpr float Tan(float x) { return tanf(x); }
 __always_inline __hot constexpr float Atan(float x) { return atanf(x); }
 __always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); }
@@ -78,8 +107,73 @@ __always_inline __hot constexpr float Abs(float x) { return fabsf(x); }
 __always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); }
 __always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); }
 __always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); }
-__always_inline __hot constexpr float Floor(float x) { return floorf(x); }
-__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); }
+
+template<bool FAST_APPROX=false>
+__always_inline __hot constexpr float Floor(float x) {
+#ifdef DC_SH4
+    if(!std::is_constant_evaluated() && FAST_APPROX) {
+        float output_float;
+        unsigned int scratch_reg;
+        unsigned int scratch_reg2;
+
+        asm volatile (R"(
+            mov     #0x4f, %[scratch]
+            shll16  %[scratch]
+            shll8   %[scratch]
+            lds     %[scratch], fpul
+            mov     #1, %[scratch2]
+            fsts    fpul, %[float_out]
+            fadd    %[floatx], %[float_out]
+            rotr    %[scratch2]
+            ftrc    %[float_out], fpul
+            sts     fpul, %[scratch]
+            add     %[scratch2], %[scratch]
+            lds     %[scratch], fpul
+            float   fpul, %[float_out]
+        )"
+        : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float)
+        : [floatx] "f" (x)
+        : "fpul", "t");
+
+        return output_float;
+    } else
+#endif
+    return floorf(x);
+}
+
+template<bool FAST_APPROX=false>
+__always_inline __hot constexpr float Ceil(float x) {
+#ifdef DC_SH4
+    if(!std::is_constant_evaluated() && FAST_APPROX) {
+        float output_float;
+        unsigned int scratch_reg;
+        unsigned int scratch_reg2;
+
+        asm volatile (R"(
+            mov     #0x4f, %[scratch]
+            shll16  %[scratch]
+            shll8   %[scratch]
+            lds     %[scratch], fpul
+            mov     #1, %[scratch2]
+            fsts    fpul, %[float_out]
+            fsub    %[floatx], %[float_out]
+            rotr    %[scratch2]
+            ftrc    %[float_out], fpul
+            sts     fpul, %[scratch]
+            add     %[scratch2], %[scratch]
+            lds     %[scratch], fpul
+            float   fpul, %[float_out]
+            fneg    %[float_out]
+        )"
+        : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float)
+        : [floatx] "f" (x)
+        : "fpul", "t");
+
+        return output_float;
+    } else
+#endif
+    return ceilf(x);
+}
 __always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; }
 __always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); }
 __always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); }
@@ -94,7 +188,7 @@ __always_inline __hot constexpr float RecipSqrt(float x) {
     return 1.0f / Sqrt(x); 
 }
 
-template<typename T>
+template<typename T, bool FAST_APPROX=true>
 __always_inline __hot constexpr T Clamp(T v, auto low, auto high) {
     return std::clamp(v, static_cast<T>(low), static_cast<T>(high));
 }
@@ -185,108 +279,133 @@ __always_inline __hot constexpr auto Norm(auto value, auto min, auto max) {
         x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
     } while(false)
 
-inline __hot __icache_aligned void mat_load2(const matrix_t *mtx) {
-    asm volatile(
-        R"(
-            fschg
-            fmov.d	@%[mtx],xd0
-            add	    #32,%[mtx]
-            pref	@%[mtx]
-            add	    #-(32-8),%[mtx]
-            fmov.d	@%[mtx]+,xd2
-            fmov.d	@%[mtx]+,xd4
-            fmov.d	@%[mtx]+,xd6
-            fmov.d	@%[mtx]+,xd8
-            fmov.d	@%[mtx]+,xd10
-            fmov.d	@%[mtx]+,xd12
-            fmov.d	@%[mtx]+,xd14
-            fschg
-        )"
-        : [mtx] "+r" (mtx)
-        :
-        :
-    );
+inline __hot __icache_aligned
+void mat_load2(const matrix_t *mtx) {
+    asm volatile(R"(
+        fschg
+        fmov.d	@%[mtx],xd0
+        add	    #32,%[mtx]
+        pref	@%[mtx]
+        add	    #-(32-8),%[mtx]
+        fmov.d	@%[mtx]+,xd2
+        fmov.d	@%[mtx]+,xd4
+        fmov.d	@%[mtx]+,xd6
+        fmov.d	@%[mtx]+,xd8
+        fmov.d	@%[mtx]+,xd10
+        fmov.d	@%[mtx]+,xd12
+        fmov.d	@%[mtx]+,xd14
+        fschg
+    )"
+    : [mtx] "+r" (mtx));
 }
 
-inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) {
-    asm volatile(
-        R"(
-            frchg
+inline __hot __icache_aligned
+void mat_load_transpose(const matrix_t *mtx) {
+    asm volatile(R"(
+        frchg
 
-            fmov.s  @%[mtx]+, fr0
+        fmov.s  @%[mtx]+, fr0
 
-            add     #32, %[mtx]
-            pref    @%[mtx]
-            add     #-(32 - 4), %[mtx]
+        add     #32, %[mtx]
+        pref    @%[mtx]
+        add     #-(32 - 4), %[mtx]
 
-            fmov.s  @%[mtx]+, fr4
-            fmov.s  @%[mtx]+, fr8
-            fmov.s  @%[mtx]+, fr12
+        fmov.s  @%[mtx]+, fr4
+        fmov.s  @%[mtx]+, fr8
+        fmov.s  @%[mtx]+, fr12
 
-            fmov.s  @%[mtx]+, fr1
-            fmov.s  @%[mtx]+, fr5
-            fmov.s  @%[mtx]+, fr9
-            fmov.s  @%[mtx]+, fr13
+        fmov.s  @%[mtx]+, fr1
+        fmov.s  @%[mtx]+, fr5
+        fmov.s  @%[mtx]+, fr9
+        fmov.s  @%[mtx]+, fr13
 
-            fmov.s  @%[mtx]+, fr2
-            fmov.s  @%[mtx]+, fr6
-            fmov.s  @%[mtx]+, fr10
-            fmov.s  @%[mtx]+, fr14
+        fmov.s  @%[mtx]+, fr2
+        fmov.s  @%[mtx]+, fr6
+        fmov.s  @%[mtx]+, fr10
+        fmov.s  @%[mtx]+, fr14
 
-            fmov.s  @%[mtx]+, fr3
-            fmov.s  @%[mtx]+, fr7
-            fmov.s  @%[mtx]+, fr11
-            fmov.s  @%[mtx]+, fr15
+        fmov.s  @%[mtx]+, fr3
+        fmov.s  @%[mtx]+, fr7
+        fmov.s  @%[mtx]+, fr11
+        fmov.s  @%[mtx]+, fr15
 
-            frchg
-        )"
-        : [mtx] "+r" (mtx)
-        :
-        :
-    );
+        frchg
+    )"
+    : [mtx] "+r" (mtx));
 }
 
-inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) {
-    asm volatile(
-        R"(
-            frchg
+inline __hot __icache_aligned
+void mat_load_rows(const float *r1, const float *r2, const float *r3, const float *r4) {
+    asm volatile (R"(
+		frchg
 
-            fmov.s  @%[mtx]+, fr0
+        pref    @%1
+		fmov.s  @%0+,fr0
+		fmov.s  @%0+,fr4
+		fmov.s  @%0+,fr8
+		fmov.s  @%0, fr12
 
-            add     #32, %[mtx]
-            pref    @%[mtx]
-            add     #-(32 - 4), %[mtx]
+        pref    @%2
+        fmov.s  @%1+,fr1
+		fmov.s  @%1+,fr5
+		fmov.s  @%1+,fr9
+		fmov.s  @%1, fr13
 
-            fmov.s  @%[mtx]+, fr4
-            fmov.s  @%[mtx]+, fr8
-            fldi0   fr12
-            add     #4, %[mtx]
+        pref    @%3
+        fmov.s  @%2+,fr2
+		fmov.s  @%2+,fr6
+		fmov.s  @%2+,fr10
+		fmov.s  @%2,fr14
 
-            fmov.s  @%[mtx]+, fr1
-            fmov.s  @%[mtx]+, fr5
-            fmov.s  @%[mtx]+, fr9
-            fldi0   fr13
-            add     #4, %[mtx]
+        fmov.s  @%3+,fr3
+		fmov.s  @%3+,fr7
+		fmov.s  @%3+,fr11
+		fmov.s  @%3,fr15
 
-            fmov.s  @%[mtx]+, fr2
-            fmov.s  @%[mtx]+, fr6
-            fmov.s  @%[mtx]+, fr10
-            fldi0   fr14
-
-            fldi0  fr3
-            fldi0  fr7
-            fmov   fr3, fr11
-            fldi1  fr15
-
-            frchg
-        )"
-        : [mtx] "+r" (mtx)
-        :
-        :
-    );
+		frchg
+	)"
+    : "+&r" (r1), "+&r" (r2), "+&r" (r3), "+&r" (r4));
 }
 
-inline __hot __icache_aligned void mat_invert_tranpose() {
+inline __hot __icache_aligned
+void mat_load_3x3_transpose(const matrix_t *mtx) {
+    asm volatile(R"(
+        frchg
+
+        fmov.s  @%[mtx]+, fr0
+
+        add     #32, %[mtx]
+        pref    @%[mtx]
+        add     #-(32 - 4), %[mtx]
+
+        fmov.s  @%[mtx]+, fr4
+        fmov.s  @%[mtx]+, fr8
+        fldi0   fr12
+        add     #4, %[mtx]
+
+        fmov.s  @%[mtx]+, fr1
+        fmov.s  @%[mtx]+, fr5
+        fmov.s  @%[mtx]+, fr9
+        fldi0   fr13
+        add     #4, %[mtx]
+
+        fmov.s  @%[mtx]+, fr2
+        fmov.s  @%[mtx]+, fr6
+        fmov.s  @%[mtx]+, fr10
+        fldi0   fr14
+
+        fldi0  fr3
+        fldi0  fr7
+        fmov   fr3, fr11
+        fldi1  fr15
+
+        frchg
+    )"
+    : [mtx] "+r" (mtx));
+}
+
+inline __hot __icache_aligned
+void mat_invert_tranpose() {
 	asm volatile(
 		"frchg\n\t"
 		"fneg	fr12\n\t"
@@ -323,235 +442,249 @@ inline __hot __icache_aligned void mat_invert_tranpose() {
 		:);
 }
 
-inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
-    asm volatile(
-        R"(
-            fschg
-            add	    #64-8,%[mtx]
-            fmov.d	xd14,@%[mtx]
-            add	    #-32,%[mtx]
-            pref	@%[mtx]
-            add	    #32,%[mtx]
-            fmov.d	xd12,@-%[mtx]
-            fmov.d	xd10,@-%[mtx]
-            fmov.d	xd8,@-%[mtx]
-            fmov.d	xd6,@-%[mtx]
-            fmov.d	xd4,@-%[mtx]
-            fmov.d	xd2,@-%[mtx]
-            fmov.d	xd0,@-%[mtx]
-            fschg
-        )"
-        : [mtx] "+&r" (mtx), "=m" (*mtx)
-        :
-        :
-    );
+inline __hot __icache_aligned
+void mat_store2(matrix_t *mtx) {
+    asm volatile(R"(
+        fschg
+        add	    #64-8,%[mtx]
+        fmov.d	xd14,@%[mtx]
+        add	    #-32,%[mtx]
+        pref	@%[mtx]
+        add	    #32,%[mtx]
+        fmov.d	xd12,@-%[mtx]
+        fmov.d	xd10,@-%[mtx]
+        fmov.d	xd8,@-%[mtx]
+        fmov.d	xd6,@-%[mtx]
+        fmov.d	xd4,@-%[mtx]
+        fmov.d	xd2,@-%[mtx]
+        fmov.d	xd0,@-%[mtx]
+        fschg
+    )"
+    : [mtx] "+&r" (mtx), "=m" (*mtx));
 }
 
-inline __hot __icache_aligned void mat_identity2(void) {
-    asm volatile(
-        R"(
-            frchg
-            fldi1	fr0
-            fschg
-            fldi0	fr1
-            fldi0	fr2
-            fldi0	fr3
-            fldi0	fr4
-            fldi1	fr5
-            fmov	dr2,dr6
-            fmov	dr2,dr8
-            fmov	dr0,dr10
-            fmov	dr2,dr12
-            fmov	dr4,dr14
-            fschg
-            frchg
-        )"
-    );
+inline __hot __icache_aligned
+void mat_identity2(void) {
+    asm volatile(R"(
+        frchg
+        fldi1	fr0
+        fschg
+        fldi0	fr1
+        fldi0	fr2
+        fldi0	fr3
+        fldi0	fr4
+        fldi1	fr5
+        fmov	dr2,dr6
+        fmov	dr2,dr8
+        fmov	dr0,dr10
+        fmov	dr2,dr12
+        fmov	dr4,dr14
+        fschg
+        frchg
+    )");
 }
 
-inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) {
-    asm volatile(
-        R"(
-            frchg
-            fldi0	fr1
-            fschg
-            fldi0	fr2
-            fldi0	fr3
-            fldi0	fr4
-            fmov	dr2, dr6
-            fmov	dr2, dr8
-            fldi0	fr11
-            fmov	dr2, dr12
-            fldi0	fr14
-            fschg
-            fmov.s	@%[x], fr0
-            fmov.s	@%[y], fr5
-            fmov.s	@%[z], fr10
-            fldi1   fr15
-            frchg
-        )"
-        :
-        : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)
-        :
-    );
+inline __hot __icache_aligned
+void mat_set_scale(float x, float y, float z) {
+    asm volatile(R"(
+        frchg
+        fldi0	fr1
+        fschg
+        fldi0	fr2
+        fldi0	fr3
+        fldi0	fr4
+        fmov	dr2, dr6
+        fmov	dr2, dr8
+        fldi0	fr11
+        fmov	dr2, dr12
+        fldi0	fr14
+        fschg
+        fmov.s	@%[x], fr0
+        fmov.s	@%[y], fr5
+        fmov.s	@%[z], fr10
+        fldi1   fr15
+        frchg
+    )"
+    :
+    : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z));
 }
 
-inline __hot __icache_aligned void mat_set_scale(float s) {
-    asm volatile(
-        R"(
-            frchg
-            fldi0	fr1
-            fschg
-            fldi0	fr2
-            fldi0	fr3
-            fldi0	fr4
-            fmov	dr2, dr6
-            fmov	dr2, dr8
-            fldi0	fr11
-            fmov	dr2, dr12
-            fldi0	fr14
-            fschg
-            fmov.s	@%[s], fr0
-            fmov    fr0, fr5
-            fmov	fr0, fr10
-            fldi1   fr15
-            frchg
-        )"
-        :
-        : [s] "r" (&s)
-        :
-    );
+inline __hot __icache_aligned
+void mat_set_scale(float s) {
+    asm volatile(R"(
+        frchg
+        fldi0	fr1
+        fschg
+        fldi0	fr2
+        fldi0	fr3
+        fldi0	fr4
+        fmov	dr2, dr6
+        fmov	dr2, dr8
+        fldi0	fr11
+        fmov	dr2, dr12
+        fldi0	fr14
+        fschg
+        fmov.s	@%[s], fr0
+        fmov    fr0, fr5
+        fmov	fr0, fr10
+        fldi1   fr15
+        frchg
+    )"
+    :
+    : [s] "r" (&s));
 }
 
-inline __hot __icache_aligned void mat_set_translation(float x, float y, float z) {
-    asm volatile(
-        R"(
-            frchg
-            fldi1	fr0
-            fschg
-            fldi0	fr1
-            fldi0	fr2
-            fldi0	fr3
-            fldi0	fr4
-            fldi1	fr5
-            fmov	dr2,dr6
-            fmov	dr2,dr8
-            fmov	dr0,dr10
-            fschg
-            fmov.s  @%[x], fr12
-            fmov.s  @%[y], fr13
-            fmov.s  @%[z], fr14
-            fldi1   fr15
-            frchg
-        )"
-        :
-        : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)
-    );
+// Don't multiply anything by fr3, since not loading from vector????
+inline __hot __icache_aligned
+void mat_apply_scale(float x, float y, float z) {
+    asm volatile(R"(
+        fschg
+        fmov	xd0, dr4
+        fmov	xd2, dr6
+        fschg
+
+        frchg
+        fmov.s  @%[x], fr0
+        fmov.s  @%[y], fr1
+        fmov.s  @%[z], fr2
+
+        fmul	fr0, fr4
+        fmul	fr0, fr8
+        fmul	fr0, fr12
+        fmul	fr1, fr5
+        fmul	fr1, fr9
+        fmul	fr1, fr13
+        fmul	fr2, fr6
+        fmul	fr2, fr10
+        fmul	fr2, fr14
+
+        fschg
+        fmov	dr4, xd0
+        fmul	fr3, fr7
+        fmov	dr6, xd2
+        fmul	fr3, fr11
+        fmov	xd4, dr4
+        fmul	fr3, fr15
+        fmov	xd6, dr6
+        fschg
+
+        fmul	fr4, fr0
+
+        fmul	fr5, fr1
+        fmul	fr6, fr2
+        fmul	fr7, fr3
+
+        fschg
+        fmov	xd0, dr4
+        fmov	xd2, dr6
+        fschg
+
+        frchg
+    )"
+    :
+    : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)
+    : "fr0", "fr1", "fr2");
 }
 
-// no declspec naked, so can't do rts / fschg. instead compiler pads with nop?
-inline __hot void mat_load_3x3(const rw::Matrix* mtx) {
-    __asm__ __volatile__ (
-        R"(
-            fschg
-            frchg
+inline __hot __icache_aligned
+void mat_apply_translation(float x, float y, float z) {
+    asm volatile(R"(
+        fschg
+        fmov	xd12, dr4
+        fmov	xd14, dr6
+        fschg
 
-            fmov        @%[mtx]+, dr0
+        fmov.s  @%[x], fr0
+        fmov.s  @%[y], fr1
+        fmov.s  @%[z], fr2
 
-            fldi0 		fr12
-            fldi0 		fr13
+        fadd	fr0, fr4
+        fadd	fr1, fr5
+        fadd	fr2, fr6
 
-            fmov        @%[mtx]+, dr2
-            fmov        @%[mtx]+, dr4
-            fmov        @%[mtx]+, dr6
-            fmov        @%[mtx]+, dr8
-            fmov        @%[mtx]+, dr10
-
-            fldi0	    fr3
-            fldi0	    fr7
-            fldi0	    fr11
-            fmov        dr12, dr14
-
-            fschg
-            frchg
-        )"
-        : [mtx] "+r" (mtx)
-    );
+        fschg
+        fmov	dr4, xd12
+        fmov	dr6, xd14
+        fschg
+    )"
+    :
+    : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)
+    : "fr0", "fr1", "fr2", "fr4", "fr5", "fr6", "fr7");
 }
 
-// sets pos.w to 1
-inline __hot void mat_load_4x4(const rw::Matrix* mtx) {
-		__asm__ __volatile__ (
-			R"(
-				fschg
-				frchg
-				fmov        @%[mtx]+, dr0
-
-				fmov        @%[mtx]+, dr2
-				fmov        @%[mtx]+, dr4
-				fmov        @%[mtx]+, dr6
-				fmov        @%[mtx]+, dr8
-				fmov        @%[mtx]+, dr10
-				fmov        @%[mtx]+, dr12
-				fmov        @%[mtx]+, dr14
-				fldi1 	 	fr15
-
-				fschg
-				frchg
-			)"
-			: [mtx] "+r" (mtx)
-		);
-	}
-
-__hot inline void mat_transpose(void) {
-    asm volatile (
-        "frchg\n\t" // fmov for singles only works on front bank
-        // FR0, FR5, FR10, and FR15 are already in place
-        // swap FR1 and FR4
-        "flds FR1, FPUL\n\t"
-        "fmov FR4, FR1\n\t"
-        "fsts FPUL, FR4\n\t"
-        // swap FR2 and FR8
-        "flds FR2, FPUL\n\t"
-        "fmov FR8, FR2\n\t"
-        "fsts FPUL, FR8\n\t"
-        // swap FR3 and FR12
-        "flds FR3, FPUL\n\t"
-        "fmov FR12, FR3\n\t"
-        "fsts FPUL, FR12\n\t"
-        // swap FR6 and FR9
-        "flds FR6, FPUL\n\t"
-        "fmov FR9, FR6\n\t"
-        "fsts FPUL, FR9\n\t"
-        // swap FR7 and FR13
-        "flds FR7, FPUL\n\t"
-        "fmov FR13, FR7\n\t"
-        "fsts FPUL, FR13\n\t"
-        // swap FR11 and FR14
-        "flds FR11, FPUL\n\t"
-        "fmov FR14, FR11\n\t"
-        "fsts FPUL, FR14\n\t"
-        // restore XMTRX to back bank
-        "frchg\n"
-        : // no outputs
-        : // no inputs
-        : "fpul" // clobbers
-    );
+inline __hot __icache_aligned
+void mat_set_translation(float x, float y, float z) {
+    asm volatile(R"(
+        frchg
+        fldi1	fr0
+        fschg
+        fldi0	fr1
+        fldi0	fr2
+        fldi0	fr3
+        fldi0	fr4
+        fldi1	fr5
+        fmov	dr2,dr6
+        fmov	dr2,dr8
+        fmov	dr0,dr10
+        fschg
+        fmov.s  @%[x], fr12
+        fmov.s  @%[y], fr13
+        fmov.s  @%[z], fr14
+        fldi1   fr15
+        frchg
+    )"
+    :
+    : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z));
 }
 
-__hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) {
+inline __hot __icache_aligned
+void mat_transpose(void) {
+    asm volatile (R"(
+        frchg
+
+        flds    fr1, fpul
+        fmov    fr4, fr1
+        fsts    fpul, fr4
+
+        flds    fr2, fpul
+        fmov    fr8, fr2
+        fsts    fpul, fr8
+
+        flds    fr3, fpul
+        fmov    fr12, fr3
+        fsts    fpul, fr12
+
+        flds    fr6, fpul
+        fmov    fr9, fr6
+        fsts    fpul, fr9
+
+        flds    fr7, fpul
+        fmov    fr13, fr7
+        fsts    fpul, fr13
+
+        flds    fr11, fpUL
+        fmov    fr14, fr11
+        fsts    fpul, fr14
+
+        frchg
+    )"
+    :
+    :
+    : "fpul");
+}
+
+__hot __icache_aligned inline
+void mat_copy(matrix_t *dst, const matrix_t *src) {
     asm volatile(R"(
         fschg
 
         pref    @%[dst]
-
         fmov.d  @%[src]+, xd0
         fmov.d  @%[src]+, xd2
         fmov.d  @%[src]+, xd4
         fmov.d  @%[src]+, xd6
     
         pref    @%[src]
-
         add     #32, %[dst]
 
         fmov.d  xd6, @-%[dst]
@@ -568,141 +701,243 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src)
         fmov.d  @%[src]+, xd6
 
         add     #32, %[dst]
-
         fmov.d  xd6, @-%[dst]
         fmov.d  xd4, @-%[dst]
         fmov.d  xd2, @-%[dst]
         fmov.d  xd0, @-%[dst]
 
         fschg
-    )": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst)
-      :
-      :);
+    )"
+    : [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst));
 }
 
-__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
+constexpr float fsca_scale = 10430.37835f;
+
+__hot __icache_aligned inline
+void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
     unsigned int prefetch_scratch;
 
-    asm volatile (
-        "mov %[bmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE)
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // back matrix
-        "fmov.d @%[bmtrx]+, XD0\n\t" // (LS)
-        "fmov.d @%[bmtrx]+, XD2\n\t"
-        "fmov.d @%[bmtrx]+, XD4\n\t"
-        "fmov.d @%[bmtrx]+, XD6\n\t"
-        "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS)
-        "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here
-        "fmov.d @%[bmtrx]+, XD10\n\t"
-        "fmov.d @%[bmtrx]+, XD12\n\t"
-        "mov %[fmtrx], %[pref_scratch]\n\t" // (MT)
-        "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle)
-        "fmov.d @%[bmtrx], XD14\n\t"
-        "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS)
-        // front matrix
-        // interleave loads and matrix multiply 4x4
-        "fmov.d @%[fmtrx]+, DR0\n\t"
-        "fmov.d @%[fmtrx]+, DR2\n\t"
-        "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec
-        "ftrv XMTRX, FV0\n\t" // (FE)
+    asm volatile (R"(
+        mov     %[m1], %[prefscr]
+        add     #32, %[prefscr]
+        fschg
+        pref    @%[prefscr]
 
-        "fmov.d @%[fmtrx]+, DR6\n\t"
-        "fmov.d @%[fmtrx]+, DR8\n\t"
-        "ftrv XMTRX, FV4\n\t"
+        fmov.d  @%[m1]+, xd0
+        fmov.d  @%[m1]+, xd2
+        fmov.d  @%[m1]+, xd4
+        fmov.d  @%[m1]+, xd6
+        pref    @%[m1]
+        fmov.d  @%[m1]+, xd8
+        fmov.d  @%[m1]+, xd10
+        fmov.d  @%[m1]+, xd12
+        mov     %[m2], %[prefscr]
+        add     #32, %[prefscr]
+        fmov.d  @%[m1], xd14
+        pref    @%[prefscr]
 
-        "fmov.d @%[fmtrx]+, DR10\n\t"
-        "fmov.d @%[fmtrx]+, DR12\n\t"
-        "ftrv XMTRX, FV8\n\t"
+        fmov.d  @%[m2]+, dr0
+        fmov.d  @%[m2]+, dr2
+        fmov.d  @%[m2]+, dr4
+        ftrv    xmtrx, fv0
 
-        "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles)
-        "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE)
-        "ftrv XMTRX, FV12\n\t" // (FE)
-        // Save output in XF regs
-        "frchg\n"
-        : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
-        : // no inputs
-        : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
-    );
+        fmov.d  @%[m2]+, dr6
+        fmov.d  @%[m2]+, dr8
+        ftrv    xmtrx, fv4
+
+        fmov.d  @%[m2]+, dr10
+        fmov.d  @%[m2]+, dr12
+        ftrv    xmtrx, fv8
+
+        fmov.d  @%[m2], dr14
+        fschg
+        ftrv    xmtrx, fv12
+        frchg
+    )"
+    : [m1] "+&r" (matrix1), [m2] "+r" (matrix2), [prefscr] "=&r" (prefetch_scratch)
+    :
+    : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15");
 }
 
-__hot inline void mat_apply_rotate_x(float x) {
-    x *= 10430.37835f;
-    asm volatile(
-        "ftrc	%[a], fpul\n\t"
-        "fsca 	fpul, dr4\n\t"
-        "fldi0	fr8\n\t"
-        "fldi0	fr11\n\t"
-        "fmov	fr5, fr10\n\t"
-        "fmov	fr4, fr9\n\t"
-        "fneg	fr9\n\t"
-        "ftrv	xmtrx, fv8\n\t"
-        "fmov	fr4, fr6\n\t"
-        "fldi0	fr7\n\t"
-        "fldi0	fr4\n\t"
-        "ftrv	xmtrx, fv4\n\t"
-        "fschg\n\t"
-        "fmov	dr8, xd8\n\t"
-        "fmov	dr10, xd10\n\t"
-        "fmov	dr4, xd4\n\t"
-        "fmov	dr6, xd6\n\t"
-        "fschg\n"
-        :
-        : [a] "f"(x)
-        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+__hot __icache_aligned inline void mat_set_rotate_x(float x) {
+    x *= fsca_scale;
+    asm volatile(R"(
+        ftrc    %[x], fpul
+		frchg
+        fldi0   fr1
+		fldi0	fr2
+		fldi0	fr3
+        fldi0	fr7
+		fldi0	fr8
+		fldi0	fr12
+		fldi0	fr13
+		fsca	fpul, dr0
+		fldi0	fr4
+		fldi0	fr11
+		fldi0	fr14
+		fldi1	fr15
+		fmov	fr1, fr5
+		fmov	fr1, fr10
+		fmov	fr0, fr9
+		fmov	fr0, fr6
+        fneg    fr6
+		fldi1	fr0
+		fldi0	fr1
+		frchg
+    )"
+    :
+    : [x] "f" (x)
+    : "fpul");
 }
 
-__hot inline void mat_apply_rotate_y(float y) {
-    y *= 10430.37835f;
-    asm volatile(
-        "ftrc	%[a], fpul\n\t"
-        "fsca   fpul, dr6\n\t"
-        "fldi0	fr9\n\t"
-        "fldi0	fr11\n\t"
-        "fmov	fr6, fr8\n\t"
-        "fmov	fr7, fr10\n\t"
-        "ftrv	xmtrx, fv8\n\t"
-        "fmov	fr7, fr4\n\t"
-        "fldi0	fr5\n\t"
-        "fneg	fr6\n\t"
-        "fldi0	fr7\n\t"
-        "ftrv	xmtrx, fv4\n\t"
-        "fschg\n\t"
-        "fmov	dr8, xd8\n\t"
-        "fmov	dr10, xd10\n\t"
-        "fmov	dr4, xd0\n\t"
-        "fmov	dr6, xd2\n\t"
-        "fschg\n"
-        :
-        : [a] "f"(y)
-        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+__hot __icache_aligned inline void mat_apply_rotate_x(float x) {
+    x *= fsca_scale;
+    asm volatile(R"(
+        ftrc	%[x], fpul
+        fsca 	fpul, dr4
+        fldi0	fr8
+        fldi0	fr11
+        fmov	fr5, fr10
+        fmov	fr4, fr9
+        fneg	fr9
+        ftrv	xmtrx, fv8
+        fmov	fr4, fr6
+        fldi0	fr7
+        fldi0	fr4
+        ftrv	xmtrx, fv4
+        fschg
+        fmov	dr8, xd8
+        fmov	dr10, xd10
+        fmov	dr4, xd4
+        fmov	dr6, xd6
+        fschg
+    )"
+    :
+    : [x] "f"(x)
+    : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
 }
 
-__hot inline void mat_apply_rotate_z(float z) {
-    z *= 10430.37835f;
-    asm volatile(
-        "ftrc	%[a], fpul\n\t"
-        "fsca   fpul, dr8\n\t"
-        "fldi0	fr10\n\t"
-        "fldi0	fr11\n\t"
-        "fmov	fr8, fr5\n\t"
-        "fneg	fr8\n\t"
-        "ftrv	xmtrx, fv8\n\t"
-        "fmov	fr9, fr4\n\t"
-        "fschg\n\t"
-        "fmov	dr10, dr6\n\t"
-        "ftrv	xmtrx, fv4\n\t"
-        "fmov	dr8, xd4\n\t"
-        "fmov	dr10, xd6\n\t"
-        "fmov	dr4, xd0\n\t"
-        "fmov	dr6, xd2\n\t"
-        "fschg\n"
-        :
-        : [a] "f"(z)
-        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+
+__hot __icache_aligned inline void mat_set_rotate_y(float y) {
+    y *= fsca_scale;
+    asm volatile(R"(
+        ftrc    %[y], fpul
+        frchg
+        fldi0	fr3
+        fldi1	fr5
+        fldi0	fr6
+        fldi0	fr7
+        fldi0	fr12
+        fldi0	fr13
+        fsca	fpul, dr0
+        fldi0	fr4
+        fldi0	fr9
+        fldi0	fr11
+        fldi0	fr14
+        fldi1	fr15
+        fmov	fr1, fr10
+        fmov	fr0, fr8
+        fneg	fr8
+        fmov	fr0, fr2
+        fmov	fr1, fr0
+        fldi0	fr1
+        frchg
+    )"
+    :
+    : [y] "f" (y)
+    : "fpul");
 }
 
+__hot __icache_aligned inline void mat_apply_rotate_y(float y) {
+    y *= fsca_scale;
+    asm volatile(R"(
+        ftrc	%[y], fpul
+        fsca    fpul, dr6
+        fldi0	fr9
+        fldi0	fr11
+        fmov	fr6, fr8
+        fmov	fr7, fr10
+        ftrv	xmtrx, fv8
+        fmov	fr7, fr4
+        fldi0	fr5
+        fneg	fr6
+        fldi0	fr7
+        ftrv	xmtrx, fv4
+        fschg
+        fmov	dr8, xd8
+        fmov	dr10, xd10
+        fmov	dr4, xd0
+        fmov	dr6, xd2
+        fschg
+    )"
+    :
+    : [y] "f" (y)
+    : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+__hot __icache_aligned inline void mat_apply_rotate_z(float z) {
+    z *= fsca_scale;
+    asm volatile(R"(
+        ftrc	%[z], fpul
+        fsca    fpul, dr8
+        fldi0	fr10
+        fldi0	fr11
+        fmov	fr8, fr5
+        fneg	fr8
+        ftrv	xmtrx, fv8
+        fmov	fr9, fr4
+        fschg
+        fmov	dr10, dr6
+        ftrv	xmtrx, fv4
+        fmov	dr8, xd4
+        fmov	dr10, xd6
+        fmov	dr4, xd0
+        fmov	dr6, xd2
+        fschg
+    )"
+    :
+    : [z] "f" (z)
+    : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
+}
+
+__hot __icache_aligned inline void mat_set_rotate_z(float z) {
+    z *= fsca_scale; 
+    asm volatile(R"(
+        ftrc    %[z], fpul
+		frchg
+		fldi0	fr2
+		fldi0	fr3
+		fldi1	fr10
+		fldi0	fr11
+		fsca	fpul, dr4
+		fschg
+		fmov	dr2, dr6
+		fmov	dr2, dr8
+		fmov	dr2, dr12
+		fldi0	fr14
+		fldi1	fr15
+		fschg
+		fmov	fr5, fr0
+		fmov	fr4, fr1
+		fneg	fr1
+		frchg
+    )"
+    :
+    : [z] "f" (z)
+    : "fpul");
+}
+
+__hot __icache_aligned inline void mat_set_rotate(float x, float y, float z) {
+    mat_set_rotate_x(x);
+    mat_apply_rotate_y(y);
+    mat_apply_rotate_z(z);
+}
+
+__hot __icache_aligned inline void mat_apply_rotate(float x, float y, float z) {
+    mat_apply_rotate_x(x);
+    mat_apply_rotate_y(y);
+    mat_apply_rotate_z(z);
+}
 
 //TODO: FIXME FOR VC (AND USE FTRV)
 template<bool FAST_APPROX=false>
@@ -880,23 +1115,6 @@ __always_inline constexpr float fipr2D(float x1, float y1, float x2, float y2) {
         w_ = tmp1233123.w; \
     } while(false)
 
-inline void mat_load_3x3(const rw::Matrix* mtx) {
-    memcpy(XMTRX, mtx, sizeof(matrix_t));
-    XMTRX[0][3] = 0.0f;
-    XMTRX[1][3] = 0.0f;
-    XMTRX[2][3] = 0.0f;
-
-    XMTRX[3][0] = 0.0f;
-    XMTRX[3][1] = 0.0f;
-    XMTRX[3][2] = 0.0f;
-    XMTRX[3][3] = 0.0f;
-}
-
-inline void mat_load_4x4(const rw::Matrix* mtx) {
-    memcpy(XMTRX, mtx, sizeof(matrix_t));
-    XMTRX[3][3] = 1.0f;
-}
-
 inline void mat_transpose(void) {
     matrix_t tmp;
 
diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h
index 1004e0e7..35334bc0 100644
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -451,6 +451,9 @@ struct alignas(8) MatrixBase
 		uint32 pad3;
 		float  posw = 1.0f;
 	};
+
+    operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
+    operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
 };
 
 struct Matrix: public MatrixBase
@@ -463,9 +466,9 @@ struct Matrix: public MatrixBase
 
     Matrix() {}
 
-    Matrix(MatrixBase &&aggregate):
-	    MatrixBase{aggregate}
-    {}
+    Matrix(MatrixBase &&aggregate){
+        *this = aggregate;
+    }
 
     Matrix(const Matrix &rhs) {
         *this = rhs;
@@ -476,8 +479,10 @@ struct Matrix: public MatrixBase
         return *this;
     }
 
-    operator matrix_t *() { return reinterpret_cast<matrix_t *>(this); }
-    operator const matrix_t *() const { return reinterpret_cast<const matrix_t *>(this); }
+    Matrix &operator=(const MatrixBase &rhs) {
+        dc::mat_copy(*this, rhs);
+        return *this;
+    }
 
 	static Matrix *create(void);
 	void destroy(void);
@@ -514,10 +519,12 @@ inline void convMatrix(Matrix *dst, RawMatrix *src){
 
 inline void convMatrix(RawMatrix *dst, Matrix *src){
 	*dst = *(RawMatrix*)src;
+#ifndef DC_SH4
 	dst->rightw = 0.0;
 	dst->upw = 0.0;
 	dst->atw = 0.0;
 	dst->posw = 1.0;
+#endif
 }
 
 struct Line