Accelerated lots of RW math + Coronas (liberty)

- lot of the RW matrix stuff has become accelerated - went through and accelerated liberty's coronas/reflections ! apparently introduced a bug somewhere along the lines that cause boats to freak out and do summersaults when trying to drive. Will resolve later.
2025-09-01 10:42:34 +02:00 · 2025-04-29 10:03:49 -05:00
parent c3454ac7ec
commit f59f84c133
4 changed files with 343 additions and 245 deletions
--- a/src/liberty/renderer/Coronas.cpp
+++ b/src/liberty/renderer/Coronas.cpp
@@ -300,9 +300,9 @@ CCoronas::Render(void)


 			if(aCoronas[i].fadeAlpha && spriteCoors.z < aCoronas[i].drawDist){
-				float recipz = 1.0f/spriteCoors.z;
+				float recipz = dc::Invert<true, false>(spriteCoors.z);
 				float fadeDistance = aCoronas[i].drawDist / 2.0f;
-				float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
+				float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - dc::Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
 				int totalFade = aCoronas[i].fadeAlpha * distanceFade;

 				if(aCoronas[i].LOScheck)
@@ -313,6 +313,7 @@ CCoronas::Render(void)
 				// render corona itself
 				if(aCoronas[i].texture){
 					float fogscale = CWeather::Foggyness*Min(spriteCoors.z, 40.0f)/40.0f + 1.0f;
+                    float invFogScale = dc::Invert<true, false>(fogscale);
 					if(CCoronas::aCoronas[i].id == SUN_CORE)
 						spriteCoors.z = 0.95f * RwCameraGetFarClipPlane(Scene.camera);
 					RwRenderStateSet(rwRENDERSTATETEXTURERASTER, RwTextureGetRaster(aCoronas[i].texture));
@@ -328,9 +329,9 @@ CCoronas::Render(void)
 						CSprite::RenderOneXLUSprite(spriteCoors.x, spriteCoors.y, spriteCoors.z,
 							spritew * aCoronas[i].size * wscale,
 							spriteh * aCoronas[i].size * fogscale * hscale,
-							CCoronas::aCoronas[i].red / fogscale,
-							CCoronas::aCoronas[i].green / fogscale,
-							CCoronas::aCoronas[i].blue / fogscale,
+							CCoronas::aCoronas[i].red * invFogScale,
+							CCoronas::aCoronas[i].green * invFogScale,
+							CCoronas::aCoronas[i].blue * invFogScale,
 							totalFade,
 							recipz,
 							255);
@@ -339,9 +340,9 @@ CCoronas::Render(void)
 							spriteCoors.x, spriteCoors.y, spriteCoors.z,
 							spritew * aCoronas[i].size * fogscale,
 							spriteh * aCoronas[i].size * fogscale,
-							CCoronas::aCoronas[i].red / fogscale,
-							CCoronas::aCoronas[i].green / fogscale,
-							CCoronas::aCoronas[i].blue / fogscale,
+							CCoronas::aCoronas[i].red * invFogScale,
+							CCoronas::aCoronas[i].green * invFogScale,
+							CCoronas::aCoronas[i].blue * invFogScale,
 							totalFade,
 							recipz,
 							20.0f * recipz,
@@ -365,7 +366,7 @@ CCoronas::Render(void)
 							(spriteCoors.x - (screenw/2)) * flare->position + (screenw/2),
 							(spriteCoors.y - (screenh/2)) * flare->position + (screenh/2),
 							spriteCoors.z,
-							4.0f*flare->size * spritew/spriteh,
+							4.0f*flare->size * dc::Div<true, false>(spritew, spriteh),
 							4.0f*flare->size,
 							(flare->red * aCoronas[i].red)>>8,
 							(flare->green * aCoronas[i].green)>>8,
@@ -480,9 +481,9 @@ CCoronas::RenderReflections(void)
 					drawDist = Min(drawDist, 55.0f);
 					if(spriteCoors.z < drawDist){
 						float fadeDistance = drawDist / 2.0f;
-						float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
+						float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
 						distanceFade = Clamp(distanceFade, 0.0f, 1.0f);
-						float recipz = 1.0f/RwCameraGetNearClipPlane(Scene.camera);
+						float recipz = dc::Invert<true, false>(RwCameraGetNearClipPlane(Scene.camera));
 						float heightFade = (20.0f - aCoronas[i].heightAboveRoad)/20.0f;
 						int intensity = distanceFade*heightFade * 230.0 * CWeather::WetRoads;

@@ -606,7 +607,9 @@ CEntity::ProcessLightsForEntity(void)
 	flashTimer1 = 0;
 	flashTimer2 = 0;
 	flashTimer3 = 0;
-
+#ifdef DC_SH4
+    dc:mat_load2(GetMatrix());
+#endif
 	n = CModelInfo::GetModelInfo(GetModelIndex())->GetNum2dEffects();
 	for(i = 0; i < n; i++, flashTimer1 += 0x80, flashTimer2 += 0x100, flashTimer3 += 0x200){
 		effect = CModelInfo::GetModelInfo(GetModelIndex())->Get2dEffect(i);
@@ -614,8 +617,12 @@ CEntity::ProcessLightsForEntity(void)
 		if(effect->type != EFFECT_LIGHT)
 			continue;

+#ifndef DC_SH4
 		pos = GetMatrix() * effect->pos;
-
+#else
+        mat_trans_single3_nodiv_nomod(effect->pos.x, effect->pos.y, effect->pos.z,
+                                      pos.x, pos.y, pos.z);
+#endif
 		lightOn = false;
 		lightFlickering = false;
 		switch(effect->light.lightType){
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -91,24 +91,6 @@ strncmp_ci(const char *s1, const char *s2, int n)
 	return 0;
 }

-Quat
-mult(const Quat &q, const Quat &p)
-{
-#ifndef DC_SH4
-	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
-	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
-	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
-	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
-#else
-	Quat o;
-	dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
-	              reinterpret_cast<const dc::quaternion_t &>(q),
-				  reinterpret_cast<const dc::quaternion_t &>(p));
-	return o;
-#endif
-}
-
-
 Quat*
 Quat::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
@@ -166,53 +148,39 @@ slerp(const Quat &q, const Quat &p, float32 a)
 //
 // V3d
 //
-
-V3d
-cross(const V3d &a, const V3d &b)
-{
-	return makeV3d(a.y*b.z - a.z*b.y,
-	               a.z*b.x - a.x*b.z,
-	               a.x*b.y - a.y*b.x);
+void V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) {
+    int32 i;
+    #ifndef DC_SH4
+        V3d tmp;
+        for(i = 0; i < n; i++){
+            tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
+            tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
+            tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
+            out[i] = tmp;
+        }
+    #else
+        dc::mat_load2(*m);
+        for(i = 0; i < n; i++)
+            mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
+                                          out[i].x, out[i].y, out[i].z);
+    #endif
 }
-
-void
-V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m)
-{
-	int32 i;
-#ifndef DC_SH4
-    V3d tmp;
-    for(i = 0; i < n; i++){
-		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
-		tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
-		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
-		out[i] = tmp;
-	}
-#else
-    dc::mat_load2(*m);
-    for(i = 0; i < n; i++)
-        mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
-                                      out[i].x, out[i].y, out[i].z);
-#endif
-}
-
-void
-V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
-{
-	int32 i;
-#ifndef DC_SH4
-	V3d tmp;
-	for(i = 0; i < n; i++){
-		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
-		tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
-		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
-		out[i] = tmp;
-	}
-#else
-    dc::mat_load2(*m);
-    for(i = 0; i < n; i++)
-        mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
-                                out[i].x, out[i].y, out[i].z);
-#endif
+void V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) {
+    int32 i;
+    #ifndef DC_SH4
+        V3d tmp;
+        for(i = 0; i < n; i++){
+            tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
+            tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
+            tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
+            out[i] = tmp;
+        }
+    #else
+        dc::mat_load2(*m);
+        for(i = 0; i < n; i++)
+            mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
+                                    out[i].x, out[i].y, out[i].z);
+    #endif
 }

 //
@@ -343,9 +311,10 @@ Matrix::mult(Matrix *dst, const Matrix *src1, const Matrix *src2)
 		*dst = *src2;
 	else if(src2->flags & IDENTITY)
 		*dst = *src1;
-	else{
+	else {
+        uint8_t flags = src1->flags & src2->flags;
 		mult_(dst, src1, src2);
-		dst->flags = src1->flags & src2->flags;
+		dst->flags = flags;
 	}
 	return dst;
 }
@@ -366,7 +335,8 @@ Matrix::invert(Matrix *dst, const Matrix *src)
 Matrix*
 Matrix::transpose(Matrix *dst, const Matrix *src)
 {
-	if(src->flags & IDENTITY)
+#ifndef DC_SH4
+	if(src->flags & IDENTITY) 
 		*dst = *src;
 	dst->right.x = src->right.x;
 	dst->up.x = src->right.y;
@@ -380,25 +350,31 @@ Matrix::transpose(Matrix *dst, const Matrix *src)
 	dst->pos.x = 0.0;
 	dst->pos.y = 0.0;
 	dst->pos.z = 0.0;
+#else
+    if(src->flags & IDENTITY)
+        *dst = *src;
+    else {
+        dc::mat_load_transpose(*src);
+        dc::mat_store2(*dst);
+    }
+#endif
 	return dst;
 }

 Matrix*
 Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
-	Matrix tmp, rot;
-	makeRotation(&rot, axis, angle);
+	Matrix rot;
+    makeRotation(&rot, axis, angle);
 	switch(op){
 	case COMBINEREPLACE:
 		*this = rot;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
-		*this = tmp;
+		mult(this, &rot, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
-		*this = tmp;
+        mult(this, this, &rot);
 		break;
 	}
 	return this;
@@ -407,27 +383,25 @@ Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 Matrix*
 Matrix::rotate(const Quat &q, CombineOp op)
 {
-	Matrix tmp, rot;
-	makeRotation(&rot, q);
+	Matrix rot;
+    makeRotation(&rot, q);
 	switch(op){
 	case COMBINEREPLACE:
 		*this = rot;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
-		*this = tmp;
+        mult(this, &rot, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
-		*this = tmp;
+        mult(this, this, &rot);
 		break;
 	}
 	return this;
 }
+
 Matrix*
 Matrix::translate(const V3d *translation, CombineOp op)
 {
-	Matrix tmp;
 	Matrix trans = identMat;
 	trans.pos = *translation;
 	trans.flags &= ~IDENTITY;
@@ -436,12 +410,10 @@ Matrix::translate(const V3d *translation, CombineOp op)
 		*this = trans;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &trans, this);
-		*this = tmp;
+		mult(this, &trans, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &trans);
-		*this = tmp;
+		mult(this, this, &trans);
 		break;
 	}
 	return this;
@@ -450,7 +422,6 @@ Matrix::translate(const V3d *translation, CombineOp op)
 Matrix*
 Matrix::scale(const V3d *scale, CombineOp op)
 {
-	Matrix tmp;
 	Matrix scl = identMat;
 	scl.right.x = scale->x;
 	scl.up.y = scale->y;
@@ -461,12 +432,10 @@ Matrix::scale(const V3d *scale, CombineOp op)
 		*this = scl;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &scl, this);
-		*this = tmp;
+		mult(this, &scl, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &scl);
-		*this = tmp;
+		mult(this, this, &scl);
 		break;
 	}
 	return this;
@@ -475,18 +444,15 @@ Matrix::scale(const V3d *scale, CombineOp op)
 Matrix*
 Matrix::transform(const Matrix *mat, CombineOp op)
 {
-	Matrix tmp;
 	switch(op){
 	case COMBINEREPLACE:
 		*this = *mat;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, mat, this);
-		*this = tmp;
+		mult(this, mat, this);
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, mat);
-		*this = tmp;
+		mult(this, this, mat);
 		break;
 	}
 	return this;
@@ -501,27 +467,31 @@ Matrix::getRotation(void)
 	if(tr > 0.0f){
 		s = sqrtf(1.0f + tr) * 2.0f;
 		q.w = s / 4.0f;
-		q.x = (up.z - at.y) / s;
-		q.y = (at.x - right.z) / s;
-		q.z = (right.y - up.x) / s;
+        float invS = dc::Invert<true, false>(s);
+		q.x = (up.z - at.y) * invS;
+		q.y = (at.x - right.z) * invS;
+		q.z = (right.y - up.x) * invS;
 	}else if(right.x > up.y && right.x > at.z){
 		s = sqrtf(1.0f + right.x - up.y - at.z) * 2.0f;
-		q.w = (up.z - at.y) / s;
-		q.x = s / 4.0f;
-		q.y = (up.x + right.y) / s;
-		q.z = (at.x + right.z) / s;
+        q.x = s / 4.0f;
+        float invS = dc::Invert<true, false>(s);
+        q.w = (up.z - at.y) * invS;
+		q.y = (up.x + right.y) * invS;
+		q.z = (at.x + right.z) * invS;
 	}else if(up.y > at.z){
 		s = sqrtf(1.0f + up.y - right.x - at.z) * 2.0f;
-		q.w = (at.x - right.z) / s;
-		q.x = (up.x + right.y) / s;
-		q.y = s / 4.0f;
-		q.z = (at.y + up.z) / s;
+        q.y = s / 4.0f;
+        float invS = dc::Invert<true, false>(s);
+        q.w = (at.x - right.z) * invS;
+		q.x = (up.x + right.y) * invS;
+		q.z = (at.y + up.z) * invS;
 	}else{
 		s = sqrtf(1.0f + at.z - right.x - up.y) * 2.0f;
-		q.w = (right.y - up.x) / s;
-		q.x = (at.x + right.z) / s;
-		q.y = (at.y + up.z) / s;
-		q.z = s / 4.0f;
+        q.z = s / 4.0f;
+        float invS = dc::Invert<true, false>(s);
+        q.w = (right.y - up.x) * invS;
+		q.x = (at.x + right.z) * invS;
+		q.y = (at.y + up.z) * invS;
 	}
 	return q;
 }
@@ -543,20 +513,7 @@ Matrix::lookAt(const V3d &dir, const V3d &up)
 void
 Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const Matrix *__restrict__ src2)
 {
-	#if !defined(DC_TEXCONV) && !defined(DC_SIM)
-	dst->right.x = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->right.y = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->right.z = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->up.x    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->up.y    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->up.z    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->at.x    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
-	dst->at.y    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
-	dst->at.z    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
-	dst->pos.x   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.x, src2->up.x, src2->at.x, src2->pos.x);
-	dst->pos.y   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  	 	  src2->right.y, src2->up.y, src2->at.y, src2->pos.y);
-	dst->pos.z   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.z, src2->up.z, src2->at.z, src2->pos.z);
-	#else
+#ifndef DC_SH4
 	dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x;
 	dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y;
 	dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z;
@@ -569,12 +526,15 @@ Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const M
 	dst->pos.x   = src1->pos.x*src2->right.x   + src1->pos.y*src2->up.x   + src1->pos.z*src2->at.x + src2->pos.x;
 	dst->pos.y   = src1->pos.x*src2->right.y   + src1->pos.y*src2->up.y   + src1->pos.z*src2->at.y + src2->pos.y;
 	dst->pos.z   = src1->pos.x*src2->right.z   + src1->pos.y*src2->up.z   + src1->pos.z*src2->at.z + src2->pos.z;
-	#endif
+#else
+    dc::mat_mult(*dst, *src2, *src1);
+#endif
 }

 void
 Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 {
+#if 1
 	dst->right.x = src->right.x;
 	dst->right.y = src->up.x;
 	dst->right.z = src->at.x;
@@ -593,7 +553,12 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 	dst->pos.z = -(src->pos.x*src->at.x +
 	               src->pos.y*src->at.y +
 	               src->pos.z*src->at.z);
-	dst->flags = TYPEORTHONORMAL;
+#else
+    dc::mat_load_transpose(*src);
+    dc::mat_invert_tranpose();
+    dc::mat_store2(*dst);
+#endif
+    dst->flags = TYPEORTHONORMAL;
 }

 Matrix*
@@ -688,7 +653,11 @@ Matrix::normalError(void)
 	x = dot(right, right) - 1.0f;
 	y = dot(up, up) - 1.0f;
 	z = dot(at, at) - 1.0f;
+#ifndef DC_SH4
 	return x*x + y*y + z*z;
+#else
+    return fipr_magnitude_sqr(x, y, z, 0.0f);
+#endif
 }

 float32
@@ -698,16 +667,27 @@ Matrix::orthogonalError(void)
 	x = dot(at, up);
 	y = dot(at, right);
 	z = dot(up, right);
+#ifndef DC_SH4
 	return x*x + y*y + z*z;
+#else
+    return fipr_magnitude_sqr(x, y, z, 0.0f);
+#endif
 }

 float32
 Matrix::identityError(void)
 {
-	V3d r = { right.x-1.0f, right.y, right.z };
+    V3d r = { right.x-1.0f, right.y, right.z };
 	V3d u = { up.x, up.y-1.0f, up.z };
 	V3d a = { at.x, at.y, at.z-1.0f };
+#ifndef DC_SH4
 	return dot(r,r) + dot(u,u) + dot(a,a) + dot(pos,pos);
+#else
+    return fipr_magnitude_sqr(r.x, r.y, r.z, 0.0f)    +
+           fipr_magnitude_sqr(u.x, u.y, u.z, 0.0f)    +
+           fipr_magnitude_sqr(at.x, at.y, at.z, 0.0f) +
+           fipr_magnitude_sqr(pos.x, pos.y, pos.z, 0.0f);
+#endif
 }

 void
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -246,6 +246,83 @@ inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) {
    );
 }

+inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) {
+    asm volatile(
+        R"(
+            frchg
+
+            fmov.s  @%[mtx]+, fr0
+
+            add     #32, %[mtx]
+            pref    @%[mtx]
+            add     #-(32 - 4), %[mtx]
+
+            fmov.s  @%[mtx]+, fr4
+            fmov.s  @%[mtx]+, fr8
+            fldi0   fr12
+            add     #4, %[mtx]
+
+            fmov.s  @%[mtx]+, fr1
+            fmov.s  @%[mtx]+, fr5
+            fmov.s  @%[mtx]+, fr9
+            fldi0   fr13
+            add     #4, %[mtx]
+
+            fmov.s  @%[mtx]+, fr2
+            fmov.s  @%[mtx]+, fr6
+            fmov.s  @%[mtx]+, fr10
+            fldi0   fr14
+
+            fldi0  fr3
+            fldi0  fr7
+            fmov   fr3, fr11
+            fldi1  fr15
+
+            frchg
+        )"
+        : [mtx] "+r" (mtx)
+        :
+        :
+    );
+}
+
+inline __hot __icache_aligned void mat_invert_tranpose() {
+	asm volatile(
+		"frchg\n\t"
+		"fneg	fr12\n\t"
+		"fneg	fr13\n\t"
+		"fneg	fr14\n\t"
+		"fldi0	fr15\n\t"
+		"fldi0	fr3\n\t"
+		"fipr	fv12, fv0\n\t"
+		"fldi0	fr7\n\t"
+		"fipr	fv12, fv4\n\t"
+		"fldi0	fr11\n\t"
+		"fipr	fv12, fv8\n\t"
+
+		"fmov	fr3, fr12\n\t"
+		"fmov	fr7, fr13\n\t"
+		"fmov	fr11, fr14\n\t"
+		"fmov	fr1, fr15\n\t"
+		"fmov	fr4, fr1\n\t"
+		"fmov	fr15, fr4\n\t"
+		"fmov	fr2, fr15\n\t"
+		"fmov	fr8, fr2\n\t"
+		"fmov	fr15, fr2\n\t"
+		"fmov	fr6, fr15\n\t"
+		"fmov	fr9, fr6\n\t"
+		"fmov	fr15, fr9\n\t"
+
+		"fldi0	fr3\n\t"
+		"fldi0	fr7\n\t"
+		"fldi0	fr11\n\t"
+		"fldi1	fr15\n\t"
+		"frchg\n"
+		:
+		:
+		:);
+}
+
 inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
    asm volatile(
        R"(
@@ -449,103 +526,6 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src)
      :);
 }

-//TODO: FIXME FOR VC (AND USE FTRV)
-template<bool FAST_APPROX=false>
-__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
-    if(FAST_APPROX && !std::is_constant_evaluated()) {
-    /*
-        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
-        // q2 then needs to be rotated after each inner product
-        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
-        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
-        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
-        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
-    */
-        // keep q1 in fv4
-        register float q1x __asm__ ("fr4") = (q1.x);
-        register float q1y __asm__ ("fr5") = (q1.y);
-        register float q1z __asm__ ("fr6") = (q1.z);
-        register float q1w __asm__ ("fr7") = (q1.w);
-
-        // load q2 into fv8, use it to get the shuffled reorder into fv0
-        register float q2x __asm__ ("fr8")  = (q2.x);
-        register float q2y __asm__ ("fr9")  = (q2.y);
-        register float q2z __asm__ ("fr10") = (q2.z);
-        register float q2w __asm__ ("fr11") = (q2.w);
-
-        // temporary operand / result in fv0
-        register float t1x __asm__ ("fr0");
-        register float t1y __asm__ ("fr1");
-        register float t1z __asm__ ("fr2");
-        register float t1w __asm__ ("fr3");
-
-        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
-        t1x = q2w;
-        t1y = q2z;
-        t1z = -q2y;
-        t1w = q2w;
-        __asm__ ("\n"
-            " fipr	fv4,fv0\n"
-            : "+f" (t1w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (t1x), "f" (t1y), "f" (t1z)
-        );
-        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
-
-        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
-        t1x = -q2z;
-        t1y = q2w;
-        t1z = q2x;
-        __atomic_thread_fence(1);
-        r->x = t1w;   // get previous result
-        t1w = q2y;
-        __asm__ ("\n"
-            "	fipr	fv4,fv0\n"
-            : "+f" (t1w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (t1x), "f" (t1y), "f" (t1z)
-        );
-        //y = t1w;
-
-        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
-        t1x = q2y;
-        t1y = -q2x;
-        t1z = q2w;
-        __atomic_thread_fence(1);
-        r->y = t1w;   // get previous result
-        t1w = q2z;
-        __asm__ ("\n"
-            "	fipr	fv4,fv0\n"
-            : "+f" (t1w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (t1x), "f" (t1y), "f" (t1z)
-        );
-        //z = t1w;
-        __atomic_thread_fence(1);
-
-        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
-        q2x = -q2x;
-        q2y = -q2y;
-        q2z = -q2z;
-        __asm__ ("\n"
-            "	fipr	fv4,fv8\n"
-            : "+f" (q2w)
-            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
-              "f" (q2x), "f" (q2y), "f" (q2z)
-        );
-
-        __atomic_thread_fence(1);
-        r->z = t1w;
-        __atomic_thread_fence(1);
-        r->w = q2w;
-    } else {
-        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
-        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
-        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
-        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
-    }
-}
-
 __hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
    unsigned int prefetch_scratch;

@@ -669,6 +649,104 @@ __hot inline void mat_apply_rotate_z(float z) {
        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
 }

+
+//TODO: FIXME FOR VC (AND USE FTRV)
+template<bool FAST_APPROX=false>
+__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
+    if(FAST_APPROX && !std::is_constant_evaluated()) {
+    /*
+        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
+        // q2 then needs to be rotated after each inner product
+        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+    */
+        // keep q1 in fv4
+        register float q1x __asm__ ("fr4") = (q1.x);
+        register float q1y __asm__ ("fr5") = (q1.y);
+        register float q1z __asm__ ("fr6") = (q1.z);
+        register float q1w __asm__ ("fr7") = (q1.w);
+
+        // load q2 into fv8, use it to get the shuffled reorder into fv0
+        register float q2x __asm__ ("fr8")  = (q2.x);
+        register float q2y __asm__ ("fr9")  = (q2.y);
+        register float q2z __asm__ ("fr10") = (q2.z);
+        register float q2w __asm__ ("fr11") = (q2.w);
+
+        // temporary operand / result in fv0
+        register float t1x __asm__ ("fr0");
+        register float t1y __asm__ ("fr1");
+        register float t1z __asm__ ("fr2");
+        register float t1w __asm__ ("fr3");
+
+        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
+        t1x = q2w;
+        t1y = q2z;
+        t1z = -q2y;
+        t1w = q2w;
+        __asm__ ("\n"
+            " fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
+
+        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
+        t1x = -q2z;
+        t1y = q2w;
+        t1z = q2x;
+        __atomic_thread_fence(1);
+        r->x = t1w;   // get previous result
+        t1w = q2y;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //y = t1w;
+
+        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
+        t1x = q2y;
+        t1y = -q2x;
+        t1z = q2w;
+        __atomic_thread_fence(1);
+        r->y = t1w;   // get previous result
+        t1w = q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv0\n"
+            : "+f" (t1w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (t1x), "f" (t1y), "f" (t1z)
+        );
+        //z = t1w;
+        __atomic_thread_fence(1);
+
+        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
+        q2x = -q2x;
+        q2y = -q2y;
+        q2z = -q2z;
+        __asm__ ("\n"
+            "	fipr	fv4,fv8\n"
+            : "+f" (q2w)
+            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
+              "f" (q2x), "f" (q2y), "f" (q2z)
+        );
+
+        __atomic_thread_fence(1);
+        r->z = t1w;
+        __atomic_thread_fence(1);
+        r->w = q2w;
+    } else {
+        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
+        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
+        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
+        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
+    }
+}
+
 #   else
 #       ifdef DC_TEXCONV
 #           define mat_apply(a)
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -238,8 +238,8 @@ inline V2d neg(const V2d &a) { return makeV2d(-a.x, -a.y); }
 inline V2d add(const V2d &a, const V2d &b) { return makeV2d(a.x+b.x, a.y+b.y); }
 inline V2d sub(const V2d &a, const V2d &b) { return makeV2d(a.x-b.x, a.y-b.y); }
 inline V2d scale(const V2d &a, float32 r) { return makeV2d(a.x*r, a.y*r); }
-inline float32 length(const V2d &v) { return sqrtf(v.x*v.x + v.y*v.y); }
-inline V2d normalize(const V2d &v) { return scale(v, 1.0f/length(v)); }
+inline float32 length(const V2d &v) { return dc::Sqrt(v.x*v.x + v.y*v.y); }
+inline V2d normalize(const V2d &v) { return scale(v, dc::RecipSqrt(v.x*v.x + v.y*v.y)); }

 struct V3d
 {
@@ -265,10 +265,22 @@ inline float32 length(const V3d &v) {
 	return len;
 #endif
 }
-inline V3d normalize(const V3d &v) { return scale(v, 1.0f/length(v)); }
-inline V3d setlength(const V3d &v, float32 l) { return scale(v, l/length(v)); }
-V3d cross(const V3d &a, const V3d &b);
-inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) {
+inline V3d normalize(const V3d &v) {
+    float invLen; 
+#ifndef DC_SH4
+    invLen = 1.0f / length(v);
+#else
+    invLen = dc::RecipSqrt(fipr_magnitude_sqr(v.x, v.y, v.z, 0.0f));
+#endif
+    return scale(v, invLen); 
+}
+inline V3d setlength(const V3d &v, float32 l) { return scale(v, dc::Div<true, false>(l, length(v))); }
+inline V3d cross(const V3d &a, const V3d &b) {
+    return makeV3d(a.y*b.z - a.z*b.y,
+        a.z*b.x - a.x*b.z,
+        a.x*b.y - a.y*b.x);
+}
+inline float32 dot(const V3d &a, const V3d &b) {
 #ifdef DC_SH4
 	return fipr(a.x, a.y, a.z, 0.0f, b.x, b.y, b.z, 0.0f);
 #else
@@ -329,12 +341,33 @@ inline float32 length(const Quat &q) {
 #ifndef DC_SH4
 	return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
 #else
-	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
+	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
 #endif
 }
-inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
+inline Quat normalize(const Quat &q) {
+    float invLen; 
+#ifndef DC_SH4
+    invLen = 1.0f / length(q);
+#else
+    invLen = dc::RecipSqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
+#endif
+    return scale(q, invLen);
+}
 inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
-Quat mult(const Quat &q, const Quat &p);
+inline Quat mult(const Quat &q, const Quat &p) {
+#ifndef DC_SH4
+	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
+	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
+	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
+	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
+#else
+    Quat o;
+    dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
+	              reinterpret_cast<const dc::quaternion_t &>(q),
+                  reinterpret_cast<const dc::quaternion_t &>(p));
+	return o;
+#endif
+}
 inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.0f, v)), conj(q)).vec(); }
 Quat lerp(const Quat &q, const Quat &p, float32 r);
 Quat slerp(const Quat &q, const Quat &p, float32 a);