Accelerated lots of RW math + Coronas (liberty)

- lot of the RW matrix stuff has become accelerated - went through and accelerated liberty's coronas/reflections ! apparently introduced a bug somewhere along the lines that cause boats to freak out and do summersaults when trying to drive. Will resolve later.
2025-09-01 18:52:58 +02:00 · 2025-04-29 10:03:49 -05:00
parent c3454ac7ec
commit f59f84c133
4 changed files with 343 additions and 245 deletions
--- a/src/liberty/renderer/Coronas.cpp
+++ b/src/liberty/renderer/Coronas.cpp
@@ -300,9 +300,9 @@ CCoronas::Render(void)
 			if(aCoronas[i].fadeAlpha && spriteCoors.z < aCoronas[i].drawDist){
-				float recipz = 1.0f/spriteCoors.z;
+				float recipz = dc::Invert<true, false>(spriteCoors.z);
 				float fadeDistance = aCoronas[i].drawDist / 2.0f;
-				float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
+				float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - dc::Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
 				int totalFade = aCoronas[i].fadeAlpha * distanceFade;
 				if(aCoronas[i].LOScheck)
@@ -313,6 +313,7 @@ CCoronas::Render(void)
 				// render corona itself
 				if(aCoronas[i].texture){
 					float fogscale = CWeather::Foggyness*Min(spriteCoors.z, 40.0f)/40.0f + 1.0f;
                    float invFogScale = dc::Invert<true, false>(fogscale);
 					if(CCoronas::aCoronas[i].id == SUN_CORE)
 						spriteCoors.z = 0.95f * RwCameraGetFarClipPlane(Scene.camera);
 					RwRenderStateSet(rwRENDERSTATETEXTURERASTER, RwTextureGetRaster(aCoronas[i].texture));
@@ -328,9 +329,9 @@ CCoronas::Render(void)
 						CSprite::RenderOneXLUSprite(spriteCoors.x, spriteCoors.y, spriteCoors.z,
 							spritew * aCoronas[i].size * wscale,
 							spriteh * aCoronas[i].size * fogscale * hscale,
-							CCoronas::aCoronas[i].red / fogscale,
+							CCoronas::aCoronas[i].red * invFogScale,
-							CCoronas::aCoronas[i].green / fogscale,
+							CCoronas::aCoronas[i].green * invFogScale,
-							CCoronas::aCoronas[i].blue / fogscale,
+							CCoronas::aCoronas[i].blue * invFogScale,
 							totalFade,
 							recipz,
 							255);
@@ -339,9 +340,9 @@ CCoronas::Render(void)
 							spriteCoors.x, spriteCoors.y, spriteCoors.z,
 							spritew * aCoronas[i].size * fogscale,
 							spriteh * aCoronas[i].size * fogscale,
-							CCoronas::aCoronas[i].red / fogscale,
+							CCoronas::aCoronas[i].red * invFogScale,
-							CCoronas::aCoronas[i].green / fogscale,
+							CCoronas::aCoronas[i].green * invFogScale,
-							CCoronas::aCoronas[i].blue / fogscale,
+							CCoronas::aCoronas[i].blue * invFogScale,
 							totalFade,
 							recipz,
 							20.0f * recipz,
@@ -365,7 +366,7 @@ CCoronas::Render(void)
 							(spriteCoors.x - (screenw/2)) * flare->position + (screenw/2),
 							(spriteCoors.y - (screenh/2)) * flare->position + (screenh/2),
 							spriteCoors.z,
-							4.0f*flare->size * spritew/spriteh,
+							4.0f*flare->size * dc::Div<true, false>(spritew, spriteh),
 							4.0f*flare->size,
 							(flare->red * aCoronas[i].red)>>8,
 							(flare->green * aCoronas[i].green)>>8,
@@ -480,9 +481,9 @@ CCoronas::RenderReflections(void)
 					drawDist = Min(drawDist, 55.0f);
 					if(spriteCoors.z < drawDist){
 						float fadeDistance = drawDist / 2.0f;
-						float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
+						float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
 						distanceFade = Clamp(distanceFade, 0.0f, 1.0f);
-						float recipz = 1.0f/RwCameraGetNearClipPlane(Scene.camera);
+						float recipz = dc::Invert<true, false>(RwCameraGetNearClipPlane(Scene.camera));
 						float heightFade = (20.0f - aCoronas[i].heightAboveRoad)/20.0f;
 						int intensity = distanceFade*heightFade * 230.0 * CWeather::WetRoads;
@@ -606,7 +607,9 @@ CEntity::ProcessLightsForEntity(void)
 	flashTimer1 = 0;
 	flashTimer2 = 0;
 	flashTimer3 = 0;
-
+#ifdef DC_SH4
    dc:mat_load2(GetMatrix());
 #endif
 	n = CModelInfo::GetModelInfo(GetModelIndex())->GetNum2dEffects();
 	for(i = 0; i < n; i++, flashTimer1 += 0x80, flashTimer2 += 0x100, flashTimer3 += 0x200){
 		effect = CModelInfo::GetModelInfo(GetModelIndex())->Get2dEffect(i);
@@ -614,8 +617,12 @@ CEntity::ProcessLightsForEntity(void)
 		if(effect->type != EFFECT_LIGHT)
 			continue;
 #ifndef DC_SH4
 		pos = GetMatrix() * effect->pos;
-
+#else
        mat_trans_single3_nodiv_nomod(effect->pos.x, effect->pos.y, effect->pos.z,
                                      pos.x, pos.y, pos.z);
 #endif
 		lightOn = false;
 		lightFlickering = false;
 		switch(effect->light.lightType){
--- a/vendor/librw/src/base.cpp
+++ b/vendor/librw/src/base.cpp
@@ -91,24 +91,6 @@ strncmp_ci(const char *s1, const char *s2, int n)
 	return 0;
 }
 Quat
 mult(const Quat &q, const Quat &p)
 {
 #ifndef DC_SH4
 	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
 	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
 	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
 	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
 #else
 	Quat o;
 	dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
 	              reinterpret_cast<const dc::quaternion_t &>(q),
 				  reinterpret_cast<const dc::quaternion_t &>(p));
 	return o;
 #endif
 }
 Quat*
 Quat::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
@@ -166,53 +148,39 @@ slerp(const Quat &q, const Quat &p, float32 a)
 //
 // V3d
 //
-
+void V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) {
-V3d
+    int32 i;
-cross(const V3d &a, const V3d &b)
+    #ifndef DC_SH4
-{
+        V3d tmp;
-	return makeV3d(a.y*b.z - a.z*b.y,
+        for(i = 0; i < n; i++){
-	               a.z*b.x - a.x*b.z,
+            tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
-	               a.x*b.y - a.y*b.x);
+            tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
            tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
            out[i] = tmp;
        }
    #else
        dc::mat_load2(*m);
        for(i = 0; i < n; i++)
            mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
                                          out[i].x, out[i].y, out[i].z);
    #endif
 }
-
+void V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) {
-void
+    int32 i;
-V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m)
+    #ifndef DC_SH4
-{
+        V3d tmp;
-	int32 i;
+        for(i = 0; i < n; i++){
-#ifndef DC_SH4
+            tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
-    V3d tmp;
+            tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
-    for(i = 0; i < n; i++){
+            tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
-		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
+            out[i] = tmp;
-		tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
+        }
-		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
+    #else
-		out[i] = tmp;
+        dc::mat_load2(*m);
-	}
+        for(i = 0; i < n; i++)
-#else
+            mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
-    dc::mat_load2(*m);
+                                    out[i].x, out[i].y, out[i].z);
-    for(i = 0; i < n; i++)
+    #endif
        mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
                                      out[i].x, out[i].y, out[i].z);
 #endif
 }
 void
 V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
 {
 	int32 i;
 #ifndef DC_SH4
 	V3d tmp;
 	for(i = 0; i < n; i++){
 		tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
 		tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
 		tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
 		out[i] = tmp;
 	}
 #else
    dc::mat_load2(*m);
    for(i = 0; i < n; i++)
        mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
                                out[i].x, out[i].y, out[i].z);
 #endif
 }
 //
@@ -343,9 +311,10 @@ Matrix::mult(Matrix *dst, const Matrix *src1, const Matrix *src2)
 		*dst = *src2;
 	else if(src2->flags & IDENTITY)
 		*dst = *src1;
-	else{
+	else {
        uint8_t flags = src1->flags & src2->flags;
 		mult_(dst, src1, src2);
-		dst->flags = src1->flags & src2->flags;
+		dst->flags = flags;
 	}
 	return dst;
 }
@@ -366,7 +335,8 @@ Matrix::invert(Matrix *dst, const Matrix *src)
 Matrix*
 Matrix::transpose(Matrix *dst, const Matrix *src)
 {
-	if(src->flags & IDENTITY)
+#ifndef DC_SH4
 	if(src->flags & IDENTITY) 
 		*dst = *src;
 	dst->right.x = src->right.x;
 	dst->up.x = src->right.y;
@@ -380,25 +350,31 @@ Matrix::transpose(Matrix *dst, const Matrix *src)
 	dst->pos.x = 0.0;
 	dst->pos.y = 0.0;
 	dst->pos.z = 0.0;
 #else
    if(src->flags & IDENTITY)
        *dst = *src;
    else {
        dc::mat_load_transpose(*src);
        dc::mat_store2(*dst);
    }
 #endif
 	return dst;
 }
 Matrix*
 Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 {
-	Matrix tmp, rot;
+	Matrix rot;
-	makeRotation(&rot, axis, angle);
+    makeRotation(&rot, axis, angle);
 	switch(op){
 	case COMBINEREPLACE:
 		*this = rot;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
+		mult(this, &rot, this);
 		*this = tmp;
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
+        mult(this, this, &rot);
 		*this = tmp;
 		break;
 	}
 	return this;
@@ -407,27 +383,25 @@ Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
 Matrix*
 Matrix::rotate(const Quat &q, CombineOp op)
 {
-	Matrix tmp, rot;
+	Matrix rot;
-	makeRotation(&rot, q);
+    makeRotation(&rot, q);
 	switch(op){
 	case COMBINEREPLACE:
 		*this = rot;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &rot, this);
+        mult(this, &rot, this);
 		*this = tmp;
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &rot);
+        mult(this, this, &rot);
 		*this = tmp;
 		break;
 	}
 	return this;
 }
 Matrix*
 Matrix::translate(const V3d *translation, CombineOp op)
 {
 	Matrix tmp;
 	Matrix trans = identMat;
 	trans.pos = *translation;
 	trans.flags &= ~IDENTITY;
@@ -436,12 +410,10 @@ Matrix::translate(const V3d *translation, CombineOp op)
 		*this = trans;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &trans, this);
+		mult(this, &trans, this);
 		*this = tmp;
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &trans);
+		mult(this, this, &trans);
 		*this = tmp;
 		break;
 	}
 	return this;
@@ -450,7 +422,6 @@ Matrix::translate(const V3d *translation, CombineOp op)
 Matrix*
 Matrix::scale(const V3d *scale, CombineOp op)
 {
 	Matrix tmp;
 	Matrix scl = identMat;
 	scl.right.x = scale->x;
 	scl.up.y = scale->y;
@@ -461,12 +432,10 @@ Matrix::scale(const V3d *scale, CombineOp op)
 		*this = scl;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, &scl, this);
+		mult(this, &scl, this);
 		*this = tmp;
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, &scl);
+		mult(this, this, &scl);
 		*this = tmp;
 		break;
 	}
 	return this;
@@ -475,18 +444,15 @@ Matrix::scale(const V3d *scale, CombineOp op)
 Matrix*
 Matrix::transform(const Matrix *mat, CombineOp op)
 {
 	Matrix tmp;
 	switch(op){
 	case COMBINEREPLACE:
 		*this = *mat;
 		break;
 	case COMBINEPRECONCAT:
-		mult(&tmp, mat, this);
+		mult(this, mat, this);
 		*this = tmp;
 		break;
 	case COMBINEPOSTCONCAT:
-		mult(&tmp, this, mat);
+		mult(this, this, mat);
 		*this = tmp;
 		break;
 	}
 	return this;
@@ -501,27 +467,31 @@ Matrix::getRotation(void)
 	if(tr > 0.0f){
 		s = sqrtf(1.0f + tr) * 2.0f;
 		q.w = s / 4.0f;
-		q.x = (up.z - at.y) / s;
+        float invS = dc::Invert<true, false>(s);
-		q.y = (at.x - right.z) / s;
+		q.x = (up.z - at.y) * invS;
-		q.z = (right.y - up.x) / s;
+		q.y = (at.x - right.z) * invS;
 		q.z = (right.y - up.x) * invS;
 	}else if(right.x > up.y && right.x > at.z){
 		s = sqrtf(1.0f + right.x - up.y - at.z) * 2.0f;
-		q.w = (up.z - at.y) / s;
+        q.x = s / 4.0f;
-		q.x = s / 4.0f;
+        float invS = dc::Invert<true, false>(s);
-		q.y = (up.x + right.y) / s;
+        q.w = (up.z - at.y) * invS;
-		q.z = (at.x + right.z) / s;
+		q.y = (up.x + right.y) * invS;
 		q.z = (at.x + right.z) * invS;
 	}else if(up.y > at.z){
 		s = sqrtf(1.0f + up.y - right.x - at.z) * 2.0f;
-		q.w = (at.x - right.z) / s;
+        q.y = s / 4.0f;
-		q.x = (up.x + right.y) / s;
+        float invS = dc::Invert<true, false>(s);
-		q.y = s / 4.0f;
+        q.w = (at.x - right.z) * invS;
-		q.z = (at.y + up.z) / s;
+		q.x = (up.x + right.y) * invS;
 		q.z = (at.y + up.z) * invS;
 	}else{
 		s = sqrtf(1.0f + at.z - right.x - up.y) * 2.0f;
-		q.w = (right.y - up.x) / s;
+        q.z = s / 4.0f;
-		q.x = (at.x + right.z) / s;
+        float invS = dc::Invert<true, false>(s);
-		q.y = (at.y + up.z) / s;
+        q.w = (right.y - up.x) * invS;
-		q.z = s / 4.0f;
+		q.x = (at.x + right.z) * invS;
 		q.y = (at.y + up.z) * invS;
 	}
 	return q;
 }
@@ -543,20 +513,7 @@ Matrix::lookAt(const V3d &dir, const V3d &up)
 void
 Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const Matrix *__restrict__ src2)
 {
-	#if !defined(DC_TEXCONV) && !defined(DC_SIM)
+#ifndef DC_SH4
 	dst->right.x = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.x, src2->up.x, src2->at.x, 0);
 	dst->right.y = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.y, src2->up.y, src2->at.y, 0);
 	dst->right.z = fipr(src1->right.x, src1->right.y,  src1->right.z, 0, 		  src2->right.z, src2->up.z, src2->at.z, 0);
 	dst->up.x    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
 	dst->up.y    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
 	dst->up.z    = fipr(src1->up.x,    src1->up.y,  src1->up.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
 	dst->at.x    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.x, src2->up.x, src2->at.x, 0);
 	dst->at.y    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.y, src2->up.y, src2->at.y, 0);
 	dst->at.z    = fipr(src1->at.x,    src1->at.y,  src1->at.z, 0, 				  src2->right.z, src2->up.z, src2->at.z, 0);
 	dst->pos.x   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.x, src2->up.x, src2->at.x, src2->pos.x);
 	dst->pos.y   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  	 	  src2->right.y, src2->up.y, src2->at.y, src2->pos.y);
 	dst->pos.z   = fipr(src1->pos.x,   src1->pos.y,  src1->pos.z, 1, 	  		  src2->right.z, src2->up.z, src2->at.z, src2->pos.z);
 	#else
 	dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x;
 	dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y;
 	dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z;
@@ -569,12 +526,15 @@ Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const M
 	dst->pos.x   = src1->pos.x*src2->right.x   + src1->pos.y*src2->up.x   + src1->pos.z*src2->at.x + src2->pos.x;
 	dst->pos.y   = src1->pos.x*src2->right.y   + src1->pos.y*src2->up.y   + src1->pos.z*src2->at.y + src2->pos.y;
 	dst->pos.z   = src1->pos.x*src2->right.z   + src1->pos.y*src2->up.z   + src1->pos.z*src2->at.z + src2->pos.z;
-	#endif
+#else
    dc::mat_mult(*dst, *src2, *src1);
 #endif
 }
 void
 Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 {
 #if 1
 	dst->right.x = src->right.x;
 	dst->right.y = src->up.x;
 	dst->right.z = src->at.x;
@@ -593,7 +553,12 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
 	dst->pos.z = -(src->pos.x*src->at.x +
 	               src->pos.y*src->at.y +
 	               src->pos.z*src->at.z);
-	dst->flags = TYPEORTHONORMAL;
+#else
    dc::mat_load_transpose(*src);
    dc::mat_invert_tranpose();
    dc::mat_store2(*dst);
 #endif
    dst->flags = TYPEORTHONORMAL;
 }
 Matrix*
@@ -688,7 +653,11 @@ Matrix::normalError(void)
 	x = dot(right, right) - 1.0f;
 	y = dot(up, up) - 1.0f;
 	z = dot(at, at) - 1.0f;
 #ifndef DC_SH4
 	return x*x + y*y + z*z;
 #else
    return fipr_magnitude_sqr(x, y, z, 0.0f);
 #endif
 }
 float32
@@ -698,16 +667,27 @@ Matrix::orthogonalError(void)
 	x = dot(at, up);
 	y = dot(at, right);
 	z = dot(up, right);
 #ifndef DC_SH4
 	return x*x + y*y + z*z;
 #else
    return fipr_magnitude_sqr(x, y, z, 0.0f);
 #endif
 }
 float32
 Matrix::identityError(void)
 {
-	V3d r = { right.x-1.0f, right.y, right.z };
+    V3d r = { right.x-1.0f, right.y, right.z };
 	V3d u = { up.x, up.y-1.0f, up.z };
 	V3d a = { at.x, at.y, at.z-1.0f };
 #ifndef DC_SH4
 	return dot(r,r) + dot(u,u) + dot(a,a) + dot(pos,pos);
 #else
    return fipr_magnitude_sqr(r.x, r.y, r.z, 0.0f)    +
           fipr_magnitude_sqr(u.x, u.y, u.z, 0.0f)    +
           fipr_magnitude_sqr(at.x, at.y, at.z, 0.0f) +
           fipr_magnitude_sqr(pos.x, pos.y, pos.z, 0.0f);
 #endif
 }
 void
--- a/vendor/librw/src/dc/rwdc_common.h
+++ b/vendor/librw/src/dc/rwdc_common.h
@@ -246,6 +246,83 @@ inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) {
    );
 }
 inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) {
    asm volatile(
        R"(
            frchg
            fmov.s  @%[mtx]+, fr0
            add     #32, %[mtx]
            pref    @%[mtx]
            add     #-(32 - 4), %[mtx]
            fmov.s  @%[mtx]+, fr4
            fmov.s  @%[mtx]+, fr8
            fldi0   fr12
            add     #4, %[mtx]
            fmov.s  @%[mtx]+, fr1
            fmov.s  @%[mtx]+, fr5
            fmov.s  @%[mtx]+, fr9
            fldi0   fr13
            add     #4, %[mtx]
            fmov.s  @%[mtx]+, fr2
            fmov.s  @%[mtx]+, fr6
            fmov.s  @%[mtx]+, fr10
            fldi0   fr14
            fldi0  fr3
            fldi0  fr7
            fmov   fr3, fr11
            fldi1  fr15
            frchg
        )"
        : [mtx] "+r" (mtx)
        :
        :
    );
 }
 inline __hot __icache_aligned void mat_invert_tranpose() {
 	asm volatile(
 		"frchg\n\t"
 		"fneg	fr12\n\t"
 		"fneg	fr13\n\t"
 		"fneg	fr14\n\t"
 		"fldi0	fr15\n\t"
 		"fldi0	fr3\n\t"
 		"fipr	fv12, fv0\n\t"
 		"fldi0	fr7\n\t"
 		"fipr	fv12, fv4\n\t"
 		"fldi0	fr11\n\t"
 		"fipr	fv12, fv8\n\t"
 		"fmov	fr3, fr12\n\t"
 		"fmov	fr7, fr13\n\t"
 		"fmov	fr11, fr14\n\t"
 		"fmov	fr1, fr15\n\t"
 		"fmov	fr4, fr1\n\t"
 		"fmov	fr15, fr4\n\t"
 		"fmov	fr2, fr15\n\t"
 		"fmov	fr8, fr2\n\t"
 		"fmov	fr15, fr2\n\t"
 		"fmov	fr6, fr15\n\t"
 		"fmov	fr9, fr6\n\t"
 		"fmov	fr15, fr9\n\t"
 		"fldi0	fr3\n\t"
 		"fldi0	fr7\n\t"
 		"fldi0	fr11\n\t"
 		"fldi1	fr15\n\t"
 		"frchg\n"
 		:
 		:
 		:);
 }
 inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
    asm volatile(
        R"(
@@ -449,103 +526,6 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src)
      :);
 }
 //TODO: FIXME FOR VC (AND USE FTRV)
 template<bool FAST_APPROX=false>
 __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
    if(FAST_APPROX && !std::is_constant_evaluated()) {
    /*
        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
        // q2 then needs to be rotated after each inner product
        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
    */
        // keep q1 in fv4
        register float q1x __asm__ ("fr4") = (q1.x);
        register float q1y __asm__ ("fr5") = (q1.y);
        register float q1z __asm__ ("fr6") = (q1.z);
        register float q1w __asm__ ("fr7") = (q1.w);
        // load q2 into fv8, use it to get the shuffled reorder into fv0
        register float q2x __asm__ ("fr8")  = (q2.x);
        register float q2y __asm__ ("fr9")  = (q2.y);
        register float q2z __asm__ ("fr10") = (q2.z);
        register float q2w __asm__ ("fr11") = (q2.w);
        // temporary operand / result in fv0
        register float t1x __asm__ ("fr0");
        register float t1y __asm__ ("fr1");
        register float t1z __asm__ ("fr2");
        register float t1w __asm__ ("fr3");
        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
        t1x = q2w;
        t1y = q2z;
        t1z = -q2y;
        t1w = q2w;
        __asm__ ("\n"
            " fipr	fv4,fv0\n"
            : "+f" (t1w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (t1x), "f" (t1y), "f" (t1z)
        );
        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
        t1x = -q2z;
        t1y = q2w;
        t1z = q2x;
        __atomic_thread_fence(1);
        r->x = t1w;   // get previous result
        t1w = q2y;
        __asm__ ("\n"
            "	fipr	fv4,fv0\n"
            : "+f" (t1w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (t1x), "f" (t1y), "f" (t1z)
        );
        //y = t1w;
        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
        t1x = q2y;
        t1y = -q2x;
        t1z = q2w;
        __atomic_thread_fence(1);
        r->y = t1w;   // get previous result
        t1w = q2z;
        __asm__ ("\n"
            "	fipr	fv4,fv0\n"
            : "+f" (t1w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (t1x), "f" (t1y), "f" (t1z)
        );
        //z = t1w;
        __atomic_thread_fence(1);
        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
        q2x = -q2x;
        q2y = -q2y;
        q2z = -q2z;
        __asm__ ("\n"
            "	fipr	fv4,fv8\n"
            : "+f" (q2w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (q2x), "f" (q2y), "f" (q2z)
        );
        __atomic_thread_fence(1);
        r->z = t1w;
        __atomic_thread_fence(1);
        r->w = q2w;
    } else {
        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
    }
 }
 __hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
    unsigned int prefetch_scratch;
@@ -669,6 +649,104 @@ __hot inline void mat_apply_rotate_z(float z) {
        : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
 }
 //TODO: FIXME FOR VC (AND USE FTRV)
 template<bool FAST_APPROX=false>
 __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
    if(FAST_APPROX && !std::is_constant_evaluated()) {
    /*
        // reorder the coefficients so that q1 stays in constant order {x,y,z,w}
        // q2 then needs to be rotated after each inner product
        x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
        y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
        z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
        w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
    */
        // keep q1 in fv4
        register float q1x __asm__ ("fr4") = (q1.x);
        register float q1y __asm__ ("fr5") = (q1.y);
        register float q1z __asm__ ("fr6") = (q1.z);
        register float q1w __asm__ ("fr7") = (q1.w);
        // load q2 into fv8, use it to get the shuffled reorder into fv0
        register float q2x __asm__ ("fr8")  = (q2.x);
        register float q2y __asm__ ("fr9")  = (q2.y);
        register float q2z __asm__ ("fr10") = (q2.z);
        register float q2w __asm__ ("fr11") = (q2.w);
        // temporary operand / result in fv0
        register float t1x __asm__ ("fr0");
        register float t1y __asm__ ("fr1");
        register float t1z __asm__ ("fr2");
        register float t1w __asm__ ("fr3");
        // x =  (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
        t1x = q2w;
        t1y = q2z;
        t1z = -q2y;
        t1w = q2w;
        __asm__ ("\n"
            " fipr	fv4,fv0\n"
            : "+f" (t1w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (t1x), "f" (t1y), "f" (t1z)
        );
        // x = t1w;  try to avoid the stall by not reading the fipr result immediately
        // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
        t1x = -q2z;
        t1y = q2w;
        t1z = q2x;
        __atomic_thread_fence(1);
        r->x = t1w;   // get previous result
        t1w = q2y;
        __asm__ ("\n"
            "	fipr	fv4,fv0\n"
            : "+f" (t1w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (t1x), "f" (t1y), "f" (t1z)
        );
        //y = t1w;
        // z =  (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
        t1x = q2y;
        t1y = -q2x;
        t1z = q2w;
        __atomic_thread_fence(1);
        r->y = t1w;   // get previous result
        t1w = q2z;
        __asm__ ("\n"
            "	fipr	fv4,fv0\n"
            : "+f" (t1w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (t1x), "f" (t1y), "f" (t1z)
        );
        //z = t1w;
        __atomic_thread_fence(1);
        // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
        q2x = -q2x;
        q2y = -q2y;
        q2z = -q2z;
        __asm__ ("\n"
            "	fipr	fv4,fv8\n"
            : "+f" (q2w)
            : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
              "f" (q2x), "f" (q2y), "f" (q2z)
        );
        __atomic_thread_fence(1);
        r->z = t1w;
        __atomic_thread_fence(1);
        r->w = q2w;
    } else {
        r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
        r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
        r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
        r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
    }
 }
 #   else
 #       ifdef DC_TEXCONV
 #           define mat_apply(a)
--- a/vendor/librw/src/rwbase.h
+++ b/vendor/librw/src/rwbase.h
@@ -238,8 +238,8 @@ inline V2d neg(const V2d &a) { return makeV2d(-a.x, -a.y); }
 inline V2d add(const V2d &a, const V2d &b) { return makeV2d(a.x+b.x, a.y+b.y); }
 inline V2d sub(const V2d &a, const V2d &b) { return makeV2d(a.x-b.x, a.y-b.y); }
 inline V2d scale(const V2d &a, float32 r) { return makeV2d(a.x*r, a.y*r); }
-inline float32 length(const V2d &v) { return sqrtf(v.x*v.x + v.y*v.y); }
+inline float32 length(const V2d &v) { return dc::Sqrt(v.x*v.x + v.y*v.y); }
-inline V2d normalize(const V2d &v) { return scale(v, 1.0f/length(v)); }
+inline V2d normalize(const V2d &v) { return scale(v, dc::RecipSqrt(v.x*v.x + v.y*v.y)); }
 struct V3d
 {
@@ -265,10 +265,22 @@ inline float32 length(const V3d &v) {
 	return len;
 #endif
 }
-inline V3d normalize(const V3d &v) { return scale(v, 1.0f/length(v)); }
+inline V3d normalize(const V3d &v) {
-inline V3d setlength(const V3d &v, float32 l) { return scale(v, l/length(v)); }
+    float invLen; 
-V3d cross(const V3d &a, const V3d &b);
+#ifndef DC_SH4
-inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) {
+    invLen = 1.0f / length(v);
 #else
    invLen = dc::RecipSqrt(fipr_magnitude_sqr(v.x, v.y, v.z, 0.0f));
 #endif
    return scale(v, invLen); 
 }
 inline V3d setlength(const V3d &v, float32 l) { return scale(v, dc::Div<true, false>(l, length(v))); }
 inline V3d cross(const V3d &a, const V3d &b) {
    return makeV3d(a.y*b.z - a.z*b.y,
        a.z*b.x - a.x*b.z,
        a.x*b.y - a.y*b.x);
 }
 inline float32 dot(const V3d &a, const V3d &b) {
 #ifdef DC_SH4
 	return fipr(a.x, a.y, a.z, 0.0f, b.x, b.y, b.z, 0.0f);
 #else
@@ -329,12 +341,33 @@ inline float32 length(const Quat &q) {
 #ifndef DC_SH4
 	return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
 #else
-	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
+	return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
 #endif
 }
-inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
+inline Quat normalize(const Quat &q) {
    float invLen; 
 #ifndef DC_SH4
    invLen = 1.0f / length(q);
 #else
    invLen = dc::RecipSqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
 #endif
    return scale(q, invLen);
 }
 inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
-Quat mult(const Quat &q, const Quat &p);
+inline Quat mult(const Quat &q, const Quat &p) {
 #ifndef DC_SH4
 	return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
 	                q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
 	                q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
 	                q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
 #else
    Quat o;
    dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
 	              reinterpret_cast<const dc::quaternion_t &>(q),
                  reinterpret_cast<const dc::quaternion_t &>(p));
 	return o;
 #endif
 }
 inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.0f, v)), conj(q)).vec(); }
 Quat lerp(const Quat &q, const Quat &p, float32 r);
 Quat slerp(const Quat &q, const Quat &p, float32 a);