From 0ba1c595cd8aac94d3312cd3aaa7825a1294c02e Mon Sep 17 00:00:00 2001
From: Falco Girgis <gyrovorbis@gmail.com>
Date: Fri, 28 Mar 2025 00:17:25 -0500
Subject: [PATCH] Acceleration working in miami.

---
 miami/Makefile                    |   1 +
 src/miami/collision/ColLine.h     |   2 +-
 src/miami/collision/ColPoint.h    |   2 +-
 src/miami/collision/ColSphere.h   |   7 +-
 src/miami/collision/Collision.cpp | 133 +++++++++++++++++++++++++++--
 src/miami/math/Matrix.cpp         |  12 +--
 src/miami/math/Matrix.h           |  16 ++--
 src/miami/math/VuVector.h         | 134 ++++++++++++++++++++++++++++--
 src/miami/math/math.cpp           | 113 -------------------------
 src/miami/math/maths.h            |  26 ++++++
 10 files changed, 302 insertions(+), 144 deletions(-)

diff --git a/miami/Makefile b/miami/Makefile
index da7af878..45f11bbd 100644
--- a/miami/Makefile
+++ b/miami/Makefile
@@ -118,6 +118,7 @@ OBJS_NO_FAST_MATH = \
 	../src/miami/core/Cam.o \
 	../src/miami/core/Camera.o \
 	../src/miami/vehicles/Bike.o \
+	../src/miami/vehicles/Boat.o \
 	../src/miami/renderer/Particle.o
 
 KOS_CPPFLAGS += -fbuiltin -ffast-math -ffp-contract=fast \
diff --git a/src/miami/collision/ColLine.h b/src/miami/collision/ColLine.h
index 21587a06..fc942143 100644
--- a/src/miami/collision/ColLine.h
+++ b/src/miami/collision/ColLine.h
@@ -1,6 +1,6 @@
 #pragma once
 
-struct CColLine
+struct alignas(8) CColLine
 {
 	// NB: this has to be compatible with two CVuVectors
 	CVector p0;
diff --git a/src/miami/collision/ColPoint.h b/src/miami/collision/ColPoint.h
index a15b2345..31a9bde4 100644
--- a/src/miami/collision/ColPoint.h
+++ b/src/miami/collision/ColPoint.h
@@ -1,6 +1,6 @@
 #pragma once
 
-struct CColPoint
+struct alignas(8) CColPoint
 {
 	CVector point;
 	int pad1;
diff --git a/src/miami/collision/ColSphere.h b/src/miami/collision/ColSphere.h
index f86b282a..906fc1a3 100644
--- a/src/miami/collision/ColSphere.h
+++ b/src/miami/collision/ColSphere.h
@@ -2,7 +2,7 @@
 
 #include "SurfaceTable.h"
 
-struct CSphere
+struct alignas(8) CSphere
 {
 	// NB: this has to be compatible with a CVuVector
 	CVector center;
@@ -15,6 +15,11 @@ struct CColSphere : public CSphere
 	uint8 surface;
 	uint8 piece;
 
+	void Set(float radius, uint8 surf = SURFACE_DEFAULT, uint8 piece = 0) {
+		this->radius = radius;
+		this->surface = surf;
+		this->piece = piece;
+	}
 	void Set(float radius, const CVector &center, uint8 surf, uint8 piece);
 	bool IntersectRay(CVector const &from, CVector const &dir, CVector &entry, CVector &exit);
 	using CSphere::Set;
diff --git a/src/miami/collision/Collision.cpp b/src/miami/collision/Collision.cpp
index f39f3f35..f4627f8e 100644
--- a/src/miami/collision/Collision.cpp
+++ b/src/miami/collision/Collision.cpp
@@ -24,6 +24,10 @@
 #include "Camera.h"
 #include "ColStore.h"
 
+#ifdef DC_SH4
+#include "VuCollision.h"
+#endif
+
 #ifdef VU_COLLISION
 #include "VuCollision.h"
 
@@ -572,7 +576,12 @@ CCollision::TestLineOfSight(const CColLine &line, const CMatrix &matrix, CColMod
 
 	// transform line to model space
 	Invert(matrix, matTransform);
-	CColLine newline(matTransform * line.p0, matTransform * line.p1);
+	CColLine newline;
+#ifndef DC_SH4
+	newline.Set(matTransform * line.p0, matTransform * line.p1);
+#else
+	TransformPoints(reinterpret_cast<CVuVector*>(&newline), 2, matTransform, &line.p0, sizeof(CColLine)/2);
+#endif
 
 	// If we don't intersect with the bounding box, no chance on the rest
 	if(!TestLineBox(newline, model.boundingBox))
@@ -1428,7 +1437,12 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 
 	// transform line to model space
 	Invert(matrix, matTransform);
-	CColLine newline(matTransform * line.p0, matTransform * line.p1);
+	CColLine newline;
+#ifdef DC_SH4
+	TransformPoints(reinterpret_cast<CVuVector*>(&newline), 2, matTransform, &line.p0, sizeof(CColLine)/2);
+#else
+	newline.Set(matTransform * line.p0, matTransform * line.p1);
+#endif
 
 	// If we don't intersect with the bounding box, no chance on the rest
 	if(!TestLineBox(newline, model.boundingBox))
@@ -1455,9 +1469,18 @@ CCollision::ProcessLineOfSight(const CColLine &line,
 	}
 
 	if(coldist < mindist){
+#ifndef DC_SH4
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
-		mindist = coldist;
+#else
+		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		mat_trans_single3_nodiv(point.point.x,
+		                        point.point.y,
+		                        point.point.z);
+		mat_trans_normal3(point.normal.x,
+		                  point.normal.y,
+		                  point.normal.z);
+#endif
 		return true;
 	}
 	return false;
@@ -1593,7 +1616,14 @@ CCollision::ProcessVerticalLine(const CColLine &line,
 
 	// transform line to model space
 	// Why does the game seem to do this differently than above?
-	CColLine newline(MultiplyInverse(matrix, line.p0), MultiplyInverse(matrix, line.p1));
+	CMatrix matTransform;
+	Invert(matrix, matTransform);
+	CColLine newline;
+#ifndef DC_SH4
+	newline.Set(matTransform * line.p0, matTransform * line.p1);
+#else
+	TransformPoints(reinterpret_cast<CVuVector*>(&newline), 2, matTransform, &line.p0, sizeof(CColLine)/2);
+#endif
 
 	if(!TestLineBox(newline, model.boundingBox))
 		return false;
@@ -1618,13 +1648,29 @@ CCollision::ProcessVerticalLine(const CColLine &line,
 	}
 
 	if(coldist < mindist){
+#ifndef DC_SH4
 		point.point = matrix * point.point;
 		point.normal = Multiply3x3(matrix, point.normal);
+#else
+		mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrix)));
+		mat_trans_single3_nodiv(point.point.x,
+		                        point.point.y,
+		                        point.point.z);
+		mat_trans_normal3(point.normal.x,
+		                  point.normal.y,
+		                  point.normal.z);
+#endif
 		if(TempStoredPoly.valid && poly){
 			*poly = TempStoredPoly;
+#ifndef DC_SH4
 			poly->verts[0] = matrix * poly->verts[0];
 			poly->verts[1] = matrix * poly->verts[1];
 			poly->verts[2] = matrix * poly->verts[2];
+#else
+			mat_trans_single3_nodiv(poly->verts[0].x, poly->verts[0].y, poly->verts[0].z);
+			mat_trans_single3_nodiv(poly->verts[1].x, poly->verts[1].y, poly->verts[1].z);
+			mat_trans_single3_nodiv(poly->verts[2].x, poly->verts[2].y, poly->verts[2].z);
+#endif
 		}
 		mindist = coldist;
 		return true;
@@ -1981,21 +2027,52 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 
 	CColSphere bsphereAB;	// bounding sphere of A in B space
 	bsphereAB.radius = modelA.boundingSphere.radius;
+#ifndef DC_SH4
 	bsphereAB.center = matAB * modelA.boundingSphere.center;
+#else
+	/* No need to reload the matrix, since it's already banked.
+	mat_load(reinterpret_cast<const matrix_t *>(&matAB)); */
+	mat_trans_single3_nodiv_nomod(modelA.boundingSphere.center.x,
+	                              modelA.boundingSphere.center.y,
+	                              modelA.boundingSphere.center.z,
+	                              bsphereAB.center.x,
+	                              bsphereAB.center.y,
+	                              bsphereAB.center.z);
+#endif
 	if(!TestSphereBox(bsphereAB, modelB.boundingBox))
 		return 0;
-	// B to A space
-	matBA = Invert(matrixA, matBA);
-	matBA *= matrixB;
 
 	// transform modelA's spheres and lines to B space
 	for(i = 0; i < modelA.numSpheres; i++){
 		CColSphere &s = modelA.spheres[i];
+		#ifndef DC_SH4
 		aSpheresA[i].Set(s.radius, matAB * s.center, s.surface, s.piece);
+#else
+		auto &d = aSpheresA[i];
+		mat_trans_single3_nodiv_nomod(s.center.x, s.center.y, s.center.z,
+		                              d.center.x, d.center.y, d.center.z);
+		d.Set(s.radius, s.surface, s.piece);
+#endif
 	}
-	for(i = 0; i < modelA.numLines; i++)
-		aLinesA[i].Set(matAB * modelA.lines[i].p0, matAB * modelA.lines[i].p1);
 
+	for(i = 0; i < modelA.numLines; i++) {
+#ifndef DC_SH4
+		aLinesA[i].Set(matAB * modelA.lines[i].p0, matAB * modelA.lines[i].p1);
+#else
+		mat_trans_single3_nodiv_nomod(modelA.lines[i].p0.x,
+		                              modelA.lines[i].p0.y,
+		                              modelA.lines[i].p0.z,
+		                              aLinesA[i].p0.x,
+		                              aLinesA[i].p0.y,
+		                              aLinesA[i].p0.z);
+		mat_trans_single3_nodiv_nomod(modelA.lines[i].p1.x,
+		                              modelA.lines[i].p1.y,
+		                              modelA.lines[i].p1.z,
+		                              aLinesA[i].p1.x,
+		                              aLinesA[i].p1.y,
+		                              aLinesA[i].p1.z);
+#endif
+	}
 	// Test them against model B's bounding volumes
 	int numSpheresA = 0;
 	int numLinesA = 0;
@@ -2013,9 +2090,25 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	int numSpheresB = 0;
 	int numBoxesB = 0;
 	int numTrianglesB = 0;
+	// B to A space
+	matBA = Invert(matrixA, matBA);
+	matBA *= matrixB;
+#ifdef DC_SH4
+	/* No need to reload the matrix, since it's already banked.
+	mat_load(reinterpret_cast<const matrix_t *>(&matBA)); */
+#endif
 	for(i = 0; i < modelB.numSpheres; i++){
 		s.radius = modelB.spheres[i].radius;
+#ifndef DC_SH4
 		s.center = matBA * modelB.spheres[i].center;
+#else
+		mat_trans_single3_nodiv_nomod(modelB.spheres[i].center.x,
+		                              modelB.spheres[i].center.y,
+		                              modelB.spheres[i].center.z,
+		                              s.center.x,
+		                              s.center.y,
+		                              s.center.z);
+#endif
 		if(TestSphereBox(s, modelA.boundingBox))
 			aSphereIndicesB[numSpheresB++] = i;
 	}
@@ -2062,9 +2155,22 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 		if(hasCollided)
 			numCollisions++;
 	}
+
+#ifdef DC_SH4
+	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&matrixB)));
+#endif
 	for(i = 0; i < numCollisions; i++){
+#ifndef DC_SH4
 		spherepoints[i].point = matrixB * spherepoints[i].point;
 		spherepoints[i].normal = Multiply3x3(matrixB, spherepoints[i].normal);
+#else
+		mat_trans_single3_nodiv(spherepoints[i].point.x,
+		                        spherepoints[i].point.y,
+		                        spherepoints[i].point.z);
+		mat_trans_normal3(spherepoints[i].normal.x,
+		                  spherepoints[i].normal.y,
+		                  spherepoints[i].normal.z);
+#endif
 	}
 
 	// And the same thing for the lines in A
@@ -2095,8 +2201,17 @@ CCollision::ProcessColModels(const CMatrix &matrixA, CColModel &modelA,
 	for(i = 0; i < numLinesA; i++)
 		if(aCollided[i]){
 			j = aLineIndicesA[i];
+#ifndef DC_SH4
 			linepoints[j].point = matrixB * linepoints[j].point;
 			linepoints[j].normal = Multiply3x3(matrixB, linepoints[j].normal);
+#else
+			mat_trans_single3_nodiv(linepoints[j].point.x,
+			                        linepoints[j].point.y,
+			                        linepoints[j].point.z);
+			mat_trans_normal3(linepoints[j].normal.x,
+			                  linepoints[j].normal.y,
+			                  linepoints[j].normal.z);
+#endif
 		}
 
 	return numCollisions;	// sphere collisions
diff --git a/src/miami/math/Matrix.cpp b/src/miami/math/Matrix.cpp
index c0d909cb..62b7029c 100644
--- a/src/miami/math/Matrix.cpp
+++ b/src/miami/math/Matrix.cpp
@@ -1,11 +1,5 @@
 #include "common.h"
 
-CMatrix::CMatrix(void)
-{
-	m_attachment = nil;
-	m_hasRwMatrix = false;
-}
-
 CMatrix::CMatrix(CMatrix const &m)
 {
 	m_attachment = nil;
@@ -434,6 +428,11 @@ operator*(const CMatrix &m1, const CMatrix &m2)
 {
 	// TODO: VU0 code
 	CMatrix out;
+#if defined(RW_DC)
+	mat_load(reinterpret_cast<const matrix_t *>(&m1));
+	mat_apply(reinterpret_cast<const matrix_t *>(&m2));
+	mat_store(reinterpret_cast<matrix_t *>(&out));
+#else
 	out.rx = m1.rx * m2.rx + m1.fx * m2.ry + m1.ux * m2.rz;
 	out.ry = m1.ry * m2.rx + m1.fy * m2.ry + m1.uy * m2.rz;
 	out.rz = m1.rz * m2.rx + m1.fz * m2.ry + m1.uz * m2.rz;
@@ -446,6 +445,7 @@ operator*(const CMatrix &m1, const CMatrix &m2)
 	out.px = m1.rx * m2.px + m1.fx * m2.py + m1.ux * m2.pz + m1.px;
 	out.py = m1.ry * m2.px + m1.fy * m2.py + m1.uy * m2.pz + m1.py;
 	out.pz = m1.rz * m2.px + m1.fz * m2.py + m1.uz * m2.pz + m1.pz;
+#endif
 	return out;
 }
 
diff --git a/src/miami/math/Matrix.h b/src/miami/math/Matrix.h
index 0adcf32c..39a476d6 100644
--- a/src/miami/math/Matrix.h
+++ b/src/miami/math/Matrix.h
@@ -1,6 +1,6 @@
 #pragma once
 
-class CMatrix
+class alignas(8) CMatrix
 {
 public:
 #ifdef GTA_PS2
@@ -23,18 +23,18 @@ public:
 		float f[4][4];
 		struct
 		{
-			float rx, ry, rz, rw;
-			float fx, fy, fz, fw;
-			float ux, uy, uz, uw;
-			float px, py, pz, pw;
+			float rx, ry, rz, rw=0.0f;
+			float fx, fy, fz, fw=0.0f;
+			float ux, uy, uz, uw=0.0f;
+			float px, py, pz, pw=1.0f;
 		};
 	};
 
-	RwMatrix *m_attachment;
-	bool m_hasRwMatrix;	// are we the owner?
+	RwMatrix *m_attachment=nil;
+	bool m_hasRwMatrix=false;	// are we the owner?
 #endif
 
-	CMatrix(void);
+	CMatrix(void)=default;
 	CMatrix(CMatrix const &m);
 	CMatrix(RwMatrix *matrix, bool owner = false);
 	CMatrix(float scale){
diff --git a/src/miami/math/VuVector.h b/src/miami/math/VuVector.h
index 41584095..2e9fbc28 100644
--- a/src/miami/math/VuVector.h
+++ b/src/miami/math/VuVector.h
@@ -1,6 +1,14 @@
 #pragma once
 
-class TYPEALIGN(16) CVuVector : public CVector
+#include "maths.h"
+
+#ifdef RW_DC
+#define VECTOR_ALIGN 8
+#else
+#define VECTOR_ALIGN 16
+#endif
+
+class TYPEALIGN(VECTOR_ALIGN) CVuVector : public CVector
 {
 public:
 	float w;
@@ -26,7 +34,123 @@ public:
 	// TODO: operator-
 };
 
-void TransformPoint(CVuVector &out, const CMatrix &mat, const CVuVector &in);
-void TransformPoint(CVuVector &out, const CMatrix &mat, const RwV3d &in);
-void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride);
-void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const CVuVector *in);
+__always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const CVuVector &in)
+{
+#ifdef GTA_PS2
+	__asm__ __volatile__("\n\
+		lqc2    vf01,0x0(%2)\n\
+		lqc2    vf02,0x0(%1)\n\
+		lqc2    vf03,0x10(%1)\n\
+		lqc2    vf04,0x20(%1)\n\
+		lqc2    vf05,0x30(%1)\n\
+		vmulax.xyz	ACC,   vf02,vf01\n\
+		vmadday.xyz	ACC,   vf03,vf01\n\
+		vmaddaz.xyz	ACC,   vf04,vf01\n\
+		vmaddw.xyz	vf06,vf05,vf00\n\
+		sqc2    vf06,0x0(%0)\n\
+		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
+#elif defined(DC_SH4)
+	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	mat_trans_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z, out.y);
+#else
+	out = mat * in;
+#endif
+}
+
+__always_inline void TransformPoint(CVuVector &out, const CMatrix &mat, const RwV3d &in)
+{
+#ifdef GTA_PS2
+	__asm__ __volatile__("\n\
+		ldr	$8,0x0(%2)\n\
+		ldl	$8,0x7(%2)\n\
+		lw	$9,0x8(%2)\n\
+		pcpyld	$10,$9,$8\n\
+		qmtc2	$10,vf01\n\
+		lqc2    vf02,0x0(%1)\n\
+		lqc2    vf03,0x10(%1)\n\
+		lqc2    vf04,0x20(%1)\n\
+		lqc2    vf05,0x30(%1)\n\
+		vmulax.xyz	ACC,   vf02,vf01\n\
+		vmadday.xyz	ACC,   vf03,vf01\n\
+		vmaddaz.xyz	ACC,   vf04,vf01\n\
+		vmaddw.xyz	vf06,vf05,vf00\n\
+		sqc2    vf06,0x0(%0)\n\
+		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
+#elif defined(DC_SH4)
+	mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	mat_trans_nodiv_nomod(in.x, in.y, in.z, out.x, out.y, out.z, out.y);
+#else
+	out = mat * in;
+#endif
+}
+
+__always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride)
+{
+#ifdef GTA_PS3
+	__asm__ __volatile__("\n\
+		paddub	$3,%4,$0\n\
+		lqc2    vf02,0x0(%2)\n\
+		lqc2    vf03,0x10(%2)\n\
+		lqc2    vf04,0x20(%2)\n\
+		lqc2    vf05,0x30(%2)\n\
+		ldr	$8,0x0(%3)\n\
+		ldl	$8,0x7(%3)\n\
+		lw	$9,0x8(%3)\n\
+		pcpyld	$10,$9,$8\n\
+		qmtc2	$10,vf01\n\
+	1:	vmulax.xyz	ACC,   vf02,vf01\n\
+		vmadday.xyz	ACC,   vf03,vf01\n\
+		vmaddaz.xyz	ACC,   vf04,vf01\n\
+		vmaddw.xyz	vf06,vf05,vf00\n\
+		add	%3,%3,$3\n\
+		ldr	$8,0x0(%3)\n\
+		ldl	$8,0x7(%3)\n\
+		lw	$9,0x8(%3)\n\
+		pcpyld	$10,$9,$8\n\
+		qmtc2	$10,vf01\n\
+		addi	%1,%1,-1\n\
+		addiu	%0,%0,0x10\n\
+		sqc2    vf06,-0x10(%0)\n\
+		bnez	%1,1b\n\
+		": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
+#elif defined(DC_SH4)
+    mat_load(reinterpret_cast<matrix_t *>(const_cast<CMatrix *>(&mat)));
+	while(n--) {
+		mat_trans_single3_nodiv_nomod(in->x, in->y, in->z, out->x, out->y, out->z);
+		in = reinterpret_cast<const RwV3d *>(reinterpret_cast<const uint8_t *>(in) + stride);
+		++out;
+	}
+#else
+	while(n--){
+		*out = mat * *in;
+		in = (RwV3d*)((uint8*)in + stride);
+		out++;
+	}
+#endif
+}
+
+__always_inline void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const CVuVector *in)
+{
+#ifdef GTA_PS2
+	__asm__ __volatile__("\n\
+		lqc2    vf02,0x0(%2)\n\
+		lqc2    vf03,0x10(%2)\n\
+		lqc2    vf04,0x20(%2)\n\
+		lqc2    vf05,0x30(%2)\n\
+		lqc2    vf01,0x0(%3)\n\
+		nop\n\
+	1:	vmulax.xyz	ACC,   vf02,vf01\n\
+		vmadday.xyz	ACC,   vf03,vf01\n\
+		vmaddaz.xyz	ACC,   vf04,vf01\n\
+		vmaddw.xyz	vf06,vf05,vf00\n\
+		lqc2	vf01,0x10(%3)\n\
+		addiu	%3,%3,0x10\n\
+		addi	%1,%1,-1\n\
+		addiu	%0,%0,0x10\n\
+		sqc2    vf06,-0x10(%0)\n\
+		bnez	%1,1b\n\
+		": : "r" (out) , "r" (n), "r" (&mat) ,"r" (in): "memory");
+#else
+    TransformPoints(out, n, mat, in, sizeof(CVuVector));
+#endif
+}
diff --git a/src/miami/math/math.cpp b/src/miami/math/math.cpp
index 8cb56dab..75b6def8 100644
--- a/src/miami/math/math.cpp
+++ b/src/miami/math/math.cpp
@@ -3,116 +3,3 @@
 #include "VuVector.h"
 
 // TODO: move more stuff into here
-
-
-void TransformPoint(CVuVector &out, const CMatrix &mat, const CVuVector &in)
-{
-#ifdef GTA_PS2
-	__asm__ __volatile__("\n\
-		lqc2    vf01,0x0(%2)\n\
-		lqc2    vf02,0x0(%1)\n\
-		lqc2    vf03,0x10(%1)\n\
-		lqc2    vf04,0x20(%1)\n\
-		lqc2    vf05,0x30(%1)\n\
-		vmulax.xyz	ACC,   vf02,vf01\n\
-		vmadday.xyz	ACC,   vf03,vf01\n\
-		vmaddaz.xyz	ACC,   vf04,vf01\n\
-		vmaddw.xyz	vf06,vf05,vf00\n\
-		sqc2    vf06,0x0(%0)\n\
-		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
-#else
-	out = mat * in;
-#endif
-}
-
-void TransformPoint(CVuVector &out, const CMatrix &mat, const RwV3d &in)
-{
-#ifdef GTA_PS2
-	__asm__ __volatile__("\n\
-		ldr	$8,0x0(%2)\n\
-		ldl	$8,0x7(%2)\n\
-		lw	$9,0x8(%2)\n\
-		pcpyld	$10,$9,$8\n\
-		qmtc2	$10,vf01\n\
-		lqc2    vf02,0x0(%1)\n\
-		lqc2    vf03,0x10(%1)\n\
-		lqc2    vf04,0x20(%1)\n\
-		lqc2    vf05,0x30(%1)\n\
-		vmulax.xyz	ACC,   vf02,vf01\n\
-		vmadday.xyz	ACC,   vf03,vf01\n\
-		vmaddaz.xyz	ACC,   vf04,vf01\n\
-		vmaddw.xyz	vf06,vf05,vf00\n\
-		sqc2    vf06,0x0(%0)\n\
-		": : "r" (&out) , "r" (&mat) ,"r" (&in): "memory");
-#else
-	out = mat * in;
-#endif
-}
-
-void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const RwV3d *in, int stride)
-{
-#ifdef GTA_PS3
-	__asm__ __volatile__("\n\
-		paddub	$3,%4,$0\n\
-		lqc2    vf02,0x0(%2)\n\
-		lqc2    vf03,0x10(%2)\n\
-		lqc2    vf04,0x20(%2)\n\
-		lqc2    vf05,0x30(%2)\n\
-		ldr	$8,0x0(%3)\n\
-		ldl	$8,0x7(%3)\n\
-		lw	$9,0x8(%3)\n\
-		pcpyld	$10,$9,$8\n\
-		qmtc2	$10,vf01\n\
-	1:	vmulax.xyz	ACC,   vf02,vf01\n\
-		vmadday.xyz	ACC,   vf03,vf01\n\
-		vmaddaz.xyz	ACC,   vf04,vf01\n\
-		vmaddw.xyz	vf06,vf05,vf00\n\
-		add	%3,%3,$3\n\
-		ldr	$8,0x0(%3)\n\
-		ldl	$8,0x7(%3)\n\
-		lw	$9,0x8(%3)\n\
-		pcpyld	$10,$9,$8\n\
-		qmtc2	$10,vf01\n\
-		addi	%1,%1,-1\n\
-		addiu	%0,%0,0x10\n\
-		sqc2    vf06,-0x10(%0)\n\
-		bnez	%1,1b\n\
-		": : "r" (out) , "r" (n), "r" (&mat), "r" (in), "r" (stride): "memory");
-#else
-	while(n--){
-		*out = mat * *in;
-		in = (RwV3d*)((uint8*)in + stride);
-		out++;
-	}
-#endif
-}
-
-void TransformPoints(CVuVector *out, int n, const CMatrix &mat, const CVuVector *in)
-{
-#ifdef GTA_PS2
-	__asm__ __volatile__("\n\
-		lqc2    vf02,0x0(%2)\n\
-		lqc2    vf03,0x10(%2)\n\
-		lqc2    vf04,0x20(%2)\n\
-		lqc2    vf05,0x30(%2)\n\
-		lqc2    vf01,0x0(%3)\n\
-		nop\n\
-	1:	vmulax.xyz	ACC,   vf02,vf01\n\
-		vmadday.xyz	ACC,   vf03,vf01\n\
-		vmaddaz.xyz	ACC,   vf04,vf01\n\
-		vmaddw.xyz	vf06,vf05,vf00\n\
-		lqc2	vf01,0x10(%3)\n\
-		addiu	%3,%3,0x10\n\
-		addi	%1,%1,-1\n\
-		addiu	%0,%0,0x10\n\
-		sqc2    vf06,-0x10(%0)\n\
-		bnez	%1,1b\n\
-		": : "r" (out) , "r" (n), "r" (&mat) ,"r" (in): "memory");
-#else
-	while(n--){
-		*out = mat * *in;
-		in++;
-		out++;
-	}
-#endif
-}
diff --git a/src/miami/math/maths.h b/src/miami/math/maths.h
index 6a228036..a4701d3e 100644
--- a/src/miami/math/maths.h
+++ b/src/miami/math/maths.h
@@ -1,5 +1,31 @@
 #pragma once
 
+#include "src/common_defines.h"
+
+
+#ifdef DC_SH4
+
+#define mat_trans_nodiv_nomod(x, y, z, x2, y2, z2, w2) do { \
+        register float __x __asm__("fr12") = (x); \
+        register float __y __asm__("fr13") = (y); \
+        register float __z __asm__("fr14") = (z); \
+        register float __w __asm__("fr15") = 1.0f; \
+        __asm__ __volatile__( "ftrv  xmtrx, fv12\n" \
+                              : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) \
+                              : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); \
+        x2 = __x; y2 = __y; z2 = __z; w2 = __w; \
+    } while(false)
+
+#else
+
+#define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \
+		vector_t tmp = { x_, y_, z_, 1.0f }; \
+		mat_transform(&tmp, &tmp, 1, 0); \
+		x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; w2 = tmp.w; \
+	} while(false)
+#endif
+
+
 // wrapper around float versions of functions
 // in gta they are in CMaths but that makes the code rather noisy