From 2361fcd88298feeb114ee708140b7a6401ae62a6 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sun, 27 Apr 2025 14:40:30 -0500 Subject: [PATCH 1/6] Collision + Vec optimizations (liberty only). --- src/liberty/core/World.cpp | 28 +++++++++++++-- src/liberty/core/ZoneCull.cpp | 57 +++++++++++++++++++++++-------- src/liberty/math/Vector.cpp | 3 +- vendor/librw/src/dc/rwdc_common.h | 40 +++++++++++++++++++++- 4 files changed, 107 insertions(+), 21 deletions(-) diff --git a/src/liberty/core/World.cpp b/src/liberty/core/World.cpp index 1c34a633..17a3532f 100644 --- a/src/liberty/core/World.cpp +++ b/src/liberty/core/World.cpp @@ -1215,6 +1215,9 @@ CWorld::FindObjectsIntersectingAngledCollisionBox(const CColBox &boundingBox, co const int32 nStartY = Max(GetSectorIndexY(fStartY), 0); const int32 nEndX = Min(GetSectorIndexX(fEndX), NUMSECTORS_X - 1); const int32 nEndY = Min(GetSectorIndexY(fEndY), NUMSECTORS_Y - 1); +#ifdef DC_SH4 + mat_load_transpose(matrix); +#endif for(int32 y = nStartY; y <= nEndY; y++) { for(int32 x = nStartX; x <= nEndX; x++) { CSector *pSector = GetSector(x, y); @@ -1268,14 +1271,19 @@ CWorld::FindObjectsIntersectingAngledCollisionBoxSectorList(CPtrList &list, cons int16 *nEntitiesFound, int16 maxEntitiesToFind, CEntity **aEntities) { - for(CPtrNode *pNode = list.first; pNode; pNode = pNode->next) { + for(CPtrNode *pNode = list.first; pNode; pNode = pNode->next) { CEntity *pEntity = (CEntity *)pNode->item; if(pEntity->m_scanCode != GetCurrentScanCode()) { pEntity->m_scanCode = GetCurrentScanCode(); CColSphere sphere; CVector vecDistance = pEntity->GetPosition() - position; sphere.radius = pEntity->GetBoundRadius(); +#ifndef DC_SH4 sphere.center = Multiply3x3(vecDistance, matrix); +#else // Transposed matrix was already loaded by the callee, so no need to reload! + mat_trans_normal3_nomod(vecDistance.x, vecDistance.y, vecDistance.z, + sphere.center.x, sphere.center.y, sphere.center.z); +#endif if(CCollision::TestSphereBox(sphere, boundingBox) && *nEntitiesFound < maxEntitiesToFind) { if(aEntities) aEntities[*nEntitiesFound] = pEntity; ++*nEntitiesFound; @@ -1444,13 +1452,27 @@ CWorld::CallOffChaseForAreaSectorListVehicles(CPtrList &list, float x1, float y1 pVehicle->AutoPilot.m_nTimeTempAction = CTimer::GetTimeInMilliseconds() + 2000; CColModel *pColModel = pVehicle->GetColModel(); bool bInsideSphere = false; +#ifdef DC_SH4 + mat_load2(pVehicle->GetMatrix()); +#endif for(int32 i = 0; i < pColModel->numSpheres; i++) { +#ifndef DC_SH4 CVector pos = pVehicle->GetMatrix() * pColModel->spheres[i].center; - float fRadius = pColModel->spheres[i].radius; +#else + CVector pos; + auto ¢er = pColModel->spheres[i].center; + mat_trans_single3_nodiv_nomod(center.x, center.y, center.z, + pos.x, pos.y, pos.z); +#endif + float fRadius = pColModel->spheres[i].radius; if(pos.x + fRadius > x1 && pos.x - fRadius < x2 && pos.y + fRadius > y1 && - pos.y - fRadius < y2) + pos.y - fRadius < y2) { bInsideSphere = true; // Maybe break the loop when bInsideSphere is set to true? +#ifdef DC_SH4 // Don't see why not! + break; +#endif + } } if(bInsideSphere) { if(pVehicle->GetPosition().x <= (x1 + x2) * 0.5f) diff --git a/src/liberty/core/ZoneCull.cpp b/src/liberty/core/ZoneCull.cpp index 5a76e5ed..44e3d4b7 100644 --- a/src/liberty/core/ZoneCull.cpp +++ b/src/liberty/core/ZoneCull.cpp @@ -949,11 +949,15 @@ CCullZone::FindTestPoints() if(ElementsY > 32) ElementsY = 32; if(ElementsZ > 32) ElementsZ = 32; Memsize = ElementsX * ElementsY * ElementsZ; - StepX = (maxx-minx)/(ElementsX-1); - StepY = (maxy-miny)/(ElementsY-1); - StepZ = (maxz-minz)/(ElementsZ-1); + StepX = Div(maxx-minx, ElementsX-1); + StepY = Div(maxy-miny, ElementsY-1); + StepZ = Div(maxz-minz, ElementsZ-1); +#ifndef DC_SH4 pMem = new uint8[Memsize]; +#else + pMem = reinterpret_cast(alloca(Memsize)); +#endif memset(pMem, 0, Memsize); // indices of center @@ -1496,13 +1500,28 @@ CCullZone::TestEntityVisibilityFromCullZone(CEntity *entity, float extraDist, CE else boundMaxZ += extraDist; +#ifndef DC_SH4 CVector vecMin = entity->GetMatrix() * CVector(boundMinX, boundMinY, boundMinZ); CVector vecMaxX = entity->GetMatrix() * CVector(boundMaxX, boundMinY, boundMinZ); CVector vecMaxY = entity->GetMatrix() * CVector(boundMinX, boundMaxY, boundMinZ); CVector vecMaxZ = entity->GetMatrix() * CVector(boundMinX, boundMinY, boundMaxZ); - CVector dirx = vecMaxX - vecMin; - CVector diry = vecMaxY - vecMin; - CVector dirz = vecMaxZ - vecMin; +#else + mat_load2(entity->GetMatrix()); + + CVector vecMin, vecMaxX, vecMaxY, vecMaxZ; + mat_trans_single3_nodiv_nomod(boundMinX, boundMinY, boundMinZ, + vecMin.x, vecMin.y, vecMin.z); + mat_trans_single3_nodiv_nomod(boundMaxX, boundMinY, boundMinZ, + vecMaxX.x, vecMaxX.y, vecMaxX.z); + mat_trans_single3_nodiv_nomod(boundMinX, boundMaxY, boundMinZ, + vecMaxY.x, vecMaxY.y, vecMaxY.z); + mat_trans_single3_nodiv_nomod(boundMinX, boundMinY, boundMaxZ, + vecMaxZ.x, vecMaxZ.y, vecMaxZ.z); +#endif + + CVector dirx = vecMaxX - vecMin; + CVector diry = vecMaxY - vecMin; + CVector dirz = vecMaxZ - vecMin; // If building intersects zone at all, it's visible int x, y, z; @@ -1520,22 +1539,30 @@ CCullZone::TestEntityVisibilityFromCullZone(CEntity *entity, float extraDist, CE float distToZone = CalcDistToCullZone(entity->GetPosition().x, entity->GetPosition().y)/15.0f; distToZone = Max(distToZone, 7.0f); - int numX = (boundMaxX - boundMinX)/distToZone + 2.0f; - int numY = (boundMaxY - boundMinY)/distToZone + 2.0f; - int numZ = (boundMaxZ - boundMinZ)/distToZone + 2.0f; + float invDistToZone = Invert(distToZone); + int numX = (boundMaxX - boundMinX)*invDistToZone + 2.0f; + int numY = (boundMaxY - boundMinY)*invDistToZone + 2.0f; + int numZ = (boundMaxZ - boundMinZ)*invDistToZone + 2.0f; - float stepX = 1.0f/(numX-1); - float stepY = 1.0f/(numY-1); - float stepZ = 1.0f/(numZ-1); + float stepX = Invert(numX-1); + float stepY = Invert(numY-1); + float stepZ = Invert(numZ-1); float midX = (boundMaxX + boundMinX)/2.0f; float midY = (boundMaxY + boundMinY)/2.0f; float midZ = (boundMaxZ + boundMinZ)/2.0f; // check both xy planes - for(int i = 0; i < NumTestPoints; i++){ +#ifndef DC_SH4 + CVector mid = entity->GetMatrix() * CVector(midX, midY, midZ); +#else + CVector mid; + mat_trans_single3_nodiv_nomod(midX, midY, midZ, + mid.x, mid.y, mid.z); +#endif + mid.z += 0.1f; + for(int i = 0; i < NumTestPoints; i++){ CVector testPoint = aTestPoints[i]; - CVector mid = entity->GetMatrix() * CVector(midX, midY, midZ); - mid.z += 0.1f; + if(DoThoroughLineTest(testPoint, mid, entity)) return true; diff --git a/src/liberty/math/Vector.cpp b/src/liberty/math/Vector.cpp index 4f5a21ed..797b22d0 100644 --- a/src/liberty/math/Vector.cpp +++ b/src/liberty/math/Vector.cpp @@ -50,8 +50,7 @@ Multiply3x3(const CVector &vec, const CMatrix &mat) mat.ux * vec.x + mat.uy * vec.y + mat.uz * vec.z); #else CVector out; - dc::mat_load2(mat); - mat_transpose(); + dc::mat_load_transpose(mat); mat_trans_normal3_nomod(vec.x, vec.y, vec.z, out.x, out.y, out.z); return out; diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index 9c8f673a..dc532776 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -185,7 +185,7 @@ __always_inline __hot constexpr auto Norm(auto value, auto min, auto max) { x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ } while(false) -inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) { +inline __hot __icache_aligned void mat_load2(const matrix_t *mtx) { asm volatile( R"( fschg @@ -208,6 +208,44 @@ inline __hot __icache_aligned void mat_load2(const matrix_t* mtx) { ); } +inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) { + asm volatile( + R"( + frchg + + fmov.s @%[mtx]+, fr0 + + add #32, %[mtx] + pref @%[mtx] + add #-(32 - 4), %[mtx] + + fmov.s @%[mtx]+, fr4 + fmov.s @%[mtx]+, fr8 + fmov.s @%[mtx]+, fr12 + + fmov.s @%[mtx]+, fr1 + fmov.s @%[mtx]+, fr5 + fmov.s @%[mtx]+, fr9 + fmov.s @%[mtx]+, fr13 + + fmov.s @%[mtx]+, fr2 + fmov.s @%[mtx]+, fr6 + fmov.s @%[mtx]+, fr10 + fmov.s @%[mtx]+, fr14 + + fmov.s @%[mtx]+, fr3 + fmov.s @%[mtx]+, fr7 + fmov.s @%[mtx]+, fr11 + fmov.s @%[mtx]+, fr15 + + frchg + )" + : [mtx] "+r" (mtx) + : + : + ); +} + inline __hot __icache_aligned void mat_store2(matrix_t *mtx) { asm volatile( R"( From 056fe395675d6e6d3a017776b0ec63008f3f9615 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sun, 27 Apr 2025 14:59:40 -0500 Subject: [PATCH 2/6] Accelerated rw::RawMatrix layer. --- vendor/librw/src/base.cpp | 23 +++++++++++++++++++---- vendor/librw/src/dc/rwdc.cpp | 4 ++-- vendor/librw/src/rwbase.h | 32 +++++++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 9 deletions(-) diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index 9f45b459..dcc01dfd 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -151,8 +151,9 @@ slerp(const Quat &q, const Quat &p, float32 a) float32 phi = acosf(c); if(phi > 0.00001f){ float32 s = sinf(phi); - return add(scale(q1, sinf((1.0f-a)*phi)/s), - scale(p, sinf(a*phi)/s)); + float invS = dc::Invert(s); + return add(scale(q1, sinf((1.0f-a)*phi) * invS), + scale(p, sinf(a*phi) * invS)); } return q1; } @@ -202,6 +203,7 @@ V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) void RawMatrix::mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2) { +#ifndef DC_SH4 dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x + src1->rightw*src2->pos.x; dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y + src1->rightw*src2->pos.y; dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z + src1->rightw*src2->pos.z; @@ -218,11 +220,15 @@ RawMatrix::mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2) dst->pos.y = src1->pos.x*src2->right.y + src1->pos.y*src2->up.y + src1->pos.z*src2->at.y + src1->posw*src2->pos.y; dst->pos.z = src1->pos.x*src2->right.z + src1->pos.y*src2->up.z + src1->pos.z*src2->at.z + src1->posw*src2->pos.z; dst->posw = src1->pos.x*src2->rightw + src1->pos.y*src2->upw + src1->pos.z*src2->atw + src1->posw*src2->posw; +#else + dc::mat_mult(*dst, *src2, *src1); +#endif } void RawMatrix::transpose(RawMatrix *dst, RawMatrix *src) { +#ifndef DC_SH4 dst->right.x = src->right.x; dst->up.x = src->right.y; dst->at.x = src->right.z; @@ -239,18 +245,27 @@ RawMatrix::transpose(RawMatrix *dst, RawMatrix *src) dst->upw = src->pos.y; dst->atw = src->pos.z; dst->posw = src->posw; +#else + dc::mat_load_transpose(*src); + dc::mat_store2(*dst); +#endif } void RawMatrix::setIdentity(RawMatrix *dst) { - static RawMatrix identity = { +#ifndef DC_SH4 + static RawMatrix identity = {{ { 1.0f, 0.0f, 0.0f }, 0.0f, { 0.0f, 1.0f, 0.0f }, 0.0f, { 0.0f, 0.0f, 1.0f }, 0.0f, { 0.0f, 0.0f, 0.0f }, 1.0f - }; + }}; *dst = identity; +#else + dc::mat_identity2(); + dc::mat_store2(*dst); +#endif } // diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index 2849461f..3793f59b 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -3530,12 +3530,12 @@ uploadSkinMatrices(Atomic *a, Matrix* skinMatrices) return skinMatrices[0].identityError() < 0.01f; } -static RawMatrix normal2texcoord = { +static RawMatrix normal2texcoord = {{ { 0.5f / 127, 0.0f, 0.0f }, 0.0f, { 0.0f, -0.5f / 127, 0.0f }, 0.0f, { 0.0f, 0.0f, 1.0f }, 0.0f, { 0.5f, 0.5f, 0.0f }, 1.0f -}; +}}; void uploadEnvMatrix(Frame *frame, RawMatrix *world, matrix_t* envMatrix) diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h index 8b567709..941fb989 100644 --- a/vendor/librw/src/rwbase.h +++ b/vendor/librw/src/rwbase.h @@ -339,9 +339,9 @@ inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0. Quat lerp(const Quat &q, const Quat &p, float32 r); Quat slerp(const Quat &q, const Quat &p, float32 a); -struct __attribute__((aligned(8))) RawMatrix +struct alignas(8) RawMatrixBase { - V3d right; + V3d right; float32 rightw; V3d up; float32 upw; @@ -349,6 +349,32 @@ struct __attribute__((aligned(8))) RawMatrix float32 atw; V3d pos; float32 posw; +}; + +struct RawMatrix: public RawMatrixBase +{ + RawMatrix() {} + + RawMatrix(RawMatrixBase &&aggregate): + RawMatrixBase{aggregate} + {} + + RawMatrix(const RawMatrix &rhs) { + *this = rhs; + } + + operator matrix_t *() { + return reinterpret_cast(this); + } + + operator const matrix_t *() const { + return reinterpret_cast(this); + } + + RawMatrix &operator=(const RawMatrix &rhs) { + dc::mat_copy(*this, rhs); + return *this; + } // NB: this is dst = src2*src1, i.e. src1 is applied first, then src2 static void mult(RawMatrix *dst, RawMatrix *src1, RawMatrix *src2); @@ -356,7 +382,7 @@ struct __attribute__((aligned(8))) RawMatrix static void setIdentity(RawMatrix *dst); }; -struct Matrix +struct alignas(8) Matrix { enum Type { TYPENORMAL = 1, From c3454ac7ec4f12ae7f6db536864541fcd87834d6 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sun, 27 Apr 2025 15:25:54 -0500 Subject: [PATCH 3/6] Basic rw::Matrix acceleration. --- vendor/librw/src/base.cpp | 35 +++++++++++++++++----- vendor/librw/src/dc/rwdc.cpp | 12 ++++++++ vendor/librw/src/rwbase.h | 58 +++++++++++++++++++++++++++++------- 3 files changed, 87 insertions(+), 18 deletions(-) diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index dcc01dfd..dd4f168b 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -45,12 +45,17 @@ int32 build = 0xFFFF; bool32 streamAppendFrames = 0; char *debugFile = nil; -static Matrix identMat = { - { 1.0f, 0.0f, 0.0f }, Matrix::IDENTITY|Matrix::TYPEORTHONORMAL, - { 0.0f, 1.0f, 0.0f }, 0, - { 0.0f, 0.0f, 1.0f }, 0, - { 0.0f, 0.0f, 0.0f }, 0 -}; +static Matrix identMat = {{ + .right = { 1.0f, 0.0f, 0.0f }, + .flags = Matrix::IDENTITY|Matrix::TYPEORTHONORMAL, + .pad0 = 0, + .up = { 0.0f, 1.0f, 0.0f }, + .upw = 0.0f, + .at = { 0.0f, 0.0f, 1.0f }, + .atw = 0.0f, + .pos = { 0.0f, 0.0f, 0.0f }, + .posw = 1.0f +}}; // lazy implementation int @@ -174,19 +179,27 @@ void V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) { int32 i; - V3d tmp; - for(i = 0; i < n; i++){ +#ifndef DC_SH4 + V3d tmp; + for(i = 0; i < n; i++){ tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x; tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y; tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z; out[i] = tmp; } +#else + dc::mat_load2(*m); + for(i = 0; i < n; i++) + mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z, + out[i].x, out[i].y, out[i].z); +#endif } void V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) { int32 i; +#ifndef DC_SH4 V3d tmp; for(i = 0; i < n; i++){ tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x; @@ -194,6 +207,12 @@ V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z; out[i] = tmp; } +#else + dc::mat_load2(*m); + for(i = 0; i < n; i++) + mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z, + out[i].x, out[i].y, out[i].z); +#endif } // diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index 3793f59b..a5ddacbb 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -6247,6 +6247,18 @@ writeNativeSkin(Stream *stream, int32 len, void *object, int32 offset) stream->write8(&skin->numBones, 4); + for(int32 i = 0; i < skin->numBones; i++){ + Matrix &m = *reinterpret_cast( + &skin->inverseMatrices[i * 16]); + + if(m.flags & MatrixBase::IDENTITY_OLD) + m.flags |= MatrixBase::IDENTITY; + m.pad0 = 0; + m.upw = 0.0f; + m.atw = 0.0f; + m.posw = 1.0f; + } + stream->write32(skin->inverseMatrices, skin->numBones*64); return stream; } diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h index 941fb989..4965f2fd 100644 --- a/vendor/librw/src/rwbase.h +++ b/vendor/librw/src/rwbase.h @@ -382,7 +382,7 @@ struct RawMatrix: public RawMatrixBase static void setIdentity(RawMatrix *dst); }; -struct alignas(8) Matrix +struct alignas(8) MatrixBase { enum Type { TYPENORMAL = 1, @@ -391,22 +391,60 @@ struct alignas(8) Matrix TYPEMASK = 3 }; enum Flags { - IDENTITY = 0x20000 + IDENTITY = 0x4, + IDENTITY_OLD = 0x20000 }; + + V3d right; + union { + struct { + uint32_t flags: 3 = TYPEORTHONORMAL|IDENTITY; + uint32_t pad0: 29 = 0; + }; + float rightw; + }; + V3d up; + union { + uint32 pad1; + float upw = 0.0f; + }; + V3d at; + union { + uint32 pad2; + float atw = 0.0f; + }; + V3d pos; + union { + uint32 pad3; + float posw = 1.0f; + }; +}; + +struct Matrix: public MatrixBase +{ struct Tolerance { float32 normal; float32 orthogonal; float32 identity; }; - V3d right; - uint32 flags; - V3d up; - uint32 pad1; - V3d at; - uint32 pad2; - V3d pos; - uint32 pad3; + Matrix() {} + + Matrix(MatrixBase &&aggregate): + MatrixBase{aggregate} + {} + + Matrix(const Matrix &rhs) { + *this = rhs; + } + + Matrix &operator=(const RawMatrix &rhs) { + dc::mat_copy(*this, rhs); + return *this; + } + + operator matrix_t *() { return reinterpret_cast(this); } + operator const matrix_t *() const { return reinterpret_cast(this); } static Matrix *create(void); void destroy(void); From f59f84c1334daca71e4ab747eda9c2f22adf578e Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Tue, 29 Apr 2025 10:03:49 -0500 Subject: [PATCH 4/6] Accelerated lots of RW math + Coronas (liberty) - lot of the RW matrix stuff has become accelerated - went through and accelerated liberty's coronas/reflections ! apparently introduced a bug somewhere along the lines that cause boats to freak out and do summersaults when trying to drive. Will resolve later. --- src/liberty/renderer/Coronas.cpp | 33 ++-- vendor/librw/src/base.cpp | 232 ++++++++++++------------- vendor/librw/src/dc/rwdc_common.h | 272 +++++++++++++++++++----------- vendor/librw/src/rwbase.h | 51 +++++- 4 files changed, 343 insertions(+), 245 deletions(-) diff --git a/src/liberty/renderer/Coronas.cpp b/src/liberty/renderer/Coronas.cpp index e9f9e662..7f7de9cc 100644 --- a/src/liberty/renderer/Coronas.cpp +++ b/src/liberty/renderer/Coronas.cpp @@ -300,9 +300,9 @@ CCoronas::Render(void) if(aCoronas[i].fadeAlpha && spriteCoors.z < aCoronas[i].drawDist){ - float recipz = 1.0f/spriteCoors.z; + float recipz = dc::Invert(spriteCoors.z); float fadeDistance = aCoronas[i].drawDist / 2.0f; - float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance; + float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - dc::Div((spriteCoors.z - fadeDistance), fadeDistance); int totalFade = aCoronas[i].fadeAlpha * distanceFade; if(aCoronas[i].LOScheck) @@ -313,6 +313,7 @@ CCoronas::Render(void) // render corona itself if(aCoronas[i].texture){ float fogscale = CWeather::Foggyness*Min(spriteCoors.z, 40.0f)/40.0f + 1.0f; + float invFogScale = dc::Invert(fogscale); if(CCoronas::aCoronas[i].id == SUN_CORE) spriteCoors.z = 0.95f * RwCameraGetFarClipPlane(Scene.camera); RwRenderStateSet(rwRENDERSTATETEXTURERASTER, RwTextureGetRaster(aCoronas[i].texture)); @@ -328,9 +329,9 @@ CCoronas::Render(void) CSprite::RenderOneXLUSprite(spriteCoors.x, spriteCoors.y, spriteCoors.z, spritew * aCoronas[i].size * wscale, spriteh * aCoronas[i].size * fogscale * hscale, - CCoronas::aCoronas[i].red / fogscale, - CCoronas::aCoronas[i].green / fogscale, - CCoronas::aCoronas[i].blue / fogscale, + CCoronas::aCoronas[i].red * invFogScale, + CCoronas::aCoronas[i].green * invFogScale, + CCoronas::aCoronas[i].blue * invFogScale, totalFade, recipz, 255); @@ -339,9 +340,9 @@ CCoronas::Render(void) spriteCoors.x, spriteCoors.y, spriteCoors.z, spritew * aCoronas[i].size * fogscale, spriteh * aCoronas[i].size * fogscale, - CCoronas::aCoronas[i].red / fogscale, - CCoronas::aCoronas[i].green / fogscale, - CCoronas::aCoronas[i].blue / fogscale, + CCoronas::aCoronas[i].red * invFogScale, + CCoronas::aCoronas[i].green * invFogScale, + CCoronas::aCoronas[i].blue * invFogScale, totalFade, recipz, 20.0f * recipz, @@ -365,7 +366,7 @@ CCoronas::Render(void) (spriteCoors.x - (screenw/2)) * flare->position + (screenw/2), (spriteCoors.y - (screenh/2)) * flare->position + (screenh/2), spriteCoors.z, - 4.0f*flare->size * spritew/spriteh, + 4.0f*flare->size * dc::Div(spritew, spriteh), 4.0f*flare->size, (flare->red * aCoronas[i].red)>>8, (flare->green * aCoronas[i].green)>>8, @@ -480,9 +481,9 @@ CCoronas::RenderReflections(void) drawDist = Min(drawDist, 55.0f); if(spriteCoors.z < drawDist){ float fadeDistance = drawDist / 2.0f; - float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance; + float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - Div((spriteCoors.z - fadeDistance), fadeDistance); distanceFade = Clamp(distanceFade, 0.0f, 1.0f); - float recipz = 1.0f/RwCameraGetNearClipPlane(Scene.camera); + float recipz = dc::Invert(RwCameraGetNearClipPlane(Scene.camera)); float heightFade = (20.0f - aCoronas[i].heightAboveRoad)/20.0f; int intensity = distanceFade*heightFade * 230.0 * CWeather::WetRoads; @@ -606,7 +607,9 @@ CEntity::ProcessLightsForEntity(void) flashTimer1 = 0; flashTimer2 = 0; flashTimer3 = 0; - +#ifdef DC_SH4 + dc:mat_load2(GetMatrix()); +#endif n = CModelInfo::GetModelInfo(GetModelIndex())->GetNum2dEffects(); for(i = 0; i < n; i++, flashTimer1 += 0x80, flashTimer2 += 0x100, flashTimer3 += 0x200){ effect = CModelInfo::GetModelInfo(GetModelIndex())->Get2dEffect(i); @@ -614,8 +617,12 @@ CEntity::ProcessLightsForEntity(void) if(effect->type != EFFECT_LIGHT) continue; +#ifndef DC_SH4 pos = GetMatrix() * effect->pos; - +#else + mat_trans_single3_nodiv_nomod(effect->pos.x, effect->pos.y, effect->pos.z, + pos.x, pos.y, pos.z); +#endif lightOn = false; lightFlickering = false; switch(effect->light.lightType){ diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index dd4f168b..2e3a22cc 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -91,24 +91,6 @@ strncmp_ci(const char *s1, const char *s2, int n) return 0; } -Quat -mult(const Quat &q, const Quat &p) -{ -#ifndef DC_SH4 - return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, - q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, - q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, - q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); -#else - Quat o; - dc::quat_mult(reinterpret_cast(&o), - reinterpret_cast(q), - reinterpret_cast(p)); - return o; -#endif -} - - Quat* Quat::rotate(const V3d *axis, float32 angle, CombineOp op) { @@ -166,53 +148,39 @@ slerp(const Quat &q, const Quat &p, float32 a) // // V3d // - -V3d -cross(const V3d &a, const V3d &b) -{ - return makeV3d(a.y*b.z - a.z*b.y, - a.z*b.x - a.x*b.z, - a.x*b.y - a.y*b.x); +void V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) { + int32 i; + #ifndef DC_SH4 + V3d tmp; + for(i = 0; i < n; i++){ + tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x; + tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y; + tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z; + out[i] = tmp; + } + #else + dc::mat_load2(*m); + for(i = 0; i < n; i++) + mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z, + out[i].x, out[i].y, out[i].z); + #endif } - -void -V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) -{ - int32 i; -#ifndef DC_SH4 - V3d tmp; - for(i = 0; i < n; i++){ - tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x; - tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y; - tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z; - out[i] = tmp; - } -#else - dc::mat_load2(*m); - for(i = 0; i < n; i++) - mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z, - out[i].x, out[i].y, out[i].z); -#endif -} - -void -V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) -{ - int32 i; -#ifndef DC_SH4 - V3d tmp; - for(i = 0; i < n; i++){ - tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x; - tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y; - tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z; - out[i] = tmp; - } -#else - dc::mat_load2(*m); - for(i = 0; i < n; i++) - mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z, - out[i].x, out[i].y, out[i].z); -#endif +void V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) { + int32 i; + #ifndef DC_SH4 + V3d tmp; + for(i = 0; i < n; i++){ + tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x; + tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y; + tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z; + out[i] = tmp; + } + #else + dc::mat_load2(*m); + for(i = 0; i < n; i++) + mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z, + out[i].x, out[i].y, out[i].z); + #endif } // @@ -343,9 +311,10 @@ Matrix::mult(Matrix *dst, const Matrix *src1, const Matrix *src2) *dst = *src2; else if(src2->flags & IDENTITY) *dst = *src1; - else{ + else { + uint8_t flags = src1->flags & src2->flags; mult_(dst, src1, src2); - dst->flags = src1->flags & src2->flags; + dst->flags = flags; } return dst; } @@ -366,7 +335,8 @@ Matrix::invert(Matrix *dst, const Matrix *src) Matrix* Matrix::transpose(Matrix *dst, const Matrix *src) { - if(src->flags & IDENTITY) +#ifndef DC_SH4 + if(src->flags & IDENTITY) *dst = *src; dst->right.x = src->right.x; dst->up.x = src->right.y; @@ -380,25 +350,31 @@ Matrix::transpose(Matrix *dst, const Matrix *src) dst->pos.x = 0.0; dst->pos.y = 0.0; dst->pos.z = 0.0; +#else + if(src->flags & IDENTITY) + *dst = *src; + else { + dc::mat_load_transpose(*src); + dc::mat_store2(*dst); + } +#endif return dst; } Matrix* Matrix::rotate(const V3d *axis, float32 angle, CombineOp op) { - Matrix tmp, rot; - makeRotation(&rot, axis, angle); + Matrix rot; + makeRotation(&rot, axis, angle); switch(op){ case COMBINEREPLACE: *this = rot; break; case COMBINEPRECONCAT: - mult(&tmp, &rot, this); - *this = tmp; + mult(this, &rot, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &rot); - *this = tmp; + mult(this, this, &rot); break; } return this; @@ -407,27 +383,25 @@ Matrix::rotate(const V3d *axis, float32 angle, CombineOp op) Matrix* Matrix::rotate(const Quat &q, CombineOp op) { - Matrix tmp, rot; - makeRotation(&rot, q); + Matrix rot; + makeRotation(&rot, q); switch(op){ case COMBINEREPLACE: *this = rot; break; case COMBINEPRECONCAT: - mult(&tmp, &rot, this); - *this = tmp; + mult(this, &rot, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &rot); - *this = tmp; + mult(this, this, &rot); break; } return this; } + Matrix* Matrix::translate(const V3d *translation, CombineOp op) { - Matrix tmp; Matrix trans = identMat; trans.pos = *translation; trans.flags &= ~IDENTITY; @@ -436,12 +410,10 @@ Matrix::translate(const V3d *translation, CombineOp op) *this = trans; break; case COMBINEPRECONCAT: - mult(&tmp, &trans, this); - *this = tmp; + mult(this, &trans, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &trans); - *this = tmp; + mult(this, this, &trans); break; } return this; @@ -450,7 +422,6 @@ Matrix::translate(const V3d *translation, CombineOp op) Matrix* Matrix::scale(const V3d *scale, CombineOp op) { - Matrix tmp; Matrix scl = identMat; scl.right.x = scale->x; scl.up.y = scale->y; @@ -461,12 +432,10 @@ Matrix::scale(const V3d *scale, CombineOp op) *this = scl; break; case COMBINEPRECONCAT: - mult(&tmp, &scl, this); - *this = tmp; + mult(this, &scl, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &scl); - *this = tmp; + mult(this, this, &scl); break; } return this; @@ -475,18 +444,15 @@ Matrix::scale(const V3d *scale, CombineOp op) Matrix* Matrix::transform(const Matrix *mat, CombineOp op) { - Matrix tmp; switch(op){ case COMBINEREPLACE: *this = *mat; break; case COMBINEPRECONCAT: - mult(&tmp, mat, this); - *this = tmp; + mult(this, mat, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, mat); - *this = tmp; + mult(this, this, mat); break; } return this; @@ -501,27 +467,31 @@ Matrix::getRotation(void) if(tr > 0.0f){ s = sqrtf(1.0f + tr) * 2.0f; q.w = s / 4.0f; - q.x = (up.z - at.y) / s; - q.y = (at.x - right.z) / s; - q.z = (right.y - up.x) / s; + float invS = dc::Invert(s); + q.x = (up.z - at.y) * invS; + q.y = (at.x - right.z) * invS; + q.z = (right.y - up.x) * invS; }else if(right.x > up.y && right.x > at.z){ s = sqrtf(1.0f + right.x - up.y - at.z) * 2.0f; - q.w = (up.z - at.y) / s; - q.x = s / 4.0f; - q.y = (up.x + right.y) / s; - q.z = (at.x + right.z) / s; + q.x = s / 4.0f; + float invS = dc::Invert(s); + q.w = (up.z - at.y) * invS; + q.y = (up.x + right.y) * invS; + q.z = (at.x + right.z) * invS; }else if(up.y > at.z){ s = sqrtf(1.0f + up.y - right.x - at.z) * 2.0f; - q.w = (at.x - right.z) / s; - q.x = (up.x + right.y) / s; - q.y = s / 4.0f; - q.z = (at.y + up.z) / s; + q.y = s / 4.0f; + float invS = dc::Invert(s); + q.w = (at.x - right.z) * invS; + q.x = (up.x + right.y) * invS; + q.z = (at.y + up.z) * invS; }else{ s = sqrtf(1.0f + at.z - right.x - up.y) * 2.0f; - q.w = (right.y - up.x) / s; - q.x = (at.x + right.z) / s; - q.y = (at.y + up.z) / s; - q.z = s / 4.0f; + q.z = s / 4.0f; + float invS = dc::Invert(s); + q.w = (right.y - up.x) * invS; + q.x = (at.x + right.z) * invS; + q.y = (at.y + up.z) * invS; } return q; } @@ -543,20 +513,7 @@ Matrix::lookAt(const V3d &dir, const V3d &up) void Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const Matrix *__restrict__ src2) { - #if !defined(DC_TEXCONV) && !defined(DC_SIM) - dst->right.x = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.x, src2->up.x, src2->at.x, 0); - dst->right.y = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.y, src2->up.y, src2->at.y, 0); - dst->right.z = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.z, src2->up.z, src2->at.z, 0); - dst->up.x = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.x, src2->up.x, src2->at.x, 0); - dst->up.y = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.y, src2->up.y, src2->at.y, 0); - dst->up.z = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.z, src2->up.z, src2->at.z, 0); - dst->at.x = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.x, src2->up.x, src2->at.x, 0); - dst->at.y = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.y, src2->up.y, src2->at.y, 0); - dst->at.z = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.z, src2->up.z, src2->at.z, 0); - dst->pos.x = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.x, src2->up.x, src2->at.x, src2->pos.x); - dst->pos.y = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.y, src2->up.y, src2->at.y, src2->pos.y); - dst->pos.z = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.z, src2->up.z, src2->at.z, src2->pos.z); - #else +#ifndef DC_SH4 dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x; dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y; dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z; @@ -569,12 +526,15 @@ Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const M dst->pos.x = src1->pos.x*src2->right.x + src1->pos.y*src2->up.x + src1->pos.z*src2->at.x + src2->pos.x; dst->pos.y = src1->pos.x*src2->right.y + src1->pos.y*src2->up.y + src1->pos.z*src2->at.y + src2->pos.y; dst->pos.z = src1->pos.x*src2->right.z + src1->pos.y*src2->up.z + src1->pos.z*src2->at.z + src2->pos.z; - #endif +#else + dc::mat_mult(*dst, *src2, *src1); +#endif } void Matrix::invertOrthonormal(Matrix *dst, const Matrix *src) { +#if 1 dst->right.x = src->right.x; dst->right.y = src->up.x; dst->right.z = src->at.x; @@ -593,7 +553,12 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src) dst->pos.z = -(src->pos.x*src->at.x + src->pos.y*src->at.y + src->pos.z*src->at.z); - dst->flags = TYPEORTHONORMAL; +#else + dc::mat_load_transpose(*src); + dc::mat_invert_tranpose(); + dc::mat_store2(*dst); +#endif + dst->flags = TYPEORTHONORMAL; } Matrix* @@ -688,7 +653,11 @@ Matrix::normalError(void) x = dot(right, right) - 1.0f; y = dot(up, up) - 1.0f; z = dot(at, at) - 1.0f; +#ifndef DC_SH4 return x*x + y*y + z*z; +#else + return fipr_magnitude_sqr(x, y, z, 0.0f); +#endif } float32 @@ -698,16 +667,27 @@ Matrix::orthogonalError(void) x = dot(at, up); y = dot(at, right); z = dot(up, right); +#ifndef DC_SH4 return x*x + y*y + z*z; +#else + return fipr_magnitude_sqr(x, y, z, 0.0f); +#endif } float32 Matrix::identityError(void) { - V3d r = { right.x-1.0f, right.y, right.z }; + V3d r = { right.x-1.0f, right.y, right.z }; V3d u = { up.x, up.y-1.0f, up.z }; V3d a = { at.x, at.y, at.z-1.0f }; +#ifndef DC_SH4 return dot(r,r) + dot(u,u) + dot(a,a) + dot(pos,pos); +#else + return fipr_magnitude_sqr(r.x, r.y, r.z, 0.0f) + + fipr_magnitude_sqr(u.x, u.y, u.z, 0.0f) + + fipr_magnitude_sqr(at.x, at.y, at.z, 0.0f) + + fipr_magnitude_sqr(pos.x, pos.y, pos.z, 0.0f); +#endif } void diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index dc532776..0c40af52 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -246,6 +246,83 @@ inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) { ); } +inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) { + asm volatile( + R"( + frchg + + fmov.s @%[mtx]+, fr0 + + add #32, %[mtx] + pref @%[mtx] + add #-(32 - 4), %[mtx] + + fmov.s @%[mtx]+, fr4 + fmov.s @%[mtx]+, fr8 + fldi0 fr12 + add #4, %[mtx] + + fmov.s @%[mtx]+, fr1 + fmov.s @%[mtx]+, fr5 + fmov.s @%[mtx]+, fr9 + fldi0 fr13 + add #4, %[mtx] + + fmov.s @%[mtx]+, fr2 + fmov.s @%[mtx]+, fr6 + fmov.s @%[mtx]+, fr10 + fldi0 fr14 + + fldi0 fr3 + fldi0 fr7 + fmov fr3, fr11 + fldi1 fr15 + + frchg + )" + : [mtx] "+r" (mtx) + : + : + ); +} + +inline __hot __icache_aligned void mat_invert_tranpose() { + asm volatile( + "frchg\n\t" + "fneg fr12\n\t" + "fneg fr13\n\t" + "fneg fr14\n\t" + "fldi0 fr15\n\t" + "fldi0 fr3\n\t" + "fipr fv12, fv0\n\t" + "fldi0 fr7\n\t" + "fipr fv12, fv4\n\t" + "fldi0 fr11\n\t" + "fipr fv12, fv8\n\t" + + "fmov fr3, fr12\n\t" + "fmov fr7, fr13\n\t" + "fmov fr11, fr14\n\t" + "fmov fr1, fr15\n\t" + "fmov fr4, fr1\n\t" + "fmov fr15, fr4\n\t" + "fmov fr2, fr15\n\t" + "fmov fr8, fr2\n\t" + "fmov fr15, fr2\n\t" + "fmov fr6, fr15\n\t" + "fmov fr9, fr6\n\t" + "fmov fr15, fr9\n\t" + + "fldi0 fr3\n\t" + "fldi0 fr7\n\t" + "fldi0 fr11\n\t" + "fldi1 fr15\n\t" + "frchg\n" + : + : + :); +} + inline __hot __icache_aligned void mat_store2(matrix_t *mtx) { asm volatile( R"( @@ -449,103 +526,6 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) :); } -//TODO: FIXME FOR VC (AND USE FTRV) -template -__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - /* - // reorder the coefficients so that q1 stays in constant order {x,y,z,w} - // q2 then needs to be rotated after each inner product - x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); - y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); - z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); - w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); - */ - // keep q1 in fv4 - register float q1x __asm__ ("fr4") = (q1.x); - register float q1y __asm__ ("fr5") = (q1.y); - register float q1z __asm__ ("fr6") = (q1.z); - register float q1w __asm__ ("fr7") = (q1.w); - - // load q2 into fv8, use it to get the shuffled reorder into fv0 - register float q2x __asm__ ("fr8") = (q2.x); - register float q2y __asm__ ("fr9") = (q2.y); - register float q2z __asm__ ("fr10") = (q2.z); - register float q2w __asm__ ("fr11") = (q2.w); - - // temporary operand / result in fv0 - register float t1x __asm__ ("fr0"); - register float t1y __asm__ ("fr1"); - register float t1z __asm__ ("fr2"); - register float t1w __asm__ ("fr3"); - - // x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); - t1x = q2w; - t1y = q2z; - t1z = -q2y; - t1w = q2w; - __asm__ ("\n" - " fipr fv4,fv0\n" - : "+f" (t1w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (t1x), "f" (t1y), "f" (t1z) - ); - // x = t1w; try to avoid the stall by not reading the fipr result immediately - - // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); - t1x = -q2z; - t1y = q2w; - t1z = q2x; - __atomic_thread_fence(1); - r->x = t1w; // get previous result - t1w = q2y; - __asm__ ("\n" - " fipr fv4,fv0\n" - : "+f" (t1w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (t1x), "f" (t1y), "f" (t1z) - ); - //y = t1w; - - // z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); - t1x = q2y; - t1y = -q2x; - t1z = q2w; - __atomic_thread_fence(1); - r->y = t1w; // get previous result - t1w = q2z; - __asm__ ("\n" - " fipr fv4,fv0\n" - : "+f" (t1w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (t1x), "f" (t1y), "f" (t1z) - ); - //z = t1w; - __atomic_thread_fence(1); - - // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); - q2x = -q2x; - q2y = -q2y; - q2z = -q2z; - __asm__ ("\n" - " fipr fv4,fv8\n" - : "+f" (q2w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (q2x), "f" (q2y), "f" (q2z) - ); - - __atomic_thread_fence(1); - r->z = t1w; - __atomic_thread_fence(1); - r->w = q2w; - } else { - r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); - r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); - r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); - r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); - } -} - __hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { unsigned int prefetch_scratch; @@ -669,6 +649,104 @@ __hot inline void mat_apply_rotate_z(float z) { : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); } + +//TODO: FIXME FOR VC (AND USE FTRV) +template +__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + /* + // reorder the coefficients so that q1 stays in constant order {x,y,z,w} + // q2 then needs to be rotated after each inner product + x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + */ + // keep q1 in fv4 + register float q1x __asm__ ("fr4") = (q1.x); + register float q1y __asm__ ("fr5") = (q1.y); + register float q1z __asm__ ("fr6") = (q1.z); + register float q1w __asm__ ("fr7") = (q1.w); + + // load q2 into fv8, use it to get the shuffled reorder into fv0 + register float q2x __asm__ ("fr8") = (q2.x); + register float q2y __asm__ ("fr9") = (q2.y); + register float q2z __asm__ ("fr10") = (q2.z); + register float q2w __asm__ ("fr11") = (q2.w); + + // temporary operand / result in fv0 + register float t1x __asm__ ("fr0"); + register float t1y __asm__ ("fr1"); + register float t1z __asm__ ("fr2"); + register float t1w __asm__ ("fr3"); + + // x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + t1x = q2w; + t1y = q2z; + t1z = -q2y; + t1w = q2w; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + // x = t1w; try to avoid the stall by not reading the fipr result immediately + + // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + t1x = -q2z; + t1y = q2w; + t1z = q2x; + __atomic_thread_fence(1); + r->x = t1w; // get previous result + t1w = q2y; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //y = t1w; + + // z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + t1x = q2y; + t1y = -q2x; + t1z = q2w; + __atomic_thread_fence(1); + r->y = t1w; // get previous result + t1w = q2z; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //z = t1w; + __atomic_thread_fence(1); + + // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + q2x = -q2x; + q2y = -q2y; + q2z = -q2z; + __asm__ ("\n" + " fipr fv4,fv8\n" + : "+f" (q2w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (q2x), "f" (q2y), "f" (q2z) + ); + + __atomic_thread_fence(1); + r->z = t1w; + __atomic_thread_fence(1); + r->w = q2w; + } else { + r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); + } +} + # else # ifdef DC_TEXCONV # define mat_apply(a) diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h index 4965f2fd..1004e0e7 100644 --- a/vendor/librw/src/rwbase.h +++ b/vendor/librw/src/rwbase.h @@ -238,8 +238,8 @@ inline V2d neg(const V2d &a) { return makeV2d(-a.x, -a.y); } inline V2d add(const V2d &a, const V2d &b) { return makeV2d(a.x+b.x, a.y+b.y); } inline V2d sub(const V2d &a, const V2d &b) { return makeV2d(a.x-b.x, a.y-b.y); } inline V2d scale(const V2d &a, float32 r) { return makeV2d(a.x*r, a.y*r); } -inline float32 length(const V2d &v) { return sqrtf(v.x*v.x + v.y*v.y); } -inline V2d normalize(const V2d &v) { return scale(v, 1.0f/length(v)); } +inline float32 length(const V2d &v) { return dc::Sqrt(v.x*v.x + v.y*v.y); } +inline V2d normalize(const V2d &v) { return scale(v, dc::RecipSqrt(v.x*v.x + v.y*v.y)); } struct V3d { @@ -265,10 +265,22 @@ inline float32 length(const V3d &v) { return len; #endif } -inline V3d normalize(const V3d &v) { return scale(v, 1.0f/length(v)); } -inline V3d setlength(const V3d &v, float32 l) { return scale(v, l/length(v)); } -V3d cross(const V3d &a, const V3d &b); -inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) { +inline V3d normalize(const V3d &v) { + float invLen; +#ifndef DC_SH4 + invLen = 1.0f / length(v); +#else + invLen = dc::RecipSqrt(fipr_magnitude_sqr(v.x, v.y, v.z, 0.0f)); +#endif + return scale(v, invLen); +} +inline V3d setlength(const V3d &v, float32 l) { return scale(v, dc::Div(l, length(v))); } +inline V3d cross(const V3d &a, const V3d &b) { + return makeV3d(a.y*b.z - a.z*b.y, + a.z*b.x - a.x*b.z, + a.x*b.y - a.y*b.x); +} +inline float32 dot(const V3d &a, const V3d &b) { #ifdef DC_SH4 return fipr(a.x, a.y, a.z, 0.0f, b.x, b.y, b.z, 0.0f); #else @@ -329,12 +341,33 @@ inline float32 length(const Quat &q) { #ifndef DC_SH4 return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); #else - return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w)); + return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f)); #endif } -inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); } +inline Quat normalize(const Quat &q) { + float invLen; +#ifndef DC_SH4 + invLen = 1.0f / length(q); +#else + invLen = dc::RecipSqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f)); +#endif + return scale(q, invLen); +} inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); } -Quat mult(const Quat &q, const Quat &p); +inline Quat mult(const Quat &q, const Quat &p) { +#ifndef DC_SH4 + return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, + q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, + q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, + q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); +#else + Quat o; + dc::quat_mult(reinterpret_cast(&o), + reinterpret_cast(q), + reinterpret_cast(p)); + return o; +#endif +} inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.0f, v)), conj(q)).vec(); } Quat lerp(const Quat &q, const Quat &p, float32 r); Quat slerp(const Quat &q, const Quat &p, float32 a); From 92f8e04aa7f848a0d1fa69bb63d760b0c7d9c356 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Tue, 29 Apr 2025 17:05:15 -0500 Subject: [PATCH 5/6] Saving state, liberty + RW T&L optimizations. --- src/liberty/math/Matrix.cpp | 23 +++++ src/liberty/math/Vector.h | 8 +- src/miami/math/Vector.h | 14 +++ vendor/librw/src/base.cpp | 19 ++++ vendor/librw/src/dc/rwdc.cpp | 164 ++++++++++++++---------------- vendor/librw/src/dc/rwdc_common.h | 92 ++++++++++++++++- 6 files changed, 228 insertions(+), 92 deletions(-) diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp index a4b6c73f..a282a211 100644 --- a/src/liberty/math/Matrix.cpp +++ b/src/liberty/math/Matrix.cpp @@ -54,20 +54,28 @@ CMatrix::Detach(void) void CMatrix::Update(void) { +#if 1 GetRight() = m_attachment->right; GetForward() = m_attachment->up; GetUp() = m_attachment->at; GetPosition() = m_attachment->pos; +#else + mat_copy(*this, *m_attachment); +#endif } void CMatrix::UpdateRW(void) { if (m_attachment) { +#if 1 m_attachment->right = GetRight(); m_attachment->up = GetForward(); m_attachment->at = GetUp(); m_attachment->pos = GetPosition(); +#else + mat_copy(*m_attachment, *this); +#endif RwMatrixUpdate(m_attachment); } } @@ -99,6 +107,7 @@ CMatrix::operator+=(CMatrix const &rhs) void CMatrix::SetUnity(void) { +#ifndef DC_SH4 rx = 1.0f; ry = 0.0f; rz = 0.0f; @@ -111,6 +120,10 @@ CMatrix::SetUnity(void) px = 0.0f; py = 0.0f; pz = 0.0f; +#else + dc::mat_identity2(); + dc::mat_store2(*this); +#endif } void @@ -130,6 +143,7 @@ CMatrix::ResetOrientation(void) void CMatrix::SetScale(float s) { +#ifndef DC_SH4 rx = s; ry = 0.0f; rz = 0.0f; @@ -145,11 +159,16 @@ CMatrix::SetScale(float s) px = 0.0f; py = 0.0f; pz = 0.0f; +#else + mat_set_scale(s); + mat_store2(*this); +#endif } void CMatrix::SetTranslate(float x, float y, float z) { +#if 1 rx = 1.0f; ry = 0.0f; rz = 0.0f; @@ -165,6 +184,10 @@ CMatrix::SetTranslate(float x, float y, float z) px = x; py = y; pz = z; +#else + mat_set_translation(x, y, z); + mat_store2(*this); +#endif } void diff --git a/src/liberty/math/Vector.h b/src/liberty/math/Vector.h index 1501512d..63d6fee9 100644 --- a/src/liberty/math/Vector.h +++ b/src/liberty/math/Vector.h @@ -1,5 +1,7 @@ #pragma once +#include "maths.h" + class CVector : public RwV3d { public: @@ -30,7 +32,7 @@ public: } __always_inline float MagnitudeSqr(void) const { #ifdef DC_SH4 - return fipr_magnitude_sqr(x, y,z, 0.0f); + return fipr_magnitude_sqr(x, y, z, 0.0f); #else return x*x + y*y + z*z; #endif @@ -68,7 +70,7 @@ public: } const CVector &operator/=(float right) { - right = Invert(right); + right = Invert(right); x *= right; y *= right; z *= right; @@ -112,7 +114,7 @@ inline CVector operator*(float left, const CVector &right) inline CVector operator/(const CVector &left, float right) { - right = Invert(right); + right = Invert(right); return CVector(left.x * right, left.y * right, left.z * right); } diff --git a/src/miami/math/Vector.h b/src/miami/math/Vector.h index 04287ece..2e8aace1 100644 --- a/src/miami/math/Vector.h +++ b/src/miami/math/Vector.h @@ -1,5 +1,7 @@ #pragma once +#include "maths.h" + class CVector : public RwV3d { public: @@ -68,9 +70,16 @@ __always_inline float Heading(void) const { return Atan2(-x, y); } } const CVector &operator/=(float right) { +#ifndef DC_SH4 x /= right; y /= right; z /= right; +#else + right = dc::Invert(right); + x *= right; + y *= right; + z *= right; +#endif return *this; } @@ -111,7 +120,12 @@ inline CVector operator*(float left, const CVector &right) inline CVector operator/(const CVector &left, float right) { +#ifndef DC_SH4 return CVector(left.x / right, left.y / right, left.z / right); +#else + right = dc::Invert(right); + return CVector(left.x * right, left.y * right, left.z * right); +#endif } inline float diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index 2e3a22cc..ff78eddb 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -402,9 +402,16 @@ Matrix::rotate(const Quat &q, CombineOp op) Matrix* Matrix::translate(const V3d *translation, CombineOp op) { +#if 1 Matrix trans = identMat; trans.pos = *translation; trans.flags &= ~IDENTITY; +#else + Matrix trans; + dc::mat_set_translation(translation->x, translation->y, translation->z); + dc::mat_store2(trans); + trans.flags = Matrix::TYPEORTHONORMAL; +#endif switch(op){ case COMBINEREPLACE: *this = trans; @@ -422,11 +429,18 @@ Matrix::translate(const V3d *translation, CombineOp op) Matrix* Matrix::scale(const V3d *scale, CombineOp op) { +#ifndef DC_SH4 Matrix scl = identMat; scl.right.x = scale->x; scl.up.y = scale->y; scl.at.z = scale->z; scl.flags &= ~IDENTITY; +#else + Matrix scl; + dc::mat_set_scale(scale->x, scale->y, scale->z); + dc::mat_store2(scl); + scl.flags = Matrix::TYPEORTHONORMAL; +#endif switch(op){ case COMBINEREPLACE: *this = scl; @@ -594,8 +608,13 @@ void Matrix::makeRotation(Matrix *dst, const V3d *axis, float32 angle) { // V3d v = normalize(*axis); +#ifndef DC_SH4 float32 len = dot(*axis, *axis); if(len != 0.0f) len = 1.0f/sqrtf(len); +#else + float len = fipr_magnitude_sqr(axis->x, axis->y, axis->z, 0.0f); + if(len != 0.0f) len = dc::RecipSqrt(len); +#endif V3d v = rw::scale(*axis, len); angle = angle*(float)M_PI/180.0f; float32 s = sinf(angle); diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index a5ddacbb..aa8a7c14 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -563,7 +563,7 @@ Camera* rwdcCam; void beginUpdate(Camera* cam) { rwdcCam = cam; - float view[16], proj[16]; + alignas(8) float view[16], proj[16]; // View Matrix Matrix inv; @@ -586,7 +586,7 @@ void beginUpdate(Camera* cam) { view[13] = -inv.pos.y; view[14] = inv.pos.z; view[15] = 1.0f; - memcpy4(&cam->devView, view, sizeof(RawMatrix)); + mat_copy(cam->devView, reinterpret_cast(view)); // d3ddevice->SetTransform(D3DTS_VIEW, (D3DMATRIX*)view); // Projection Matrix @@ -620,8 +620,8 @@ void beginUpdate(Camera* cam) { proj[15] = 1.0f; } proj[14] = -cam->nearPlane*proj[10]; - memcpy4(&cam->devProj, proj, sizeof(RawMatrix)); - + mat_copy(cam->devProj, reinterpret_cast(proj)); + DCE_MatrixViewport(0, 0, cam->frameBuffer->width * VIDEO_MODE_SCALE_X, cam->frameBuffer->height); mat_load_apply((matrix_t*)&DCE_MAT_SCREENVIEW, (matrix_t*)&cam->devProj); @@ -945,8 +945,6 @@ void dcMotionBlur_v1(uint8_t a, uint8_t r, uint8_t g, uint8_t b) { auto addr2 = (pvr_ptr_t)&emu_vram[addr64b + 640 * 2]; #endif - - PVR_SET(PVR_TEXTURE_MODULO, 640/32); auto doquad = [=](float x, float y, float w, float h, float tx, float ty, float tw, float th) { @@ -1666,13 +1664,13 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert switch(primType) { case PRIMTYPETRILIST: pvrHeaderSubmit(); - dcache_pref_block(vtx); + __builtin_prefetch(vtx); for(int i = 0; i < numVertices; i += 3) [[likely]] { - dcache_pref_block(&vtx[i + 1]); + __builtin_prefetch(&vtx[i + 1]); pvrVertexSubmit(vtx[i + 0], PVR_CMD_VERTEX); - dcache_pref_block(&vtx[i + 2]); + __builtin_prefetch(&vtx[i + 2]); pvrVertexSubmit(vtx[i + 1], PVR_CMD_VERTEX); - dcache_pref_block(&vtx[i + 3]); + __builtin_prefetch(&vtx[i + 3]); pvrVertexSubmit(vtx[i + 2], PVR_CMD_VERTEX_EOL); } break; @@ -1680,14 +1678,11 @@ void im2DRenderPrimitive(PrimitiveType primType, void *vertices, int32_t numVert pvrHeaderSubmit(); const auto *vtxA = vtx + 0; const auto *vtxB = vtx + 1; - dcache_pref_block(vtxA); + __builtin_prefetch(vtxA); for(int i = 2; i < numVertices; ++i) [[likely]] { const auto *vtxC = vtx + i; - dcache_pref_block(vtxB); pvrVertexSubmit(*vtxA, PVR_CMD_VERTEX); - dcache_pref_block(vtxC); pvrVertexSubmit(*vtxB, PVR_CMD_VERTEX); - dcache_pref_block(&vtx[i]); pvrVertexSubmit(*vtxC, PVR_CMD_VERTEX_EOL); vtxB = vtxC; } @@ -1787,13 +1782,13 @@ void im2DRenderIndexedPrimitive(PrimitiveType primType, void *vertices, int32 nu switch(primType) { case PRIMTYPETRILIST: pvrHeaderSubmit(); - dcache_pref_block(vtx); + __builtin_prefetch(vtx); for(int i = 0; i < numIndices; i += 3) [[likely]] { - dcache_pref_block(&vtx[idx[i + 1]]); + __builtin_prefetch(&vtx[idx[i + 1]]); pvrVertexSubmit(vtx[idx[i + 0]], PVR_CMD_VERTEX); - dcache_pref_block(&vtx[idx[i + 2]]); + __builtin_prefetch(&vtx[idx[i + 2]]); pvrVertexSubmit(vtx[idx[i + 1]], PVR_CMD_VERTEX); - dcache_pref_block(&vtx[idx[i + 3]]); + __builtin_prefetch(&vtx[idx[i + 3]]); pvrVertexSubmit(vtx[idx[i + 2]], PVR_CMD_VERTEX_EOL); } break; @@ -1830,15 +1825,21 @@ void im3DTransform(void *vertices, int32 numVertices, Matrix *worldMat, uint32 f worldMat = &ident; } - rw::RawMatrix mtx, proj, world, worldview; rw::Camera *cam = engine->currentCamera; - - rw::convMatrix(&world, worldMat); +#ifndef DC_SH4 + rw::RawMatrix mtx, proj, world, worldview; + rw::convMatrix(&world, worldMat); rw::RawMatrix::mult(&worldview, &world, &cam->devView); rw::RawMatrix::mult(&proj, &worldview, &cam->devProj); rw::RawMatrix::mult(&mtx, &proj, (RawMatrix*)&DCE_MAT_SCREENVIEW); // mat_load2(&DCE_MAT_SCREENVIEW); // ~11 cycles. mat_load2(( matrix_t*)&mtx.right); // Number of cycles: ~32. +#else + mat_load_apply(&DCE_MAT_SCREENVIEW, cam->devProj); + mat_apply(cam->devView); + mat_apply(*worldMat); +#endif + if (im3dVertices) { free(im3dVertices); } @@ -2136,39 +2137,36 @@ static_assert(sizeof(MeshletInfo) == 40); // or 32 if !skin inline __attribute__((always_inline)) void setLights(Atomic *atomic, WorldLights *lightData, UniformObject &uniformObject) { - int n = 0; + int i = 0; uniformObject.ambLight = lightData->ambient; if (lightData->numDirectionals) { Matrix mat; Matrix matsrc = *atomic->getFrame()->getLTM(); - matsrc.pos = V3d {0,0,0}; + matsrc.pos = V3d {0.0f, 0.0f, 0.0f}; + matsrc.posw = 0.0f; Matrix::invert(&mat, &matsrc); - - n = 0; - for(int i = 0; i < lightData->numDirectionals && i < MAX_LIGHTS; i++){ + mat_load2(mat); + + for(; i < lightData->numDirectionals && i < MAX_LIGHTS; i++){ Light *l = lightData->directionals[i]; - uniformObject.col[n] = scale(l->color, 255); + uniformObject.col[i] = scale(l->color, 255); V3d at = l->getFrame()->getLTM()->at; V3d dir; - V3d::transformVectors(&dir, &at, 1, &mat); + mat_trans_normal3_nomod(at.x, at.y, at.z, + dir.x, dir.y, dir.z); - uniformObject.dir[n>>2][0][n&3] = -dir.x / 127.0f; - uniformObject.dir[n>>2][1][n&3] = -dir.y / 127.0f; - uniformObject.dir[n>>2][2][n&3] = -dir.z / 127.0f; - uniformObject.dir[n>>2][3][n&3] = 0; - - n++; - if(n >= MAX_LIGHTS) - goto out; + uniformObject.dir[i>>2][0][i&3] = -dir.x / 127.0f; + uniformObject.dir[i>>2][1][i&3] = -dir.y / 127.0f; + uniformObject.dir[i>>2][2][i&3] = -dir.z / 127.0f; + uniformObject.dir[i>>2][3][i&3] = 0; } } -out: - uniformObject.lightCount = n; + uniformObject.lightCount = i; } @@ -2835,7 +2833,7 @@ void* interpolateAndSubmit(void* dst, const void* src1, const void* src2, uint32 float y = v1->o_r + t * (v2->o_r - v1->o_r); float w = v1->o_g + t * (v2->o_g - v1->o_g); - w = frsqrt(w * w); + w = Invert(w); v->x = x * w; v->y = y * w; @@ -3209,15 +3207,14 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve auto skinningIndexData = (int16_t*)skinIndexes; auto skinningWeightData = (uint8_t*)skinWeights; - if (!matrix0Identity) { - mat_load_4x4(&skinMatrices[0]); - if (small_xyz) { - mat_apply(&DCE_MESHLET_MAT_DECODE); - } - } else { - if (small_xyz) { - mat_load2(&DCE_MESHLET_MAT_DECODE); - } + if constexpr (!matrix0Identity) { + if (!small_xyz) + mat_load2(skinMatrices[0]); + else + mat_load_apply(skinMatrices[0], &DCE_MESHLET_MAT_DECODE); + + } else if constexpr (small_xyz) { + mat_load2(&DCE_MESHLET_MAT_DECODE); } for(;;) { @@ -3227,7 +3224,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve int count = *skinningIndexData++; uint8_t* dstVertexBytes = dest + *skinningIndexData++; - if (matrix0Identity && !small_xyz) { + if constexpr (matrix0Identity && !small_xyz) { do { const V3d* srcVtx = (const V3d*)(srcVtxBytes); srcVtxBytes += vertexSize; @@ -3246,11 +3243,8 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve srcVtxBytes += vertexSize; V3d* dstVertex = (V3d*)(dstVertexBytes); dstVertexBytes += 64; - float x, y, z, w; - mat_trans_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z, x, y, z, w); - dstVertex->x = x; - dstVertex->y = y; - dstVertex->z = z; + mat_trans_single3_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z, + dstVertex->x, dstVertex->y, dstVertex->z); } while(--count != 0); } } else if (!(flags & 0x80)) { @@ -3280,10 +3274,10 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve break; } - mat_load_4x4(currentMatrix); - if (small_xyz){ - mat_apply(&DCE_MESHLET_MAT_DECODE); - } + if constexpr(!small_xyz) + mat_load2(*currentMatrix); + else + mat_load_apply(*currentMatrix, &DCE_MESHLET_MAT_DECODE); do { auto srcOffset = *skinningIndexData++; @@ -3297,9 +3291,9 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve srcVtx = &tmpSrc; } auto dstVtx = (V3d*)(dest + dstOffset); - float x, y, z, w; - mat_trans_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z, x, y, z, w); - V3d tmp = { x, y, z }; + V3d tmp; + mat_trans_single3_nodiv_nomod(srcVtx->x, srcVtx->y, srcVtx->z, + tmp.x, tmp.y, tmp.z); tmp = scale(tmp, *skinningWeightData++ / 255.0f); *dstVtx = add(*dstVtx, tmp); } while (--count != 0); @@ -3312,8 +3306,8 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve auto skinningIndexData = (int16_t*)skinIndexes; auto skinningWeightData = (uint8_t*)skinWeights; - if (!matrix0Identity) { - mat_load_3x3(&skinMatrices[0]); + if constexpr(!matrix0Identity) { + mat_load2(skinMatrices[0]); } for(;;) { @@ -3323,7 +3317,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve int count = *skinningIndexData++; uint8_t* dstNormalBytes = destNormal + *skinningIndexData++; - if (matrix0Identity) { + if constexpr (matrix0Identity) { do { V3d srcNormal = { static_cast(srcNormalBytes[0]), static_cast(srcNormalBytes[1]), static_cast(srcNormalBytes[2]) }; @@ -3339,9 +3333,9 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve srcNormalBytes += vertexSize; V3d* dstNormal = (V3d*)(dstNormalBytes); dstNormalBytes += 64; - float x, y, z, w; - mat_trans_nodiv_nomod_zerow(srcNormal.x, srcNormal.y, srcNormal.z, x, y, z, w); - *dstNormal = { x, y, z }; + float x, y, z; + mat_trans_normal3_nomod(srcNormal.x, srcNormal.y, srcNormal.z, + dstNormal->x, dstNormal->y, dstNormal->z); } while(--count != 0); } } else if (!(flags & 0x80)) { @@ -3371,7 +3365,7 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve break; } - mat_load_3x3(currentMatrix); + mat_load2(*currentMatrix); do { auto srcOffset = *skinningIndexData++; @@ -3383,8 +3377,8 @@ void tnlMeshletSkinVertices(uint8_t *OCR, uint8_t *OCR_normal, const uint8_t* ve auto dstNormal = (V3d*)(destNormal + dstOffset); V3d tmp; - float w; - mat_trans_nodiv_nomod_zerow(srcNormal.x, srcNormal.y, srcNormal.z, tmp.x, tmp.y, tmp.z, w); + mat_trans_normal3_nomod(srcNormal.x, srcNormal.y, srcNormal.z, + tmp.x, tmp.y, tmp.z); tmp = scale(tmp, *skinningWeightData++ / 255.0f); *dstNormal = add(*dstNormal, tmp); } while (--count != 0); @@ -3498,22 +3492,20 @@ uploadSkinMatrices(Atomic *a, Matrix* skinMatrices) if(hier){ Matrix *invMats = (Matrix*)skin->inverseMatrices; - Matrix tmp; assert(skin->numBones == hier->numNodes); if(hier->flags & HAnimHierarchy::LOCALSPACEMATRICES){ for(i = 0; i < hier->numNodes; i++){ - invMats[i].flags = 0; - Matrix::mult(m, &invMats[i], &hier->matrices[i]); + mat_mult(*m, invMats[i], hier->matrices[i]); m++; } }else{ Matrix invAtmMat; Matrix::invert(&invAtmMat, a->getFrame()->getLTM()); for(i = 0; i < hier->numNodes; i++){ - invMats[i].flags = 0; - Matrix::mult(&tmp, &hier->matrices[i], &invAtmMat); - Matrix::mult(m, &invMats[i], &tmp); + mat_load_apply(invAtmMat, hier->matrices[i]); + mat_apply(invMats[i]); + mat_store2(*m); m++; } } @@ -3547,13 +3539,13 @@ uploadEnvMatrix(Frame *frame, RawMatrix *world, matrix_t* envMatrix) RawMatrix *envMtx = (RawMatrix*)envMatrix; { - RawMatrix invMtx; + //RawMatrix invMtx; Matrix::invert(&invMat, frame->getLTM()); - convMatrix(&invMtx, &invMat); - invMtx.pos.set(0.0f, 0.0f, 0.0f); + //convMatrix(&invMtx, &invMat); + //invMtx.pos.set(0.0f, 0.0f, 0.0f); float uscale = fabs(normal2texcoord.right.x); normal2texcoord.right.x = MatFX::envMapFlipU ? -uscale : uscale; - +#if 0 RawMatrix tmpMtx; RawMatrix::mult(&tmpMtx, &invMtx, &normal2texcoord); @@ -3563,6 +3555,11 @@ uploadEnvMatrix(Frame *frame, RawMatrix *world, matrix_t* envMatrix) world->upw = 0; world->atw = 0; RawMatrix::mult(envMtx, world, &tmpMtx); +#else + mat_load_apply(normal2texcoord, invMat); + mat_apply(*world); + mat_store2(envMatrix); +#endif } } @@ -3812,12 +3809,9 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { ac->skinMatrix0Identity = skinMatrix0Identity; lightingCB(atomic, ac->uniform); - - rw::RawMatrix world; - rw::convMatrix(&world, atomic->getFrame()->getLTM()); mat_load_apply((matrix_t*)&cam->devProjScreen, (matrix_t*)&cam->devView); - mat_apply((matrix_t*)&world); + mat_apply(*atomic->getFrame()->getLTM()); mat_store2((matrix_t*)&atomicContexts.back().mtx); auto meshes = geo->meshHeader->getMeshes(); @@ -3841,7 +3835,7 @@ void defaultRenderCB(ObjPipeline *pipe, Atomic *atomic) { matfxContextPointer = &matfxContexts.back(); // N.B. world here gets converted to a 3x3 matrix // this is fine, as we only use it for env mapping from now on - uploadEnvMatrix(matfx->fx[0].env.frame, &world, &matfxContexts.back().mtx); + uploadEnvMatrix(matfx->fx[0].env.frame, reinterpret_cast(atomic->getFrame()->getLTM()), &matfxContexts.back().mtx); matfxContextPointer->coefficient = matfxCoefficient; pvr_poly_cxt_t cxt; diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index 0c40af52..fbd7b9ea 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -384,17 +384,71 @@ inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) { fmov dr2, dr12 fldi0 fr14 fschg + fmov.s @%[x], fr0 + fmov.s @%[y], fr5 + fmov.s @%[z], fr10 + fldi1 fr15 frchg - fmov %[x], xf0 - fmov %[y], xf5 - fmov %[z], xf10 )" : - : [x] "r" (x), [y] "r" (y), [z] "r" (z) + : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z) : ); } +inline __hot __icache_aligned void mat_set_scale(float s) { + asm volatile( + R"( + frchg + fldi0 fr1 + fschg + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fmov dr2, dr6 + fmov dr2, dr8 + fldi0 fr11 + fmov dr2, dr12 + fldi0 fr14 + fschg + fmov.s @%[s], fr0 + fmov fr0, fr5 + fmov fr0, fr10 + fldi1 fr15 + frchg + )" + : + : [s] "r" (&s) + : + ); +} + +inline __hot __icache_aligned void mat_set_translation(float x, float y, float z) { + asm volatile( + R"( + frchg + fldi1 fr0 + fschg + fldi0 fr1 + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fldi1 fr5 + fmov dr2,dr6 + fmov dr2,dr8 + fmov dr0,dr10 + fschg + fmov.s @%[x], fr12 + fmov.s @%[y], fr13 + fmov.s @%[z], fr14 + fldi1 fr15 + frchg + )" + : + : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z) + ); +} + // no declspec naked, so can't do rts / fschg. instead compiler pads with nop? inline __hot void mat_load_3x3(const rw::Matrix* mtx) { __asm__ __volatile__ ( @@ -747,6 +801,30 @@ __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, c } } +template +__always_inline constexpr float fipr2D(float x1, float y1, float x2, float y2) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + register float v1x asm("fr0") = x1; + register float v1y asm("fr1") = y1; + register float res asm("fr3"); + + register float v2x asm("fr4") = x2; + register float v2y asm("fr5") = y2; + + asm volatile(R"( + fldi0 fr2 + fldi0 fr3 + fipr fv4, fv0 + )" + : "=f" (res) + : "f" (v1x), "f" (v1y), "f" (v2x), "f" (v2y)); + + return res; + } else { + return (x1 * x2 + y1 * y2); + } +} + # else # ifdef DC_TEXCONV # define mat_apply(a) @@ -778,6 +856,12 @@ __hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, c y2 = tmp.y; \ } while(false) +#define mat_trans_normal3_nomod(x_, y_, z_, x2, y2, z2) do { \ + vector_t tmp = { x_, y_, z_, 0.0f }; \ + mat_transform(&tmp, &tmp, 1, 0); \ + x2 = tmp.x; y2 = tmp.y; z2 = tmp.z; \ + } while(false) + #define mat_trans_nodiv_nomod(x_, y_, z_, x2, y2, z2, w2) do { \ vector_t tmp1233123 = { x_, y_, z_, 1.0f }; \ mat_transform(&tmp1233123, &tmp1233123, 1, 0); \ From f0b38ff86ff93b924d9ab1b829f66303724b78da Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Sat, 3 May 2025 14:24:46 -0500 Subject: [PATCH 6/6] Saving state. - Good place for gainz in liberty - Cutting off PR - known issue: boats' rotations while driving are incorrect. --- liberty/Makefile | 24 +- src/liberty/animation/AnimBlendNode.cpp | 6 +- src/liberty/core/Cam.cpp | 18 +- src/liberty/core/Camera.cpp | 2 +- src/liberty/math/Matrix.cpp | 66 +- src/liberty/renderer/PointLights.cpp | 2 +- src/liberty/renderer/Renderer.cpp | 3 +- src/liberty/renderer/Renderer.h | 3 +- vendor/librw/src/base.cpp | 4 +- vendor/librw/src/camera.cpp | 56 +- vendor/librw/src/dc/rwdc.cpp | 3 +- vendor/librw/src/dc/rwdc_common.h | 1064 ++++++++++++++--------- vendor/librw/src/rwbase.h | 17 +- 13 files changed, 741 insertions(+), 527 deletions(-) diff --git a/liberty/Makefile b/liberty/Makefile index 3be60a74..82c42c51 100644 --- a/liberty/Makefile +++ b/liberty/Makefile @@ -104,6 +104,8 @@ OBJS_TEXCONV += \ OBJS_O3 = \ ../vendor/librw/src/dc/rwdc.o \ ../src/liberty/core/World.o \ + ../src/liberty/core/Zones.o \ + ../src/liberty/core/ZoneCull.o \ ../src/liberty/collision/Collision.o \ ../src/liberty/math/math.o \ ../src/liberty/math/Matrix.o \ @@ -111,11 +113,22 @@ OBJS_O3 = \ ../src/liberty/math/Rect.o \ ../src/liberty/math/Vector.o \ ../vendor/librw/src/base.o \ - ../src/liberty/renderer/Shadows.o - -OBJS_NO_FAST_MATH = \ + ../src/liberty/renderer/Shadows.o \ + ../src/liberty/renderer/Renderer.o \ + ../src/liberty/animation/FrameUpdate.o \ + ../src/liberty/animation/RpAnimblend.o \ + ../src/liberty/control/PathFind.o \ ../src/liberty/core/Cam.o \ - ../src/liberty/core/Camera.o + ../src/liberty/peds/Ped.o \ + ../src/liberty/peds/PedAI.o \ + ../src/liberty/vehicles/Automobile.o + +# ICE list with with -O3 +OBJS_O2 = \ + ../src/liberty/animation/AnimBlendNode.o + +# ICE list with -ffast-math +OBJS_NO_FAST_MATH = KOS_CPPFLAGS += -fbuiltin -ffast-math -ffp-contract=fast \ -mfsrra -mfsca @@ -169,6 +182,9 @@ clean: $(OBJS_O3): %.o: %.cpp kos-c++ $(CXXFLAGS) $(CPPFLAGS) -O3 -c $< -o $@ +$(OBJS_O2): %.o: %.cpp + kos-c++ $(CXXFLAGS) $(CPPFLAGS) -O2 -c $< -o $@ + $(OBJS_NO_FAST_MATH): %.o: %.cpp kos-c++ $(CXXFLAGS) $(CPPFLAGS) -O3 -c $< -o $@ -fno-fast-math diff --git a/src/liberty/animation/AnimBlendNode.cpp b/src/liberty/animation/AnimBlendNode.cpp index c54a83a6..7223b2b2 100644 --- a/src/liberty/animation/AnimBlendNode.cpp +++ b/src/liberty/animation/AnimBlendNode.cpp @@ -39,7 +39,7 @@ CAnimBlendNode::Update(CVector &trans, CQuaternion &rot, float weight) float blend = association->GetBlendAmount(weight); if(blend > 0.0f){ float kfAdt = player->GetNextTimeDelta(); - float t = kfAdt == 0.0f ? 0.0f : (kfAdt - remainingTime)/kfAdt; + float t = kfAdt == 0.0f ? 0.0f : dc::Div(kfAdt - remainingTime, kfAdt); if(player->type & CAnimBlendSequence::KF_TRANS){ auto kfdAt = player->GetNextTranslationDelta(); auto kfBt = player->GetPrevTranslation(); @@ -153,7 +153,7 @@ CAnimBlendNode::CalcDeltas(void) if(cos > 1.0f) cos = 1.0f; theta = Acos(cos); - invSin = theta == 0.0f ? 0.0f : 1.0f/Sin(theta); + invSin = theta == 0.0f ? 0.0f : dc::Invert(Sin(theta)); } void @@ -164,7 +164,7 @@ CAnimBlendNode::GetCurrentTranslation(CVector &trans, float weight) float blend = association->GetBlendAmount(weight); if(blend > 0.0f){ auto kfAdt = player->GetNextTimeDelta(); - float t = kfAdt == 0.0f ? 0.0f : (kfAdt - remainingTime)/kfAdt; + float t = kfAdt == 0.0f ? 0.0f : dc::Div(kfAdt - remainingTime, kfAdt); if(player->type & CAnimBlendSequence::KF_TRANS){ auto kfdAt = player->GetNextTranslationDelta(); auto kfBt = player->GetPrevTranslation(); diff --git a/src/liberty/core/Cam.cpp b/src/liberty/core/Cam.cpp index ff35b984..ab2dedc0 100644 --- a/src/liberty/core/Cam.cpp +++ b/src/liberty/core/Cam.cpp @@ -3267,24 +3267,24 @@ CCam::Process_BehindBoat(const CVector &CameraTarget, float TargetOrientation, f if(m_bCollisionChecksOn || ResetStatics){ CVector TestPoint; // Weird calculations here, also casting bool to float... - c = Cos(TargetOrientation); - s = Sin(TargetOrientation); + c = Cos(TargetOrientation); + s = Sin(TargetOrientation); TestPoint = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) + (TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) + TargetCoors; TestPoint.z = WaterLevel + TheCamera.CarZoomValueSmooth; float Test1 = CWorld::GetIsLineOfSightClear(TestPoint, TargetCoors, true, false, false, true, false, true, true); - c = Cos(TargetOrientation + 0.8f); - s = Sin(TargetOrientation + DEGTORAD(40.0f)); + c = Cos(TargetOrientation + 0.8f); + s = Sin(TargetOrientation + DEGTORAD(40.0f)); TestPoint = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) + (TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) + TargetCoors; TestPoint.z = WaterLevel + TheCamera.CarZoomValueSmooth; float Test2 = CWorld::GetIsLineOfSightClear(TestPoint, TargetCoors, true, false, false, true, false, true, true); - c = Cos(TargetOrientation - 0.8); - s = Sin(TargetOrientation - DEGTORAD(40.0f)); + c = Cos(TargetOrientation - 0.8); + s = Sin(TargetOrientation - DEGTORAD(40.0f)); TestPoint = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) + (TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) + TargetCoors; @@ -3307,8 +3307,7 @@ CCam::Process_BehindBoat(const CVector &CameraTarget, float TargetOrientation, f DeltaBeta = TargetOrientation - Beta; } - c = Cos(Beta); - s = Sin(Beta); + auto [s, c] = SinCos(Beta); TestPoint.x = TheCamera.CarZoomValueSmooth * -c + (TheCamera.CarZoomValueSmooth + 7.0f) * -c + TargetCoors.x; @@ -3333,8 +3332,7 @@ CCam::Process_BehindBoat(const CVector &CameraTarget, float TargetOrientation, f // inlined WellBufferMe(TargetWhenChecksWereOn, &Beta, &BetaSpeed, 0.07f, 0.015f, true); - s = Sin(Beta); - c = Cos(Beta); + auto [s, c] = SinCos(Beta); Source = TheCamera.CarZoomValueSmooth * CVector(-c, -s, 0.0f) + (TheCamera.CarZoomValueSmooth+7.0f) * CVector(-c, -s, 0.0f) + TargetCoors; diff --git a/src/liberty/core/Camera.cpp b/src/liberty/core/Camera.cpp index bad96834..a2a52abf 100644 --- a/src/liberty/core/Camera.cpp +++ b/src/liberty/core/Camera.cpp @@ -3694,7 +3694,7 @@ CCamera::IsBoxVisible(CVUVECTOR *box, const CMatrix *mat) #ifdef GTA_PS2 TransformPoints(box, 8, *mat, box); #else - #ifdef FIX_BUGS + #if defined(FIX_BUGS) && !defined(DC_SH4) for (i = 0; i < 8; i++) box[i] = *mat * box[i]; #else diff --git a/src/liberty/math/Matrix.cpp b/src/liberty/math/Matrix.cpp index a282a211..9f61c35c 100644 --- a/src/liberty/math/Matrix.cpp +++ b/src/liberty/math/Matrix.cpp @@ -54,7 +54,7 @@ CMatrix::Detach(void) void CMatrix::Update(void) { -#if 1 +#ifndef DC_SH4 GetRight() = m_attachment->right; GetForward() = m_attachment->up; GetUp() = m_attachment->at; @@ -68,15 +68,15 @@ void CMatrix::UpdateRW(void) { if (m_attachment) { -#if 1 +#ifndef DC_SH4 m_attachment->right = GetRight(); m_attachment->up = GetForward(); m_attachment->at = GetUp(); m_attachment->pos = GetPosition(); + RwMatrixUpdate(m_attachment); #else mat_copy(*m_attachment, *this); #endif - RwMatrixUpdate(m_attachment); } } @@ -84,8 +84,10 @@ void CMatrix::operator=(CMatrix const &rhs) { mat_copy(*this, rhs); +#ifndef DC_SH4 if (m_attachment) UpdateRW(); +#endif } void @@ -168,7 +170,7 @@ CMatrix::SetScale(float s) void CMatrix::SetTranslate(float x, float y, float z) { -#if 1 +#ifndef DC_SH4 rx = 1.0f; ry = 0.0f; rz = 0.0f; @@ -247,34 +249,52 @@ CMatrix::SetRotateZOnly(float angle) void CMatrix::SetRotateX(float angle) { +#ifndef DC_SH4 SetRotateXOnly(angle); px = 0.0f; py = 0.0f; pz = 0.0f; +#else + dc::mat_identity2(); + dc::mat_apply_rotate_x(angle); + dc::mat_store2(*this); +#endif } - void CMatrix::SetRotateY(float angle) { +#ifndef DC_SH4 SetRotateYOnly(angle); px = 0.0f; py = 0.0f; pz = 0.0f; +#else + dc::mat_identity2(); + dc::mat_apply_rotate_y(angle); + dc::mat_store2(*this); +#endif } void CMatrix::SetRotateZ(float angle) { +#ifndef DC_SH4 SetRotateZOnly(angle); px = 0.0f; py = 0.0f; pz = 0.0f; +#else + dc::mat_identity2(); + dc::mat_apply_rotate_z(angle); + dc::mat_store2(*this); +#endif } void CMatrix::SetRotate(float xAngle, float yAngle, float zAngle) { +#if 1 auto [sX, cX] = SinCos(xAngle); auto [sY, cY] = SinCos(yAngle); auto [sZ, cZ] = SinCos(zAngle); @@ -294,15 +314,19 @@ CMatrix::SetRotate(float xAngle, float yAngle, float zAngle) px = 0.0f; py = 0.0f; pz = 0.0f; +#else + dc::mat_set_rotate(xAngle, yAngle, zAngle); + dc::mat_store2(*this); +#endif } void CMatrix::RotateX(float x) { -#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - dc::mat_load2(reinterpret_cast(this)); - mat_rotate_x(x); - dc::mat_store2(reinterpret_cast(this)); +#if 0// this is bugged and does not yield correct results + dc::mat_set_rotate_x(x); + mat_apply(*this); + dc::mat_store2(*this); #else auto [s, c] = SinCos(x); @@ -329,10 +353,10 @@ CMatrix::RotateX(float x) void CMatrix::RotateY(float y) { -#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - dc::mat_load2(reinterpret_cast(this)); - mat_rotate_y(y); - dc::mat_store2(reinterpret_cast(this)); +#if 0 // this is bugged and does not yield correct results + dc::mat_set_rotate_y(y); + mat_apply(*this); + dc::mat_store2(*this); #else auto [s, c] = SinCos(y); @@ -359,10 +383,10 @@ CMatrix::RotateY(float y) void CMatrix::RotateZ(float z) { -#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - dc::mat_load2(reinterpret_cast(this)); - mat_rotate_z(z); - dc::mat_store2(reinterpret_cast(this)); +#if 0// this is bugged and does not yield correct results + dc::mat_set_rotate_z(z); + mat_apply(*this); + dc::mat_store2(*this); #else auto [s, c] = SinCos(z); @@ -389,10 +413,10 @@ CMatrix::RotateZ(float z) void CMatrix::Rotate(float x, float y, float z) { -#if 0 && defined(DC_SH4) // this is bugged and does not yield correct results - dc::mat_load2(reinterpret_cast(this)); - mat_rotate(x, y, z); - dc::mat_store2(reinterpret_cast(this)); +#if 0 // this is bugged and does not yield correct results + dc::mat_set_rotate(x, y, z); + mat_apply(*this); + dc::mat_store2(*this); #else auto [sX, cX] = SinCos(x); auto [sY, cY] = SinCos(y); diff --git a/src/liberty/renderer/PointLights.cpp b/src/liberty/renderer/PointLights.cpp index 84ac4ab2..716089dc 100644 --- a/src/liberty/renderer/PointLights.cpp +++ b/src/liberty/renderer/PointLights.cpp @@ -80,7 +80,7 @@ CPointLights::GenerateLightsAffectingObject(Const CVector *objCoors) distance = dist.Magnitude(); if(distance < radius){ - float distNorm = distance/radius; + float distNorm = Div(distance, radius); if(aLights[i].type == LIGHT_DARKEN){ // darken the object the closer it is ret *= distNorm; diff --git a/src/liberty/renderer/Renderer.cpp b/src/liberty/renderer/Renderer.cpp index cfdae34a..ba02e0ad 100644 --- a/src/liberty/renderer/Renderer.cpp +++ b/src/liberty/renderer/Renderer.cpp @@ -1315,8 +1315,9 @@ CalcNewDelta(RwV2d *a, RwV2d *b) #define TOINT(x) ((int)(x)) #endif +template void -CRenderer::ScanSectorPoly(RwV2d *poly, int32 numVertices, void (*scanfunc)(CPtrList *)) +CRenderer::ScanSectorPoly(RwV2d *poly, int32 numVertices, F&& scanfunc) { float miny, maxy; int y, yend; diff --git a/src/liberty/renderer/Renderer.h b/src/liberty/renderer/Renderer.h index 0322939c..3517a8da 100644 --- a/src/liberty/renderer/Renderer.h +++ b/src/liberty/renderer/Renderer.h @@ -90,7 +90,8 @@ public: static void ConstructRenderList(void); static void ScanWorld(void); static void RequestObjectsInFrustum(void); - static void ScanSectorPoly(RwV2d *poly, int32 numVertices, void (*scanfunc)(CPtrList *)); + template + static void ScanSectorPoly(RwV2d *poly, int32 numVertices, F &&scanfunc); static void ScanBigBuildingList(CPtrList &list); static void ScanSectorList(CPtrList *lists); static void ScanSectorList_Priority(CPtrList *lists); diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index ff78eddb..851f8f4a 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -410,7 +410,7 @@ Matrix::translate(const V3d *translation, CombineOp op) Matrix trans; dc::mat_set_translation(translation->x, translation->y, translation->z); dc::mat_store2(trans); - trans.flags = Matrix::TYPEORTHONORMAL; + trans.flags = TYPEORTHONORMAL; #endif switch(op){ case COMBINEREPLACE: @@ -439,7 +439,7 @@ Matrix::scale(const V3d *scale, CombineOp op) Matrix scl; dc::mat_set_scale(scale->x, scale->y, scale->z); dc::mat_store2(scl); - scl.flags = Matrix::TYPEORTHONORMAL; + scl.flags = TYPEORTHONORMAL; #endif switch(op){ case COMBINEREPLACE: diff --git a/vendor/librw/src/camera.cpp b/vendor/librw/src/camera.cpp index 0d2d55af..a9b3f8e5 100644 --- a/vendor/librw/src/camera.cpp +++ b/vendor/librw/src/camera.cpp @@ -493,33 +493,7 @@ Camera::frustumTestSphere(const Sphere *s) const static_assert(offsetof (decltype (p[0].plane), distance) -offsetof (decltype (p[0].plane.normal), z) == sizeof (float)); - asm volatile (R"( - frchg - - fmov.s @%0+,fr0 - fmov.s @%1+,fr1 - fmov.s @%2+,fr2 - fmov.s @%3+,fr3 - - fmov.s @%0+,fr4 - fmov.s @%1+,fr5 - fmov.s @%2+,fr6 - fmov.s @%3+,fr7 - - fmov.s @%0+,fr8 - fmov.s @%1+,fr9 - fmov.s @%2+,fr10 - fmov.s @%3+,fr11 - - fmov.s @%0,fr12 - fmov.s @%1,fr13 - fmov.s @%2,fr14 - fmov.s @%3,fr15 - - frchg - )" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3) - : - : ); + mat_load_rows(base_ptr0, base_ptr1, base_ptr2, base_ptr3); float dists[4]; mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw, @@ -643,33 +617,7 @@ Camera::frustumTestSphereNear(const Sphere *s) const static_assert(offsetof (decltype (p[0].plane), distance) -offsetof (decltype (p[0].plane.normal), z) == sizeof (float)); - asm volatile (R"( - frchg - - fmov.s @%0+,fr0 - fmov.s @%1+,fr1 - fmov.s @%2+,fr2 - fmov.s @%3+,fr3 - - fmov.s @%0+,fr4 - fmov.s @%1+,fr5 - fmov.s @%2+,fr6 - fmov.s @%3+,fr7 - - fmov.s @%0+,fr8 - fmov.s @%1+,fr9 - fmov.s @%2+,fr10 - fmov.s @%3+,fr11 - - fmov.s @%0,fr12 - fmov.s @%1,fr13 - fmov.s @%2,fr14 - fmov.s @%3,fr15 - - frchg - )" : "+&r" (base_ptr0), "+&r" (base_ptr1), "+&r" (base_ptr2), "+&r" (base_ptr3) - : - : ); + mat_load_rows(base_ptr0, base_ptr1, base_ptr2, base_ptr3); float dists[4]; mat_trans_vec4_nodiv_nomod(sx, sy, sz, sw, diff --git a/vendor/librw/src/dc/rwdc.cpp b/vendor/librw/src/dc/rwdc.cpp index aa8a7c14..c6c8e29a 100644 --- a/vendor/librw/src/dc/rwdc.cpp +++ b/vendor/librw/src/dc/rwdc.cpp @@ -811,7 +811,8 @@ struct chunked_vector { } // Iterate over each element and invoke the callback. - void forEach(void(*cb)(T&)) { + template + void forEach(F&& cb) { for (chunk* curr = first; curr; curr = curr->header.next) { for (size_t i = 0; i < curr->header.used; ++i) { cb(curr->items[i]); diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index fbd7b9ea..eab4b277 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -66,9 +66,38 @@ struct quaternion_t { float x, y, z, w; }; -__always_inline __hot constexpr float Sin(float x) { return sinf(x); } -__always_inline __hot constexpr float Cos(float x) { return cosf(x); } -__always_inline __hot constexpr auto SinCos(float x) { return std::pair { Sin(x), Cos(x) }; } +template +__always_inline __hot constexpr float Sin(float x) { +#ifdef DC_SH4 + if constexpr(!BUILTIN) + return fsin(x); + else +#endif + return sinf(x); +} + +template +__always_inline __hot constexpr float Cos(float x) { +#ifdef DC_SH4 + if constexpr(!BUILTIN) + return fcos(x); + else +#endif + return cosf(x); +} + +template +__always_inline __hot constexpr auto SinCos(float x) { +#ifdef DC_SH4 + if constexpr(!BUILTIN) { + std::pair result; + fsincosr(x, &std::get<0>(result), &std::get<1>(result)); + return result; + } else +#endif + return std::pair { Sin(x), Cos(x) }; +} + __always_inline __hot constexpr float Tan(float x) { return tanf(x); } __always_inline __hot constexpr float Atan(float x) { return atanf(x); } __always_inline __hot constexpr float Atan2(float y, float x) { return atan2f(y, x); } @@ -78,8 +107,73 @@ __always_inline __hot constexpr float Abs(float x) { return fabsf(x); } __always_inline __hot constexpr float Sqrt(float x) { return sqrtf(x); } __always_inline __hot constexpr float RecipSqrt(float x, float y) { return x / Sqrt(y); } __always_inline __hot constexpr float Pow(float x, float y) { return powf(x, y); } -__always_inline __hot constexpr float Floor(float x) { return floorf(x); } -__always_inline __hot constexpr float Ceil(float x) { return ceilf(x); } + +template +__always_inline __hot constexpr float Floor(float x) { +#ifdef DC_SH4 + if(!std::is_constant_evaluated() && FAST_APPROX) { + float output_float; + unsigned int scratch_reg; + unsigned int scratch_reg2; + + asm volatile (R"( + mov #0x4f, %[scratch] + shll16 %[scratch] + shll8 %[scratch] + lds %[scratch], fpul + mov #1, %[scratch2] + fsts fpul, %[float_out] + fadd %[floatx], %[float_out] + rotr %[scratch2] + ftrc %[float_out], fpul + sts fpul, %[scratch] + add %[scratch2], %[scratch] + lds %[scratch], fpul + float fpul, %[float_out] + )" + : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float) + : [floatx] "f" (x) + : "fpul", "t"); + + return output_float; + } else +#endif + return floorf(x); +} + +template +__always_inline __hot constexpr float Ceil(float x) { +#ifdef DC_SH4 + if(!std::is_constant_evaluated() && FAST_APPROX) { + float output_float; + unsigned int scratch_reg; + unsigned int scratch_reg2; + + asm volatile (R"( + mov #0x4f, %[scratch] + shll16 %[scratch] + shll8 %[scratch] + lds %[scratch], fpul + mov #1, %[scratch2] + fsts fpul, %[float_out] + fsub %[floatx], %[float_out] + rotr %[scratch2] + ftrc %[float_out], fpul + sts fpul, %[scratch] + add %[scratch2], %[scratch] + lds %[scratch], fpul + float fpul, %[float_out] + fneg %[float_out] + )" + : [scratch] "=&r" (scratch_reg), [scratch2] "=&r" (scratch_reg2), [float_out] "=&f" (output_float) + : [floatx] "f" (x) + : "fpul", "t"); + + return output_float; + } else +#endif + return ceilf(x); +} __always_inline __hot constexpr float Fmac(auto a, auto b, auto c) { return a * b + c; } __always_inline __hot constexpr float Lerp(float a, float b, float t) { return Fmac(t, (b - a), a); } __always_inline __hot constexpr auto Max(auto a, auto b) { return ((a > b)? a : b); } @@ -94,7 +188,7 @@ __always_inline __hot constexpr float RecipSqrt(float x) { return 1.0f / Sqrt(x); } -template +template __always_inline __hot constexpr T Clamp(T v, auto low, auto high) { return std::clamp(v, static_cast(low), static_cast(high)); } @@ -185,108 +279,133 @@ __always_inline __hot constexpr auto Norm(auto value, auto min, auto max) { x2 = __x; y2 = __y; z2 = __z; w2 = __w; \ } while(false) -inline __hot __icache_aligned void mat_load2(const matrix_t *mtx) { - asm volatile( - R"( - fschg - fmov.d @%[mtx],xd0 - add #32,%[mtx] - pref @%[mtx] - add #-(32-8),%[mtx] - fmov.d @%[mtx]+,xd2 - fmov.d @%[mtx]+,xd4 - fmov.d @%[mtx]+,xd6 - fmov.d @%[mtx]+,xd8 - fmov.d @%[mtx]+,xd10 - fmov.d @%[mtx]+,xd12 - fmov.d @%[mtx]+,xd14 - fschg - )" - : [mtx] "+r" (mtx) - : - : - ); +inline __hot __icache_aligned +void mat_load2(const matrix_t *mtx) { + asm volatile(R"( + fschg + fmov.d @%[mtx],xd0 + add #32,%[mtx] + pref @%[mtx] + add #-(32-8),%[mtx] + fmov.d @%[mtx]+,xd2 + fmov.d @%[mtx]+,xd4 + fmov.d @%[mtx]+,xd6 + fmov.d @%[mtx]+,xd8 + fmov.d @%[mtx]+,xd10 + fmov.d @%[mtx]+,xd12 + fmov.d @%[mtx]+,xd14 + fschg + )" + : [mtx] "+r" (mtx)); } -inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) { - asm volatile( - R"( - frchg +inline __hot __icache_aligned +void mat_load_transpose(const matrix_t *mtx) { + asm volatile(R"( + frchg - fmov.s @%[mtx]+, fr0 + fmov.s @%[mtx]+, fr0 - add #32, %[mtx] - pref @%[mtx] - add #-(32 - 4), %[mtx] + add #32, %[mtx] + pref @%[mtx] + add #-(32 - 4), %[mtx] - fmov.s @%[mtx]+, fr4 - fmov.s @%[mtx]+, fr8 - fmov.s @%[mtx]+, fr12 + fmov.s @%[mtx]+, fr4 + fmov.s @%[mtx]+, fr8 + fmov.s @%[mtx]+, fr12 - fmov.s @%[mtx]+, fr1 - fmov.s @%[mtx]+, fr5 - fmov.s @%[mtx]+, fr9 - fmov.s @%[mtx]+, fr13 + fmov.s @%[mtx]+, fr1 + fmov.s @%[mtx]+, fr5 + fmov.s @%[mtx]+, fr9 + fmov.s @%[mtx]+, fr13 - fmov.s @%[mtx]+, fr2 - fmov.s @%[mtx]+, fr6 - fmov.s @%[mtx]+, fr10 - fmov.s @%[mtx]+, fr14 + fmov.s @%[mtx]+, fr2 + fmov.s @%[mtx]+, fr6 + fmov.s @%[mtx]+, fr10 + fmov.s @%[mtx]+, fr14 - fmov.s @%[mtx]+, fr3 - fmov.s @%[mtx]+, fr7 - fmov.s @%[mtx]+, fr11 - fmov.s @%[mtx]+, fr15 + fmov.s @%[mtx]+, fr3 + fmov.s @%[mtx]+, fr7 + fmov.s @%[mtx]+, fr11 + fmov.s @%[mtx]+, fr15 - frchg - )" - : [mtx] "+r" (mtx) - : - : - ); + frchg + )" + : [mtx] "+r" (mtx)); } -inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) { - asm volatile( - R"( - frchg +inline __hot __icache_aligned +void mat_load_rows(const float *r1, const float *r2, const float *r3, const float *r4) { + asm volatile (R"( + frchg - fmov.s @%[mtx]+, fr0 + pref @%1 + fmov.s @%0+,fr0 + fmov.s @%0+,fr4 + fmov.s @%0+,fr8 + fmov.s @%0, fr12 - add #32, %[mtx] - pref @%[mtx] - add #-(32 - 4), %[mtx] + pref @%2 + fmov.s @%1+,fr1 + fmov.s @%1+,fr5 + fmov.s @%1+,fr9 + fmov.s @%1, fr13 - fmov.s @%[mtx]+, fr4 - fmov.s @%[mtx]+, fr8 - fldi0 fr12 - add #4, %[mtx] + pref @%3 + fmov.s @%2+,fr2 + fmov.s @%2+,fr6 + fmov.s @%2+,fr10 + fmov.s @%2,fr14 - fmov.s @%[mtx]+, fr1 - fmov.s @%[mtx]+, fr5 - fmov.s @%[mtx]+, fr9 - fldi0 fr13 - add #4, %[mtx] + fmov.s @%3+,fr3 + fmov.s @%3+,fr7 + fmov.s @%3+,fr11 + fmov.s @%3,fr15 - fmov.s @%[mtx]+, fr2 - fmov.s @%[mtx]+, fr6 - fmov.s @%[mtx]+, fr10 - fldi0 fr14 - - fldi0 fr3 - fldi0 fr7 - fmov fr3, fr11 - fldi1 fr15 - - frchg - )" - : [mtx] "+r" (mtx) - : - : - ); + frchg + )" + : "+&r" (r1), "+&r" (r2), "+&r" (r3), "+&r" (r4)); } -inline __hot __icache_aligned void mat_invert_tranpose() { +inline __hot __icache_aligned +void mat_load_3x3_transpose(const matrix_t *mtx) { + asm volatile(R"( + frchg + + fmov.s @%[mtx]+, fr0 + + add #32, %[mtx] + pref @%[mtx] + add #-(32 - 4), %[mtx] + + fmov.s @%[mtx]+, fr4 + fmov.s @%[mtx]+, fr8 + fldi0 fr12 + add #4, %[mtx] + + fmov.s @%[mtx]+, fr1 + fmov.s @%[mtx]+, fr5 + fmov.s @%[mtx]+, fr9 + fldi0 fr13 + add #4, %[mtx] + + fmov.s @%[mtx]+, fr2 + fmov.s @%[mtx]+, fr6 + fmov.s @%[mtx]+, fr10 + fldi0 fr14 + + fldi0 fr3 + fldi0 fr7 + fmov fr3, fr11 + fldi1 fr15 + + frchg + )" + : [mtx] "+r" (mtx)); +} + +inline __hot __icache_aligned +void mat_invert_tranpose() { asm volatile( "frchg\n\t" "fneg fr12\n\t" @@ -323,235 +442,249 @@ inline __hot __icache_aligned void mat_invert_tranpose() { :); } -inline __hot __icache_aligned void mat_store2(matrix_t *mtx) { - asm volatile( - R"( - fschg - add #64-8,%[mtx] - fmov.d xd14,@%[mtx] - add #-32,%[mtx] - pref @%[mtx] - add #32,%[mtx] - fmov.d xd12,@-%[mtx] - fmov.d xd10,@-%[mtx] - fmov.d xd8,@-%[mtx] - fmov.d xd6,@-%[mtx] - fmov.d xd4,@-%[mtx] - fmov.d xd2,@-%[mtx] - fmov.d xd0,@-%[mtx] - fschg - )" - : [mtx] "+&r" (mtx), "=m" (*mtx) - : - : - ); +inline __hot __icache_aligned +void mat_store2(matrix_t *mtx) { + asm volatile(R"( + fschg + add #64-8,%[mtx] + fmov.d xd14,@%[mtx] + add #-32,%[mtx] + pref @%[mtx] + add #32,%[mtx] + fmov.d xd12,@-%[mtx] + fmov.d xd10,@-%[mtx] + fmov.d xd8,@-%[mtx] + fmov.d xd6,@-%[mtx] + fmov.d xd4,@-%[mtx] + fmov.d xd2,@-%[mtx] + fmov.d xd0,@-%[mtx] + fschg + )" + : [mtx] "+&r" (mtx), "=m" (*mtx)); } -inline __hot __icache_aligned void mat_identity2(void) { - asm volatile( - R"( - frchg - fldi1 fr0 - fschg - fldi0 fr1 - fldi0 fr2 - fldi0 fr3 - fldi0 fr4 - fldi1 fr5 - fmov dr2,dr6 - fmov dr2,dr8 - fmov dr0,dr10 - fmov dr2,dr12 - fmov dr4,dr14 - fschg - frchg - )" - ); +inline __hot __icache_aligned +void mat_identity2(void) { + asm volatile(R"( + frchg + fldi1 fr0 + fschg + fldi0 fr1 + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fldi1 fr5 + fmov dr2,dr6 + fmov dr2,dr8 + fmov dr0,dr10 + fmov dr2,dr12 + fmov dr4,dr14 + fschg + frchg + )"); } -inline __hot __icache_aligned void mat_set_scale(float x, float y, float z) { - asm volatile( - R"( - frchg - fldi0 fr1 - fschg - fldi0 fr2 - fldi0 fr3 - fldi0 fr4 - fmov dr2, dr6 - fmov dr2, dr8 - fldi0 fr11 - fmov dr2, dr12 - fldi0 fr14 - fschg - fmov.s @%[x], fr0 - fmov.s @%[y], fr5 - fmov.s @%[z], fr10 - fldi1 fr15 - frchg - )" - : - : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z) - : - ); +inline __hot __icache_aligned +void mat_set_scale(float x, float y, float z) { + asm volatile(R"( + frchg + fldi0 fr1 + fschg + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fmov dr2, dr6 + fmov dr2, dr8 + fldi0 fr11 + fmov dr2, dr12 + fldi0 fr14 + fschg + fmov.s @%[x], fr0 + fmov.s @%[y], fr5 + fmov.s @%[z], fr10 + fldi1 fr15 + frchg + )" + : + : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)); } -inline __hot __icache_aligned void mat_set_scale(float s) { - asm volatile( - R"( - frchg - fldi0 fr1 - fschg - fldi0 fr2 - fldi0 fr3 - fldi0 fr4 - fmov dr2, dr6 - fmov dr2, dr8 - fldi0 fr11 - fmov dr2, dr12 - fldi0 fr14 - fschg - fmov.s @%[s], fr0 - fmov fr0, fr5 - fmov fr0, fr10 - fldi1 fr15 - frchg - )" - : - : [s] "r" (&s) - : - ); +inline __hot __icache_aligned +void mat_set_scale(float s) { + asm volatile(R"( + frchg + fldi0 fr1 + fschg + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fmov dr2, dr6 + fmov dr2, dr8 + fldi0 fr11 + fmov dr2, dr12 + fldi0 fr14 + fschg + fmov.s @%[s], fr0 + fmov fr0, fr5 + fmov fr0, fr10 + fldi1 fr15 + frchg + )" + : + : [s] "r" (&s)); } -inline __hot __icache_aligned void mat_set_translation(float x, float y, float z) { - asm volatile( - R"( - frchg - fldi1 fr0 - fschg - fldi0 fr1 - fldi0 fr2 - fldi0 fr3 - fldi0 fr4 - fldi1 fr5 - fmov dr2,dr6 - fmov dr2,dr8 - fmov dr0,dr10 - fschg - fmov.s @%[x], fr12 - fmov.s @%[y], fr13 - fmov.s @%[z], fr14 - fldi1 fr15 - frchg - )" - : - : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z) - ); +// Don't multiply anything by fr3, since not loading from vector???? +inline __hot __icache_aligned +void mat_apply_scale(float x, float y, float z) { + asm volatile(R"( + fschg + fmov xd0, dr4 + fmov xd2, dr6 + fschg + + frchg + fmov.s @%[x], fr0 + fmov.s @%[y], fr1 + fmov.s @%[z], fr2 + + fmul fr0, fr4 + fmul fr0, fr8 + fmul fr0, fr12 + fmul fr1, fr5 + fmul fr1, fr9 + fmul fr1, fr13 + fmul fr2, fr6 + fmul fr2, fr10 + fmul fr2, fr14 + + fschg + fmov dr4, xd0 + fmul fr3, fr7 + fmov dr6, xd2 + fmul fr3, fr11 + fmov xd4, dr4 + fmul fr3, fr15 + fmov xd6, dr6 + fschg + + fmul fr4, fr0 + + fmul fr5, fr1 + fmul fr6, fr2 + fmul fr7, fr3 + + fschg + fmov xd0, dr4 + fmov xd2, dr6 + fschg + + frchg + )" + : + : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z) + : "fr0", "fr1", "fr2"); } -// no declspec naked, so can't do rts / fschg. instead compiler pads with nop? -inline __hot void mat_load_3x3(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg +inline __hot __icache_aligned +void mat_apply_translation(float x, float y, float z) { + asm volatile(R"( + fschg + fmov xd12, dr4 + fmov xd14, dr6 + fschg - fmov @%[mtx]+, dr0 + fmov.s @%[x], fr0 + fmov.s @%[y], fr1 + fmov.s @%[z], fr2 - fldi0 fr12 - fldi0 fr13 + fadd fr0, fr4 + fadd fr1, fr5 + fadd fr2, fr6 - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - - fldi0 fr3 - fldi0 fr7 - fldi0 fr11 - fmov dr12, dr14 - - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); + fschg + fmov dr4, xd12 + fmov dr6, xd14 + fschg + )" + : + : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z) + : "fr0", "fr1", "fr2", "fr4", "fr5", "fr6", "fr7"); } -// sets pos.w to 1 -inline __hot void mat_load_4x4(const rw::Matrix* mtx) { - __asm__ __volatile__ ( - R"( - fschg - frchg - fmov @%[mtx]+, dr0 - - fmov @%[mtx]+, dr2 - fmov @%[mtx]+, dr4 - fmov @%[mtx]+, dr6 - fmov @%[mtx]+, dr8 - fmov @%[mtx]+, dr10 - fmov @%[mtx]+, dr12 - fmov @%[mtx]+, dr14 - fldi1 fr15 - - fschg - frchg - )" - : [mtx] "+r" (mtx) - ); - } - -__hot inline void mat_transpose(void) { - asm volatile ( - "frchg\n\t" // fmov for singles only works on front bank - // FR0, FR5, FR10, and FR15 are already in place - // swap FR1 and FR4 - "flds FR1, FPUL\n\t" - "fmov FR4, FR1\n\t" - "fsts FPUL, FR4\n\t" - // swap FR2 and FR8 - "flds FR2, FPUL\n\t" - "fmov FR8, FR2\n\t" - "fsts FPUL, FR8\n\t" - // swap FR3 and FR12 - "flds FR3, FPUL\n\t" - "fmov FR12, FR3\n\t" - "fsts FPUL, FR12\n\t" - // swap FR6 and FR9 - "flds FR6, FPUL\n\t" - "fmov FR9, FR6\n\t" - "fsts FPUL, FR9\n\t" - // swap FR7 and FR13 - "flds FR7, FPUL\n\t" - "fmov FR13, FR7\n\t" - "fsts FPUL, FR13\n\t" - // swap FR11 and FR14 - "flds FR11, FPUL\n\t" - "fmov FR14, FR11\n\t" - "fsts FPUL, FR14\n\t" - // restore XMTRX to back bank - "frchg\n" - : // no outputs - : // no inputs - : "fpul" // clobbers - ); +inline __hot __icache_aligned +void mat_set_translation(float x, float y, float z) { + asm volatile(R"( + frchg + fldi1 fr0 + fschg + fldi0 fr1 + fldi0 fr2 + fldi0 fr3 + fldi0 fr4 + fldi1 fr5 + fmov dr2,dr6 + fmov dr2,dr8 + fmov dr0,dr10 + fschg + fmov.s @%[x], fr12 + fmov.s @%[y], fr13 + fmov.s @%[z], fr14 + fldi1 fr15 + frchg + )" + : + : [x] "r" (&x), [y] "r" (&y), [z] "r" (&z)); } -__hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) { +inline __hot __icache_aligned +void mat_transpose(void) { + asm volatile (R"( + frchg + + flds fr1, fpul + fmov fr4, fr1 + fsts fpul, fr4 + + flds fr2, fpul + fmov fr8, fr2 + fsts fpul, fr8 + + flds fr3, fpul + fmov fr12, fr3 + fsts fpul, fr12 + + flds fr6, fpul + fmov fr9, fr6 + fsts fpul, fr9 + + flds fr7, fpul + fmov fr13, fr7 + fsts fpul, fr13 + + flds fr11, fpUL + fmov fr14, fr11 + fsts fpul, fr14 + + frchg + )" + : + : + : "fpul"); +} + +__hot __icache_aligned inline +void mat_copy(matrix_t *dst, const matrix_t *src) { asm volatile(R"( fschg pref @%[dst] - fmov.d @%[src]+, xd0 fmov.d @%[src]+, xd2 fmov.d @%[src]+, xd4 fmov.d @%[src]+, xd6 pref @%[src] - add #32, %[dst] fmov.d xd6, @-%[dst] @@ -568,141 +701,243 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) fmov.d @%[src]+, xd6 add #32, %[dst] - fmov.d xd6, @-%[dst] fmov.d xd4, @-%[dst] fmov.d xd2, @-%[dst] fmov.d xd0, @-%[dst] fschg - )": [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst) - : - :); + )" + : [dst] "+&r" (dst), [src] "+&r" (src), "=m" (*dst)); } -__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { +constexpr float fsca_scale = 10430.37835f; + +__hot __icache_aligned inline +void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { unsigned int prefetch_scratch; - asm volatile ( - "mov %[bmtrx], %[pref_scratch]\n\t" // (MT) - "add #32, %[pref_scratch]\n\t" // offset by 32 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) - "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) (FE) - "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) - // back matrix - "fmov.d @%[bmtrx]+, XD0\n\t" // (LS) - "fmov.d @%[bmtrx]+, XD2\n\t" - "fmov.d @%[bmtrx]+, XD4\n\t" - "fmov.d @%[bmtrx]+, XD6\n\t" - "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait (LS) - "fmov.d @%[bmtrx]+, XD8\n\t" // bmtrx prefetch should work for here - "fmov.d @%[bmtrx]+, XD10\n\t" - "fmov.d @%[bmtrx]+, XD12\n\t" - "mov %[fmtrx], %[pref_scratch]\n\t" // (MT) - "add #32, %[pref_scratch]\n\t" // store offset by 32 in r0 (EX - flow dependency, but 'add' is actually parallelized since 'mov Rm, Rn' is 0-cycle) - "fmov.d @%[bmtrx], XD14\n\t" - "pref @%[pref_scratch]\n\t" // Get a head start prefetching the second half of the 64-byte data (LS) - // front matrix - // interleave loads and matrix multiply 4x4 - "fmov.d @%[fmtrx]+, DR0\n\t" - "fmov.d @%[fmtrx]+, DR2\n\t" - "fmov.d @%[fmtrx]+, DR4\n\t" // (LS) want to issue the next one before 'ftrv' for parallel exec - "ftrv XMTRX, FV0\n\t" // (FE) + asm volatile (R"( + mov %[m1], %[prefscr] + add #32, %[prefscr] + fschg + pref @%[prefscr] - "fmov.d @%[fmtrx]+, DR6\n\t" - "fmov.d @%[fmtrx]+, DR8\n\t" - "ftrv XMTRX, FV4\n\t" + fmov.d @%[m1]+, xd0 + fmov.d @%[m1]+, xd2 + fmov.d @%[m1]+, xd4 + fmov.d @%[m1]+, xd6 + pref @%[m1] + fmov.d @%[m1]+, xd8 + fmov.d @%[m1]+, xd10 + fmov.d @%[m1]+, xd12 + mov %[m2], %[prefscr] + add #32, %[prefscr] + fmov.d @%[m1], xd14 + pref @%[prefscr] - "fmov.d @%[fmtrx]+, DR10\n\t" - "fmov.d @%[fmtrx]+, DR12\n\t" - "ftrv XMTRX, FV8\n\t" + fmov.d @%[m2]+, dr0 + fmov.d @%[m2]+, dr2 + fmov.d @%[m2]+, dr4 + ftrv xmtrx, fv0 - "fmov.d @%[fmtrx], DR14\n\t" // (LS, but this will stall 'ftrv' for 3 cycles) - "fschg\n\t" // switch back to single moves (and avoid stalling 'ftrv') (FE) - "ftrv XMTRX, FV12\n\t" // (FE) - // Save output in XF regs - "frchg\n" - : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2), [pref_scratch] "=&r" (prefetch_scratch) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed - : // no inputs - : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) - ); + fmov.d @%[m2]+, dr6 + fmov.d @%[m2]+, dr8 + ftrv xmtrx, fv4 + + fmov.d @%[m2]+, dr10 + fmov.d @%[m2]+, dr12 + ftrv xmtrx, fv8 + + fmov.d @%[m2], dr14 + fschg + ftrv xmtrx, fv12 + frchg + )" + : [m1] "+&r" (matrix1), [m2] "+r" (matrix2), [prefscr] "=&r" (prefetch_scratch) + : + : "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15"); } -__hot inline void mat_apply_rotate_x(float x) { - x *= 10430.37835f; - asm volatile( - "ftrc %[a], fpul\n\t" - "fsca fpul, dr4\n\t" - "fldi0 fr8\n\t" - "fldi0 fr11\n\t" - "fmov fr5, fr10\n\t" - "fmov fr4, fr9\n\t" - "fneg fr9\n\t" - "ftrv xmtrx, fv8\n\t" - "fmov fr4, fr6\n\t" - "fldi0 fr7\n\t" - "fldi0 fr4\n\t" - "ftrv xmtrx, fv4\n\t" - "fschg\n\t" - "fmov dr8, xd8\n\t" - "fmov dr10, xd10\n\t" - "fmov dr4, xd4\n\t" - "fmov dr6, xd6\n\t" - "fschg\n" - : - : [a] "f"(x) - : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +__hot __icache_aligned inline void mat_set_rotate_x(float x) { + x *= fsca_scale; + asm volatile(R"( + ftrc %[x], fpul + frchg + fldi0 fr1 + fldi0 fr2 + fldi0 fr3 + fldi0 fr7 + fldi0 fr8 + fldi0 fr12 + fldi0 fr13 + fsca fpul, dr0 + fldi0 fr4 + fldi0 fr11 + fldi0 fr14 + fldi1 fr15 + fmov fr1, fr5 + fmov fr1, fr10 + fmov fr0, fr9 + fmov fr0, fr6 + fneg fr6 + fldi1 fr0 + fldi0 fr1 + frchg + )" + : + : [x] "f" (x) + : "fpul"); } -__hot inline void mat_apply_rotate_y(float y) { - y *= 10430.37835f; - asm volatile( - "ftrc %[a], fpul\n\t" - "fsca fpul, dr6\n\t" - "fldi0 fr9\n\t" - "fldi0 fr11\n\t" - "fmov fr6, fr8\n\t" - "fmov fr7, fr10\n\t" - "ftrv xmtrx, fv8\n\t" - "fmov fr7, fr4\n\t" - "fldi0 fr5\n\t" - "fneg fr6\n\t" - "fldi0 fr7\n\t" - "ftrv xmtrx, fv4\n\t" - "fschg\n\t" - "fmov dr8, xd8\n\t" - "fmov dr10, xd10\n\t" - "fmov dr4, xd0\n\t" - "fmov dr6, xd2\n\t" - "fschg\n" - : - : [a] "f"(y) - : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +__hot __icache_aligned inline void mat_apply_rotate_x(float x) { + x *= fsca_scale; + asm volatile(R"( + ftrc %[x], fpul + fsca fpul, dr4 + fldi0 fr8 + fldi0 fr11 + fmov fr5, fr10 + fmov fr4, fr9 + fneg fr9 + ftrv xmtrx, fv8 + fmov fr4, fr6 + fldi0 fr7 + fldi0 fr4 + ftrv xmtrx, fv4 + fschg + fmov dr8, xd8 + fmov dr10, xd10 + fmov dr4, xd4 + fmov dr6, xd6 + fschg + )" + : + : [x] "f"(x) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); } -__hot inline void mat_apply_rotate_z(float z) { - z *= 10430.37835f; - asm volatile( - "ftrc %[a], fpul\n\t" - "fsca fpul, dr8\n\t" - "fldi0 fr10\n\t" - "fldi0 fr11\n\t" - "fmov fr8, fr5\n\t" - "fneg fr8\n\t" - "ftrv xmtrx, fv8\n\t" - "fmov fr9, fr4\n\t" - "fschg\n\t" - "fmov dr10, dr6\n\t" - "ftrv xmtrx, fv4\n\t" - "fmov dr8, xd4\n\t" - "fmov dr10, xd6\n\t" - "fmov dr4, xd0\n\t" - "fmov dr6, xd2\n\t" - "fschg\n" - : - : [a] "f"(z) - : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); + +__hot __icache_aligned inline void mat_set_rotate_y(float y) { + y *= fsca_scale; + asm volatile(R"( + ftrc %[y], fpul + frchg + fldi0 fr3 + fldi1 fr5 + fldi0 fr6 + fldi0 fr7 + fldi0 fr12 + fldi0 fr13 + fsca fpul, dr0 + fldi0 fr4 + fldi0 fr9 + fldi0 fr11 + fldi0 fr14 + fldi1 fr15 + fmov fr1, fr10 + fmov fr0, fr8 + fneg fr8 + fmov fr0, fr2 + fmov fr1, fr0 + fldi0 fr1 + frchg + )" + : + : [y] "f" (y) + : "fpul"); } +__hot __icache_aligned inline void mat_apply_rotate_y(float y) { + y *= fsca_scale; + asm volatile(R"( + ftrc %[y], fpul + fsca fpul, dr6 + fldi0 fr9 + fldi0 fr11 + fmov fr6, fr8 + fmov fr7, fr10 + ftrv xmtrx, fv8 + fmov fr7, fr4 + fldi0 fr5 + fneg fr6 + fldi0 fr7 + ftrv xmtrx, fv4 + fschg + fmov dr8, xd8 + fmov dr10, xd10 + fmov dr4, xd0 + fmov dr6, xd2 + fschg + )" + : + : [y] "f" (y) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +__hot __icache_aligned inline void mat_apply_rotate_z(float z) { + z *= fsca_scale; + asm volatile(R"( + ftrc %[z], fpul + fsca fpul, dr8 + fldi0 fr10 + fldi0 fr11 + fmov fr8, fr5 + fneg fr8 + ftrv xmtrx, fv8 + fmov fr9, fr4 + fschg + fmov dr10, dr6 + ftrv xmtrx, fv4 + fmov dr8, xd4 + fmov dr10, xd6 + fmov dr4, xd0 + fmov dr6, xd2 + fschg + )" + : + : [z] "f" (z) + : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); +} + +__hot __icache_aligned inline void mat_set_rotate_z(float z) { + z *= fsca_scale; + asm volatile(R"( + ftrc %[z], fpul + frchg + fldi0 fr2 + fldi0 fr3 + fldi1 fr10 + fldi0 fr11 + fsca fpul, dr4 + fschg + fmov dr2, dr6 + fmov dr2, dr8 + fmov dr2, dr12 + fldi0 fr14 + fldi1 fr15 + fschg + fmov fr5, fr0 + fmov fr4, fr1 + fneg fr1 + frchg + )" + : + : [z] "f" (z) + : "fpul"); +} + +__hot __icache_aligned inline void mat_set_rotate(float x, float y, float z) { + mat_set_rotate_x(x); + mat_apply_rotate_y(y); + mat_apply_rotate_z(z); +} + +__hot __icache_aligned inline void mat_apply_rotate(float x, float y, float z) { + mat_apply_rotate_x(x); + mat_apply_rotate_y(y); + mat_apply_rotate_z(z); +} //TODO: FIXME FOR VC (AND USE FTRV) template @@ -880,23 +1115,6 @@ __always_inline constexpr float fipr2D(float x1, float y1, float x2, float y2) { w_ = tmp1233123.w; \ } while(false) -inline void mat_load_3x3(const rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[0][3] = 0.0f; - XMTRX[1][3] = 0.0f; - XMTRX[2][3] = 0.0f; - - XMTRX[3][0] = 0.0f; - XMTRX[3][1] = 0.0f; - XMTRX[3][2] = 0.0f; - XMTRX[3][3] = 0.0f; -} - -inline void mat_load_4x4(const rw::Matrix* mtx) { - memcpy(XMTRX, mtx, sizeof(matrix_t)); - XMTRX[3][3] = 1.0f; -} - inline void mat_transpose(void) { matrix_t tmp; diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h index 1004e0e7..35334bc0 100644 --- a/vendor/librw/src/rwbase.h +++ b/vendor/librw/src/rwbase.h @@ -451,6 +451,9 @@ struct alignas(8) MatrixBase uint32 pad3; float posw = 1.0f; }; + + operator matrix_t *() { return reinterpret_cast(this); } + operator const matrix_t *() const { return reinterpret_cast(this); } }; struct Matrix: public MatrixBase @@ -463,9 +466,9 @@ struct Matrix: public MatrixBase Matrix() {} - Matrix(MatrixBase &&aggregate): - MatrixBase{aggregate} - {} + Matrix(MatrixBase &&aggregate){ + *this = aggregate; + } Matrix(const Matrix &rhs) { *this = rhs; @@ -476,8 +479,10 @@ struct Matrix: public MatrixBase return *this; } - operator matrix_t *() { return reinterpret_cast(this); } - operator const matrix_t *() const { return reinterpret_cast(this); } + Matrix &operator=(const MatrixBase &rhs) { + dc::mat_copy(*this, rhs); + return *this; + } static Matrix *create(void); void destroy(void); @@ -514,10 +519,12 @@ inline void convMatrix(Matrix *dst, RawMatrix *src){ inline void convMatrix(RawMatrix *dst, Matrix *src){ *dst = *(RawMatrix*)src; +#ifndef DC_SH4 dst->rightw = 0.0; dst->upw = 0.0; dst->atw = 0.0; dst->posw = 1.0; +#endif } struct Line