diff --git a/src/liberty/renderer/Coronas.cpp b/src/liberty/renderer/Coronas.cpp index e9f9e662..7f7de9cc 100644 --- a/src/liberty/renderer/Coronas.cpp +++ b/src/liberty/renderer/Coronas.cpp @@ -300,9 +300,9 @@ CCoronas::Render(void) if(aCoronas[i].fadeAlpha && spriteCoors.z < aCoronas[i].drawDist){ - float recipz = 1.0f/spriteCoors.z; + float recipz = dc::Invert(spriteCoors.z); float fadeDistance = aCoronas[i].drawDist / 2.0f; - float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance; + float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - dc::Div((spriteCoors.z - fadeDistance), fadeDistance); int totalFade = aCoronas[i].fadeAlpha * distanceFade; if(aCoronas[i].LOScheck) @@ -313,6 +313,7 @@ CCoronas::Render(void) // render corona itself if(aCoronas[i].texture){ float fogscale = CWeather::Foggyness*Min(spriteCoors.z, 40.0f)/40.0f + 1.0f; + float invFogScale = dc::Invert(fogscale); if(CCoronas::aCoronas[i].id == SUN_CORE) spriteCoors.z = 0.95f * RwCameraGetFarClipPlane(Scene.camera); RwRenderStateSet(rwRENDERSTATETEXTURERASTER, RwTextureGetRaster(aCoronas[i].texture)); @@ -328,9 +329,9 @@ CCoronas::Render(void) CSprite::RenderOneXLUSprite(spriteCoors.x, spriteCoors.y, spriteCoors.z, spritew * aCoronas[i].size * wscale, spriteh * aCoronas[i].size * fogscale * hscale, - CCoronas::aCoronas[i].red / fogscale, - CCoronas::aCoronas[i].green / fogscale, - CCoronas::aCoronas[i].blue / fogscale, + CCoronas::aCoronas[i].red * invFogScale, + CCoronas::aCoronas[i].green * invFogScale, + CCoronas::aCoronas[i].blue * invFogScale, totalFade, recipz, 255); @@ -339,9 +340,9 @@ CCoronas::Render(void) spriteCoors.x, spriteCoors.y, spriteCoors.z, spritew * aCoronas[i].size * fogscale, spriteh * aCoronas[i].size * fogscale, - CCoronas::aCoronas[i].red / fogscale, - CCoronas::aCoronas[i].green / fogscale, - CCoronas::aCoronas[i].blue / fogscale, + CCoronas::aCoronas[i].red * invFogScale, + CCoronas::aCoronas[i].green * invFogScale, + CCoronas::aCoronas[i].blue * invFogScale, totalFade, recipz, 20.0f * recipz, @@ -365,7 +366,7 @@ CCoronas::Render(void) (spriteCoors.x - (screenw/2)) * flare->position + (screenw/2), (spriteCoors.y - (screenh/2)) * flare->position + (screenh/2), spriteCoors.z, - 4.0f*flare->size * spritew/spriteh, + 4.0f*flare->size * dc::Div(spritew, spriteh), 4.0f*flare->size, (flare->red * aCoronas[i].red)>>8, (flare->green * aCoronas[i].green)>>8, @@ -480,9 +481,9 @@ CCoronas::RenderReflections(void) drawDist = Min(drawDist, 55.0f); if(spriteCoors.z < drawDist){ float fadeDistance = drawDist / 2.0f; - float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance; + float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - Div((spriteCoors.z - fadeDistance), fadeDistance); distanceFade = Clamp(distanceFade, 0.0f, 1.0f); - float recipz = 1.0f/RwCameraGetNearClipPlane(Scene.camera); + float recipz = dc::Invert(RwCameraGetNearClipPlane(Scene.camera)); float heightFade = (20.0f - aCoronas[i].heightAboveRoad)/20.0f; int intensity = distanceFade*heightFade * 230.0 * CWeather::WetRoads; @@ -606,7 +607,9 @@ CEntity::ProcessLightsForEntity(void) flashTimer1 = 0; flashTimer2 = 0; flashTimer3 = 0; - +#ifdef DC_SH4 + dc:mat_load2(GetMatrix()); +#endif n = CModelInfo::GetModelInfo(GetModelIndex())->GetNum2dEffects(); for(i = 0; i < n; i++, flashTimer1 += 0x80, flashTimer2 += 0x100, flashTimer3 += 0x200){ effect = CModelInfo::GetModelInfo(GetModelIndex())->Get2dEffect(i); @@ -614,8 +617,12 @@ CEntity::ProcessLightsForEntity(void) if(effect->type != EFFECT_LIGHT) continue; +#ifndef DC_SH4 pos = GetMatrix() * effect->pos; - +#else + mat_trans_single3_nodiv_nomod(effect->pos.x, effect->pos.y, effect->pos.z, + pos.x, pos.y, pos.z); +#endif lightOn = false; lightFlickering = false; switch(effect->light.lightType){ diff --git a/vendor/librw/src/base.cpp b/vendor/librw/src/base.cpp index dd4f168b..2e3a22cc 100644 --- a/vendor/librw/src/base.cpp +++ b/vendor/librw/src/base.cpp @@ -91,24 +91,6 @@ strncmp_ci(const char *s1, const char *s2, int n) return 0; } -Quat -mult(const Quat &q, const Quat &p) -{ -#ifndef DC_SH4 - return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, - q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, - q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, - q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); -#else - Quat o; - dc::quat_mult(reinterpret_cast(&o), - reinterpret_cast(q), - reinterpret_cast(p)); - return o; -#endif -} - - Quat* Quat::rotate(const V3d *axis, float32 angle, CombineOp op) { @@ -166,53 +148,39 @@ slerp(const Quat &q, const Quat &p, float32 a) // // V3d // - -V3d -cross(const V3d &a, const V3d &b) -{ - return makeV3d(a.y*b.z - a.z*b.y, - a.z*b.x - a.x*b.z, - a.x*b.y - a.y*b.x); +void V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) { + int32 i; + #ifndef DC_SH4 + V3d tmp; + for(i = 0; i < n; i++){ + tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x; + tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y; + tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z; + out[i] = tmp; + } + #else + dc::mat_load2(*m); + for(i = 0; i < n; i++) + mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z, + out[i].x, out[i].y, out[i].z); + #endif } - -void -V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) -{ - int32 i; -#ifndef DC_SH4 - V3d tmp; - for(i = 0; i < n; i++){ - tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x; - tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y; - tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z; - out[i] = tmp; - } -#else - dc::mat_load2(*m); - for(i = 0; i < n; i++) - mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z, - out[i].x, out[i].y, out[i].z); -#endif -} - -void -V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) -{ - int32 i; -#ifndef DC_SH4 - V3d tmp; - for(i = 0; i < n; i++){ - tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x; - tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y; - tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z; - out[i] = tmp; - } -#else - dc::mat_load2(*m); - for(i = 0; i < n; i++) - mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z, - out[i].x, out[i].y, out[i].z); -#endif +void V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) { + int32 i; + #ifndef DC_SH4 + V3d tmp; + for(i = 0; i < n; i++){ + tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x; + tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y; + tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z; + out[i] = tmp; + } + #else + dc::mat_load2(*m); + for(i = 0; i < n; i++) + mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z, + out[i].x, out[i].y, out[i].z); + #endif } // @@ -343,9 +311,10 @@ Matrix::mult(Matrix *dst, const Matrix *src1, const Matrix *src2) *dst = *src2; else if(src2->flags & IDENTITY) *dst = *src1; - else{ + else { + uint8_t flags = src1->flags & src2->flags; mult_(dst, src1, src2); - dst->flags = src1->flags & src2->flags; + dst->flags = flags; } return dst; } @@ -366,7 +335,8 @@ Matrix::invert(Matrix *dst, const Matrix *src) Matrix* Matrix::transpose(Matrix *dst, const Matrix *src) { - if(src->flags & IDENTITY) +#ifndef DC_SH4 + if(src->flags & IDENTITY) *dst = *src; dst->right.x = src->right.x; dst->up.x = src->right.y; @@ -380,25 +350,31 @@ Matrix::transpose(Matrix *dst, const Matrix *src) dst->pos.x = 0.0; dst->pos.y = 0.0; dst->pos.z = 0.0; +#else + if(src->flags & IDENTITY) + *dst = *src; + else { + dc::mat_load_transpose(*src); + dc::mat_store2(*dst); + } +#endif return dst; } Matrix* Matrix::rotate(const V3d *axis, float32 angle, CombineOp op) { - Matrix tmp, rot; - makeRotation(&rot, axis, angle); + Matrix rot; + makeRotation(&rot, axis, angle); switch(op){ case COMBINEREPLACE: *this = rot; break; case COMBINEPRECONCAT: - mult(&tmp, &rot, this); - *this = tmp; + mult(this, &rot, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &rot); - *this = tmp; + mult(this, this, &rot); break; } return this; @@ -407,27 +383,25 @@ Matrix::rotate(const V3d *axis, float32 angle, CombineOp op) Matrix* Matrix::rotate(const Quat &q, CombineOp op) { - Matrix tmp, rot; - makeRotation(&rot, q); + Matrix rot; + makeRotation(&rot, q); switch(op){ case COMBINEREPLACE: *this = rot; break; case COMBINEPRECONCAT: - mult(&tmp, &rot, this); - *this = tmp; + mult(this, &rot, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &rot); - *this = tmp; + mult(this, this, &rot); break; } return this; } + Matrix* Matrix::translate(const V3d *translation, CombineOp op) { - Matrix tmp; Matrix trans = identMat; trans.pos = *translation; trans.flags &= ~IDENTITY; @@ -436,12 +410,10 @@ Matrix::translate(const V3d *translation, CombineOp op) *this = trans; break; case COMBINEPRECONCAT: - mult(&tmp, &trans, this); - *this = tmp; + mult(this, &trans, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &trans); - *this = tmp; + mult(this, this, &trans); break; } return this; @@ -450,7 +422,6 @@ Matrix::translate(const V3d *translation, CombineOp op) Matrix* Matrix::scale(const V3d *scale, CombineOp op) { - Matrix tmp; Matrix scl = identMat; scl.right.x = scale->x; scl.up.y = scale->y; @@ -461,12 +432,10 @@ Matrix::scale(const V3d *scale, CombineOp op) *this = scl; break; case COMBINEPRECONCAT: - mult(&tmp, &scl, this); - *this = tmp; + mult(this, &scl, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, &scl); - *this = tmp; + mult(this, this, &scl); break; } return this; @@ -475,18 +444,15 @@ Matrix::scale(const V3d *scale, CombineOp op) Matrix* Matrix::transform(const Matrix *mat, CombineOp op) { - Matrix tmp; switch(op){ case COMBINEREPLACE: *this = *mat; break; case COMBINEPRECONCAT: - mult(&tmp, mat, this); - *this = tmp; + mult(this, mat, this); break; case COMBINEPOSTCONCAT: - mult(&tmp, this, mat); - *this = tmp; + mult(this, this, mat); break; } return this; @@ -501,27 +467,31 @@ Matrix::getRotation(void) if(tr > 0.0f){ s = sqrtf(1.0f + tr) * 2.0f; q.w = s / 4.0f; - q.x = (up.z - at.y) / s; - q.y = (at.x - right.z) / s; - q.z = (right.y - up.x) / s; + float invS = dc::Invert(s); + q.x = (up.z - at.y) * invS; + q.y = (at.x - right.z) * invS; + q.z = (right.y - up.x) * invS; }else if(right.x > up.y && right.x > at.z){ s = sqrtf(1.0f + right.x - up.y - at.z) * 2.0f; - q.w = (up.z - at.y) / s; - q.x = s / 4.0f; - q.y = (up.x + right.y) / s; - q.z = (at.x + right.z) / s; + q.x = s / 4.0f; + float invS = dc::Invert(s); + q.w = (up.z - at.y) * invS; + q.y = (up.x + right.y) * invS; + q.z = (at.x + right.z) * invS; }else if(up.y > at.z){ s = sqrtf(1.0f + up.y - right.x - at.z) * 2.0f; - q.w = (at.x - right.z) / s; - q.x = (up.x + right.y) / s; - q.y = s / 4.0f; - q.z = (at.y + up.z) / s; + q.y = s / 4.0f; + float invS = dc::Invert(s); + q.w = (at.x - right.z) * invS; + q.x = (up.x + right.y) * invS; + q.z = (at.y + up.z) * invS; }else{ s = sqrtf(1.0f + at.z - right.x - up.y) * 2.0f; - q.w = (right.y - up.x) / s; - q.x = (at.x + right.z) / s; - q.y = (at.y + up.z) / s; - q.z = s / 4.0f; + q.z = s / 4.0f; + float invS = dc::Invert(s); + q.w = (right.y - up.x) * invS; + q.x = (at.x + right.z) * invS; + q.y = (at.y + up.z) * invS; } return q; } @@ -543,20 +513,7 @@ Matrix::lookAt(const V3d &dir, const V3d &up) void Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const Matrix *__restrict__ src2) { - #if !defined(DC_TEXCONV) && !defined(DC_SIM) - dst->right.x = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.x, src2->up.x, src2->at.x, 0); - dst->right.y = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.y, src2->up.y, src2->at.y, 0); - dst->right.z = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.z, src2->up.z, src2->at.z, 0); - dst->up.x = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.x, src2->up.x, src2->at.x, 0); - dst->up.y = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.y, src2->up.y, src2->at.y, 0); - dst->up.z = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.z, src2->up.z, src2->at.z, 0); - dst->at.x = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.x, src2->up.x, src2->at.x, 0); - dst->at.y = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.y, src2->up.y, src2->at.y, 0); - dst->at.z = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.z, src2->up.z, src2->at.z, 0); - dst->pos.x = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.x, src2->up.x, src2->at.x, src2->pos.x); - dst->pos.y = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.y, src2->up.y, src2->at.y, src2->pos.y); - dst->pos.z = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.z, src2->up.z, src2->at.z, src2->pos.z); - #else +#ifndef DC_SH4 dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x; dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y; dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z; @@ -569,12 +526,15 @@ Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const M dst->pos.x = src1->pos.x*src2->right.x + src1->pos.y*src2->up.x + src1->pos.z*src2->at.x + src2->pos.x; dst->pos.y = src1->pos.x*src2->right.y + src1->pos.y*src2->up.y + src1->pos.z*src2->at.y + src2->pos.y; dst->pos.z = src1->pos.x*src2->right.z + src1->pos.y*src2->up.z + src1->pos.z*src2->at.z + src2->pos.z; - #endif +#else + dc::mat_mult(*dst, *src2, *src1); +#endif } void Matrix::invertOrthonormal(Matrix *dst, const Matrix *src) { +#if 1 dst->right.x = src->right.x; dst->right.y = src->up.x; dst->right.z = src->at.x; @@ -593,7 +553,12 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src) dst->pos.z = -(src->pos.x*src->at.x + src->pos.y*src->at.y + src->pos.z*src->at.z); - dst->flags = TYPEORTHONORMAL; +#else + dc::mat_load_transpose(*src); + dc::mat_invert_tranpose(); + dc::mat_store2(*dst); +#endif + dst->flags = TYPEORTHONORMAL; } Matrix* @@ -688,7 +653,11 @@ Matrix::normalError(void) x = dot(right, right) - 1.0f; y = dot(up, up) - 1.0f; z = dot(at, at) - 1.0f; +#ifndef DC_SH4 return x*x + y*y + z*z; +#else + return fipr_magnitude_sqr(x, y, z, 0.0f); +#endif } float32 @@ -698,16 +667,27 @@ Matrix::orthogonalError(void) x = dot(at, up); y = dot(at, right); z = dot(up, right); +#ifndef DC_SH4 return x*x + y*y + z*z; +#else + return fipr_magnitude_sqr(x, y, z, 0.0f); +#endif } float32 Matrix::identityError(void) { - V3d r = { right.x-1.0f, right.y, right.z }; + V3d r = { right.x-1.0f, right.y, right.z }; V3d u = { up.x, up.y-1.0f, up.z }; V3d a = { at.x, at.y, at.z-1.0f }; +#ifndef DC_SH4 return dot(r,r) + dot(u,u) + dot(a,a) + dot(pos,pos); +#else + return fipr_magnitude_sqr(r.x, r.y, r.z, 0.0f) + + fipr_magnitude_sqr(u.x, u.y, u.z, 0.0f) + + fipr_magnitude_sqr(at.x, at.y, at.z, 0.0f) + + fipr_magnitude_sqr(pos.x, pos.y, pos.z, 0.0f); +#endif } void diff --git a/vendor/librw/src/dc/rwdc_common.h b/vendor/librw/src/dc/rwdc_common.h index dc532776..0c40af52 100644 --- a/vendor/librw/src/dc/rwdc_common.h +++ b/vendor/librw/src/dc/rwdc_common.h @@ -246,6 +246,83 @@ inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) { ); } +inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) { + asm volatile( + R"( + frchg + + fmov.s @%[mtx]+, fr0 + + add #32, %[mtx] + pref @%[mtx] + add #-(32 - 4), %[mtx] + + fmov.s @%[mtx]+, fr4 + fmov.s @%[mtx]+, fr8 + fldi0 fr12 + add #4, %[mtx] + + fmov.s @%[mtx]+, fr1 + fmov.s @%[mtx]+, fr5 + fmov.s @%[mtx]+, fr9 + fldi0 fr13 + add #4, %[mtx] + + fmov.s @%[mtx]+, fr2 + fmov.s @%[mtx]+, fr6 + fmov.s @%[mtx]+, fr10 + fldi0 fr14 + + fldi0 fr3 + fldi0 fr7 + fmov fr3, fr11 + fldi1 fr15 + + frchg + )" + : [mtx] "+r" (mtx) + : + : + ); +} + +inline __hot __icache_aligned void mat_invert_tranpose() { + asm volatile( + "frchg\n\t" + "fneg fr12\n\t" + "fneg fr13\n\t" + "fneg fr14\n\t" + "fldi0 fr15\n\t" + "fldi0 fr3\n\t" + "fipr fv12, fv0\n\t" + "fldi0 fr7\n\t" + "fipr fv12, fv4\n\t" + "fldi0 fr11\n\t" + "fipr fv12, fv8\n\t" + + "fmov fr3, fr12\n\t" + "fmov fr7, fr13\n\t" + "fmov fr11, fr14\n\t" + "fmov fr1, fr15\n\t" + "fmov fr4, fr1\n\t" + "fmov fr15, fr4\n\t" + "fmov fr2, fr15\n\t" + "fmov fr8, fr2\n\t" + "fmov fr15, fr2\n\t" + "fmov fr6, fr15\n\t" + "fmov fr9, fr6\n\t" + "fmov fr15, fr9\n\t" + + "fldi0 fr3\n\t" + "fldi0 fr7\n\t" + "fldi0 fr11\n\t" + "fldi1 fr15\n\t" + "frchg\n" + : + : + :); +} + inline __hot __icache_aligned void mat_store2(matrix_t *mtx) { asm volatile( R"( @@ -449,103 +526,6 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src) :); } -//TODO: FIXME FOR VC (AND USE FTRV) -template -__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { - if(FAST_APPROX && !std::is_constant_evaluated()) { - /* - // reorder the coefficients so that q1 stays in constant order {x,y,z,w} - // q2 then needs to be rotated after each inner product - x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); - y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); - z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); - w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); - */ - // keep q1 in fv4 - register float q1x __asm__ ("fr4") = (q1.x); - register float q1y __asm__ ("fr5") = (q1.y); - register float q1z __asm__ ("fr6") = (q1.z); - register float q1w __asm__ ("fr7") = (q1.w); - - // load q2 into fv8, use it to get the shuffled reorder into fv0 - register float q2x __asm__ ("fr8") = (q2.x); - register float q2y __asm__ ("fr9") = (q2.y); - register float q2z __asm__ ("fr10") = (q2.z); - register float q2w __asm__ ("fr11") = (q2.w); - - // temporary operand / result in fv0 - register float t1x __asm__ ("fr0"); - register float t1y __asm__ ("fr1"); - register float t1z __asm__ ("fr2"); - register float t1w __asm__ ("fr3"); - - // x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); - t1x = q2w; - t1y = q2z; - t1z = -q2y; - t1w = q2w; - __asm__ ("\n" - " fipr fv4,fv0\n" - : "+f" (t1w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (t1x), "f" (t1y), "f" (t1z) - ); - // x = t1w; try to avoid the stall by not reading the fipr result immediately - - // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); - t1x = -q2z; - t1y = q2w; - t1z = q2x; - __atomic_thread_fence(1); - r->x = t1w; // get previous result - t1w = q2y; - __asm__ ("\n" - " fipr fv4,fv0\n" - : "+f" (t1w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (t1x), "f" (t1y), "f" (t1z) - ); - //y = t1w; - - // z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); - t1x = q2y; - t1y = -q2x; - t1z = q2w; - __atomic_thread_fence(1); - r->y = t1w; // get previous result - t1w = q2z; - __asm__ ("\n" - " fipr fv4,fv0\n" - : "+f" (t1w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (t1x), "f" (t1y), "f" (t1z) - ); - //z = t1w; - __atomic_thread_fence(1); - - // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); - q2x = -q2x; - q2y = -q2y; - q2z = -q2z; - __asm__ ("\n" - " fipr fv4,fv8\n" - : "+f" (q2w) - : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), - "f" (q2x), "f" (q2y), "f" (q2z) - ); - - __atomic_thread_fence(1); - r->z = t1w; - __atomic_thread_fence(1); - r->w = q2w; - } else { - r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); - r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); - r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); - r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); - } -} - __hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) { unsigned int prefetch_scratch; @@ -669,6 +649,104 @@ __hot inline void mat_apply_rotate_z(float z) { : "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11"); } + +//TODO: FIXME FOR VC (AND USE FTRV) +template +__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) { + if(FAST_APPROX && !std::is_constant_evaluated()) { + /* + // reorder the coefficients so that q1 stays in constant order {x,y,z,w} + // q2 then needs to be rotated after each inner product + x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + */ + // keep q1 in fv4 + register float q1x __asm__ ("fr4") = (q1.x); + register float q1y __asm__ ("fr5") = (q1.y); + register float q1z __asm__ ("fr6") = (q1.z); + register float q1w __asm__ ("fr7") = (q1.w); + + // load q2 into fv8, use it to get the shuffled reorder into fv0 + register float q2x __asm__ ("fr8") = (q2.x); + register float q2y __asm__ ("fr9") = (q2.y); + register float q2z __asm__ ("fr10") = (q2.z); + register float q2w __asm__ ("fr11") = (q2.w); + + // temporary operand / result in fv0 + register float t1x __asm__ ("fr0"); + register float t1y __asm__ ("fr1"); + register float t1z __asm__ ("fr2"); + register float t1w __asm__ ("fr3"); + + // x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x); + t1x = q2w; + t1y = q2z; + t1z = -q2y; + t1w = q2w; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + // x = t1w; try to avoid the stall by not reading the fipr result immediately + + // y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y); + t1x = -q2z; + t1y = q2w; + t1z = q2x; + __atomic_thread_fence(1); + r->x = t1w; // get previous result + t1w = q2y; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //y = t1w; + + // z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z); + t1x = q2y; + t1y = -q2x; + t1z = q2w; + __atomic_thread_fence(1); + r->y = t1w; // get previous result + t1w = q2z; + __asm__ ("\n" + " fipr fv4,fv0\n" + : "+f" (t1w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (t1x), "f" (t1y), "f" (t1z) + ); + //z = t1w; + __atomic_thread_fence(1); + + // w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w); + q2x = -q2x; + q2y = -q2y; + q2z = -q2z; + __asm__ ("\n" + " fipr fv4,fv8\n" + : "+f" (q2w) + : "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w), + "f" (q2x), "f" (q2y), "f" (q2z) + ); + + __atomic_thread_fence(1); + r->z = t1w; + __atomic_thread_fence(1); + r->w = q2w; + } else { + r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w); + r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w); + r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w); + r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z); + } +} + # else # ifdef DC_TEXCONV # define mat_apply(a) diff --git a/vendor/librw/src/rwbase.h b/vendor/librw/src/rwbase.h index 4965f2fd..1004e0e7 100644 --- a/vendor/librw/src/rwbase.h +++ b/vendor/librw/src/rwbase.h @@ -238,8 +238,8 @@ inline V2d neg(const V2d &a) { return makeV2d(-a.x, -a.y); } inline V2d add(const V2d &a, const V2d &b) { return makeV2d(a.x+b.x, a.y+b.y); } inline V2d sub(const V2d &a, const V2d &b) { return makeV2d(a.x-b.x, a.y-b.y); } inline V2d scale(const V2d &a, float32 r) { return makeV2d(a.x*r, a.y*r); } -inline float32 length(const V2d &v) { return sqrtf(v.x*v.x + v.y*v.y); } -inline V2d normalize(const V2d &v) { return scale(v, 1.0f/length(v)); } +inline float32 length(const V2d &v) { return dc::Sqrt(v.x*v.x + v.y*v.y); } +inline V2d normalize(const V2d &v) { return scale(v, dc::RecipSqrt(v.x*v.x + v.y*v.y)); } struct V3d { @@ -265,10 +265,22 @@ inline float32 length(const V3d &v) { return len; #endif } -inline V3d normalize(const V3d &v) { return scale(v, 1.0f/length(v)); } -inline V3d setlength(const V3d &v, float32 l) { return scale(v, l/length(v)); } -V3d cross(const V3d &a, const V3d &b); -inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) { +inline V3d normalize(const V3d &v) { + float invLen; +#ifndef DC_SH4 + invLen = 1.0f / length(v); +#else + invLen = dc::RecipSqrt(fipr_magnitude_sqr(v.x, v.y, v.z, 0.0f)); +#endif + return scale(v, invLen); +} +inline V3d setlength(const V3d &v, float32 l) { return scale(v, dc::Div(l, length(v))); } +inline V3d cross(const V3d &a, const V3d &b) { + return makeV3d(a.y*b.z - a.z*b.y, + a.z*b.x - a.x*b.z, + a.x*b.y - a.y*b.x); +} +inline float32 dot(const V3d &a, const V3d &b) { #ifdef DC_SH4 return fipr(a.x, a.y, a.z, 0.0f, b.x, b.y, b.z, 0.0f); #else @@ -329,12 +341,33 @@ inline float32 length(const Quat &q) { #ifndef DC_SH4 return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z); #else - return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w)); + return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f)); #endif } -inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); } +inline Quat normalize(const Quat &q) { + float invLen; +#ifndef DC_SH4 + invLen = 1.0f / length(q); +#else + invLen = dc::RecipSqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f)); +#endif + return scale(q, invLen); +} inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); } -Quat mult(const Quat &q, const Quat &p); +inline Quat mult(const Quat &q, const Quat &p) { +#ifndef DC_SH4 + return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z, + q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y, + q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z, + q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x); +#else + Quat o; + dc::quat_mult(reinterpret_cast(&o), + reinterpret_cast(q), + reinterpret_cast(p)); + return o; +#endif +} inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.0f, v)), conj(q)).vec(); } Quat lerp(const Quat &q, const Quat &p, float32 r); Quat slerp(const Quat &q, const Quat &p, float32 a);