Accelerated lots of RW math + Coronas (liberty)

- lot of the RW matrix stuff has become accelerated
- went through and accelerated liberty's coronas/reflections
! apparently introduced a bug somewhere along the lines that cause boats
  to freak out and do summersaults when trying to drive. Will resolve
later.
This commit is contained in:
Falco Girgis
2025-04-29 10:03:49 -05:00
parent c3454ac7ec
commit f59f84c133
4 changed files with 343 additions and 245 deletions

View File

@@ -300,9 +300,9 @@ CCoronas::Render(void)
if(aCoronas[i].fadeAlpha && spriteCoors.z < aCoronas[i].drawDist){
float recipz = 1.0f/spriteCoors.z;
float recipz = dc::Invert<true, false>(spriteCoors.z);
float fadeDistance = aCoronas[i].drawDist / 2.0f;
float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - dc::Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
int totalFade = aCoronas[i].fadeAlpha * distanceFade;
if(aCoronas[i].LOScheck)
@@ -313,6 +313,7 @@ CCoronas::Render(void)
// render corona itself
if(aCoronas[i].texture){
float fogscale = CWeather::Foggyness*Min(spriteCoors.z, 40.0f)/40.0f + 1.0f;
float invFogScale = dc::Invert<true, false>(fogscale);
if(CCoronas::aCoronas[i].id == SUN_CORE)
spriteCoors.z = 0.95f * RwCameraGetFarClipPlane(Scene.camera);
RwRenderStateSet(rwRENDERSTATETEXTURERASTER, RwTextureGetRaster(aCoronas[i].texture));
@@ -328,9 +329,9 @@ CCoronas::Render(void)
CSprite::RenderOneXLUSprite(spriteCoors.x, spriteCoors.y, spriteCoors.z,
spritew * aCoronas[i].size * wscale,
spriteh * aCoronas[i].size * fogscale * hscale,
CCoronas::aCoronas[i].red / fogscale,
CCoronas::aCoronas[i].green / fogscale,
CCoronas::aCoronas[i].blue / fogscale,
CCoronas::aCoronas[i].red * invFogScale,
CCoronas::aCoronas[i].green * invFogScale,
CCoronas::aCoronas[i].blue * invFogScale,
totalFade,
recipz,
255);
@@ -339,9 +340,9 @@ CCoronas::Render(void)
spriteCoors.x, spriteCoors.y, spriteCoors.z,
spritew * aCoronas[i].size * fogscale,
spriteh * aCoronas[i].size * fogscale,
CCoronas::aCoronas[i].red / fogscale,
CCoronas::aCoronas[i].green / fogscale,
CCoronas::aCoronas[i].blue / fogscale,
CCoronas::aCoronas[i].red * invFogScale,
CCoronas::aCoronas[i].green * invFogScale,
CCoronas::aCoronas[i].blue * invFogScale,
totalFade,
recipz,
20.0f * recipz,
@@ -365,7 +366,7 @@ CCoronas::Render(void)
(spriteCoors.x - (screenw/2)) * flare->position + (screenw/2),
(spriteCoors.y - (screenh/2)) * flare->position + (screenh/2),
spriteCoors.z,
4.0f*flare->size * spritew/spriteh,
4.0f*flare->size * dc::Div<true, false>(spritew, spriteh),
4.0f*flare->size,
(flare->red * aCoronas[i].red)>>8,
(flare->green * aCoronas[i].green)>>8,
@@ -480,9 +481,9 @@ CCoronas::RenderReflections(void)
drawDist = Min(drawDist, 55.0f);
if(spriteCoors.z < drawDist){
float fadeDistance = drawDist / 2.0f;
float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - (spriteCoors.z - fadeDistance)/fadeDistance;
float distanceFade = spriteCoors.z < fadeDistance ? 1.0f : 1.0f - Div<true, false>((spriteCoors.z - fadeDistance), fadeDistance);
distanceFade = Clamp(distanceFade, 0.0f, 1.0f);
float recipz = 1.0f/RwCameraGetNearClipPlane(Scene.camera);
float recipz = dc::Invert<true, false>(RwCameraGetNearClipPlane(Scene.camera));
float heightFade = (20.0f - aCoronas[i].heightAboveRoad)/20.0f;
int intensity = distanceFade*heightFade * 230.0 * CWeather::WetRoads;
@@ -606,7 +607,9 @@ CEntity::ProcessLightsForEntity(void)
flashTimer1 = 0;
flashTimer2 = 0;
flashTimer3 = 0;
#ifdef DC_SH4
dc:mat_load2(GetMatrix());
#endif
n = CModelInfo::GetModelInfo(GetModelIndex())->GetNum2dEffects();
for(i = 0; i < n; i++, flashTimer1 += 0x80, flashTimer2 += 0x100, flashTimer3 += 0x200){
effect = CModelInfo::GetModelInfo(GetModelIndex())->Get2dEffect(i);
@@ -614,8 +617,12 @@ CEntity::ProcessLightsForEntity(void)
if(effect->type != EFFECT_LIGHT)
continue;
#ifndef DC_SH4
pos = GetMatrix() * effect->pos;
#else
mat_trans_single3_nodiv_nomod(effect->pos.x, effect->pos.y, effect->pos.z,
pos.x, pos.y, pos.z);
#endif
lightOn = false;
lightFlickering = false;
switch(effect->light.lightType){

View File

@@ -91,24 +91,6 @@ strncmp_ci(const char *s1, const char *s2, int n)
return 0;
}
Quat
mult(const Quat &q, const Quat &p)
{
#ifndef DC_SH4
return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
#else
Quat o;
dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
reinterpret_cast<const dc::quaternion_t &>(q),
reinterpret_cast<const dc::quaternion_t &>(p));
return o;
#endif
}
Quat*
Quat::rotate(const V3d *axis, float32 angle, CombineOp op)
{
@@ -166,53 +148,39 @@ slerp(const Quat &q, const Quat &p, float32 a)
//
// V3d
//
V3d
cross(const V3d &a, const V3d &b)
{
return makeV3d(a.y*b.z - a.z*b.y,
a.z*b.x - a.x*b.z,
a.x*b.y - a.y*b.x);
void V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m) {
int32 i;
#ifndef DC_SH4
V3d tmp;
for(i = 0; i < n; i++){
tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
out[i] = tmp;
}
#else
dc::mat_load2(*m);
for(i = 0; i < n; i++)
mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
out[i].x, out[i].y, out[i].z);
#endif
}
void
V3d::transformPoints(V3d *out, const V3d *in, int32 n, const Matrix *m)
{
int32 i;
#ifndef DC_SH4
V3d tmp;
for(i = 0; i < n; i++){
tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x + m->pos.x;
tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y + m->pos.y;
tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z + m->pos.z;
out[i] = tmp;
}
#else
dc::mat_load2(*m);
for(i = 0; i < n; i++)
mat_trans_single3_nodiv_nomod(in[i].x, in[i].y, in[i].z,
out[i].x, out[i].y, out[i].z);
#endif
}
void
V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m)
{
int32 i;
#ifndef DC_SH4
V3d tmp;
for(i = 0; i < n; i++){
tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
out[i] = tmp;
}
#else
dc::mat_load2(*m);
for(i = 0; i < n; i++)
mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
out[i].x, out[i].y, out[i].z);
#endif
void V3d::transformVectors(V3d *out, const V3d *in, int32 n, const Matrix *m) {
int32 i;
#ifndef DC_SH4
V3d tmp;
for(i = 0; i < n; i++){
tmp.x = in[i].x*m->right.x + in[i].y*m->up.x + in[i].z*m->at.x;
tmp.y = in[i].x*m->right.y + in[i].y*m->up.y + in[i].z*m->at.y;
tmp.z = in[i].x*m->right.z + in[i].y*m->up.z + in[i].z*m->at.z;
out[i] = tmp;
}
#else
dc::mat_load2(*m);
for(i = 0; i < n; i++)
mat_trans_normal3_nomod(in[i].x, in[i].y, in[i].z,
out[i].x, out[i].y, out[i].z);
#endif
}
//
@@ -343,9 +311,10 @@ Matrix::mult(Matrix *dst, const Matrix *src1, const Matrix *src2)
*dst = *src2;
else if(src2->flags & IDENTITY)
*dst = *src1;
else{
else {
uint8_t flags = src1->flags & src2->flags;
mult_(dst, src1, src2);
dst->flags = src1->flags & src2->flags;
dst->flags = flags;
}
return dst;
}
@@ -366,7 +335,8 @@ Matrix::invert(Matrix *dst, const Matrix *src)
Matrix*
Matrix::transpose(Matrix *dst, const Matrix *src)
{
if(src->flags & IDENTITY)
#ifndef DC_SH4
if(src->flags & IDENTITY)
*dst = *src;
dst->right.x = src->right.x;
dst->up.x = src->right.y;
@@ -380,25 +350,31 @@ Matrix::transpose(Matrix *dst, const Matrix *src)
dst->pos.x = 0.0;
dst->pos.y = 0.0;
dst->pos.z = 0.0;
#else
if(src->flags & IDENTITY)
*dst = *src;
else {
dc::mat_load_transpose(*src);
dc::mat_store2(*dst);
}
#endif
return dst;
}
Matrix*
Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
{
Matrix tmp, rot;
makeRotation(&rot, axis, angle);
Matrix rot;
makeRotation(&rot, axis, angle);
switch(op){
case COMBINEREPLACE:
*this = rot;
break;
case COMBINEPRECONCAT:
mult(&tmp, &rot, this);
*this = tmp;
mult(this, &rot, this);
break;
case COMBINEPOSTCONCAT:
mult(&tmp, this, &rot);
*this = tmp;
mult(this, this, &rot);
break;
}
return this;
@@ -407,27 +383,25 @@ Matrix::rotate(const V3d *axis, float32 angle, CombineOp op)
Matrix*
Matrix::rotate(const Quat &q, CombineOp op)
{
Matrix tmp, rot;
makeRotation(&rot, q);
Matrix rot;
makeRotation(&rot, q);
switch(op){
case COMBINEREPLACE:
*this = rot;
break;
case COMBINEPRECONCAT:
mult(&tmp, &rot, this);
*this = tmp;
mult(this, &rot, this);
break;
case COMBINEPOSTCONCAT:
mult(&tmp, this, &rot);
*this = tmp;
mult(this, this, &rot);
break;
}
return this;
}
Matrix*
Matrix::translate(const V3d *translation, CombineOp op)
{
Matrix tmp;
Matrix trans = identMat;
trans.pos = *translation;
trans.flags &= ~IDENTITY;
@@ -436,12 +410,10 @@ Matrix::translate(const V3d *translation, CombineOp op)
*this = trans;
break;
case COMBINEPRECONCAT:
mult(&tmp, &trans, this);
*this = tmp;
mult(this, &trans, this);
break;
case COMBINEPOSTCONCAT:
mult(&tmp, this, &trans);
*this = tmp;
mult(this, this, &trans);
break;
}
return this;
@@ -450,7 +422,6 @@ Matrix::translate(const V3d *translation, CombineOp op)
Matrix*
Matrix::scale(const V3d *scale, CombineOp op)
{
Matrix tmp;
Matrix scl = identMat;
scl.right.x = scale->x;
scl.up.y = scale->y;
@@ -461,12 +432,10 @@ Matrix::scale(const V3d *scale, CombineOp op)
*this = scl;
break;
case COMBINEPRECONCAT:
mult(&tmp, &scl, this);
*this = tmp;
mult(this, &scl, this);
break;
case COMBINEPOSTCONCAT:
mult(&tmp, this, &scl);
*this = tmp;
mult(this, this, &scl);
break;
}
return this;
@@ -475,18 +444,15 @@ Matrix::scale(const V3d *scale, CombineOp op)
Matrix*
Matrix::transform(const Matrix *mat, CombineOp op)
{
Matrix tmp;
switch(op){
case COMBINEREPLACE:
*this = *mat;
break;
case COMBINEPRECONCAT:
mult(&tmp, mat, this);
*this = tmp;
mult(this, mat, this);
break;
case COMBINEPOSTCONCAT:
mult(&tmp, this, mat);
*this = tmp;
mult(this, this, mat);
break;
}
return this;
@@ -501,27 +467,31 @@ Matrix::getRotation(void)
if(tr > 0.0f){
s = sqrtf(1.0f + tr) * 2.0f;
q.w = s / 4.0f;
q.x = (up.z - at.y) / s;
q.y = (at.x - right.z) / s;
q.z = (right.y - up.x) / s;
float invS = dc::Invert<true, false>(s);
q.x = (up.z - at.y) * invS;
q.y = (at.x - right.z) * invS;
q.z = (right.y - up.x) * invS;
}else if(right.x > up.y && right.x > at.z){
s = sqrtf(1.0f + right.x - up.y - at.z) * 2.0f;
q.w = (up.z - at.y) / s;
q.x = s / 4.0f;
q.y = (up.x + right.y) / s;
q.z = (at.x + right.z) / s;
q.x = s / 4.0f;
float invS = dc::Invert<true, false>(s);
q.w = (up.z - at.y) * invS;
q.y = (up.x + right.y) * invS;
q.z = (at.x + right.z) * invS;
}else if(up.y > at.z){
s = sqrtf(1.0f + up.y - right.x - at.z) * 2.0f;
q.w = (at.x - right.z) / s;
q.x = (up.x + right.y) / s;
q.y = s / 4.0f;
q.z = (at.y + up.z) / s;
q.y = s / 4.0f;
float invS = dc::Invert<true, false>(s);
q.w = (at.x - right.z) * invS;
q.x = (up.x + right.y) * invS;
q.z = (at.y + up.z) * invS;
}else{
s = sqrtf(1.0f + at.z - right.x - up.y) * 2.0f;
q.w = (right.y - up.x) / s;
q.x = (at.x + right.z) / s;
q.y = (at.y + up.z) / s;
q.z = s / 4.0f;
q.z = s / 4.0f;
float invS = dc::Invert<true, false>(s);
q.w = (right.y - up.x) * invS;
q.x = (at.x + right.z) * invS;
q.y = (at.y + up.z) * invS;
}
return q;
}
@@ -543,20 +513,7 @@ Matrix::lookAt(const V3d &dir, const V3d &up)
void
Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const Matrix *__restrict__ src2)
{
#if !defined(DC_TEXCONV) && !defined(DC_SIM)
dst->right.x = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.x, src2->up.x, src2->at.x, 0);
dst->right.y = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.y, src2->up.y, src2->at.y, 0);
dst->right.z = fipr(src1->right.x, src1->right.y, src1->right.z, 0, src2->right.z, src2->up.z, src2->at.z, 0);
dst->up.x = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.x, src2->up.x, src2->at.x, 0);
dst->up.y = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.y, src2->up.y, src2->at.y, 0);
dst->up.z = fipr(src1->up.x, src1->up.y, src1->up.z, 0, src2->right.z, src2->up.z, src2->at.z, 0);
dst->at.x = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.x, src2->up.x, src2->at.x, 0);
dst->at.y = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.y, src2->up.y, src2->at.y, 0);
dst->at.z = fipr(src1->at.x, src1->at.y, src1->at.z, 0, src2->right.z, src2->up.z, src2->at.z, 0);
dst->pos.x = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.x, src2->up.x, src2->at.x, src2->pos.x);
dst->pos.y = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.y, src2->up.y, src2->at.y, src2->pos.y);
dst->pos.z = fipr(src1->pos.x, src1->pos.y, src1->pos.z, 1, src2->right.z, src2->up.z, src2->at.z, src2->pos.z);
#else
#ifndef DC_SH4
dst->right.x = src1->right.x*src2->right.x + src1->right.y*src2->up.x + src1->right.z*src2->at.x;
dst->right.y = src1->right.x*src2->right.y + src1->right.y*src2->up.y + src1->right.z*src2->at.y;
dst->right.z = src1->right.x*src2->right.z + src1->right.y*src2->up.z + src1->right.z*src2->at.z;
@@ -569,12 +526,15 @@ Matrix::mult_(Matrix *__restrict__ dst, const Matrix *__restrict__ src1, const M
dst->pos.x = src1->pos.x*src2->right.x + src1->pos.y*src2->up.x + src1->pos.z*src2->at.x + src2->pos.x;
dst->pos.y = src1->pos.x*src2->right.y + src1->pos.y*src2->up.y + src1->pos.z*src2->at.y + src2->pos.y;
dst->pos.z = src1->pos.x*src2->right.z + src1->pos.y*src2->up.z + src1->pos.z*src2->at.z + src2->pos.z;
#endif
#else
dc::mat_mult(*dst, *src2, *src1);
#endif
}
void
Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
{
#if 1
dst->right.x = src->right.x;
dst->right.y = src->up.x;
dst->right.z = src->at.x;
@@ -593,7 +553,12 @@ Matrix::invertOrthonormal(Matrix *dst, const Matrix *src)
dst->pos.z = -(src->pos.x*src->at.x +
src->pos.y*src->at.y +
src->pos.z*src->at.z);
dst->flags = TYPEORTHONORMAL;
#else
dc::mat_load_transpose(*src);
dc::mat_invert_tranpose();
dc::mat_store2(*dst);
#endif
dst->flags = TYPEORTHONORMAL;
}
Matrix*
@@ -688,7 +653,11 @@ Matrix::normalError(void)
x = dot(right, right) - 1.0f;
y = dot(up, up) - 1.0f;
z = dot(at, at) - 1.0f;
#ifndef DC_SH4
return x*x + y*y + z*z;
#else
return fipr_magnitude_sqr(x, y, z, 0.0f);
#endif
}
float32
@@ -698,16 +667,27 @@ Matrix::orthogonalError(void)
x = dot(at, up);
y = dot(at, right);
z = dot(up, right);
#ifndef DC_SH4
return x*x + y*y + z*z;
#else
return fipr_magnitude_sqr(x, y, z, 0.0f);
#endif
}
float32
Matrix::identityError(void)
{
V3d r = { right.x-1.0f, right.y, right.z };
V3d r = { right.x-1.0f, right.y, right.z };
V3d u = { up.x, up.y-1.0f, up.z };
V3d a = { at.x, at.y, at.z-1.0f };
#ifndef DC_SH4
return dot(r,r) + dot(u,u) + dot(a,a) + dot(pos,pos);
#else
return fipr_magnitude_sqr(r.x, r.y, r.z, 0.0f) +
fipr_magnitude_sqr(u.x, u.y, u.z, 0.0f) +
fipr_magnitude_sqr(at.x, at.y, at.z, 0.0f) +
fipr_magnitude_sqr(pos.x, pos.y, pos.z, 0.0f);
#endif
}
void

View File

@@ -246,6 +246,83 @@ inline __hot __icache_aligned void mat_load_transpose(const matrix_t *mtx) {
);
}
inline __hot __icache_aligned void mat_load_3x3_transpose(const matrix_t *mtx) {
asm volatile(
R"(
frchg
fmov.s @%[mtx]+, fr0
add #32, %[mtx]
pref @%[mtx]
add #-(32 - 4), %[mtx]
fmov.s @%[mtx]+, fr4
fmov.s @%[mtx]+, fr8
fldi0 fr12
add #4, %[mtx]
fmov.s @%[mtx]+, fr1
fmov.s @%[mtx]+, fr5
fmov.s @%[mtx]+, fr9
fldi0 fr13
add #4, %[mtx]
fmov.s @%[mtx]+, fr2
fmov.s @%[mtx]+, fr6
fmov.s @%[mtx]+, fr10
fldi0 fr14
fldi0 fr3
fldi0 fr7
fmov fr3, fr11
fldi1 fr15
frchg
)"
: [mtx] "+r" (mtx)
:
:
);
}
inline __hot __icache_aligned void mat_invert_tranpose() {
asm volatile(
"frchg\n\t"
"fneg fr12\n\t"
"fneg fr13\n\t"
"fneg fr14\n\t"
"fldi0 fr15\n\t"
"fldi0 fr3\n\t"
"fipr fv12, fv0\n\t"
"fldi0 fr7\n\t"
"fipr fv12, fv4\n\t"
"fldi0 fr11\n\t"
"fipr fv12, fv8\n\t"
"fmov fr3, fr12\n\t"
"fmov fr7, fr13\n\t"
"fmov fr11, fr14\n\t"
"fmov fr1, fr15\n\t"
"fmov fr4, fr1\n\t"
"fmov fr15, fr4\n\t"
"fmov fr2, fr15\n\t"
"fmov fr8, fr2\n\t"
"fmov fr15, fr2\n\t"
"fmov fr6, fr15\n\t"
"fmov fr9, fr6\n\t"
"fmov fr15, fr9\n\t"
"fldi0 fr3\n\t"
"fldi0 fr7\n\t"
"fldi0 fr11\n\t"
"fldi1 fr15\n\t"
"frchg\n"
:
:
:);
}
inline __hot __icache_aligned void mat_store2(matrix_t *mtx) {
asm volatile(
R"(
@@ -449,103 +526,6 @@ __hot __icache_aligned inline void mat_copy(matrix_t *dst, const matrix_t *src)
:);
}
//TODO: FIXME FOR VC (AND USE FTRV)
template<bool FAST_APPROX=false>
__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
/*
// reorder the coefficients so that q1 stays in constant order {x,y,z,w}
// q2 then needs to be rotated after each inner product
x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
*/
// keep q1 in fv4
register float q1x __asm__ ("fr4") = (q1.x);
register float q1y __asm__ ("fr5") = (q1.y);
register float q1z __asm__ ("fr6") = (q1.z);
register float q1w __asm__ ("fr7") = (q1.w);
// load q2 into fv8, use it to get the shuffled reorder into fv0
register float q2x __asm__ ("fr8") = (q2.x);
register float q2y __asm__ ("fr9") = (q2.y);
register float q2z __asm__ ("fr10") = (q2.z);
register float q2w __asm__ ("fr11") = (q2.w);
// temporary operand / result in fv0
register float t1x __asm__ ("fr0");
register float t1y __asm__ ("fr1");
register float t1z __asm__ ("fr2");
register float t1w __asm__ ("fr3");
// x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
t1x = q2w;
t1y = q2z;
t1z = -q2y;
t1w = q2w;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
// x = t1w; try to avoid the stall by not reading the fipr result immediately
// y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
t1x = -q2z;
t1y = q2w;
t1z = q2x;
__atomic_thread_fence(1);
r->x = t1w; // get previous result
t1w = q2y;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//y = t1w;
// z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
t1x = q2y;
t1y = -q2x;
t1z = q2w;
__atomic_thread_fence(1);
r->y = t1w; // get previous result
t1w = q2z;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//z = t1w;
__atomic_thread_fence(1);
// w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
q2x = -q2x;
q2y = -q2y;
q2z = -q2z;
__asm__ ("\n"
" fipr fv4,fv8\n"
: "+f" (q2w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (q2x), "f" (q2y), "f" (q2z)
);
__atomic_thread_fence(1);
r->z = t1w;
__atomic_thread_fence(1);
r->w = q2w;
} else {
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
}
}
__hot inline void mat_load_apply(const matrix_t* matrix1, const matrix_t* matrix2) {
unsigned int prefetch_scratch;
@@ -669,6 +649,104 @@ __hot inline void mat_apply_rotate_z(float z) {
: "fpul", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11");
}
//TODO: FIXME FOR VC (AND USE FTRV)
template<bool FAST_APPROX=false>
__hot constexpr inline void quat_mult(quaternion_t *r, const quaternion_t &q1, const quaternion_t &q2) {
if(FAST_APPROX && !std::is_constant_evaluated()) {
/*
// reorder the coefficients so that q1 stays in constant order {x,y,z,w}
// q2 then needs to be rotated after each inner product
x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
*/
// keep q1 in fv4
register float q1x __asm__ ("fr4") = (q1.x);
register float q1y __asm__ ("fr5") = (q1.y);
register float q1z __asm__ ("fr6") = (q1.z);
register float q1w __asm__ ("fr7") = (q1.w);
// load q2 into fv8, use it to get the shuffled reorder into fv0
register float q2x __asm__ ("fr8") = (q2.x);
register float q2y __asm__ ("fr9") = (q2.y);
register float q2z __asm__ ("fr10") = (q2.z);
register float q2w __asm__ ("fr11") = (q2.w);
// temporary operand / result in fv0
register float t1x __asm__ ("fr0");
register float t1y __asm__ ("fr1");
register float t1z __asm__ ("fr2");
register float t1w __asm__ ("fr3");
// x = (q1.x * q2.w) + (q1.y * q2.z) - (q1.z * q2.y) + (q1.w * q2.x);
t1x = q2w;
t1y = q2z;
t1z = -q2y;
t1w = q2w;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
// x = t1w; try to avoid the stall by not reading the fipr result immediately
// y = -(q1.x * q2.z) + (q1.y * q2.w) + (q1.z * q2.x) + (q1.w * q2.y);
t1x = -q2z;
t1y = q2w;
t1z = q2x;
__atomic_thread_fence(1);
r->x = t1w; // get previous result
t1w = q2y;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//y = t1w;
// z = (q1.x * q2.y) - (q1.y * q2.x) + (q1.z * q2.w) + (q1.w * q2.z);
t1x = q2y;
t1y = -q2x;
t1z = q2w;
__atomic_thread_fence(1);
r->y = t1w; // get previous result
t1w = q2z;
__asm__ ("\n"
" fipr fv4,fv0\n"
: "+f" (t1w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (t1x), "f" (t1y), "f" (t1z)
);
//z = t1w;
__atomic_thread_fence(1);
// w = -(q1.x * q2.x) - (q1.y * q2.y) - (q1.z * q2.z) + (q1.w * q2.w);
q2x = -q2x;
q2y = -q2y;
q2z = -q2z;
__asm__ ("\n"
" fipr fv4,fv8\n"
: "+f" (q2w)
: "f" (q1x), "f" (q1y), "f" (q1z), "f" (q1w),
"f" (q2x), "f" (q2y), "f" (q2z)
);
__atomic_thread_fence(1);
r->z = t1w;
__atomic_thread_fence(1);
r->w = q2w;
} else {
r->x = (q2.z * q1.y) - (q1.z * q2.y) + (q1.x * q2.w) + (q2.x * q1.w);
r->y = (q2.x * q1.z) - (q1.x * q2.z) + (q1.y * q2.w) + (q2.y * q1.w);
r->z = (q2.y * q1.x) - (q1.y * q2.x) + (q1.z * q2.w) + (q2.z * q1.w);
r->w = (q2.w * q1.w) - (q2.x * q1.x) - (q2.y * q1.y) - (q2.z * q1.z);
}
}
# else
# ifdef DC_TEXCONV
# define mat_apply(a)

View File

@@ -238,8 +238,8 @@ inline V2d neg(const V2d &a) { return makeV2d(-a.x, -a.y); }
inline V2d add(const V2d &a, const V2d &b) { return makeV2d(a.x+b.x, a.y+b.y); }
inline V2d sub(const V2d &a, const V2d &b) { return makeV2d(a.x-b.x, a.y-b.y); }
inline V2d scale(const V2d &a, float32 r) { return makeV2d(a.x*r, a.y*r); }
inline float32 length(const V2d &v) { return sqrtf(v.x*v.x + v.y*v.y); }
inline V2d normalize(const V2d &v) { return scale(v, 1.0f/length(v)); }
inline float32 length(const V2d &v) { return dc::Sqrt(v.x*v.x + v.y*v.y); }
inline V2d normalize(const V2d &v) { return scale(v, dc::RecipSqrt(v.x*v.x + v.y*v.y)); }
struct V3d
{
@@ -265,10 +265,22 @@ inline float32 length(const V3d &v) {
return len;
#endif
}
inline V3d normalize(const V3d &v) { return scale(v, 1.0f/length(v)); }
inline V3d setlength(const V3d &v, float32 l) { return scale(v, l/length(v)); }
V3d cross(const V3d &a, const V3d &b);
inline __attribute__((always_inline)) float32 dot(const V3d &a, const V3d &b) {
inline V3d normalize(const V3d &v) {
float invLen;
#ifndef DC_SH4
invLen = 1.0f / length(v);
#else
invLen = dc::RecipSqrt(fipr_magnitude_sqr(v.x, v.y, v.z, 0.0f));
#endif
return scale(v, invLen);
}
inline V3d setlength(const V3d &v, float32 l) { return scale(v, dc::Div<true, false>(l, length(v))); }
inline V3d cross(const V3d &a, const V3d &b) {
return makeV3d(a.y*b.z - a.z*b.y,
a.z*b.x - a.x*b.z,
a.x*b.y - a.y*b.x);
}
inline float32 dot(const V3d &a, const V3d &b) {
#ifdef DC_SH4
return fipr(a.x, a.y, a.z, 0.0f, b.x, b.y, b.z, 0.0f);
#else
@@ -329,12 +341,33 @@ inline float32 length(const Quat &q) {
#ifndef DC_SH4
return sqrtf(q.w*q.w + q.x*q.x + q.y*q.y + q.z*q.z);
#else
return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, q.w));
return dc::Sqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
#endif
}
inline Quat normalize(const Quat &q) { return scale(q, 1.0f/length(q)); }
inline Quat normalize(const Quat &q) {
float invLen;
#ifndef DC_SH4
invLen = 1.0f / length(q);
#else
invLen = dc::RecipSqrt(fipr_magnitude_sqr(q.x, q.y, q.z, 0.0f));
#endif
return scale(q, invLen);
}
inline Quat conj(const Quat &q) { return makeQuat(q.w, -q.x, -q.y, -q.z); }
Quat mult(const Quat &q, const Quat &p);
inline Quat mult(const Quat &q, const Quat &p) {
#ifndef DC_SH4
return makeQuat(q.w*p.w - q.x*p.x - q.y*p.y - q.z*p.z,
q.w*p.x + q.x*p.w + q.y*p.z - q.z*p.y,
q.w*p.y + q.y*p.w + q.z*p.x - q.x*p.z,
q.w*p.z + q.z*p.w + q.x*p.y - q.y*p.x);
#else
Quat o;
dc::quat_mult(reinterpret_cast<dc::quaternion_t *>(&o),
reinterpret_cast<const dc::quaternion_t &>(q),
reinterpret_cast<const dc::quaternion_t &>(p));
return o;
#endif
}
inline V3d rotate(const V3d &v, const Quat &q) { return mult(mult(q, makeQuat(0.0f, v)), conj(q)).vec(); }
Quat lerp(const Quat &q, const Quat &p, float32 r);
Quat slerp(const Quat &q, const Quat &p, float32 a);