1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-08-11 07:34:33 +02:00

#370 3DO use ARM asm implementation to transform and project vertices (the same speed, but save 4k of RAM)

This commit is contained in:
XProger
2021-11-18 12:02:41 +03:00
parent 5467a82bf9
commit 7761a5db06
7 changed files with 200 additions and 256 deletions

View File

@@ -945,13 +945,6 @@ void matrixSetIdentity()
m.e21 = 0;
m.e22 = 0x4000;
m.e23 = 0;
#ifdef __3DO__
m.e30 = 0;
m.e31 = 0;
m.e32 = 0;
m.e33 = 0x4000;
#endif
}
void matrixSetView(const vec3i &pos, int32 angleX, int32 angleY)
@@ -978,13 +971,6 @@ void matrixSetView(const vec3i &pos, int32 angleX, int32 angleY)
m.e22 = (cx * cy) >> FIXED_SHIFT;
m.e23 = 0;
#ifdef __3DO__
m.e30 = 0;
m.e31 = 0;
m.e32 = 0;
m.e33 = 0x4000;
#endif
cameraViewPos = pos;
cameraViewOffset = _vec3i(0, 0, 0);
}

View File

@@ -124,7 +124,7 @@
#define IWRAM_MATRIX_LERP
// the maximum of active enemies
// #define MAX_ENEMIES 3
#define VIS_DIST (1024 * 10)
#define VIEW_DIST (1024 * 10)
#endif
#ifdef __3DO__
@@ -140,7 +140,7 @@
#define MAX_ENEMIES 3
// set the maximum number of simultaneously played channels
#define SND_CHANNELS 4
#define VIS_DIST (1024 * 10)
#define VIEW_DIST (1024 * 10)
#endif
#ifndef NAV_STEPS
@@ -428,16 +428,16 @@ extern int32 fps;
#define MAX_DYN_SECTORS (1024*3)
#define MAX_SAMPLES 180
#ifndef VIS_DIST
#define VIS_DIST (1024 * 16)
#ifndef VIEW_DIST
#define VIEW_DIST (1024 * 16)
#endif
#define FOV_SHIFT 3
#define FOG_SHIFT 1
#define FOG_MAX VIS_DIST
#define FOG_MAX VIEW_DIST
#define FOG_MIN (FOG_MAX - (8192 >> FOG_SHIFT))
#define VIEW_MIN_F (256 << FIXED_SHIFT)
#define VIEW_MAX_F (FOG_MAX << FIXED_SHIFT)
#define VIEW_MAX_F (VIEW_DIST << FIXED_SHIFT)
#define FRUSTUM_FAR_X (5 << 10)
#define FRUSTUM_FAR_Y (3 << 10)
@@ -490,7 +490,7 @@ extern int32 fps;
#define DP33(ax,ay,az,bx,by,bz) (ax * bx + ay * by + az * bz)
#ifdef USE_DIV_TABLE
#define DIV_TABLE_SIZE 1024
#define DIV_TABLE_SIZE 1025 // to compare with #1024 without extra LDR
#define FixedInvS(x) ((x < 0) ? -divTable[abs(x)] : divTable[x])
#define FixedInvU(x) divTable[x]
extern divTableInt divTable[DIV_TABLE_SIZE];
@@ -588,23 +588,11 @@ struct vec4i {
}
};
#ifdef __3DO__
#define F16_SHIFT (16 - FIXED_SHIFT) // for fix14<->fix16 conversion
#define DOT_SHIFT (FIXED_SHIFT + FIXED_SHIFT - 16 - F16_SHIFT)
#endif
struct Matrix
{
#ifdef __3DO__
int32 e00, e10, e20, e30;
int32 e01, e11, e21, e31;
int32 e02, e12, e22, e32;
int32 e03, e13, e23, e33;
#else
int32 e00, e01, e02, e03;
int32 e10, e11, e12, e13;
int32 e20, e21, e22, e23;
#endif
};
struct RoomQuad

View File

@@ -11,11 +11,13 @@ pdiv RN r2
m0 RN r3
m1 RN r4
m2 RN r5
n0 RN r6
n1 RN r12
n2 RN lr
tmp RN r4
m RN r7
m3 RN r6
n0 RN r7
n1 RN r8
n2 RN r9
n3 RN r12
m RN lr
tmp RN m0
MACRO
next_row
@@ -25,13 +27,13 @@ m RN r7
MACRO
load
ldmia m, {m0, m1, m2}
ldmia n, {n0, n1, n2}
ldmia m, {m0, m1, m2, m3}
ldmia n, {n0, n1, n2, n3}
MEND
MACRO
store
stmia m, {m0, m1, m2}
stmia m, {m0, m1, m2, m3}
MEND
MACRO ; a = (a + b) / 2
@@ -40,9 +42,11 @@ m RN r7
add m0, m0, n0
add m1, m1, n1
add m2, m2, n2
add m3, m3, n3
mov m0, m0, asr #1
mov m1, m1, asr #1
mov m2, m2, asr #1
mov m3, m3, asr #1
store
MEND
@@ -52,9 +56,11 @@ m RN r7
sub n0, n0, m0
sub n1, n1, m1
sub n2, n2, m2
sub n3, n3, m3
add m0, m0, n0, asr #2
add m1, m1, n1, asr #2
add m2, m2, n2, asr #2
add m3, m3, n3, asr #2
store
MEND
@@ -64,9 +70,11 @@ m RN r7
sub m0, n0, m0
sub m1, n1, m1
sub m2, n2, m2
sub m3, n3, m3
sub m0, n0, m0, asr #2
sub m1, n1, m1, asr #2
sub m2, n2, m2, asr #2
sub m3, n3, m3, asr #2
store
MEND
@@ -76,29 +84,30 @@ m RN r7
sub n0, n0, m0
sub n1, n1, m1
sub n2, n2, m2
sub n3, n3, m3
mul n0, pmul, n0
mul n1, pmul, n1
mul n2, pmul, n2
mul n3, pmul, n3
add m0, m0, n0, asr #8
add m1, m1, n1, asr #8
add m2, m2, n2, asr #8
add m3, m3, n3, asr #8
store
MEND
MACRO ; transposed (3x4)
MACRO
lerp $func
$func ; e00, e10, e20
$func ; e00, e01, e02, e03
next_row
$func ; e01, e11, e21
$func ; e10, e11, e12, e13
next_row
$func ; e02, e12, e22
next_row
$func ; e03, e13, e23
$func ; e20, e21, e22, e23
b done
MEND
matrixLerp_asm
stmfd sp!, {r4-r7, lr}
stmfd sp!, {r4-r9, lr}
ldr m, =matrixPtr
ldr m, [m]
@@ -128,5 +137,5 @@ m1_d4
m3_d4
lerp _3_4
done ldmfd sp!, {r4-r7, pc}
done ldmfd sp!, {r4-r9, pc}
END

View File

@@ -4,7 +4,7 @@
struct Vertex
{
int32 x, y, z, w; // for rooms z = (depth << CLIP_SHIFT) | ClipFlags
int32 x, y, z; // for rooms z = (depth << CLIP_SHIFT) | ClipFlags
};
uint16* gPalette;
@@ -174,30 +174,126 @@ enum ClipFlags {
CLIP_NEAR = 1 << 5
};
X_INLINE int32 classify(int32 x, int32 y, int32 x0, int32 y0, int32 x1, int32 y1)
#define USE_ASM
#ifdef USE_ASM
#define unpackRoom unpackRoom_asm
#define unpackMesh unpackMesh_asm
#define projectVertices projectVertices_asm
extern "C" void unpackRoom_asm(const RoomVertex* vertices, int32 vCount);
extern "C" void unpackMesh_asm(const MeshVertex* vertices, int32 vCount);
extern "C" void projectVertices_asm(int32 vCount);
#else
#define unpackRoom unpackRoom_c
#define unpackMesh unpackMesh_c
#define projectVertices projectVertices_c
void unpackRoom_c(const RoomVertex* vertices, int32 vCount)
{
return (x < x0 ? CLIP_LEFT : 0) |
(x > x1 ? CLIP_RIGHT : 0) |
(y < y0 ? CLIP_TOP : 0) |
(y > y1 ? CLIP_BOTTOM : 0);
Vertex* res = gVertices;
uint32 *v32 = (uint32*)vertices;
for (int32 i = 0; i < vCount; i += 4)
{
uint32 n0 = *v32++;
uint32 n1 = *v32++;
res->x = (n0 << 10) & 0x7C00;
res->y = (n0 << 3) & 0x3F00;
res->z = (n0 >> 1) & 0x7C00;
res++;
res->x = (n0 >> 6) & 0x7C00;
res->y = (n0 >> 13) & 0x3F00;
res->z = (n0 >> 17) & 0x7C00;
res++;
res->x = (n1 << 10) & 0x7C00;
res->y = (n1 << 3) & 0x3F00;
res->z = (n1 >> 1) & 0x7C00;
res++;
res->x = (n1 >> 6) & 0x7C00;
res->y = (n1 >> 13) & 0x3F00;
res->z = (n1 >> 17) & 0x7C00;
res++;
}
}
X_INLINE void transformVertices(Vertex* points, int32 count)
void unpackMesh_c(const MeshVertex* vertices, int32 vCount)
{
uint32 *v32 = (uint32*)vertices;
Vertex* res = gVertices;
for (int32 i = 0; i < vCount; i += 2)
{
uint32 n0 = *v32++;
uint32 n1 = *v32++;
uint32 n2 = *v32++;
res->x = int16(n0 >> 16);
res->y = int16(n0);
res->z = int16(n1 >> 16);
res++;
res->x = int16(n1);
res->y = int16(n2 >> 16);
res->z = int16(n2);
res++;
}
}
void projectVertices_c(int32 vCount)
{
Matrix& m = matrixGet();
int32 mx = m.e03;
int32 my = m.e13;
int32 mz = m.e23;
m.e03 >>= FIXED_SHIFT;
m.e13 >>= FIXED_SHIFT;
m.e23 >>= FIXED_SHIFT;
MulManyVec4Mat44_F16((vec4f16*)points, (vec4f16*)points, *(mat44f16*)&matrixGet(), count);
Vertex* v = gVertices;
m.e03 = mx;
m.e13 = my;
m.e23 = mz;
int32 minX = viewportRel.x0;
int32 minY = viewportRel.y0;
int32 maxX = viewportRel.x1;
int32 maxY = viewportRel.y1;
for (int32 i = 0; i < vCount; i++)
{
int32 vx = v->x;
int32 vy = v->y;
int32 vz = v->z;
int32 x = DP43(m.e00, m.e01, m.e02, m.e03, vx, vy, vz);
int32 y = DP43(m.e10, m.e11, m.e12, m.e13, vx, vy, vz);
int32 z = DP43(m.e20, m.e21, m.e22, m.e23, vx, vy, vz);
int32 clip = 0;
if (z < VIEW_MIN_F) {
z = VIEW_MIN_F;
clip = CLIP_NEAR;
} else if (z > VIEW_MAX_F) {
z = VIEW_MAX_F;
clip = CLIP_FAR;
}
x >>= FIXED_SHIFT;
y >>= FIXED_SHIFT;
z >>= FIXED_SHIFT;
PERSPECTIVE(x, y, z);
if (x < minX) clip |= CLIP_LEFT;
if (y < minY) clip |= CLIP_TOP;
if (x > maxX) clip |= CLIP_RIGHT;
if (y > maxY) clip |= CLIP_BOTTOM;
v->x = x;
v->y = y;
v->z = (z << CLIP_SHIFT) | clip;
v++;
}
}
#endif
bool transformBoxRect(const AABBs* box, RectMinMax* rect)
{
@@ -207,40 +303,37 @@ bool transformBoxRect(const AABBs* box, RectMinMax* rect)
return false;
}
AABBi b;
b.minX = (box->minX << F16_SHIFT);
b.maxX = (box->maxX << F16_SHIFT);
b.minY = (box->minY << F16_SHIFT);
b.maxY = (box->maxY << F16_SHIFT);
b.minZ = (box->minZ << F16_SHIFT);
b.maxZ = (box->maxZ << F16_SHIFT);
int32 minX = box->minX;
int32 maxX = box->maxX;
int32 minY = box->minY;
int32 maxY = box->maxY;
int32 minZ = box->minZ;
int32 maxZ = box->maxZ;
Vertex v[8] = {
{ b.minX, b.minY, b.minZ, 1 << 16 },
{ b.maxX, b.minY, b.minZ, 1 << 16 },
{ b.minX, b.maxY, b.minZ, 1 << 16 },
{ b.maxX, b.maxY, b.minZ, 1 << 16 },
{ b.minX, b.minY, b.maxZ, 1 << 16 },
{ b.maxX, b.minY, b.maxZ, 1 << 16 },
{ b.minX, b.maxY, b.maxZ, 1 << 16 },
{ b.maxX, b.maxY, b.maxZ, 1 << 16 }
};
gVertices[0].x = minX; gVertices[0].y = minY; gVertices[0].z = minZ;
gVertices[1].x = maxX; gVertices[1].y = minY; gVertices[1].z = minZ;
gVertices[2].x = minX; gVertices[2].y = maxY; gVertices[2].z = minZ;
gVertices[3].x = maxX; gVertices[3].y = maxY; gVertices[3].z = minZ;
gVertices[4].x = minX; gVertices[4].y = minY; gVertices[4].z = maxZ;
gVertices[5].x = maxX; gVertices[5].y = minY; gVertices[5].z = maxZ;
gVertices[6].x = minX; gVertices[6].y = maxY; gVertices[6].z = maxZ;
gVertices[7].x = maxX; gVertices[7].y = maxY; gVertices[7].z = maxZ;
transformVertices(v, 8);
projectVertices(8);
*rect = RectMinMax( INT_MAX, INT_MAX, INT_MIN, INT_MIN );
for (int32 i = 0; i < 8; i++)
Vertex* v = gVertices;
for (int32 i = 0; i < 8; i++, v++)
{
int32 x = v[i].x;
int32 y = v[i].y;
int32 z = v[i].z;
int32 x = v->x;
int32 y = v->y;
int32 z = v->z;
if (z < (VIEW_MIN_F >> FIXED_SHIFT) || z >= (VIEW_MAX_F >> FIXED_SHIFT))
if ((z & CLIP_MASK) & (CLIP_NEAR | CLIP_FAR))
continue;
PERSPECTIVE(x, y, z);
if (x < rect->x0) rect->x0 = x;
if (x > rect->x1) rect->x1 = x;
if (y < rect->y0) rect->y0 = y;
@@ -255,119 +348,6 @@ bool transformBoxRect(const AABBs* box, RectMinMax* rect)
return true;
}
#define USE_ASM
#ifdef USE_ASM
#define unpackRoom unpackRoom_asm
#define unpackMesh unpackMesh_asm
//#define ccbMap4 ccbMap4_asm
extern "C" void unpackRoom_asm(const RoomVertex* vertices, int32 vCount);
extern "C" void unpackMesh_asm(const MeshVertex* vertices, int32 vCount);
//extern "C" void ccbMap4_asm(Face* f, const Vertex* v0, const Vertex* v1, const Vertex* v2, const Vertex* v3, uint32 shift);
#else
#define unpackRoom unpackRoom_c
#define unpackMesh unpackMesh_c
void unpackRoom_c(const RoomVertex* vertices, int32 vCount)
{
Vertex* res = gVertices;
uint32 *v32 = (uint32*)vertices;
for (int32 i = 0; i < vCount; i += 4)
{
uint32 n0 = *v32++;
uint32 n1 = *v32++;
res->x = (n0 << 12) & 0x1F000;
res->y = (n0 << 5) & 0xFC00;
res->z = (n0 << 1) & 0x1F000;
res->w = 1 << 16;
res++;
res->x = (n0 >> 4) & 0x1F000;
res->y = (n0 >> 11) & 0xFC00;
res->z = (n0 >> 15) & 0x1F000;
res->w = 1 << 16;
res++;
res->x = (n1 << 12) & 0x1F000;
res->y = (n1 << 5) & 0xFC00;
res->z = (n1 << 1) & 0x1F000;
res->w = 1 << 16;
res++;
res->x = (n1 >> 4) & 0x1F000;
res->y = (n1 >> 11) & 0xFC00;
res->z = (n1 >> 15) & 0x1F000;
res->w = 1 << 16;
res++;
}
}
void unpackMesh_c(const MeshVertex* vertices, int32 vCount)
{
uint32 *v32 = (uint32*)vertices;
Vertex* res = gVertices;
for (int32 i = 0; i < vCount; i += 2)
{
// << F16_SHIFT should be already applied
uint32 n0 = *v32++;
uint32 n1 = *v32++;
uint32 n2 = *v32++;
res->x = int16(n0 >> 16);
res->y = int16(n0);
res->z = int16(n1 >> 16);
res->w = 1 << 16;
res++;
res->x = int16(n1);
res->y = int16(n2 >> 16);
res->z = int16(n2);
res->w = 1 << 16;
res++;
}
}
#endif
void projectVertices(int32 vCount)
{
int32 x0 = viewportRel.x0;
int32 y0 = viewportRel.y0;
int32 x1 = viewportRel.x1;
int32 y1 = viewportRel.y1;
Vertex* res = gVertices;
for (int32 i = 0; i < vCount; i++)
{
int32 x = res->x;
int32 y = res->y;
int32 z = res->z;
int32 clip = 0;
if (z < (VIEW_MIN_F >> FIXED_SHIFT)) {
z = (VIEW_MIN_F >> FIXED_SHIFT);
clip = CLIP_NEAR;
} else if (z > (VIEW_MAX_F >> FIXED_SHIFT)) {
z = (VIEW_MAX_F >> FIXED_SHIFT);
clip = CLIP_FAR;
}
PERSPECTIVE(x, y, z);
clip |= classify(x, y, x0, y0, x1, y1);
res->x = x;
res->y = y;
res->z = (z << CLIP_SHIFT) | clip;
res++;
}
}
void transformRoom(const Room* room)
{
int32 vCount = room->info->verticesCount;
@@ -376,8 +356,6 @@ void transformRoom(const Room* room)
unpackRoom(room->data.vertices, vCount);
transformVertices(gVertices, vCount);
projectVertices(vCount);
gVerticesCount += vCount;
@@ -387,8 +365,6 @@ void transformMesh(const MeshVertex* vertices, int32 vCount, const uint16* vInte
{
unpackMesh(vertices, vCount);
transformVertices(gVertices, vCount);
projectVertices(vCount);
gVerticesCount += vCount;
@@ -757,11 +733,6 @@ X_INLINE void faceAddMeshTriangleFlat(uint32 flags, uint32 indices, uint32 shade
void faceAddShadow(int32 x, int32 z, int32 sx, int32 sz)
{
x <<= F16_SHIFT;
z <<= F16_SHIFT;
sx <<= F16_SHIFT;
sz <<= F16_SHIFT;
int32 sx2 = sx << 1;
int32 sz2 = sz << 1;

View File

@@ -11,24 +11,20 @@ vCount RN r1
vx0 RN r1
vy0 RN r2
vz0 RN r3
vw0 RN r4
vx1 RN r5
vy1 RN r6
vz1 RN r7
vw1 RN r8
vx1 RN r4
vy1 RN r5
vz1 RN r6
n0 RN vy0
n1 RN vx1
n2 RN vz1
res RN r12
last RN lr
stmfd sp!, {r4-r8, lr}
stmfd sp!, {r4-r6, lr}
ldr res, =gVertices
; last = vertices + vCount * 6
add vCount, vCount, vCount, lsl #1
add last, vertices, vCount, lsl #1
mov vw0, #(1 << 16)
mov vw1, #(1 << 16)
loop ldmia vertices!, {n0, n1, n2} ; load two encoded vertices
cmp vertices, last
@@ -45,8 +41,8 @@ loop ldmia vertices!, {n0, n1, n2} ; load two encoded vertices
mov n2, n2, lsl #16
mov vz1, n2, asr #16 ; z
stmia res!, {vx0, vy0, vz0, vw0, vx1, vy1, vz1, vw1}
stmia res!, {vx0, vy0, vz0, vx1, vy1, vz1}
blt loop
ldmfd sp!, {r4-r8, pc}
ldmfd sp!, {r4-r6, pc}
END

View File

@@ -12,33 +12,27 @@ vCount RN r1
vx0 RN r1
vy0 RN r2
vz0 RN r3
vw0 RN r4
vx1 RN r5
vy1 RN r6
vz1 RN r7
vw1 RN r8
vx1 RN r4
vy1 RN r5
vz1 RN r6
vx2 RN vx0
vy2 RN vy0
vz2 RN vz0
vw2 RN vw0
vx3 RN vx1
vy3 RN vy1
vz3 RN vz1
vw3 RN vw1
n0 RN vz1
n1 RN r9
maskH RN r10
maskV RN r11
n1 RN r7
maskH RN r8
maskV RN r9
res RN r12
last RN lr
stmfd sp!, {r4-r11, lr}
stmfd sp!, {r4-r9, lr}
ldr res, =gVertices
add last, vertices, vCount, lsl #1 ; last = vertices + vCount * 2
mov vw0, #(1 << 16) ; vw2
mov vw1, #(1 << 16) ; vw3
mov maskH, #0x1F000
mov maskV, #0xFC00
mov maskH, #0x7C00
mov maskV, #0x3F00
loop ldmia vertices!, {n0, n1} ; load four encoded vertices
cmp vertices, last
@@ -47,32 +41,32 @@ loop ldmia vertices!, {n0, n1} ; load four encoded vertices
; n0 = z3:5, y3:6, x3:5, z2:5, y2:6, x2:5
; 1st vertex
and vx0, maskH, n0, lsl #12 ; decode x0
and vy0, maskV, n0, lsl #5 ; decode y0
and vz0, maskH, n0, lsl #1 ; decode z0
and vx0, maskH, n0, lsl #10 ; decode x0
and vy0, maskV, n0, lsl #3 ; decode y0
and vz0, maskH, n0, lsr #1 ; decode z0
; 2nd vertex
and vx1, maskH, n0, lsr #4 ; decode x0
and vy1, maskV, n0, lsr #11 ; decode y0
and vz1, maskH, n0, lsr #15 ; decode z0
and vx1, maskH, n0, lsr #6 ; decode x0
and vy1, maskV, n0, lsr #13 ; decode y0
and vz1, maskH, n0, lsr #17 ; decode z0
; store
stmia res!, {vx0, vy0, vz0, vw0, vx1, vy1, vz1, vw1}
stmia res!, {vx0, vy0, vz0, vx1, vy1, vz1}
; 3rd vertex
and vx2, maskH, n1, lsl #12 ; decode x0
and vy2, maskV, n1, lsl #5 ; decode y0
and vz2, maskH, n1, lsl #1 ; decode z0
and vx2, maskH, n1, lsl #10 ; decode x0
and vy2, maskV, n1, lsl #3 ; decode y0
and vz2, maskH, n1, lsr #1 ; decode z0
; 4th vertex
and vx3, maskH, n1, lsr #4 ; decode x0
and vy3, maskV, n1, lsr #11 ; decode y0
and vz3, maskH, n1, lsr #15 ; decode z0
and vx3, maskH, n1, lsr #6 ; decode x0
and vy3, maskV, n1, lsr #13 ; decode y0
and vz3, maskH, n1, lsr #17 ; decode z0
; store
stmia res!, {vx2, vy2, vz2, vw2, vx3, vy3, vz3, vw3}
stmia res!, {vx2, vy2, vz2, vx3, vy3, vz3}
blt loop
ldmfd sp!, {r4-r11, pc}
ldmfd sp!, {r4-r9, pc}
END

View File

@@ -4006,9 +4006,9 @@ struct LevelPC
int16 x, y, z;
} v;
v.x = vertices[j].x << 2; // F16_SHIFT
v.y = vertices[j].y << 2; // F16_SHIFT
v.z = vertices[j].z << 2; // F16_SHIFT
v.x = vertices[j].x;
v.y = vertices[j].y;
v.z = vertices[j].z;
f.write(v.x);
f.write(v.y);