From 4e9b92e5a46afcc4a573edb0e1b51c2935a77fc4 Mon Sep 17 00:00:00 2001 From: XProger Date: Sat, 24 Dec 2022 11:23:42 +0300 Subject: [PATCH] #407 32X optimizations, increase fog distance (2 blocks) --- src/fixed/common.h | 22 ++- src/platform/32x/asm/common.i | 18 +-- src/platform/32x/asm/faceAddMeshQuads.i | 68 ++++---- src/platform/32x/asm/faceAddMeshTriangles.i | 70 ++++---- src/platform/32x/asm/faceAddRoomQuads.i | 94 ++++++----- src/platform/32x/asm/faceAddRoomTriangles.i | 78 ++++----- src/platform/32x/asm/rasterize.i | 4 +- src/platform/32x/asm/rasterizeF.i | 83 ++++------ src/platform/32x/asm/rasterizeFT.i | 19 +-- src/platform/32x/asm/rasterizeGT.i | 19 +-- src/platform/32x/asm/rasterizeS.i | 77 ++++----- src/platform/32x/asm/transformMesh.i | 22 +-- src/platform/32x/asm/transformRoom.i | 170 ++++++++------------ src/platform/32x/rasterizer.h | 10 +- src/platform/32x/render.cpp | 103 +++++++----- 15 files changed, 392 insertions(+), 465 deletions(-) diff --git a/src/fixed/common.h b/src/fixed/common.h index 9b298b0..1cdeb75 100644 --- a/src/fixed/common.h +++ b/src/fixed/common.h @@ -637,9 +637,12 @@ struct Matrix struct RoomQuad { -#ifdef __3DO__ +#if defined(__3DO__) uint32 flags; uint16 indices[4]; +#elif defined(__32X__) + uint32 flags; + int8 indices[4]; #else int8 indices[4]; uint16 flags; @@ -649,9 +652,12 @@ struct RoomQuad struct RoomTriangle { -#ifdef __3DO__ +#if defined(__3DO__) uint32 flags; uint16 indices[4]; +#elif defined(__32X__) + uint16 flags; + uint16 indices[3]; #else uint16 indices[3]; uint16 flags; @@ -660,9 +666,12 @@ struct RoomTriangle struct MeshQuad { -#ifdef __3DO__ +#if defined(__3DO__) uint32 flags; uint32 indices; +#elif defined(__32X__) + uint16 flags; + uint8 indices[4]; #else int8 indices[4]; uint16 flags; @@ -672,9 +681,12 @@ struct MeshQuad struct MeshTriangle { -#ifdef __3DO__ +#if defined(__3DO__) uint32 flags; uint32 indices; +#elif defined(__32X__) + uint16 flags; + uint8 indices[4]; #else int8 indices[4]; uint16 flags; @@ -743,7 +755,7 @@ struct Face { uint32 flags; Face* next; - uint16 indices[4]; + int16 indices[4]; }; #endif diff --git a/src/platform/32x/asm/common.i b/src/platform/32x/asm/common.i index 85e1c5f..e9b85b5 100644 --- a/src/platform/32x/asm/common.i +++ b/src/platform/32x/asm/common.i @@ -54,22 +54,20 @@ #define FACE_SIZEOF 16 -#define VIEW_DIST (1024 * 10) // max = DIV_TABLE_END << PROJ_SHIFT -#define FOG_SHIFT 1 -#define FOG_MAX VIEW_DIST -#define FOG_MIN (FOG_MAX - (8192 >> FOG_SHIFT)) -#define VIEW_MIN (64) -#define VIEW_MAX (VIEW_DIST) -#define VIEW_OFF 4096 +#define VIEW_MIN 64 +#define VIEW_MAX (10 << 10) +#define FOG_SHIFT 4 +#define FOG_MIN (VIEW_MAX - 2048) + +#define OT_SHIFT 4 #define CLIP_FRAME (1 << 0) #define CLIP_LEFT (1 << 1) #define CLIP_RIGHT (1 << 2) #define CLIP_TOP (1 << 3) #define CLIP_BOTTOM (1 << 4) -#define CLIP_FAR (1 << 5) -#define CLIP_NEAR (1 << 6) -#define CLIP_DISCARD (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_FAR + CLIP_NEAR) +#define CLIP_PLANE (1 << 5) +#define CLIP_DISCARD (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_PLANE) #define VP_MINX 0 #define VP_MINY 4 diff --git a/src/platform/32x/asm/faceAddMeshQuads.i b/src/platform/32x/asm/faceAddMeshQuads.i index 6b0bfd7..ca67756 100644 --- a/src/platform/32x/asm/faceAddMeshQuads.i +++ b/src/platform/32x/asm/faceAddMeshQuads.i @@ -26,9 +26,9 @@ #define vz2 vg2 #define vz3 vg3 -#define depth vg0 // == vz0 +#define depth tmp #define next vg1 -#define ot tmp +#define ot vg0 .align 4 .global _faceAddMeshQuads_asm @@ -43,26 +43,30 @@ _faceAddMeshQuads_asm: mov.l r14, @-sp mov.l var_gVertices_fam, vertices + add #VERTEX_Z, vertices mov.l var_gVerticesBase_fam, vp mov.l @vp, vp mov.l var_gFacesBase_fam, face mov.l @face, face + nop .loop_famq: // read flags and indices mov.w @polys+, flags - mov.b @polys+, vp0 - mov.b @polys+, vp1 - mov.b @polys+, vp2 - mov.b @polys+, vp3 + mov.w @polys+, vp0 + mov.w @polys+, vp2 - extu.w flags, flags + extu.w flags, flags // TODO packer free high bit + + extu.b vp0, vp1 + shlr8 vp0 extu.b vp0, vp0 - extu.b vp1, vp1 + + extu.b vp2, vp3 + shlr8 vp2 extu.b vp2, vp2 - extu.b vp3, vp3 // p = gVerticesBase + index * VERTEX_SIZEOF shll2 vp0 @@ -111,50 +115,40 @@ _faceAddMeshQuads_asm: or tmp, flags .avg_z4_famq: - mov.w @vp0, vz0 + mov.w @vp0, depth mov.w @vp1, vz1 mov.w @vp2, vz2 mov.w @vp3, vz3 - add vz1, vz0 - add vz2, vz0 - add vz3, vz0 - shlr2 vz0 // div by 4 + add vz1, depth + add vz2, depth + add vz3, depth + shlr2 depth // depth /= 4 mov.l var_gOT_fam, ot .face_add_famq: - // index = (p - vertices) / VERTEX_SIZEOF + // offset = (p - vertices) sub vertices, vp0 sub vertices, vp1 sub vertices, vp2 sub vertices, vp3 - shlr2 vp0 - shlr2 vp1 - shlr2 vp2 - shlr2 vp3 - shlr vp0 - shlr vp1 - shlr vp2 - shlr vp3 - - // depth (vz0) >>= OT_SHIFT (4) - shlr2 depth - shlr2 depth shll2 depth - add ot, depth // depth = gOT[depth] - mov.l @depth, next - mov.l face, @depth + mov.l @(depth, ot), next + mov.l face, @(depth, ot) + shll16 vp3 + xtrct vp2, vp3 + shll16 vp1 + xtrct vp0, vp1 + + mov.l flags, @(0, face) + mov.l next, @(4, face) + mov.l vp1, @(8, face) + mov.l vp3, @(12, face) add #FACE_SIZEOF, face - mov face, tmp + nop - mov.w vp3, @-tmp - mov.w vp2, @-tmp - mov.w vp1, @-tmp - mov.w vp0, @-tmp - mov.l next, @-tmp - mov.l flags, @-tmp .skip_famq: dt count bf .loop_famq diff --git a/src/platform/32x/asm/faceAddMeshTriangles.i b/src/platform/32x/asm/faceAddMeshTriangles.i index 383fc48..fe4d869 100644 --- a/src/platform/32x/asm/faceAddMeshTriangles.i +++ b/src/platform/32x/asm/faceAddMeshTriangles.i @@ -25,7 +25,7 @@ #define vz1 vg1 #define vz2 vg2 -#define depth vg0 // == vz0 +#define depth tmp #define next vg1 .align 4 @@ -41,6 +41,7 @@ _faceAddMeshTriangles_asm: mov.l r14, @-sp mov.l var_gVertices_fam, vertices + add #VERTEX_Z, vertices mov.l var_gVerticesBase_fam, vp mov.l @vp, vp @@ -49,19 +50,20 @@ _faceAddMeshTriangles_asm: mov.l @face, face mov.l var_gOT_fam, ot - nop .loop_famt: // read flags and indices mov.w @polys+, flags - mov.b @polys+, vp0 - mov.b @polys+, vp1 - mov.b @polys+, vp2 - add #1, polys // skup 4th index + mov.w @polys+, vp0 + mov.w @polys+, vp2 - extu.w flags, flags + extu.w flags, flags // TODO packer free high bit + + extu.b vp0, vp1 + shlr8 vp0 extu.b vp0, vp0 - extu.b vp1, vp1 + + shlr8 vp2 extu.b vp2, vp2 // p = gVerticesBase + index * VERTEX_SIZEOF @@ -80,11 +82,9 @@ _faceAddMeshTriangles_asm: // check_backface ccw vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2 bt/s .skip_famt - mov.l const_FACE_TRIANGLE_fam, tmp // [delay slot] - or tmp, flags // fetch clip masks - mov #(VERTEX_CLIP - 4), tmp + mov #(VERTEX_CLIP - 4), tmp // [delay slot] mov.b @(tmp, vp0), vg0 mov.b @(tmp, vp1), vg1 mov.b @(tmp, vp2), vg2 @@ -95,8 +95,11 @@ _faceAddMeshTriangles_asm: tst #CLIP_DISCARD, tmp bf/s .skip_famt + mov.l const_FACE_TRIANGLE_fam, tmp // [delay slot] + or tmp, flags + // mark if should be clipped by frame - mov vg0, tmp // [delay slot] + mov vg0, tmp or vg1, tmp or vg2, tmp tst #CLIP_FRAME, tmp @@ -105,44 +108,35 @@ _faceAddMeshTriangles_asm: or tmp, flags .avg_z3_famt: - mov.w @vp0, vz0 + mov.w @vp0, depth mov.w @vp1, vz1 mov.w @vp2, vz2 - add vz1, vz0 - add vz2, vz0 - add vz2, vz0 // approx. - shlr2 vz0 // div by 4 + add vz1, depth + add vz2, depth + add vz2, depth // approx. + shlr2 depth // depth /= 4 .face_add_famt: - // index = (p - vertices) / VERTEX_SIZEOF + // offset = (p - vertices) sub vertices, vp0 sub vertices, vp1 sub vertices, vp2 - shlr2 vp0 - shlr2 vp1 - shlr2 vp2 - shlr vp0 - shlr vp1 - shlr vp2 - - // depth (vz0) >>= OT_SHIFT (4) - shlr2 depth - shlr2 depth shll2 depth - add ot, depth // depth = gOT[depth] - mov.l @depth, next - mov.l face, @depth + mov.l @(depth, ot), next + mov.l face, @(depth, ot) + shll16 vp2 + shll16 vp1 + xtrct vp0, vp1 + + mov.l flags, @(0, face) + mov.l next, @(4, face) + mov.l vp1, @(8, face) + mov.l vp2, @(12, face) add #FACE_SIZEOF, face - mov face, tmp - add #-2, tmp // skip 4th index + nop - mov.w vp2, @-tmp - mov.w vp1, @-tmp - mov.w vp0, @-tmp - mov.l next, @-tmp - mov.l flags, @-tmp .skip_famt: dt count bf .loop_famt diff --git a/src/platform/32x/asm/faceAddRoomQuads.i b/src/platform/32x/asm/faceAddRoomQuads.i index bb9b533..5897a7a 100644 --- a/src/platform/32x/asm/faceAddRoomQuads.i +++ b/src/platform/32x/asm/faceAddRoomQuads.i @@ -26,9 +26,9 @@ #define vz2 vg2 #define vz3 vg3 -#define depth vg0 // == vz0 +#define depth tmp #define next vg1 -#define ot tmp +#define ot vg0 .align 4 .global _faceAddRoomQuads_asm @@ -43,34 +43,44 @@ _faceAddRoomQuads_asm: mov.l r14, @-sp mov.l var_gVertices_far, vertices + add #VERTEX_Z, vertices mov.l var_gVerticesBase_far, vp mov.l @vp, vp mov.l var_gFacesBase_far, face mov.l @face, face + nop .loop_farq: // read flags and indices - mov.w @polys+, flags - mov.w @polys+, vp0 - mov.w @polys+, vp1 - mov.w @polys+, vp2 - mov.w @polys+, vp3 - extu.w flags, flags - // indices never exceed 32k, no need for extu.w + mov.l @polys+, flags + mov.l @polys+, vp0 - // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2) + exts.b vp0, vp3 + shlr8 vp0 + exts.b vp0, vp2 + shlr8 vp0 + exts.b vp0, vp1 + shlr8 vp0 + exts.b vp0, vp0 + + // index *= 8 (VERTEX_SIZEOF) shll2 vp0 shll2 vp1 shll2 vp2 shll2 vp3 + shll vp0 + shll vp1 + shll vp2 + shll vp3 // get vertex address add vp, vp0 - add vp, vp1 - add vp, vp2 - add vp, vp3 + add vp0, vp1 + add vp1, vp2 + add vp2, vp3 + mov vp3, vp // fetch ((g << 8) | clip) mov #VERTEX_G, tmp @@ -116,59 +126,45 @@ _faceAddRoomQuads_asm: add #VERTEX_Z, vp3 // [delay slot] ccw shifts p[0..2] address to VERTEX_Z, shift p3 too // max_z4 - mov.w @vp0, vz0 + mov.w @vp0, depth mov.w @vp1, vz1 // check_z1 - cmp/gt vz0, vz1 + cmp/gt depth, vz1 bf/s 3f mov.w @vp2, vz2 // [delay slot] - mov vz1, vz0 // if (z1 > z0) z0 = z1 + mov vz1, depth // if (z1 > z0) z0 = z1 3: // check_z2 - cmp/gt vz0, vz2 + cmp/gt depth, vz2 bf/s 4f mov.w @vp3, vz3 // [delay slot] - mov vz2, vz0 // if (z2 > z0) z0 = z2 + mov vz2, depth // if (z2 > z0) z0 = z2 4: // check_z3 - cmp/gt vz0, vz3 - bf .face_add_farq // TODO use delay slot but not for OT! ) - mov vz3, vz0 // if (z3 > z0) z0 = z3 + cmp/gt depth, vz3 + bf/s .face_add_farq + sub vertices, vp0 // [delay slot] get the first offset + mov vz3, depth // if (z3 > z0) z0 = z3 .face_add_farq: - mov.l var_gOT_far, ot // [delay slot] - // get absolute indices - // p address is 4 bytes ahead but it's fine for shlr3 - // index = (p - vertices) / VERTEX_SIZEOF - sub vertices, vp0 + mov.l var_gOT_far, ot + // offset = (p - vertices) sub vertices, vp1 sub vertices, vp2 sub vertices, vp3 - shlr2 vp0 - shlr2 vp1 - shlr2 vp2 - shlr2 vp3 - shlr vp0 - shlr vp1 - shlr vp2 - shlr vp3 - - // depth (vz0) >>= OT_SHIFT (4) - shlr2 depth - shlr2 depth shll2 depth - add ot, depth // depth = gOT[depth] - mov.l @depth, next - mov.l face, @depth + mov.l @(depth, ot), next + mov.l face, @(depth, ot) + shll16 vp3 + xtrct vp2, vp3 + shll16 vp1 + xtrct vp0, vp1 + + mov.l flags, @(0, face) + mov.l next, @(4, face) + mov.l vp1, @(8, face) + mov.l vp3, @(12, face) add #FACE_SIZEOF, face - mov face, tmp - - mov.w vp3, @-tmp - mov.w vp2, @-tmp - mov.w vp1, @-tmp - mov.w vp0, @-tmp - mov.l next, @-tmp - mov.l flags, @-tmp .skip_farq: dt count bf .loop_farq diff --git a/src/platform/32x/asm/faceAddRoomTriangles.i b/src/platform/32x/asm/faceAddRoomTriangles.i index 56580c6..6ad484b 100644 --- a/src/platform/32x/asm/faceAddRoomTriangles.i +++ b/src/platform/32x/asm/faceAddRoomTriangles.i @@ -25,7 +25,7 @@ #define vz1 vg1 #define vz2 vg2 -#define depth vg0 // == vz0 +#define depth tmp #define next vg1 .align 4 @@ -41,6 +41,7 @@ _faceAddRoomTriangles_asm: mov.l r14, @-sp mov.l var_gVertices_far, vertices + add #VERTEX_Z, vertices mov.l var_gVerticesBase_far, vp mov.l @vp, vp @@ -49,21 +50,19 @@ _faceAddRoomTriangles_asm: mov.l @face, face mov.l var_gOT_far, ot - nop .loop_fart: // read flags and indices - mov.w @polys+, flags - mov.w @polys+, vp0 - mov.w @polys+, vp1 - mov.w @polys+, vp2 - extu.w flags, flags - // indices never exceed 32k, no need for extu.w + mov.l @polys+, flags + mov.l @polys+, vp1 - // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2) - shll2 vp0 - shll2 vp1 - shll2 vp2 + extu.w flags, vp0 + shlr16 flags + + extu.w vp1, vp2 + shlr16 vp1 + + // vp[0..2] alreay multiplied by VERTEX_SIZEOF // get vertex address add vp, vp0 @@ -90,7 +89,7 @@ _faceAddRoomTriangles_asm: or vg2, tmp tst #CLIP_FRAME, tmp bt/s 1f - mov.l const_FACE_CLIPPED_far, tmp // [delay slot] + mov.l const_FACE_CLIPPED_far, tmp // [delay slot] mov #1, tmp; rotr x2 or tmp, flags 1: // compare VERTEX_G for gouraud rasterization @@ -100,60 +99,47 @@ _faceAddRoomTriangles_asm: shlr8 vg1 // shift down for g only tst vg1, vg1 bt/s 2f - mov.l const_FACE_GOURAUD_far, tmp // [delay slot] + mov.l const_FACE_GOURAUD_far, tmp // [delay slot] mov #128, tmp; shll8 add tmp, flags 2: // check_backface ccw vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2 bt/s .skip_fart - mov.l const_FACE_TRIANGLE_far, tmp // [delay slot] + mov.l const_FACE_TRIANGLE_far, tmp // [delay slot] mov #1, tmp; rotr or tmp, flags // max_z3 - mov.w @vp0, vz0 + mov.w @vp0, depth // depth = vz0 mov.w @vp1, vz1 // check_z1 - cmp/gt vz0, vz1 + cmp/gt depth, vz1 bf/s 3f mov.w @vp2, vz2 // [delay slot] - mov vz1, vz0 // if (z1 > z0) z0 = z1 + mov vz1, depth // if (z1 > depth) depth = z1 3: // check_z2 - cmp/gt vz0, vz2 - bf .face_add_fart // TODO use delay slot but not for OT! ) - mov vz2, vz0 // if (z2 > z0) z0 = z2 + cmp/gt depth, vz2 + bf/s .face_add_fart // TODO use delay slot but not for OT! ) + sub vertices, vp0 // [delay slot] get the first offset + mov vz2, depth // if (z2 > depth) depth = z2 .face_add_fart: - // get absolute indices - // p address is 4 bytes ahead but it's fine for shlr3 - // index = (p - vertices) / VERTEX_SIZEOF - sub vertices, vp0 + // offset = (p - vertices) sub vertices, vp1 sub vertices, vp2 - shlr2 vp0 - shlr2 vp1 - shlr2 vp2 - shlr vp0 - shlr vp1 - shlr vp2 - - // depth (vz0) >>= OT_SHIFT (4) - shlr2 depth - shlr2 depth shll2 depth - add ot, depth // depth = gOT[depth] - mov.l @depth, next - mov.l face, @depth + mov.l @(depth, ot), next + mov.l face, @(depth, ot) + shll16 vp2 + shll16 vp1 + xtrct vp0, vp1 + + mov.l flags, @(0, face) + mov.l next, @(4, face) + mov.l vp1, @(8, face) + mov.l vp2, @(12, face) add #FACE_SIZEOF, face - mov face, tmp - add #-2, tmp // skip 4th index - - mov.w vp2, @-tmp - mov.w vp1, @-tmp - mov.w vp0, @-tmp - mov.l next, @-tmp - mov.l flags, @-tmp .skip_fart: dt count bf .loop_fart diff --git a/src/platform/32x/asm/rasterize.i b/src/platform/32x/asm/rasterize.i index c92a552..c3e64f2 100644 --- a/src/platform/32x/asm/rasterize.i +++ b/src/platform/32x/asm/rasterize.i @@ -40,8 +40,8 @@ _rasterize_asm: .align 2 var_fb: - // overwrite image frame buffer address has the same - // write per but allow transparent write for byte & word + // overwrite image frame buffer address, it has the same + // write latency but allow transparent write for byte & word .long 0x24020200 var_table: #ifdef ON_CHIP_RENDER diff --git a/src/platform/32x/asm/rasterizeF.i b/src/platform/32x/asm/rasterizeF.i index 3c023fc..a1e5b22 100644 --- a/src/platform/32x/asm/rasterizeF.i +++ b/src/platform/32x/asm/rasterizeF.i @@ -5,25 +5,22 @@ #define pixel r4 // arg #define L r5 // arg #define index r6 // arg -#define gtile r7 // arg (unused) -#define N gtile +#define h r7 #define Lx r8 #define Rx r9 #define Ldx r10 #define Rdx r11 #define dup r12 // const #define inv r13 -#define divLUT r14 +#define R r14 -#define R index -#define h N +#define divLUT inv #define Ry inv #define Ly inv -#define Rptr R +#define Rptr index -#define iw inv #define ih inv #define LMAP inv @@ -38,7 +35,6 @@ mov.l @sp+, r9 rts mov.l @sp+, r8 - nop .global _rasterizeF_asm _rasterizeF_asm: @@ -63,37 +59,30 @@ _rasterizeF_asm: mov L, R - mov.l var_divTable_fs, divLUT - mov #0, Rh - mov #0, Lh -.loop_f: - tst Lh, Lh - bf/s .calc_left_end_f .calc_left_start_f: mov.b @(VERTEX_PREV, L), tmp // [delay slot] - mov tmp, N - shll2 N - shll2 N - add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT) + add L, tmp // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT) - mov.w @L+, Lx - mov.w @L+, Ly + mov.l @L, Lx + extu.w Lx, Ly + shlr16 Lx - mov N, tmp - mov.w @tmp+, Ldx - mov.w @tmp+, Lh + mov.l @tmp, Ldx + extu.w Ldx, Lh + shlr16 Ldx cmp/ge Ly, Lh bf/s .exit_f cmp/eq Ly, Lh // [delay slot] bt/s .calc_left_start_f // if (L->v.y == N->v.y) check next vertex - mov N, L // [delay slot] + mov tmp, L // [delay slot] sub Lx, Ldx sub Ly, Lh + mov.l var_divTable_fs, divLUT mov Lh, tmp shll tmp mov.w @(tmp, divLUT), ih @@ -104,31 +93,30 @@ _rasterizeF_asm: .calc_left_end_f: tst Rh, Rh - bf/s .calc_right_end_f + bf .calc_right_end_f .calc_right_start_f: - mov.b @(VERTEX_NEXT, R), tmp // [delay slot] - mov tmp, N - shll2 N - shll2 N - add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT) + mov.b @(VERTEX_NEXT, R), tmp + add R, tmp // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT) - mov.w @R+, Rx - mov.w @R+, Ry + mov.l @R, Rx + extu.w Rx, Ry + shlr16 Rx - mov N, tmp - mov.w @tmp+, Rdx - mov.w @tmp+, Rh + mov.l @tmp, Rdx + extu.w Rdx, Rh + shlr16 Rdx cmp/ge Ry, Rh bf/s .exit_f cmp/eq Ry, Rh // [delay slot] bt/s .calc_right_start_f // if (R->v.y == N->v.y) check next vertex - mov N, R // [delay slot] + mov tmp, R // [delay slot] sub Rx, Rdx sub Ry, Rh + mov.l var_divTable_fs, divLUT mov Rh, tmp shll tmp mov.w @(tmp, divLUT), ih @@ -148,8 +136,6 @@ _rasterizeF_asm: sub h, Lh sub h, Rh - mov.l R, @-sp - .scanline_start_f: mov Lx, Lptr mov Rx, Rptr @@ -160,12 +146,6 @@ _rasterizeF_asm: cmp/gt Lptr, Rptr // if (!(Rptr > Lptr)) skip zero length scanline bf/s .scanline_end_f - // iw = divTable[Rptr - Lptr] - mov Rptr, tmp // [delay slot] - sub Lptr, tmp - shll tmp - mov.w @(tmp, divLUT), iw - add pixel, Lptr // Lptr = pixel + (Lx >> 16) add pixel, Rptr // Rptr = pixel + (Rx >> 16) @@ -178,10 +158,10 @@ _rasterizeF_asm: mov.b dup, @Lptr add #1, Lptr - mov #1, tmp // tmp = 1 (for align_right) cmp/gt Lptr, Rptr bf/s .scanline_end_f tst tmp, Rptr + nop .align_right_f: bt .block_2px_f @@ -192,17 +172,20 @@ _rasterizeF_asm: .block_2px_f: mov.w dup, @-Rptr cmp/gt Lptr, Rptr - bt .block_2px_f + bt/s .block_2px_f + nop .scanline_end_f: dt h mov.w var_frameWidth_fs, tmp bf/s .scanline_start_f - add tmp, pixel // [delay slot] pixel += 120 + 120 + 80 + add tmp, pixel // [delay slot] pixel += FRAME_WIDTH - bra .loop_f - mov.l @sp+, R + tst Lh, Lh + bf .calc_right_start_f + bra .calc_left_start_f + nop #undef tmp #undef Lh @@ -211,7 +194,6 @@ _rasterizeF_asm: #undef pixel #undef L #undef index -#undef N #undef Lx #undef Rx #undef Ldx @@ -224,6 +206,5 @@ _rasterizeF_asm: #undef Ry #undef Ly #undef Rptr -#undef iw #undef ih #undef LMAP diff --git a/src/platform/32x/asm/rasterizeFT.i b/src/platform/32x/asm/rasterizeFT.i index f617727..48d5e05 100644 --- a/src/platform/32x/asm/rasterizeFT.i +++ b/src/platform/32x/asm/rasterizeFT.i @@ -66,6 +66,7 @@ mov.l @sp+, r9 rts mov.l @sp+, r8 + nop .global _rasterizeFT_asm _rasterizeFT_asm: @@ -95,14 +96,13 @@ _rasterizeFT_asm: tst Lh, Lh bf/s .calc_left_end_ft + nop .calc_left_start_ft: mov.b @(VERTEX_PREV, L), tmp // [delay slot] mov tmp, N mov.w @(VERTEX_Y, L), tmp - shll2 N - shll2 N add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT) mov tmp, Ly mov.w @(VERTEX_Y, N), tmp @@ -144,14 +144,13 @@ _rasterizeFT_asm: shlr16 Rh // Rh = (Rh >> 16) tst Rh, Rh bf/s .calc_right_end_ft + nop .calc_right_start_ft: mov.b @(VERTEX_NEXT, R), tmp // [delay slot] mov tmp, N mov.w @(VERTEX_Y, R), tmp - shll2 N - shll2 N add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT) mov tmp, Ry mov.w @(VERTEX_Y, N), tmp @@ -206,7 +205,8 @@ _rasterizeFT_asm: mov.l tmp, @(SP_H, sp) mov.l L, @(SP_L, sp) mov.l R, @(SP_R, sp) - + nop + .scanline_start_ft: mov Lx, Lptr mov Rx, Rptr @@ -263,15 +263,15 @@ _rasterizeFT_asm: cmp/gt Lptr, Rptr bf/s .scanline_end_ft + nop .block_prepare_ft: shll dtdx // [delay slot] optional + nop .block_2px_ft: - swap.b t, index // UUuuvvVV - swap.w index, index // vvVVUUuu - shll8 index // VVUUuu00 - shlr16 index // 0000VVUU + getUV t, index + mov.b @(index, TILE), index mov.b @(index, LMAP), index @@ -283,6 +283,7 @@ _rasterizeFT_asm: cmp/gt Lptr, Rptr bt/s .block_2px_ft sub dtdx, t // [delay slot] t -= dtdx + nop .scanline_end_ft: mov.l @(SP_LDX, sp), sLdx diff --git a/src/platform/32x/asm/rasterizeGT.i b/src/platform/32x/asm/rasterizeGT.i index 2f23cef..f149a33 100644 --- a/src/platform/32x/asm/rasterizeGT.i +++ b/src/platform/32x/asm/rasterizeGT.i @@ -93,8 +93,6 @@ _rasterizeGT_asm: add #-SP_SIZE, sp mov gtile, TILE - nop - mov #0, Rh .loop_gt: @@ -102,14 +100,13 @@ _rasterizeGT_asm: tst Lh, Lh bf/s .calc_left_end_gt + shlr16 Rh // [delay slot] Rh = (Rh >> 16) .calc_left_start_gt: - mov.b @(VERTEX_PREV, L), tmp // [delay slot] + mov.b @(VERTEX_PREV, L), tmp mov tmp, N mov.w @(VERTEX_Y, L), tmp - shll2 N - shll2 N add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT) mov tmp, Ly mov.w @(VERTEX_Y, N), tmp @@ -159,9 +156,9 @@ _rasterizeGT_asm: // calc Ldt scaleUV Ldt, tmp, ih mov.l tmp, @(SP_LDT, sp) + nop .calc_left_end_gt: - shlr16 Rh // Rh = (Rh >> 16) tst Rh, Rh bf/s .calc_right_end_gt @@ -170,8 +167,6 @@ _rasterizeGT_asm: mov tmp, N mov.w @(VERTEX_Y, R), tmp - shll2 N - shll2 N add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT) mov tmp, Ry mov.w @(VERTEX_Y, N), tmp @@ -221,6 +216,7 @@ _rasterizeGT_asm: // calc Rdt scaleUV Rdt, tmp, ih mov.l tmp, @(SP_RDT, sp) + nop .calc_right_end_gt: // bake gLightmap address into g value @@ -233,6 +229,7 @@ _rasterizeGT_asm: bf/s .scanline_prepare_gt mov Lh, h // [delay slot] mov Rh, h + nop .scanline_prepare_gt: sub h, Lh @@ -330,10 +327,8 @@ _rasterizeGT_asm: shll dgdx .block_2px_gt: - swap.b t, index // UUuuvvVV - swap.w index, index // vvVVUUuu - shll8 index // VVUUuu00 - shlr16 index // 0000VVUU + getUV t, index + mov.b @(index, TILE), index mov g, LMAP diff --git a/src/platform/32x/asm/rasterizeS.i b/src/platform/32x/asm/rasterizeS.i index a1ef8bc..985b41c 100644 --- a/src/platform/32x/asm/rasterizeS.i +++ b/src/platform/32x/asm/rasterizeS.i @@ -5,8 +5,7 @@ #define pixel r4 // arg #define L r5 // arg #define R r6 // arg -#define gtile r7 // arg (unused) -#define N gtile +#define h r7 #define Lx r8 #define Rx r9 #define Ldx r10 @@ -16,14 +15,12 @@ #define divLUT r14 #define index tmp -#define h N #define Ry inv #define Ly inv -#define Rptr R +#define Rptr inv -#define iw inv #define ih inv .align 4 @@ -37,7 +34,6 @@ mov.l @sp+, r9 rts mov.l @sp+, r8 - nop .global _rasterizeS_asm _rasterizeS_asm: @@ -58,30 +54,25 @@ _rasterizeS_asm: mov.l var_divTable_fs, divLUT mov #0, Rh - mov #0, Lh -.loop_s: - tst Lh, Lh - bf/s .calc_left_end_s + nop .calc_left_start_s: mov.b @(VERTEX_PREV, L), tmp // [delay slot] - mov tmp, N - shll2 N - shll2 N - add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT) + add L, tmp // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT) - mov.w @L+, Lx - mov.w @L+, Ly + mov.l @L, Lx + extu.w Lx, Ly + shlr16 Lx - mov N, tmp - mov.w @tmp+, Ldx - mov.w @tmp+, Lh + mov.l @tmp, Ldx + extu.w Ldx, Lh + shlr16 Ldx cmp/ge Ly, Lh bf/s .exit_s cmp/eq Ly, Lh // [delay slot] bt/s .calc_left_start_s // if (L->v.y == N->v.y) check next vertex - mov N, L // [delay slot] + mov tmp, L // [delay slot] sub Lx, Ldx sub Ly, Lh @@ -96,27 +87,26 @@ _rasterizeS_asm: .calc_left_end_s: tst Rh, Rh - bf/s .calc_right_end_s + bf .calc_right_end_s + nop .calc_right_start_s: - mov.b @(VERTEX_NEXT, R), tmp // [delay slot] - mov tmp, N - shll2 N - shll2 N - add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT) + mov.b @(VERTEX_NEXT, R), tmp + add R, tmp // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT) - mov.w @R+, Rx - mov.w @R+, Ry + mov.l @R, Rx + extu.w Rx, Ry + shlr16 Rx - mov N, tmp - mov.w @tmp+, Rdx - mov.w @tmp+, Rh + mov.l @tmp, Rdx + extu.w Rdx, Rh + shlr16 Rdx cmp/ge Ry, Rh bf/s .exit_s cmp/eq Ry, Rh // [delay slot] bt/s .calc_right_start_s // if (R->v.y == N->v.y) check next vertex - mov N, R // [delay slot] + mov tmp, R // [delay slot] sub Rx, Rdx sub Ry, Rh @@ -135,13 +125,12 @@ _rasterizeS_asm: bf/s .scanline_prepare_s mov Lh, h // [delay slot] mov Rh, h + nop .scanline_prepare_s: sub h, Lh sub h, Rh - mov.l R, @-sp - .scanline_start_s: mov Lx, Lptr mov Rx, Rptr @@ -152,14 +141,8 @@ _rasterizeS_asm: cmp/gt Lptr, Rptr // if (!(Rptr > Lptr)) skip zero length scanline bf/s .scanline_end_s - // iw = divTable[Rptr - Lptr] - mov Rptr, tmp // [delay slot] - sub Lptr, tmp - shll tmp - mov.w @(tmp, divLUT), iw - - add pixel, Lptr // Lptr = pixel + (Lx >> 16) - add pixel, Rptr // Rptr = pixel + (Rx >> 16) + add pixel, Lptr // Lptr = pixel + (Lx >> 16) + add pixel, Rptr // Rptr = pixel + (Rx >> 16) .shade_pixel_s: mov.b @Lptr, index @@ -174,10 +157,12 @@ _rasterizeS_asm: mov.w var_frameWidth_fs, tmp bf/s .scanline_start_s - add tmp, pixel // [delay slot] pixel += 120 + 120 + 80 + add tmp, pixel // [delay slot] pixel += FRAME_WIDTH - bra .loop_s - mov.l @sp+, R + tst Lh, Lh + bf .calc_right_start_s + bra .calc_left_start_s + nop #undef tmp #undef Lh @@ -186,7 +171,6 @@ _rasterizeS_asm: #undef pixel #undef L #undef R -#undef N #undef Lx #undef Rx #undef Ldx @@ -199,5 +183,4 @@ _rasterizeS_asm: #undef Ry #undef Ly #undef Rptr -#undef iw #undef ih diff --git a/src/platform/32x/asm/transformMesh.i b/src/platform/32x/asm/transformMesh.i index 75f2095..d0da8dc 100644 --- a/src/platform/32x/asm/transformMesh.i +++ b/src/platform/32x/asm/transformMesh.i @@ -78,10 +78,10 @@ _transformMesh_asm: // pre-transform the matrix offset add #M03, m mov.w @m+, mx - shll16 mx mov.w @m+, my - shll16 my mov.w @m+, mz + shll16 mx + shll16 my shll16 mz add #-MATRIX_SIZEOF, m @@ -99,22 +99,24 @@ _transformMesh_asm: // z clipping .clip_z_near_m: - mov #VIEW_MIN, minZ // 64 + mov #VIEW_MIN, minZ cmp/gt z, minZ bf/s .clip_z_far_m cmp/ge maxZ, z // [delay slot] mov minZ, z - add #CLIP_NEAR, vg + add #CLIP_PLANE, vg .clip_z_far_m: - bf/s .project_m - mov z, dz // [delay slot] dz = z + bf .project_m mov maxZ, z - add #CLIP_FAR, vg + add #CLIP_PLANE, vg .project_m: - // dz = divTable[z >> (PROJ_SHIFT = 4)] - shlr2 dz - shlr2 dz + // z >>= OT_SHIFT + shlr2 z + shlr2 z + + // dz = divTable[z] + mov z, dz shll dz mov.w @(dz, divLUT), dz diff --git a/src/platform/32x/asm/transformRoom.i b/src/platform/32x/asm/transformRoom.i index 4254a74..f737976 100644 --- a/src/platform/32x/asm/transformRoom.i +++ b/src/platform/32x/asm/transformRoom.i @@ -4,9 +4,9 @@ #define res r3 #define vertices r4 // arg #define count r5 // arg -#define stackVtx r6 -#define stackMtx r7 -#define vp r8 +#define vp r6 +#define m r7 +#define vg r8 #define x r9 #define y r10 #define z r11 @@ -18,13 +18,14 @@ #define minY tmp #define maxX tmp #define maxY tmp -#define minZ tmp +#define minZ x #define dz tmp -#define vg stackVtx -#define fog stackMtx -#define cnt stackVtx +#define stackVtx tmp +#define fog x +#define minFog y +#define maxG y -#define SP_SIZE (18 + 6) // mat3x3 + vec3 +#define SP_SIZE (8) // vec3s + padding .align 4 .global _transformRoom_asm @@ -37,7 +38,6 @@ _transformRoom_asm: mov.l r12, @-sp mov.l r13, @-sp mov.l r14, @-sp - mov sp, stackMtx add #-SP_SIZE, sp mov.l var_viewportRel, vp @@ -49,139 +49,111 @@ _transformRoom_asm: // store matrix into stack (in reverse order) mov.l var_gMatrixPtr, tmp - mov.l @tmp, tmp + mov.l @tmp, m - // copy 3x3 matrix rotation part - mov #9, cnt -.copyMtx_r: - mov.w @tmp+, mx - dt cnt - bf/s .copyMtx_r - mov.w mx, @-stackMtx // [delay slot] - - // prepare offsets (const) - mov.w @tmp+, mx - mov.w @tmp+, my - mov.w @tmp+, mz + // pre-transform the matrix offset + add #M03, m + mov.w @m+, mx + mov.w @m+, my + mov.w @m+, mz shll8 mx shll8 my shll8 mz + add #-12, m // offset to z-row + + // maxZ = VIEW_MAX = (1024 * 10) >> OT_SHIFT = (40 << 8) >> OT_SHIFT + mov #40, maxZ + shll2 maxZ + shll2 maxZ add #8, res // extra offset for @-Rn - nop .loop_r: // unpack vertex mov.b @vertices+, x mov.b @vertices+, y mov.b @vertices+, z - shll2 x shll2 y shll2 z - // upload vertex coords into stack (in reverse order) + // upload vertex coords into stack mov sp, stackVtx add #6, stackVtx - mov stackVtx, stackMtx - //shll16 x - //xtrct y, x - mov.w x, @-stackVtx - mov.w y, @-stackVtx mov.w z, @-stackVtx + mov.w y, @-stackVtx + mov.w x, @-stackVtx - //transform z +.transform_z: lds mz, MACL - mac.w @stackVtx+, @stackMtx+ - mac.w @stackVtx+, @stackMtx+ - mac.w @stackVtx+, @stackMtx+ + mac.w @stackVtx+, @m+ + mac.w @stackVtx+, @m+ + mac.w @stackVtx+, @m+ sts MACL, z add #-6, stackVtx + add #-18, m // offset to x-row shlr8 z + + // z >>= OT_SHIFT + shlr2 z + shlr2 z + exts.w z, z - - // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF] - // tmp = z + VIEW_OFF = z + 4096 - mov #16, tmp - shll8 tmp - add z, tmp - // maxZ = VIEW_OFF + VIEW_MAX + VIEW_OFF = 18432 - mov #72, maxZ - shll8 maxZ - // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF] - cmp/hi maxZ, tmp - bf/s .visible_r - mov #40, maxZ // [delay slot] maxZ = 40 - mov #(CLIP_NEAR + CLIP_FAR), vg - mov.w vg, @-res - add #1, vertices - dt count - bf/s .loop_r - add #10, res // [delay slot] - bra .done_r - nop - -.visible_r: - //transform y - lds my, MACL - mac.w @stackVtx+, @stackMtx+ - mac.w @stackVtx+, @stackMtx+ - mac.w @stackVtx+, @stackMtx+ - sts MACL, y - add #-6, stackVtx - shlr8 y - exts.w y, y - - //transform x - lds mx, MACL - mac.w @stackVtx+, @stackMtx+ - mac.w @stackVtx+, @stackMtx+ - mac.w @stackVtx+, @stackMtx+ - sts MACL, x - shll8 maxZ // maxZ = VIEW_MAX = (1024 * 10) = (40 << 8) - shlr8 x - exts.w x, x - - mov.b @vertices+, vg - - // tmp = FOG_MIN = 6144 = (24 << 8) - mov #24, tmp - shll8 tmp +.calc_fog: // if z <= FOG_MIN -> skip fog calc - cmp/gt tmp, z - bf/s .clip_z_near_r - mov z, fog // [delay slot] - sub tmp, fog // fog = z - FOG_MIN - shll fog // FOG_SHIFT - shlr8 fog // shift down to 0..31 range + mov #(32 >> OT_SHIFT), minFog // minFog = FOG_MIN >> OT_SHIFT + shll8 minFog + mov z, fog + subc minFog, fog // TODO need to clear T before? + bt/s .clip_z_near_r + mov.b @vertices+, vg // [delay slot] + shlr2 fog + shlr fog // shift down to 0..31 range add fog, vg // vg = min(vg, 31) - mov #31, tmp - cmp/gt tmp, vg + mov #31, maxG + cmp/gt maxG, vg bf .clip_z_near_r mov #31, vg // z clipping .clip_z_near_r: add #1, vg // +1 for signed lightmap fetch - mov #VIEW_MIN, minZ // minZ = VIEW_MIN = 64 + mov #(VIEW_MIN >> OT_SHIFT), minZ cmp/gt z, minZ bf/s .clip_z_far_r shll8 vg // [delay slot] clear lower 8-bits of vg for clipping flags mov minZ, z - add #CLIP_NEAR, vg + add #CLIP_PLANE, vg .clip_z_far_r: cmp/ge maxZ, z - bf/s .project_r - mov z, dz // [delay slot] + bf .transform_x mov maxZ, z - add #CLIP_FAR, vg + add #CLIP_PLANE, vg -.project_r: // dz = divTable[z >> (PROJ_SHIFT = 4)] - shlr2 dz - shlr2 dz +.transform_x: + lds mx, MACL + mac.w @stackVtx+, @m+ + mac.w @stackVtx+, @m+ + mac.w @stackVtx+, @m+ + sts MACL, x + add #-6, stackVtx + shlr8 x + exts.w x, x + +.transform_y: + lds my, MACL + mac.w @stackVtx+, @m+ + mac.w @stackVtx+, @m+ + mac.w @stackVtx+, @m+ + sts MACL, y + mov z, dz // [delay slot] + shlr8 y + exts.w y, y + +.project_r: // dz = divTable[z] shll dz mov.w @(dz, divLUT), dz @@ -266,7 +238,6 @@ _transformRoom_asm: #undef vertices #undef count #undef stackVtx -#undef stackMtx #undef vp #undef x #undef y @@ -282,5 +253,4 @@ _transformRoom_asm: #undef dz #undef vg #undef fog -#undef cnt #undef SP_SIZE \ No newline at end of file diff --git a/src/platform/32x/rasterizer.h b/src/platform/32x/rasterizer.h index ac93a66..4a89334 100644 --- a/src/platform/32x/rasterizer.h +++ b/src/platform/32x/rasterizer.h @@ -132,7 +132,7 @@ extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLin } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -239,7 +239,7 @@ extern "C" void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLin } } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -380,7 +380,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi #endif } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -570,7 +570,7 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi #endif } - pixel += VRAM_WIDTH; + pixel += (FRAME_WIDTH >> 1); Lx += Ldx; Rx += Rdx; @@ -604,7 +604,7 @@ extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const Vert if (L->v.y < 0) { - pixel -= L->v.y * VRAM_WIDTH; + pixel -= L->v.y * (FRAME_WIDTH >> 1); v -= L->v.y * dv; h += L->v.y; } diff --git a/src/platform/32x/render.cpp b/src/platform/32x/render.cpp index 254f008..a0c5c88 100644 --- a/src/platform/32x/render.cpp +++ b/src/platform/32x/render.cpp @@ -65,9 +65,8 @@ enum ClipFlags { CLIP_RIGHT = 1 << 2, CLIP_TOP = 1 << 3, CLIP_BOTTOM = 1 << 4, - CLIP_FAR = 1 << 5, - CLIP_NEAR = 1 << 6, - CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_FAR | CLIP_NEAR), + CLIP_PLANE = 1 << 5, + CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_PLANE) }; const MeshQuad gShadowQuads[] = { @@ -183,12 +182,12 @@ void transformRoom_c(const RoomVertex* vertices, int32 count) uint32 clip = 0; if (z <= VIEW_MIN_F) { - clip = CLIP_NEAR; + clip = CLIP_PLANE; z = VIEW_MIN_F; } if (z >= VIEW_MAX_F) { - clip = CLIP_FAR; + clip = CLIP_PLANE; z = VIEW_MAX_F; } @@ -330,12 +329,12 @@ void transformMesh_c(const MeshVertex* vertices, int32 count, int32 intensity) uint32 clip = 0; if (z <= (VIEW_MIN_F >> FIXED_SHIFT)) { - clip = CLIP_NEAR; + clip = CLIP_PLANE; z = VIEW_MIN_F >> FIXED_SHIFT; } if (z >= (VIEW_MAX_F >> FIXED_SHIFT)) { - clip = CLIP_FAR; + clip = CLIP_PLANE; z = VIEW_MAX_F >> FIXED_SHIFT; } @@ -598,25 +597,25 @@ int32 sphereIsVisible_c(int32 sx, int32 sy, int32 sz, int32 r) void flush_ot(int32 bit) { - VertexLink v[4 + 3]; + VertexLink v[4 + 4]; VertexLink* q = v; VertexLink* t = v + 4; // quad - q[0].prev = 3; - q[0].next = 1; - q[1].prev = -1; - q[1].next = 1; - q[2].prev = -1; - q[2].next = 1; - q[3].prev = -1; - q[3].next = -3; + q[0].prev = (3 << 4); + q[0].next = (1 << 4); + q[1].prev = -(1 << 4); + q[1].next = (1 << 4); + q[2].prev = -(1 << 4); + q[2].next = (1 << 4); + q[3].prev = -(1 << 4); + q[3].next = -(3 << 4); // triangle - t[0].prev = 2; - t[0].next = 1; - t[1].prev = -1; - t[1].next = 1; - t[2].prev = -1; - t[2].next = -2; + t[0].prev = (2 << 4); + t[0].next = (1 << 4); + t[1].prev = -(1 << 4); + t[1].next = (1 << 4); + t[2].prev = -(1 << 4); + t[2].next = -(2 << 4); int32 index = 0; const ColorIndex* tile = NULL; @@ -654,12 +653,29 @@ void flush_ot(int32 bit) ptr[3].t.t = 0xFF00FF00 & (tex.uv23 << 8); } - ptr[0].v = gVertices[face->indices[0]]; - ptr[1].v = gVertices[face->indices[1]]; - ptr[2].v = gVertices[face->indices[2]]; + #if 1 + uint8* vPtr = (uint8*)gVertices; + ((uint32*)&ptr[0].v)[0] = ((uint32*)(vPtr + face->indices[0]))[0]; + ((uint32*)&ptr[0].v)[1] = ((uint32*)(vPtr + face->indices[0]))[1]; + + ((uint32*)&ptr[1].v)[0] = ((uint32*)(vPtr + face->indices[1]))[0]; + ((uint32*)&ptr[1].v)[1] = ((uint32*)(vPtr + face->indices[1]))[1]; + + ((uint32*)&ptr[2].v)[0] = ((uint32*)(vPtr + face->indices[2]))[0]; + ((uint32*)&ptr[2].v)[1] = ((uint32*)(vPtr + face->indices[2]))[1]; + if (!(flags & FACE_TRIANGLE)) { - ptr[3].v = gVertices[face->indices[3]]; + ((uint32*)&ptr[3].v)[0] = ((uint32*)(vPtr + face->indices[3]))[0]; + ((uint32*)&ptr[3].v)[1] = ((uint32*)(vPtr + face->indices[3]))[1]; } + #else + ptr[0].v = gVertices[face->indices[0] >> 3]; + ptr[1].v = gVertices[face->indices[1] >> 3]; + ptr[2].v = gVertices[face->indices[2] >> 3]; + if (!(flags & FACE_TRIANGLE)) { + ptr[3].v = gVertices[face->indices[3] >> 3]; + } + #endif if (flags & FACE_CLIPPED) { drawPoly(flags, ptr, tile); @@ -855,10 +871,10 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde bool skip = (first->v.y == last->v.y); VertexLink* top = (first->v.y < last->v.y) ? first : last; - first->prev = count - 1; - first->next = 1; - last->prev = -1; - last->next = 1 - count; + first->prev = (count - 1) << 4; + first->next = (1 << 4); + last->prev = -(1 << 4); + last->next = (1 - count) << 4; for (int32 i = 1; i < count - 1; i++) { @@ -873,8 +889,8 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde skip = false; } - p->prev = -1; - p->next = 1; + p->prev = -(1 << 4); + p->next = (1 << 4); } if (skip) @@ -910,7 +926,7 @@ void clear() MARS_SYS_COMM4 = MARS_CMD_CLEAR; } -void renderRoom(const Room* room) +void renderRoom(Room* room) { int32 vCount = room->info->verticesCount; if (vCount <= 0) @@ -1225,14 +1241,8 @@ const int32 BAR_COLORS[BAR_MAX][5] = { { 43, 44, 43, 42, 41 }, }; -X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 shade, int32 color1, int32 color2, int32 z) +X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 color1, int32 color2, int32 z) { - // background - if (shade >= 0) { - renderFill(x + 1, y + 1, width - 2, height - 2, shade, z); - } - - // frame renderLine(x + 1, y, width - 2, 1, color1, z); renderLine(x + 1, y + height - 1, width - 2, 1, color2, z); renderLine(x, y, 1, height, color1, z); @@ -1242,9 +1252,9 @@ X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type) { // colored bar - int32 ix = x + 2; - int32 iy = y + 2; - int32 w = value * width >> 8; + int32 ix = x + 1; + int32 iy = y + 1; + int32 w = value* width >> 8; if (w > 0) { @@ -1254,7 +1264,12 @@ void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type) } } - renderBorder(x, y, width + 4, BAR_HEIGHT + 4, 27, 19, 17, 0); + if (w < width) + { + renderFill(x + 1 + w, y + 1, width - w, BAR_HEIGHT, 27, 0); + } + + renderBorder(x, y, width + 2, BAR_HEIGHT + 2, 19, 17, 0); } void renderBackground(const void* background)