1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-01-17 12:58:50 +01:00

#407 32X S/F/FT/FTA/GT/GTA SH-2 rasterization asm, transform optims, change input mapping

This commit is contained in:
XProger 2022-04-03 14:07:01 +03:00
parent efc069efd4
commit 8fe5d86a64
17 changed files with 1463 additions and 222 deletions

View File

@ -42,11 +42,7 @@ EWRAM_DATA ExtraInfoLara playersExtra[MAX_PLAYERS];
#define LEVEL_INFO(name, title, track, secrets) { #name, NULL, title, track, secrets } #define LEVEL_INFO(name, title, track, secrets) { #name, NULL, title, track, secrets }
#endif #endif
#ifdef __3DO__ // TODO fix the title scren on 3DO
EWRAM_DATA LevelID gLevelID = LVL_TR1_1;
#else
EWRAM_DATA LevelID gLevelID = LVL_TR1_TITLE; EWRAM_DATA LevelID gLevelID = LVL_TR1_TITLE;
#endif
const LevelInfo gLevelInfo[LVL_MAX] = { const LevelInfo gLevelInfo[LVL_MAX] = {
// TR1 // TR1
@ -97,7 +93,7 @@ int32 rand_draw()
#ifdef USE_DIV_TABLE #ifdef USE_DIV_TABLE
EWRAM_DATA ALIGN16 divTableInt divTable[DIV_TABLE_SIZE] = { // must be at EWRAM start EWRAM_DATA ALIGN16 divTableInt divTable[DIV_TABLE_SIZE] = { // must be at EWRAM start
0xFFFF, 0xFFFF, 0x8000, 0x5555, 0x4000, 0x3333, 0x2AAA, 0x2492, 0x0000, 0x7FFF, 0x7FFF, 0x5555, 0x4000, 0x3333, 0x2AAA, 0x2492,
0x2000, 0x1C71, 0x1999, 0x1745, 0x1555, 0x13B1, 0x1249, 0x1111, 0x2000, 0x1C71, 0x1999, 0x1745, 0x1555, 0x13B1, 0x1249, 0x1111,
0x1000, 0x0F0F, 0x0E38, 0x0D79, 0x0CCC, 0x0C30, 0x0BA2, 0x0B21, 0x1000, 0x0F0F, 0x0E38, 0x0D79, 0x0CCC, 0x0C30, 0x0BA2, 0x0B21,
0x0AAA, 0x0A3D, 0x09D8, 0x097B, 0x0924, 0x08D3, 0x0888, 0x0842, 0x0AAA, 0x0A3D, 0x09D8, 0x097B, 0x0924, 0x08D3, 0x0888, 0x0842,

View File

@ -9,7 +9,6 @@
#endif #endif
#if defined(_WIN32) #if defined(_WIN32)
#define MODE4
#define USE_DIV_TABLE #define USE_DIV_TABLE
#define MODE4 #define MODE4
@ -218,6 +217,12 @@ typedef unsigned int uint32;
typedef uint16 divTableInt; typedef uint16 divTableInt;
#endif #endif
#if defined(__32X__)
typedef int8 ColorIndex;
#else
typedef uint8 ColorIndex;
#endif
//#include <new> //#include <new>
inline void* operator new(size_t, void *ptr) inline void* operator new(size_t, void *ptr)
{ {
@ -826,8 +831,6 @@ struct RoomVertex
{ {
#if defined(__3DO__) #if defined(__3DO__)
uint16 xyz565; uint16 xyz565;
#elif defined(__32X__)
uint8 g, z, y, x;
#else #else
uint8 x, y, z, g; uint8 x, y, z, g;
#endif #endif

View File

@ -201,6 +201,7 @@ void gameLoadLevel(const void* data)
//resetLara(0, 9, _vec3i(49669, 7680, 57891), ANGLE_0); // first door //resetLara(0, 9, _vec3i(49669, 7680, 57891), ANGLE_0); // first door
//resetLara(0, 10, _vec3i(43063, 7168, 61198), ANGLE_0); // transp //resetLara(0, 10, _vec3i(43063, 7168, 61198), ANGLE_0); // transp
//resetLara(0, 14, _vec3i(20215, 6656, 52942), ANGLE_90 + ANGLE_45); // bridge //resetLara(0, 14, _vec3i(20215, 6656, 52942), ANGLE_90 + ANGLE_45); // bridge
//resetLara(0, 25, _vec3i(8789, 5632, 80173), 0); // portal
//resetLara(0, 17, _vec3i(16475, 6656, 59845), ANGLE_90); // bear //resetLara(0, 17, _vec3i(16475, 6656, 59845), ANGLE_90); // bear
//resetLara(0, 26, _vec3i(24475, 6912, 83505), ANGLE_90); // switch timer 1 //resetLara(0, 26, _vec3i(24475, 6912, 83505), ANGLE_90); // switch timer 1
//resetLara(0, 35, _vec3i(35149, 2048, 74189), ANGLE_90); // switch timer 2 //resetLara(0, 35, _vec3i(35149, 2048, 74189), ANGLE_90); // switch timer 2

View File

@ -29,6 +29,9 @@ int32 alignOffset(int32 a, int32 b)
void* soundPlay(int16 id, const vec3i* pos) void* soundPlay(int16 id, const vec3i* pos)
{ {
#ifdef __32X__ // TODO
return NULL;
#endif
if (!gSettings.audio_sfx) if (!gSettings.audio_sfx)
return NULL; return NULL;

View File

@ -2739,12 +2739,12 @@ struct Lara : ItemObj
} }
#elif defined(__32X__) #elif defined(__32X__)
// 6 buttons // 6 buttons
if (keys & IK_A) input |= IN_WEAPON; if (keys & IK_A) input |= IN_ACTION;
if (keys & IK_B) input |= IN_ACTION; if (keys & IK_B) input |= IN_JUMP;
if (keys & IK_C) input |= IN_JUMP; if (keys & IK_C) input |= IN_WEAPON;
if (keys & IK_X) input |= IN_LOOK; if (keys & IK_X) input |= IN_WALK;
if (keys & IK_Y) input |= IN_UP | IN_DOWN; if (keys & IK_Y) input |= IN_UP | IN_DOWN;
if (keys & IK_Z) input |= IN_WALK; if (keys & IK_Z) input |= IN_LOOK;
#elif defined(__GBA__) || defined(_WIN32) #elif defined(__GBA__) || defined(_WIN32)
int32 ikA, ikB; int32 ikA, ikB;

View File

@ -88,4 +88,4 @@ $(BUILD)/%.o: ../../fixed/%.cpp
$(SHXX) $(SHCCFLAGS) $(INCPATH) -o $@ $< $(SHXX) $(SHCCFLAGS) $(INCPATH) -o $@ $<
clean: clean:
$(RM) $(BUILD)/* *.32x *.elf $(RM) $(BUILD)/* $(TARGET).32x $(TARGET).elf

View File

@ -39,14 +39,18 @@
#define FACE_TYPE_F 1 #define FACE_TYPE_F 1
#define VERTEX_X 0 #define VERTEX_X 0
#define VERTEX_Y 2 #define VERTEX_Y 2
#define VERTEX_Z 4 #define VERTEX_Z 4
#define VERTEX_G 6 #define VERTEX_G 6
#define VERTEX_CLIP 7 #define VERTEX_CLIP 7
#define VERTEX_T 8 #define VERTEX_T 8
#define VERTEX_PREV 12 #define VERTEX_PREV 12
#define VERTEX_NEXT 13 #define VERTEX_NEXT 13
#define VERTEX_PADDING 14
#define VERTEX_SIZEOF_SHIFT 4
#define VERTEX_SIZEOF (1 << VERTEX_SIZEOF_SHIFT)
#define VIEW_DIST (1024 * 10) // max = DIV_TABLE_END << PROJ_SHIFT #define VIEW_DIST (1024 * 10) // max = DIV_TABLE_END << PROJ_SHIFT
#define FOG_SHIFT 1 #define FOG_SHIFT 1
@ -56,20 +60,70 @@
#define VIEW_MAX (VIEW_DIST) #define VIEW_MAX (VIEW_DIST)
#define VIEW_OFF 4096 #define VIEW_OFF 4096
#define CLIP_LEFT (1 << 0) #define CLIP_FRAME (1 << 0)
#define CLIP_RIGHT (1 << 1) #define CLIP_LEFT (1 << 1)
#define CLIP_TOP (1 << 2) #define CLIP_RIGHT (1 << 2)
#define CLIP_BOTTOM (1 << 3) #define CLIP_TOP (1 << 3)
#define CLIP_FAR (1 << 4) #define CLIP_BOTTOM (1 << 4)
#define CLIP_NEAR (1 << 5) #define CLIP_FAR (1 << 5)
#define CLIP_NEAR (1 << 6)
#define VP_MINX 0 #define VP_MINX 0
#define VP_MINY 4 #define VP_MINY 4
#define VP_MAXX 8 #define VP_MAXX 8
#define VP_MAXY 12 #define VP_MAXY 12
.macro shlr14 reg #define FRAME_WIDTH 320
shll2 \reg #define FRAME_HEIGHT 224
shlr16 \reg
//exts.w reg, reg // skip this because of mov.w .macro align_fetch
.endm .p2alignw 2, 0x0009
.endm
.macro shlr14 x
shll2 \x
shlr16 \x
//exts.w x, x // skip this because of mov.w
.endm
// int32 >> 12
// 1. shar x 12 => 12 op
// 2. (int32(int16(x >> 16)) << 4) | (x >> 12) => 8 op (require an extra register)
.macro shar12 x, t
swap.w \x, \t
exts.w \t, \t
shll2 \t
shll2 \t
shlr8 \x
shlr2 \x
shlr2 \x
or \t, \x
.endm
// out = uv * f
// uv and out regs must be different
// destructive for uv reg
.macro scaleUV uv, out, f
muls.w \uv, \f
shlr16 \uv
sts MACL, \out // v = int16(uv) * f (16-bit shift)
muls.w \uv, \f
sts MACL, \uv // u = int16(uv >> 16) * f (16-bit shift)
shlr16 \uv
xtrct \uv, \out // out = uint16(v >> 16) | (u & 0xFFFF0000)
.endm
// UUuuVVvv -> 0000VVUU
.macro getUV uv, index
swap.b \uv, \index // UUuuvvVV
swap.w \index, \index // vvVVUUuu
shll8 \index // VVUUuu00
shlr16 \index // 0000VVUU
.endm
// index (r0) = gLightmap[index]
// in index 0..255
// in lightmap one of 32 gLightmap slices
.macro lit lightmap, index
mov.b @(\index, \lightmap), \index
.endm

View File

@ -1,4 +1,5 @@
#include "common.i" #include "common.i"
SEG_RASTER
#define type r0 #define type r0
#define proc r1 #define proc r1
@ -9,45 +10,46 @@
#define pixel flags #define pixel flags
#define y type #define y type
.text
.align 4 .align 4
.global _rasterize_asm .global _rasterize_asm
_rasterize_asm: _rasterize_asm:
mov flags, type mov flags, type
shll2 type shll2 type
swap.w type, type shlr16 type
and #15, type extu.b type, proc
cmp/eq #FACE_TYPE_F, type cmp/eq #FACE_TYPE_F, type // cmp/eq #imm is 8-bit
bf/s 0f bf/s .getProc
mov L, R mov L, R
extu.b flags, R extu.b flags, R
0: // proc = table[type] .getProc: // proc = table[type]
mov type, proc
mova var_table, type mova var_table, type
shll2 proc shll2 proc
mov.l @(type, proc), proc mov.l @(type, proc), proc
// pixel = fb + y * 320 // pixel = fb + y * 320 = fb + y * 256 + y * 64
mov.w @(VERTEX_Y, L), y mov.w @(VERTEX_Y, L), y
mov.l var_fb, pixel mov.l var_fb, pixel
shll8 y shll8 y
add y, pixel // pixel += y * 256 add y, pixel // pixel += y * 256
shlr2 y shar y
shar y
jmp @proc jmp @proc
add y, pixel // pixel += y * 64 add y, pixel // pixel += y * 64
nop
.align 2
var_fb: var_fb:
.long 0x24000200 // overwrite image frame buffer address has the same
// write per but allow transparent write for byte & word
.long 0x24020200
var_table: var_table:
.long _rasterizeS_c .long _rasterizeS_asm
.long _rasterizeF_c .long _rasterizeF_asm
.long _rasterizeFT_c .long _rasterizeFT_asm
.long _rasterizeFTA_c .long _rasterizeFT_asm
.long _rasterizeGT_c .long _rasterizeGT_asm
.long _rasterizeGTA_c .long _rasterizeGT_asm
.long _rasterizeSprite_c .long _rasterizeSprite_c
.long _rasterizeFillS_c .long _rasterizeFillS_c
.long _rasterizeLineH_c .long _rasterizeLineH_c

View File

@ -0,0 +1,215 @@
#include "common.i"
SEG_RASTER
#define tmp r0
#define Lh r1
#define Rh r2
#define Lptr r3
#define pixel r4 // arg
#define L r5 // arg
#define index r6 // arg
#define N r7
#define Lx r8
#define Rx r9
#define Ldx r10
#define Rdx r11
#define dup r12 // const
#define inv r13
#define divLUT r14
#define R index
#define h N
#define Ry inv
#define Ly inv
#define Rptr R
#define iw inv
#define ih inv
#define LMAP inv
.align 4
.exit:
// pop
mov.l @sp+, r14
mov.l @sp+, r13
mov.l @sp+, r12
mov.l @sp+, r11
mov.l @sp+, r10
mov.l @sp+, r9
rts
mov.l @sp+, r8
nop
.global _rasterizeF_asm
_rasterizeF_asm:
// push
mov.l r8, @-sp
mov.l r9, @-sp
mov.l r10, @-sp
mov.l r11, @-sp
mov.l r12, @-sp
mov.l r13, @-sp
mov.l r14, @-sp
mov.l var_LMAP_ADDR, LMAP
mov.b @(VERTEX_G, L), tmp
shll8 tmp
add index, tmp
or tmp, LMAP
mov.b @LMAP, dup
extu.b dup, dup
swap.b dup, index
or index, dup // dup = index | (index << 8)
mov L, R
mov.l var_divTable, divLUT
mov #0, Rh
mov #0, Lh
.loop:
tst Lh, Lh
bf/s .calc_left_end
.calc_left_start:
mov.b @(VERTEX_PREV, L), tmp // [delay slot]
mov tmp, N
shll2 N
shll2 N
add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
mov.w @L+, Lx
mov.w @L+, Ly
mov N, tmp
mov.w @tmp+, Ldx
mov.w @tmp+, Lh
cmp/ge Ly, Lh
bf/s .exit
cmp/eq Ly, Lh // [delay slot]
bt/s .calc_left_start // if (L->v.y == N->v.y) check next vertex
mov N, L // [delay slot]
sub Lx, Ldx
sub Ly, Lh
mov Lh, tmp
shll tmp
mov.w @(tmp, divLUT), ih
muls.w ih, Ldx
shll16 Lx // [delay slot]
sts MACL, Ldx
.calc_left_end:
tst Rh, Rh
bf/s .calc_right_end
.calc_right_start:
mov.b @(VERTEX_NEXT, R), tmp // [delay slot]
mov tmp, N
shll2 N
shll2 N
add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
mov.w @R+, Rx
mov.w @R+, Ry
mov N, tmp
mov.w @tmp+, Rdx
mov.w @tmp+, Rh
cmp/ge Ry, Rh
bf/s .exit
cmp/eq Ry, Rh // [delay slot]
bt/s .calc_right_start // if (R->v.y == N->v.y) check next vertex
mov N, R // [delay slot]
sub Rx, Rdx
sub Ry, Rh
mov Rh, tmp
shll tmp
mov.w @(tmp, divLUT), ih
muls.w ih, Rdx
shll16 Rx // [delay slot]
sts MACL, Rdx
.calc_right_end:
// h = min(Lh, Rh)
cmp/gt Rh, Lh
bf/s .scanline_prepare
mov Lh, h // [delay slot]
mov Rh, h
.scanline_prepare:
sub h, Lh
sub h, Rh
mov.l R, @-sp
.scanline_start:
mov Lx, Lptr
mov Rx, Rptr
add Ldx, Lx
add Rdx, Rx
shlr16 Lptr // Lptr = (Lx >> 16)
shlr16 Rptr // Rptr = (Rx >> 16)
cmp/gt Lptr, Rptr // if (!(Rptr > Lptr)) skip zero length scanline
bf/s .scanline_end
// iw = divTable[Rptr - Lptr]
mov Rptr, tmp // [delay slot]
sub Lptr, tmp
shll tmp
mov.w @(tmp, divLUT), iw
add pixel, Lptr // Lptr = pixel + (Lx >> 16)
add pixel, Rptr // Rptr = pixel + (Rx >> 16)
.align_left:
mov #1, tmp
tst tmp, Lptr
bt/s .align_right
tst tmp, Rptr // [delay slot]
mov.b dup, @Lptr
add #1, Lptr
mov #1, tmp // tmp = 1 (for align_right)
cmp/gt Lptr, Rptr
bf/s .scanline_end
tst tmp, Rptr
.align_right:
bt .block_2px
mov.b dup, @-Rptr
cmp/gt Lptr, Rptr
bf .scanline_end
.block_2px:
mov.w dup, @-Rptr
cmp/gt Lptr, Rptr
bt .block_2px
.scanline_end:
dt h
mov.w var_frameWidth, tmp
bf/s .scanline_start
add tmp, pixel // [delay slot] pixel += 120 + 120 + 80
bra .loop
mov.l @sp+, R
var_frameWidth:
.word FRAME_WIDTH
.align 2
var_LMAP_ADDR:
.long _gLightmap_base
var_divTable:
.long _divTable

View File

@ -0,0 +1,319 @@
#include "common.i"
SEG_RASTER
#define tmp r0
#define Lh r1
#define Rh r2
#define LMAP r3 // const
#define pixel r4 // arg
#define L r5 // arg
#define R r6 // arg
#define N r7
#define Lx r8
#define Rx r9
#define Lt r10
#define Rt r11
#define dup r12
#define TILE r13 // const
#define divLUT r14
#define h N
#define Ldx h
#define Rdx h
#define Ldt h
#define Rdt h
#define Ry Rx
#define Ly Lx
#define Rv Rx
#define Lv Lx
#define Lptr L
#define Rptr R
#define t Lh
#define dtdx Rh
#define index tmp
#define iw dup
#define ih dup
#define sLdx L
#define sRdx R
#define sLdt Lh
#define sRdt Rh
SP_LDX = 0
SP_RDX = 4
SP_LDT = 8
SP_RDT = 12
SP_H = 16
SP_L = 20
SP_R = 24
SP_SIZE = 28
.align 4
.exit:
// pop
add #SP_SIZE, sp
mov.l @sp+, r14
mov.l @sp+, r13
mov.l @sp+, r12
mov.l @sp+, r11
mov.l @sp+, r10
mov.l @sp+, r9
rts
mov.l @sp+, r8
.global _rasterizeFT_asm
_rasterizeFT_asm:
// push
mov.l r8, @-sp
mov.l r9, @-sp
mov.l r10, @-sp
mov.l r11, @-sp
mov.l r12, @-sp
mov.l r13, @-sp
mov.l r14, @-sp
add #-SP_SIZE, sp
mov.l var_LMAP_ADDR, LMAP
mov.b @(VERTEX_G, L), tmp
shll8 tmp
or tmp, LMAP
mov.l var_divTable, divLUT
mov.l var_gTile, TILE
mov.l @TILE, TILE
mov #0, Rh
.loop:
extu.w Rh, Lh // Lh = int16(Rh)
tst Lh, Lh
bf/s .calc_left_end
.calc_left_start:
mov.b @(VERTEX_PREV, L), tmp // [delay slot]
mov tmp, N
mov.w @(VERTEX_Y, L), tmp
shll2 N
shll2 N
add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
mov tmp, Ly
mov.w @(VERTEX_Y, N), tmp
sub Ly, tmp
cmp/pz tmp
bf/s .exit
tst tmp, tmp
mov L, Lv // Lv = L
bt/s .calc_left_start // if (Lh == 0) check next vertex
mov N, L // [delay slot]
mov tmp, Lh
mov.l @(VERTEX_T, Lv), Lt
mov.w @(VERTEX_X, Lv), tmp
swap.w tmp, Lx // Lx = L->v.x << 16
mov Lh, tmp
cmp/eq #1, tmp
bt/s .calc_left_end
shll tmp // [delay slot]
mov.w @(tmp, divLUT), ih
// calc Ldx
mov.w @(VERTEX_X, L), tmp
swap.w Lx, Ldx
sub Ldx, tmp
muls.w ih, tmp
mov.l @(VERTEX_T, L), Ldt
sts MACL, tmp
sub Lt, Ldt
mov.l tmp, @(SP_LDX, sp)
// calc Ldt
scaleUV Ldt, tmp, ih
mov.l tmp, @(SP_LDT, sp)
.calc_left_end:
shlr16 Rh // Rh = (Rh >> 16)
tst Rh, Rh
bf/s .calc_right_end
.calc_right_start:
mov.b @(VERTEX_NEXT, R), tmp // [delay slot]
mov tmp, N
mov.w @(VERTEX_Y, R), tmp
shll2 N
shll2 N
add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
mov tmp, Ry
mov.w @(VERTEX_Y, N), tmp
sub Ry, tmp
cmp/pz tmp
bf/s .exit
tst tmp, tmp
mov R, Rv // Rv = R
bt/s .calc_right_start // if (Rh == 0) check next vertex
mov N, R // [delay slot]
mov tmp, Rh
mov.l @(VERTEX_T, Rv), Rt
mov.w @(VERTEX_X, Rv), tmp
swap.w tmp, Rx // Rx = R->v.x << 16
mov Rh, tmp
cmp/eq #1, tmp
bt/s .calc_right_end
shll tmp // [delay slot]
mov.w @(tmp, divLUT), ih
// calc Rdx
mov.w @(VERTEX_X, R), tmp
swap.w Rx, Rdx
sub Rdx, tmp
muls.w ih, tmp
mov.l @(VERTEX_T, R), Rdt
sts MACL, tmp
sub Rt, Rdt
mov.l tmp, @(SP_RDX, sp)
// calc Rdt
scaleUV Rdt, tmp, ih
mov.l tmp, @(SP_RDT, sp)
.calc_right_end:
// h = min(Lh, Rh)
cmp/gt Rh, Lh
bf/s .scanline_prepare
mov Lh, h // [delay slot]
mov Rh, h
.scanline_prepare:
sub h, Lh
sub h, Rh
swap.w Rh, tmp
or Lh, tmp
mov.l tmp, @(SP_H, sp)
mov.l L, @(SP_L, sp)
mov.l R, @(SP_R, sp)
.scanline_start:
mov Lx, Lptr
mov Rx, Rptr
shlr16 Lptr // Lptr = (Lx >> 16)
shlr16 Rptr // Rptr = (Rx >> 16)
cmp/gt Lptr, Rptr // if (!(Rptr > Lptr)) skip zero length scanline
bf/s .scanline_end
// iw = divTable[Rptr - Lptr]
mov Rptr, tmp // [delay slot]
sub Lptr, tmp
shll tmp
mov.w @(tmp, divLUT), iw
// calc dtdx
mov Rt, tmp
sub Lt, tmp
muls.w tmp, iw
add pixel, Lptr // Lptr = pixel + (Lx >> 16)
sts MACL, dtdx // v = int16(uv) * f (16-bit shift)
shlr16 tmp
muls.w tmp, iw
add pixel, Rptr // Rptr = pixel + (Rx >> 16)
sts MACL, tmp // u = int16(uv >> 16) * f (16-bit shift)
mov Rt, t
shlr16 tmp
xtrct tmp, dtdx // out = uint16(v >> 16) | (u & 0xFFFF0000)
.align_left:
mov #1, tmp
tst tmp, Lptr
bt/s .align_right
tst tmp, Rptr // [delay slot]
getUV Lt, index
mov.b @(index, TILE), index
mov.b @(index, LMAP), index
mov.b index, @Lptr
add #1, Lptr
mov #1, tmp // tmp = 1 (for align_right)
cmp/gt Lptr, Rptr
bf/s .scanline_end
tst tmp, Rptr
.align_right:
bt/s .block_prepare
getUV t, index
mov.b @(index, TILE), index
mov.b @(index, LMAP), index
sub dtdx, t
mov.b index, @-Rptr
cmp/gt Lptr, Rptr
bf/s .scanline_end
.block_prepare:
shll dtdx // [delay slot] optional
.block_2px:
swap.b t, index // UUuuvvVV
swap.w index, index // vvVVUUuu
shll8 index // VVUUuu00
shlr16 index // 0000VVUU
mov.b @(index, TILE), index
mov.b @(index, LMAP), index
extu.b index, index
swap.b index, dup
or index, dup // dup = index | (index << 8)
mov.w dup, @-Rptr
cmp/gt Lptr, Rptr
bt/s .block_2px
sub dtdx, t // [delay slot] t -= dtdx
.scanline_end:
mov.l @(SP_LDX, sp), sLdx
mov.l @(SP_RDX, sp), sRdx
mov.l @(SP_LDT, sp), sLdt
mov.l @(SP_RDT, sp), sRdt
add sLdx, Lx
add sRdx, Rx
add sLdt, Lt
add sRdt, Rt
dt h
mov.w var_frameWidth, tmp
bf/s .scanline_start
add tmp, pixel // [delay slot] pixel += 120 + 120 + 80
mov.l @(SP_L, sp), L
mov.l @(SP_R, sp), R
bra .loop
mov.l @(SP_H, sp), Rh
var_frameWidth:
.word FRAME_WIDTH
.align 2
var_LMAP_ADDR:
.long _gLightmap_base
var_divTable:
.long _divTable
var_gTile:
.long _gTile

View File

@ -0,0 +1,402 @@
#include "common.i"
SEG_RASTER
#define tmp r0
#define Lh r1
#define Rh r2
#define dup r3
#define pixel r4 // arg
#define L r5 // arg
#define R r6 // arg
#define N r7
#define Lx r8
#define Rx r9
#define Lg r10
#define Rg r11
#define Lt r12
#define Rt r13
#define TILE r14 // const
#define h N
#define Ldx h
#define Rdx h
#define Ldt h
#define Rdt h
#define Ry Rx
#define Ly Lx
#define Rv Rx
#define Lv Lx
#define Lptr Lh
#define Rptr Rx
#define g Rg
#define dgdx R
#define t Rt
#define dtdx L
#define index tmp
#define LMAP dup
#define divLUT dup
#define iw dup
#define ih dup
#define dx dgdx
#define mask Rh
#define sLdx L
#define sRdx R
#define sLdt L
#define sRdt R
#define sLdg L
#define sRdg R
SP_LDX = 0
SP_RDX = 4
SP_LDT = 8
SP_RDT = 12
SP_LDG = 16
SP_RDG = 18
SP_H = 20
SP_L = 24
SP_R = 28
SP_SIZE = 32
.align 4
.exit:
// pop
add #SP_SIZE, sp
mov.l @sp+, r14
mov.l @sp+, r13
mov.l @sp+, r12
mov.l @sp+, r11
mov.l @sp+, r10
mov.l @sp+, r9
rts
mov.l @sp+, r8
nop
.global _rasterizeGT_asm
_rasterizeGT_asm:
// push
mov.l r8, @-sp
mov.l r9, @-sp
mov.l r10, @-sp
mov.l r11, @-sp
mov.l r12, @-sp
mov.l r13, @-sp
mov.l r14, @-sp
add #-SP_SIZE, sp
mov.l var_gTile, TILE
mov.l @TILE, TILE
mov #0, Rh
.loop:
extu.w Rh, Lh // Lh = int16(Rh)
tst Lh, Lh
bf/s .calc_left_end
.calc_left_start:
mov.b @(VERTEX_PREV, L), tmp // [delay slot]
mov tmp, N
mov.w @(VERTEX_Y, L), tmp
shll2 N
shll2 N
add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
mov tmp, Ly
mov.w @(VERTEX_Y, N), tmp
sub Ly, tmp
cmp/pz tmp
bf/s .exit
tst tmp, tmp
mov L, Lv // Lv = L
bt/s .calc_left_start // if (Lh == 0) check next vertex
mov N, L // [delay slot]
mov tmp, Lh
mov.b @(VERTEX_G, Lv), tmp
mov.l @(VERTEX_T, Lv), Lt
mov tmp, Lg
mov.w @(VERTEX_X, Lv), tmp
shll8 Lg
swap.w tmp, Lx // Lx = L->v.x << 16
mov Lh, tmp
cmp/eq #1, tmp
bt/s .calc_left_end
shll tmp // [delay slot]
mov.l var_divTable, divLUT
mov.w @(tmp, divLUT), ih
// calc Ldx
mov.w @(VERTEX_X, L), tmp
swap.w Lx, Ldx
sub Ldx, tmp
muls.w ih, tmp
mov.b @(VERTEX_G, L), tmp
sts MACL, Ldx
shll8 tmp
mov.l Ldx, @(SP_LDX, sp)
// calc Ldg
sub Lg, tmp
muls.w ih, tmp
mov.l @(VERTEX_T, L), Ldt
sts MACL, tmp
sub Lt, Ldt
shlr16 tmp
mov.w tmp, @(SP_LDG, sp)
// calc Ldt
scaleUV Ldt, tmp, ih
mov.l tmp, @(SP_LDT, sp)
.calc_left_end:
shlr16 Rh // Rh = (Rh >> 16)
tst Rh, Rh
bf/s .calc_right_end
.calc_right_start:
mov.b @(VERTEX_NEXT, R), tmp // [delay slot]
mov tmp, N
mov.w @(VERTEX_Y, R), tmp
shll2 N
shll2 N
add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
mov tmp, Ry
mov.w @(VERTEX_Y, N), tmp
sub Ry, tmp
cmp/pz tmp
bf/s .exit
tst tmp, tmp
mov R, Rv // Rv = R
bt/s .calc_right_start // if (Rh == 0) check next vertex
mov N, R // [delay slot]
mov tmp, Rh
mov.b @(VERTEX_G, Rv), tmp
mov.l @(VERTEX_T, Rv), Rt
mov tmp, Rg
mov.w @(VERTEX_X, Rv), tmp
shll8 Rg
swap.w tmp, Rx // Rx = R->v.x << 16
mov Rh, tmp
cmp/eq #1, tmp
bt/s .calc_right_end
shll tmp // [delay slot]
mov.l var_divTable, divLUT
mov.w @(tmp, divLUT), ih
// calc Rdx
mov.w @(VERTEX_X, R), tmp
swap.w Rx, Rdx
sub Rdx, tmp
muls.w ih, tmp
mov.b @(VERTEX_G, R), tmp
sts MACL, Rdx
shll8 tmp
mov.l Rdx, @(SP_RDX, sp)
// calc Rdg
sub Rg, tmp
muls.w ih, tmp
mov.l @(VERTEX_T, R), Rdt
sts MACL, tmp
sub Rt, Rdt
shlr16 tmp
mov.w tmp, @(SP_RDG, sp)
// calc Rdt
scaleUV Rdt, tmp, ih
mov.l tmp, @(SP_RDT, sp)
.calc_right_end:
// bake gLightmap address into g value
mov.l var_LMAP_ADDR, tmp
or tmp, Lg
or tmp, Rg
// h = min(Lh, Rh)
cmp/gt Rh, Lh
bf/s .scanline_prepare
mov Lh, h // [delay slot]
mov Rh, h
.scanline_prepare:
sub h, Lh
sub h, Rh
swap.w Rh, tmp
or Lh, tmp
mov.l tmp, @(SP_H, sp)
mov.l L, @(SP_L, sp)
mov.l R, @(SP_R, sp)
mov.l var_mask, mask
.scanline_start:
mov.l Rx, @-sp // alias Rptr
mov Lx, Lptr
shlr16 Lptr // Lptr = (Lx >> 16)
shlr16 Rptr // Rptr = (Rx >> 16)
cmp/gt Lptr, Rptr // if (!(Rptr > Lptr)) skip zero length scanline
bf/s .scanline_end_fast
// iw = divTable[Rptr - Lptr]
mov Rptr, tmp // [delay slot]
sub Lptr, tmp
mov.l var_divTable, divLUT
shll tmp
mov.w @(tmp, divLUT), iw
add pixel, Lptr // Lptr = pixel + (Lx >> 16)
add pixel, Rptr // Rptr = pixel + (Rx >> 16)
mov.l Rt, @-sp // alias t
mov.l Rg, @-sp // alias g
// calc dtdx
mov Rt, tmp
sub Lt, tmp
muls.w tmp, iw
shlr16 tmp
sts MACL, dtdx // v = int16(uv) * f (16-bit shift)
muls.w tmp, iw
mov Rg, tmp
sts MACL, dx // u = int16(uv >> 16) * f (16-bit shift)
sub Lg, tmp
shlr16 dx
xtrct dx, dtdx // out = uint16(v >> 16) | (u & 0xFFFF0000)
// calc dgdx
muls.w tmp, iw
mov #1, tmp
sts MACL, dgdx
tst tmp, Lptr
shlr16 dgdx
exts.w dgdx, dgdx
.align_left:
bt/s .align_right
tst tmp, Rptr // [delay slot]
getUV Lt, index
mov.b @(index, TILE), index
mov Lg, LMAP
and mask, LMAP
mov.b @(index, LMAP), index
mov.b index, @Lptr
add #1, Lptr
mov #1, tmp // tmp = 1 (for align_right)
cmp/gt Lptr, Rptr
bf/s .scanline_end
tst tmp, Rptr
.align_right:
bt/s .block_prepare
mov g, LMAP
getUV t, index
mov.b @(index, TILE), index
and mask, LMAP
sub dgdx, g
mov.b @(index, LMAP), index
sub dtdx, t
mov.b index, @-Rptr
cmp/gt Lptr, Rptr
bf/s .scanline_end
.block_prepare:
shll dtdx // [delay slot] optional
shll dgdx
.block_2px:
swap.b t, index // UUuuvvVV
swap.w index, index // vvVVUUuu
shll8 index // VVUUuu00
shlr16 index // 0000VVUU
mov.b @(index, TILE), index
mov g, LMAP
and mask, LMAP // LMAP = (g & 0xFFFFFF00)
mov.b @(index, LMAP), index
sub dgdx, g // g -= dgdx
extu.b index, index
swap.b index, dup
or index, dup // dup = index | (index << 8)
mov.w dup, @-Rptr
cmp/gt Lptr, Rptr
bt/s .block_2px
sub dtdx, t // [delay slot] t -= dtdx
.scanline_end:
mov.l @sp+, Rg
mov.l @sp+, Rt
.scanline_end_fast:
mov.l @sp+, Rx
mov sp, tmp
mov.l @tmp+, sLdx
mov.l @tmp+, sRdx
add sLdx, Lx
add sRdx, Rx
mov.l @tmp+, sLdt
mov.l @tmp+, sRdt
add sLdt, Lt
add sRdt, Rt
mov.w @tmp+, sLdg
mov.w @tmp+, sRdg
add sLdg, Lg
add sRdg, Rg
dt h
mov.w var_frameWidth, tmp
bf/s .scanline_start
add tmp, pixel // [delay slot] pixel += 120 + 120 + 80
mov.l @(SP_L, sp), L
mov.l @(SP_R, sp), R
bra .loop
mov.l @(SP_H, sp), Rh
var_frameWidth:
.word FRAME_WIDTH
.align 2
var_LMAP_ADDR:
.long _gLightmap_base
var_mask:
.long 0xFFFFFF00
var_divTable:
.long _divTable
var_gTile:
.long _gTile

View File

@ -0,0 +1,190 @@
#include "common.i"
SEG_RASTER
#define tmp r0
#define Lh r1
#define Rh r2
#define Lptr r3
#define pixel r4 // arg
#define L r5 // arg
#define R r6 // arg
#define N r7
#define Lx r8
#define Rx r9
#define Ldx r10
#define Rdx r11
#define LMAP r12 // const
#define inv r13
#define divLUT r14
#define index tmp
#define h N
#define Ry inv
#define Ly inv
#define Rptr R
#define iw inv
#define ih inv
.align 4
.exit:
// pop
mov.l @sp+, r14
mov.l @sp+, r13
mov.l @sp+, r12
mov.l @sp+, r11
mov.l @sp+, r10
mov.l @sp+, r9
rts
mov.l @sp+, r8
nop
.global _rasterizeS_asm
_rasterizeS_asm:
// push
mov.l r8, @-sp
mov.l r9, @-sp
mov.l r10, @-sp
mov.l r11, @-sp
mov.l r12, @-sp
mov.l r13, @-sp
mov.l r14, @-sp
mov.l var_LMAP_ADDR, LMAP
mov #27, tmp
shll8 tmp
or tmp, LMAP
mov.l var_divTable, divLUT
mov #0, Rh
mov #0, Lh
.loop:
tst Lh, Lh
bf/s .calc_left_end
.calc_left_start:
mov.b @(VERTEX_PREV, L), tmp // [delay slot]
mov tmp, N
shll2 N
shll2 N
add L, N // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
mov.w @L+, Lx
mov.w @L+, Ly
mov N, tmp
mov.w @tmp+, Ldx
mov.w @tmp+, Lh
cmp/ge Ly, Lh
bf/s .exit
cmp/eq Ly, Lh // [delay slot]
bt/s .calc_left_start // if (L->v.y == N->v.y) check next vertex
mov N, L // [delay slot]
sub Lx, Ldx
sub Ly, Lh
mov Lh, tmp
shll tmp
mov.w @(tmp, divLUT), ih
muls.w ih, Ldx
shll16 Lx // [delay slot]
sts MACL, Ldx
.calc_left_end:
tst Rh, Rh
bf/s .calc_right_end
.calc_right_start:
mov.b @(VERTEX_NEXT, R), tmp // [delay slot]
mov tmp, N
shll2 N
shll2 N
add R, N // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
mov.w @R+, Rx
mov.w @R+, Ry
mov N, tmp
mov.w @tmp+, Rdx
mov.w @tmp+, Rh
cmp/ge Ry, Rh
bf/s .exit
cmp/eq Ry, Rh // [delay slot]
bt/s .calc_right_start // if (R->v.y == N->v.y) check next vertex
mov N, R // [delay slot]
sub Rx, Rdx
sub Ry, Rh
mov Rh, tmp
shll tmp
mov.w @(tmp, divLUT), ih
muls.w ih, Rdx
shll16 Rx // [delay slot]
sts MACL, Rdx
.calc_right_end:
// h = min(Lh, Rh)
cmp/gt Rh, Lh
bf/s .scanline_prepare
mov Lh, h // [delay slot]
mov Rh, h
.scanline_prepare:
sub h, Lh
sub h, Rh
mov.l R, @-sp
.scanline_start:
mov Lx, Lptr
mov Rx, Rptr
add Ldx, Lx
add Rdx, Rx
shlr16 Lptr // Lptr = (Lx >> 16)
shlr16 Rptr // Rptr = (Rx >> 16)
cmp/gt Lptr, Rptr // if (!(Rptr > Lptr)) skip zero length scanline
bf/s .scanline_end
// iw = divTable[Rptr - Lptr]
mov Rptr, tmp // [delay slot]
sub Lptr, tmp
shll tmp
mov.w @(tmp, divLUT), iw
add pixel, Lptr // Lptr = pixel + (Lx >> 16)
add pixel, Rptr // Rptr = pixel + (Rx >> 16)
.shade_pixel:
mov.b @Lptr, index
mov.b @(index, LMAP), index
mov.b index, @Lptr
add #1, Lptr
cmp/gt Lptr, Rptr
bt .shade_pixel
.scanline_end:
dt h
mov.w var_frameWidth, tmp
bf/s .scanline_start
add tmp, pixel // [delay slot] pixel += 120 + 120 + 80
bra .loop
mov.l @sp+, R
var_frameWidth:
.word FRAME_WIDTH
.align 2
var_LMAP_ADDR:
.long _gLightmap_base
var_divTable:
.long _divTable

View File

@ -12,26 +12,24 @@ SEG_TRANS
#define x r8 #define x r8
#define y r9 #define y r9
#define z r10 #define z r10
#define minX r11 #define mx r11
#define minY r12 #define my r12
#define maxX r13 #define mz r13
#define maxY r14
#define vg intensity #define vg intensity
#define ambient tmp #define ambient tmp
#define dz tmp #define dz tmp
#define minZ tmp #define minZ tmp
.macro transform v .macro transform v, offset
clrmac lds \offset, MACL
mac.w @vertices+, @m+ mac.w @vertices+, @m+
mac.w @vertices+, @m+ mac.w @vertices+, @m+
mac.w @vertices+, @m+ mac.w @vertices+, @m+
sts MACL, tmp sts MACL, tmp
// v += tmp >> (FIXED_SHIFT + FP16_SHIFT) // v += tmp >> (FIXED_SHIFT + FP16_SHIFT)
shlr16 tmp shlr16 tmp
exts.w tmp, tmp exts.w tmp, \v
add tmp, \v
.endm .endm
.align 4 .align 4
@ -44,13 +42,6 @@ _transformMesh_asm:
mov.l r11, @-sp mov.l r11, @-sp
mov.l r12, @-sp mov.l r12, @-sp
mov.l r13, @-sp mov.l r13, @-sp
mov.l r14, @-sp
mov.l var_viewportRel, tmp
mov.w @tmp+, minX
mov.w @tmp+, minY
mov.w @tmp+, maxX
mov.w @tmp+, maxY
mov.l var_gVerticesBase, tmp mov.l var_gVerticesBase, tmp
mov.l @tmp, res mov.l @tmp, res
@ -71,7 +62,7 @@ _transformMesh_asm:
shlr8 ambient shlr8 ambient
exts.b ambient, vg exts.b ambient, vg
// vg = clamp(vg, 0, 31) // vg = clamp(vg, 0, 31) + 1
.vg_max: .vg_max:
mov #31, tmp mov #31, tmp
cmp/gt tmp, vg cmp/gt tmp, vg
@ -82,26 +73,32 @@ _transformMesh_asm:
subc tmp, tmp // tmp = -T subc tmp, tmp // tmp = -T
and tmp, vg and tmp, vg
add #1, vg // +1 for signed lightmap fetch
shll8 vg // lower 8 bits = vertex.clip flags shll8 vg // lower 8 bits = vertex.clip flags
add #8, res // extra offset for @-Rn add #8, res // extra offset for @-Rn
add #M03, m // extra offset to the matrix translation row
// pre-transform the matrix offset
add #M03, m
mov.w @m+, mx
shll16 mx
mov.w @m+, my
shll16 my
mov.w @m+, mz
shll16 mz
add #-MATRIX_SIZEOF, m
.loop: .loop:
// clear clipping flags // clear clipping flags
shlr8 vg shlr8 vg
shll8 vg shll8 vg
mov.w @m+, x
mov.w @m+, y
mov.w @m+, z
add #-MATRIX_SIZEOF, m
// transform to view space // transform to view space
transform x transform x, mx
add #-6, vertices // reset vertex ptr add #-6, vertices // reset vertex ptr
transform y transform y, my
add #-6, vertices // reset vertex ptr add #-6, vertices // reset vertex ptr
transform z transform z, mz
// z clipping // z clipping
.clip_z_near: .clip_z_near:
@ -124,6 +121,8 @@ _transformMesh_asm:
shll dz shll dz
mov.w @(dz, divLUT), dz mov.w @(dz, divLUT), dz
add #-M03, m // reset matrix ptr
// x = x * dz >> (16 - PROJ_SHIFT) // x = x * dz >> (16 - PROJ_SHIFT)
muls.w dz, x muls.w dz, x
sts MACL, x sts MACL, x
@ -140,42 +139,36 @@ _transformMesh_asm:
shlr16 y shlr16 y
exts.w y, y exts.w y, y
// viewport clipping .apply_offset:
.clip_vp_minX: // x += FRAME_WIDTH / 2 (160)
cmp/gt x, minX add #100, x // x += 100
bf/s .clip_vp_minY add #60, x // x += 60
cmp/ge y, minY // y += FRAME_HEIGHT / 2 (112)
add #CLIP_LEFT, vg add #112, y // y += 112
.clip_vp_minY:
bf/s .clip_vp_maxX .clip_frame_x: // 0 < x > FRAME_WIDTH
cmp/gt maxX, x mov #80, tmp
add #CLIP_TOP, vg shll2 tmp // tmp = 80 * 4 = 320 = FRAME_WIDTH
.clip_vp_maxX: cmp/hi tmp, x
bf/s .clip_vp_maxY bt/s .clip_frame
cmp/ge maxY, y add #-96, tmp // tmp = 320 - 96 = 224 = FRAME_HEIGHT (delay slot)
add #CLIP_RIGHT, vg .clip_frame_y: // 0 < y > FRAME_HEIGHT
.clip_vp_maxY: cmp/hi tmp, y
bf/s .store_vertex .clip_frame:
dt count movt tmp
add #CLIP_BOTTOM, vg or tmp, vg // vg |= CLIP_FRAME
.store_vertex: .store_vertex:
// x += FRAME_WIDTH / 2 (160)
add #100, x
add #60, x
// y += FRAME_HEIGHT / 2 (112)
add #112, y
mov.w vg, @-res mov.w vg, @-res
mov.w z, @-res mov.w z, @-res
mov.w y, @-res mov.w y, @-res
mov.w x, @-res mov.w x, @-res
dt count
bf/s .loop bf/s .loop
add #16, res add #16, res
// pop // pop
mov.l @sp+, r14
mov.l @sp+, r13 mov.l @sp+, r13
mov.l @sp+, r12 mov.l @sp+, r12
mov.l @sp+, r11 mov.l @sp+, r11
@ -185,8 +178,6 @@ _transformMesh_asm:
mov.l @sp+, r8 mov.l @sp+, r8
.align 2 .align 2
var_viewportRel:
.long _viewportRel
var_gVerticesBase: var_gVerticesBase:
.long _gVerticesBase .long _gVerticesBase
var_gMatrixPtr: var_gMatrixPtr:

View File

@ -7,15 +7,15 @@ SEG_TRANS
#define res r3 #define res r3
#define vertices r4 // arg #define vertices r4 // arg
#define count r5 // arg #define count r5 // arg
#define vp r6 #define stackVtx r6
#define m r7 #define stackMtx r7
#define x r8 #define vp r8
#define y r9 #define x r9
#define z r10 #define y r10
#define vx r11 #define z r11
#define vy r12 #define mx r12 // const
#define vz r13 #define my r13 // const
#define vg r14 #define mz r14 // const
#define minX tmp #define minX tmp
#define minY tmp #define minY tmp
@ -23,28 +23,21 @@ SEG_TRANS
#define maxY tmp #define maxY tmp
#define minZ tmp #define minZ tmp
#define dz tmp #define dz tmp
#define fog vx #define vg stackVtx
#define fog stackMtx
#define cnt stackVtx
.macro transform v, row #define SP_SIZE (18 + 6) // mat3x3 + vec3
mov.w @(\row * 6, m), tmp
muls.w vx, tmp .macro transform v, offset
lds \offset, MACL
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
add #-6, stackVtx
sts MACL, \v sts MACL, \v
mov.w @(\row * 6 + 2, m), tmp
muls.w vy, tmp
sts MACL, tmp
add tmp, \v
mov.w @(\row * 6 + 4, m), tmp
muls.w vz, tmp
sts MACL, tmp
add tmp, \v
mov.w @(\row * 2 + M03, m), tmp
shll2 \v
shlr8 \v shlr8 \v
exts.w \v, \v exts.w \v, \v
add tmp, \v
.endm .endm
.align 4 .align 4
@ -58,32 +51,71 @@ _transformRoom_asm:
mov.l r12, @-sp mov.l r12, @-sp
mov.l r13, @-sp mov.l r13, @-sp
mov.l r14, @-sp mov.l r14, @-sp
mov sp, stackMtx
add #-SP_SIZE, sp
mov.l var_viewportRel, vp mov.l var_viewportRel, vp
mov.l var_gVerticesBase, tmp mov.l var_gVerticesBase, tmp
mov.l @tmp, res mov.l @tmp, res
mov.l var_gMatrixPtr, tmp
mov.l @tmp, m
mov.l var_divTable, divLUT mov.l var_divTable, divLUT
// store matrix into stack (in reverse order)
mov.l var_gMatrixPtr, tmp
mov.l @tmp, tmp
// copy 3x3 matrix rotation part
mov #9, cnt
.copyMtx:
mov.w @tmp+, mx
dt cnt
bf/s .copyMtx
mov.w mx, @-stackMtx
// prepare offsets (const)
mov.w @tmp+, mx
mov.w @tmp+, my
mov.w @tmp+, mz
shll8 mx
shll8 my
shll8 mz
add #8, res // extra offset for @-Rn add #8, res // extra offset for @-Rn
.loop: .loop:
// unpack vertex // unpack vertex
mov.l @vertices+, vg mov.b @vertices+, x
extu.b vg, vx mov.b @vertices+, y
shlr8 vg mov.b @vertices+, z
extu.b vg, vy
shlr8 vg shll2 x
extu.b vg, vz shll2 y
shll2 z
// upload vertex coords into stack (in reverse order)
mov sp, stackVtx
add #6, stackVtx
mov stackVtx, stackMtx
mov.w x, @-stackVtx
mov.w y, @-stackVtx
mov.w z, @-stackVtx
// transform to view space // transform to view space
transform z, 2 //transform z, mz
.z_range_check: lds mz, MACL
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
add #-6, stackVtx
sts MACL, z
shlr8 z
exts.w z, z
.z_range_check: // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
// tmp = z + VIEW_OFF = z + 4096 // tmp = z + VIEW_OFF = z + 4096
mov #16, tmp mov #16, tmp
shll8 tmp shll8 tmp
@ -97,19 +129,37 @@ _transformRoom_asm:
mov #40, maxZ // maxZ = 40 (delay slot) mov #40, maxZ // maxZ = 40 (delay slot)
mov #(CLIP_NEAR + CLIP_FAR), vg mov #(CLIP_NEAR + CLIP_FAR), vg
mov.w vg, @-res mov.w vg, @-res
add #1, vertices
dt count dt count
bf/s .loop bf/s .loop
add #10, res add #10, res
bra .done bra .done
// delay slot from transform (mov.w) nop
.visible: .visible:
transform x, 0 //transform y, my
transform y, 1 lds my, MACL
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
add #-6, stackVtx
sts MACL, y
shlr8 y
exts.w y, y
//transform x, mx
lds mx, MACL
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
mac.w @stackVtx+, @stackMtx+
shll8 maxZ // maxZ = VIEW_MAX = (1024 * 10) = (40 << 8)
sts MACL, x
shlr8 x
exts.w x, x
mov.b @vertices+, vg
// maxZ = VIEW_MAX = (1024 * 10) = (40 << 8)
shll8 maxZ
shlr8 vg
// tmp = FOG_MIN = 6144 = (24 << 8) // tmp = FOG_MIN = 6144 = (24 << 8)
mov #24, tmp mov #24, tmp
shll8 tmp shll8 tmp
@ -129,43 +179,37 @@ _transformRoom_asm:
// z clipping // z clipping
.clip_z_near: .clip_z_near:
shll8 vg // clear lower 8-bits of vg for clipping flags add #1, vg // +1 for signed lightmap fetch
mov #VIEW_MIN, minZ // minZ = VIEW_MIN = 64 mov #VIEW_MIN, minZ // minZ = VIEW_MIN = 64
cmp/gt z, minZ cmp/gt z, minZ
bf/s .clip_z_far bf/s .clip_z_far
cmp/ge maxZ, z shll8 vg // clear lower 8-bits of vg for clipping flags (delay slot)
mov minZ, z mov minZ, z
add #CLIP_NEAR, vg add #CLIP_NEAR, vg
.clip_z_far: .clip_z_far:
cmp/ge maxZ, z
bf/s .project bf/s .project
mov z, dz // dz = z (delay slot) mov z, dz
mov maxZ, z mov maxZ, z
add #CLIP_FAR, vg add #CLIP_FAR, vg
.project: .project: // dz = divTable[z >> (PROJ_SHIFT = 4)]
// dz = divTable[z >> (PROJ_SHIFT = 4)]
shlr2 dz shlr2 dz
shlr2 dz shlr2 dz
shll dz shll dz
mov.w @(dz, divLUT), dz mov.w @(dz, divLUT), dz
// x = x * dz >> (16 - PROJ_SHIFT) .proj_x: // x = x * dz >> 12
muls.w dz, x muls.w dz, x
sts MACL, x sts MACL, x
shll2 x
shll2 x
shlr16 x
exts.w x, x
// y = y * dz >> (16 - PROJ_SHIFT) .proj_y: // y = y * dz >> 12
muls.w dz, y muls.w dz, y
shar12 x, tmp // do it here to hide muls.w latency
sts MACL, y sts MACL, y
shll2 y shar12 y, tmp
shll2 y
shlr16 y
exts.w y, y
// viewport clipping // portal rect clipping
.clip_vp_minX: .clip_vp_minX:
mov.w @(0, vp), minX mov.w @(0, vp), minX
cmp/gt x, minX cmp/gt x, minX
@ -184,26 +228,41 @@ _transformRoom_asm:
add #CLIP_RIGHT, vg add #CLIP_RIGHT, vg
.clip_vp_maxY: .clip_vp_maxY:
cmp/ge maxY, y cmp/ge maxY, y
bf/s .store_vertex bf/s .apply_offset
dt count mov #80, tmp // tmp = 80 (delay slot)
add #CLIP_BOTTOM, vg add #CLIP_BOTTOM, vg
.store_vertex: .apply_offset:
// x += FRAME_WIDTH / 2 (160) // x += FRAME_WIDTH / 2 (160)
add #100, x add #100, x // x += 100
add #60, x add #60, x // x += 60
// y += FRAME_HEIGHT / 2 (112) // y += FRAME_HEIGHT / 2 (112)
add #112, y add #112, y // y += 112
// frame rect clipping
.clip_frame_x: // 0 < x > FRAME_WIDTH
shll2 tmp // tmp = 80 * 4 = 320 = FRAME_WIDTH
cmp/hi tmp, x
bt/s .clip_frame
add #-96, tmp // tmp = 320 - 96 = 224 = FRAME_HEIGHT (delay slot)
.clip_frame_y: // 0 < y > FRAME_HEIGHT
cmp/hi tmp, y
.clip_frame:
movt tmp
or tmp, vg // vg |= CLIP_FRAME
.store_vertex:
mov.w vg, @-res mov.w vg, @-res
mov.w z, @-res mov.w z, @-res
mov.w y, @-res mov.w y, @-res
mov.w x, @-res mov.w x, @-res
dt count
bf/s .loop bf/s .loop
add #16, res add #16, res
.done: .done:
// pop // pop
add #SP_SIZE, sp
mov.l @sp+, r14 mov.l @sp+, r14
mov.l @sp+, r13 mov.l @sp+, r13
mov.l @sp+, r12 mov.l @sp+, r12
@ -218,7 +277,7 @@ var_viewportRel:
.long _viewportRel .long _viewportRel
var_gVerticesBase: var_gVerticesBase:
.long _gVerticesBase .long _gVerticesBase
var_gMatrixPtr:
.long _gMatrixPtr
var_divTable: var_divTable:
.long _divTable .long _divTable
var_gMatrixPtr:
.long _gMatrixPtr

View File

@ -191,8 +191,13 @@
.incbin "src-md/m68k.bin" /* all 68000 code & data, compiled to 0x880800/0xFF0000 */ .incbin "src-md/m68k.bin" /* all 68000 code & data, compiled to 0x880800/0xFF0000 */
.data
.global _gLightmap_base
.global _gLightmap .global _gLightmap
.data
_gLightmap_base:
.space 128
_gLightmap: _gLightmap:
.space 256 * 32 .space 256 * 32

View File

@ -11,8 +11,11 @@
#endif #endif
#endif #endif
#define CACHE_ON(ptr) ptr = &ptr[-0x20000000 / sizeof(ptr[0])];
#define CACHE_OFF(ptr) ptr = &ptr[0x20000000 / sizeof(ptr[0])];
extern uint8 gLightmap[256 * 32]; extern uint8 gLightmap[256 * 32];
extern const uint8* gTile; extern const ColorIndex* gTile;
extern "C" { extern "C" {
void rasterize_dummy_asm(uint16* pixel, const VertexLink* L, const VertexLink* R); void rasterize_dummy_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
@ -104,7 +107,7 @@ extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLin
if (width > 0) if (width > 0)
{ {
volatile uint8* ptr = (uint8*)pixel + x1; volatile ColorIndex* ptr = (uint8*)pixel + x1;
if (x1 & 1) if (x1 & 1)
{ {
@ -756,7 +759,7 @@ extern "C" void rasterizeGTA_c(uint16* pixel, const VertexLink* L, const VertexL
extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const VertexLink* R) extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{ {
R++; R++;
const uint8* ft_lightmap = &gLightmap[L->v.g << 8]; const uint8* ft_lightmap = &gLightmap[L->v.g << 8] + 128;
int32 w = R->v.x - L->v.x; int32 w = R->v.x - L->v.x;
if (w <= 0 || w >= DIV_TABLE_SIZE) return; if (w <= 0 || w >= DIV_TABLE_SIZE) return;
@ -817,50 +820,37 @@ extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const Vert
for (int32 y = 0; y < h; y++) for (int32 y = 0; y < h; y++)
{ {
const uint8* xtile = gTile + (v & 0xFF00); const ColorIndex* xtile = (ColorIndex*)gTile + (v & 0xFF00);
volatile uint8* xptr = ptr; volatile uint8* xptr = ptr;
int32 xu = u; uint32 xu = uint32(u);
if (alignL) if (alignL)
{ {
uint8 indexB = xtile[xu >> 8]; ColorIndex indexB = xtile[xu >> 8];
if (indexB) {
*xptr = ft_lightmap[indexB];
}
xptr++;
xu += du; xu += du;
if (indexB) xptr[0] = ft_lightmap[indexB];
xptr++;
} }
for (int32 x = 0; x < w; x++) for (int32 x = 0; x < w; x++)
{ {
uint8 indexA = xtile[xu >> 8]; ColorIndex indexA = xtile[xu >> 8];
xu += du;
uint8 indexB = xtile[xu >> 8];
xu += du; xu += du;
if (indexA) xptr[0] = ft_lightmap[indexA];
if (indexA | indexB) ColorIndex indexB = xtile[xu >> 8];
{ xu += du;
indexA = (indexA) ? ft_lightmap[indexA] : xptr[0]; if (indexB) xptr[1] = ft_lightmap[indexB];
indexB = (indexB) ? ft_lightmap[indexB] : xptr[1];
#ifdef CPU_BIG_ENDIAN
*(uint16*)xptr = indexB | (indexA << 8);
#else
*(uint16*)xptr = indexA | (indexB << 8);
#endif
}
xptr += 2; xptr += 2;
} }
if (alignR) if (alignR)
{ {
uint8 indexA = xtile[xu >> 8]; ColorIndex indexA = xtile[xu >> 8];
if (indexA) { if (indexA) xptr[0] = ft_lightmap[indexA];
*xptr = ft_lightmap[indexA];
}
} }
v += dv; v += dv;

View File

@ -49,8 +49,8 @@ enum FaceType {
FACE_TYPE_MAX FACE_TYPE_MAX
}; };
#define FACE_TRIANGLE (1 << 19) #define FACE_TRIANGLE (1 << 31)
#define FACE_CLIPPED (1 << 18) #define FACE_CLIPPED (1 << 30)
#define FACE_TYPE_SHIFT 14 #define FACE_TYPE_SHIFT 14
#define FACE_TYPE_MASK 15 #define FACE_TYPE_MASK 15
#define FACE_GOURAUD (2 << FACE_TYPE_SHIFT) #define FACE_GOURAUD (2 << FACE_TYPE_SHIFT)
@ -60,7 +60,7 @@ enum FaceType {
extern Level level; extern Level level;
const uint8* gTile; const ColorIndex* gTile;
ViewportRel viewportRel; ViewportRel viewportRel;
Vertex* gVerticesBase; Vertex* gVerticesBase;
@ -72,13 +72,14 @@ EWRAM_DATA ALIGN16 Face gFaces[MAX_FACES]; // EWRAM 30k
Face* gOT[OT_SIZE]; // IWRAM 2.5k Face* gOT[OT_SIZE]; // IWRAM 2.5k
enum ClipFlags { enum ClipFlags {
CLIP_LEFT = 1 << 0, CLIP_FRAME = 1 << 0,
CLIP_RIGHT = 1 << 1, CLIP_LEFT = 1 << 1,
CLIP_TOP = 1 << 2, CLIP_RIGHT = 1 << 2,
CLIP_BOTTOM = 1 << 3, CLIP_TOP = 1 << 3,
CLIP_FAR = 1 << 4, CLIP_BOTTOM = 1 << 4,
CLIP_NEAR = 1 << 5, CLIP_FAR = 1 << 5,
CLIP_MASK_VP = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM), CLIP_NEAR = 1 << 6,
CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_FAR | CLIP_NEAR),
}; };
const MeshQuad gShadowQuads[] = { const MeshQuad gShadowQuads[] = {
@ -142,8 +143,6 @@ extern "C" {
#define faceAddMeshQuads faceAddMeshQuads_asm #define faceAddMeshQuads faceAddMeshQuads_asm
#define faceAddMeshTriangles faceAddMeshTriangles_asm #define faceAddMeshTriangles faceAddMeshTriangles_asm
#define rasterize rasterize_asm #define rasterize rasterize_asm
#else #else
#define transformRoom transformRoom_c #define transformRoom transformRoom_c
#define transformRoomUW transformRoomUW_c #define transformRoomUW transformRoomUW_c
@ -152,7 +151,7 @@ extern "C" {
#define faceAddRoomTriangles faceAddRoomTriangles_c #define faceAddRoomTriangles faceAddRoomTriangles_c
#define faceAddMeshQuads faceAddMeshQuads_c #define faceAddMeshQuads faceAddMeshQuads_c
#define faceAddMeshTriangles faceAddMeshTriangles_c #define faceAddMeshTriangles faceAddMeshTriangles_c
#define rasterize rasterize_asm #define rasterize rasterize_c
X_INLINE bool checkBackface(const Vertex *a, const Vertex *b, const Vertex *c) X_INLINE bool checkBackface(const Vertex *a, const Vertex *b, const Vertex *c)
{ {
@ -217,6 +216,10 @@ void transformRoom_c(const RoomVertex* vertices, int32 count)
x += (FRAME_WIDTH >> 1); x += (FRAME_WIDTH >> 1);
y += (FRAME_HEIGHT >> 1); y += (FRAME_HEIGHT >> 1);
if ((x < 0 || x > FRAME_WIDTH) || (y < 0 || y > FRAME_HEIGHT)) {
clip |= CLIP_FRAME;
}
if (x < viewport.x0) clip |= CLIP_LEFT; if (x < viewport.x0) clip |= CLIP_LEFT;
if (x > viewport.x1) clip |= CLIP_RIGHT; if (x > viewport.x1) clip |= CLIP_RIGHT;
if (y < viewport.y0) clip |= CLIP_TOP; if (y < viewport.y0) clip |= CLIP_TOP;
@ -287,6 +290,10 @@ void transformRoomUW_c(const RoomVertex* vertices, int32 count)
x += (FRAME_WIDTH >> 1); x += (FRAME_WIDTH >> 1);
y += (FRAME_HEIGHT >> 1); y += (FRAME_HEIGHT >> 1);
if ((x < 0 || x > FRAME_WIDTH) || (y < 0 || y > FRAME_HEIGHT)) {
clip |= CLIP_FRAME;
}
if (x < viewport.x0) clip |= CLIP_LEFT; if (x < viewport.x0) clip |= CLIP_LEFT;
if (x > viewport.x1) clip |= CLIP_RIGHT; if (x > viewport.x1) clip |= CLIP_RIGHT;
if (y < viewport.y0) clip |= CLIP_TOP; if (y < viewport.y0) clip |= CLIP_TOP;
@ -340,6 +347,10 @@ void transformMesh_c(const MeshVertex* vertices, int32 count, int32 intensity)
x += (FRAME_WIDTH >> 1); x += (FRAME_WIDTH >> 1);
y += (FRAME_HEIGHT >> 1); y += (FRAME_HEIGHT >> 1);
if ((x < 0 || x > FRAME_WIDTH) || (y < 0 || y > FRAME_HEIGHT)) {
clip |= CLIP_FRAME;
}
if (x < viewport.x0) clip |= CLIP_LEFT; if (x < viewport.x0) clip |= CLIP_LEFT;
if (x > viewport.x1) clip |= CLIP_RIGHT; if (x > viewport.x1) clip |= CLIP_RIGHT;
if (y < viewport.y0) clip |= CLIP_TOP; if (y < viewport.y0) clip |= CLIP_TOP;
@ -370,10 +381,10 @@ void faceAddRoomQuads_c(const RoomQuad* polys, int32 count)
uint32 c2 = v2->clip; uint32 c2 = v2->clip;
uint32 c3 = v3->clip; uint32 c3 = v3->clip;
if (c0 & c1 & c2 & c3) if (c0 & c1 & c2 & c3 & CLIP_DISCARD)
continue; continue;
if ((c0 | c1 | c2 | c3) & CLIP_MASK_VP) { if ((c0 | c1 | c2 | c3) & CLIP_FRAME) {
flags |= FACE_CLIPPED; flags |= FACE_CLIPPED;
} }
@ -415,10 +426,10 @@ void faceAddRoomTriangles_c(const RoomTriangle* polys, int32 count)
uint32 c1 = v1->clip; uint32 c1 = v1->clip;
uint32 c2 = v2->clip; uint32 c2 = v2->clip;
if (c0 & c1 & c2) if (c0 & c1 & c2 & CLIP_DISCARD)
continue; continue;
if ((c0 | c1 | c2) & CLIP_MASK_VP) { if ((c0 | c1 | c2) & CLIP_FRAME) {
flags |= FACE_CLIPPED; flags |= FACE_CLIPPED;
} }
@ -464,10 +475,10 @@ void faceAddMeshQuads_c(const MeshQuad* polys, int32 count)
uint32 c2 = v2->clip; uint32 c2 = v2->clip;
uint32 c3 = v3->clip; uint32 c3 = v3->clip;
if (c0 & c1 & c2 & c3) if (c0 & c1 & c2 & c3 & CLIP_DISCARD)
continue; continue;
if ((c0 | c1 | c2 | c3) & CLIP_MASK_VP) { if ((c0 | c1 | c2 | c3) & CLIP_FRAME) {
flags |= FACE_CLIPPED; flags |= FACE_CLIPPED;
} }
@ -500,10 +511,10 @@ void faceAddMeshTriangles_c(const MeshTriangle* polys, int32 count)
uint32 c1 = v1->clip; uint32 c1 = v1->clip;
uint32 c2 = v2->clip; uint32 c2 = v2->clip;
if (c0 & c1 & c2) if (c0 & c1 & c2 & CLIP_DISCARD)
continue; continue;
if ((c0 | c1 | c2) & CLIP_MASK_VP) { if ((c0 | c1 | c2) & CLIP_FRAME) {
flags |= FACE_CLIPPED; flags |= FACE_CLIPPED;
} }
flags |= FACE_TRIANGLE; flags |= FACE_TRIANGLE;
@ -696,7 +707,7 @@ void flush_c()
if (type > FACE_TYPE_F) if (type > FACE_TYPE_F)
{ {
const Texture &tex = level.textures[flags & FACE_TEXTURE]; const Texture &tex = level.textures[flags & FACE_TEXTURE];
gTile = (uint8*)tex.tile; gTile = (ColorIndex*)tex.tile;
v[0].t.t = 0xFF00FF00 & (tex.uv01); v[0].t.t = 0xFF00FF00 & (tex.uv01);
v[1].t.t = 0xFF00FF00 & (tex.uv01 << 8); v[1].t.t = 0xFF00FF00 & (tex.uv01 << 8);
@ -730,7 +741,7 @@ void flush_c()
if (type == FACE_TYPE_SPRITE) if (type == FACE_TYPE_SPRITE)
{ {
const Sprite &sprite = level.sprites[flags & FACE_TEXTURE]; const Sprite &sprite = level.sprites[flags & FACE_TEXTURE];
gTile = (uint8*)sprite.tile; gTile = (ColorIndex*)sprite.tile;
v[0].t.t = (sprite.uwvh) & (0xFF00FF00); v[0].t.t = (sprite.uwvh) & (0xFF00FF00);
v[1].t.t = (sprite.uwvh) & (0xFF00FF00 >> 8); v[1].t.t = (sprite.uwvh) & (0xFF00FF00 >> 8);
} }