1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-03-13 23:59:41 +01:00

#368 micro optimizations of rasterizer, remove per-pixel texturing (unused)

This commit is contained in:
XProger 2022-02-12 06:14:47 +03:00
parent 401c854209
commit b6df8a2348
16 changed files with 221 additions and 348 deletions

View File

@ -1,8 +1,6 @@
.section .iwram
.arm
#define TEX_2PX // lazy texturing, comment out for per-pixel
#define FRAME_WIDTH 240
#define FRAME_HEIGHT 160
@ -45,9 +43,11 @@
.equ CLIP_MASK_VP, (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM)
.equ CLIP_MASK, (CLIP_MASK_VP + CLIP_FAR + CLIP_NEAR)
.equ FACE_TEXTURE_BITS, 13
.equ FACE_TRIANGLE, (1 << FACE_TEXTURE_BITS)
.equ FACE_TEXTURE_BITS, 14
.equ FACE_TEXTURE, ((1 << FACE_TEXTURE_BITS) - 1)
.equ FACE_GOURAUD, (2 << FACE_TYPE_SHIFT)
.equ FACE_CLIPPED, (1 << 18)
.equ FACE_TRIANGLE, (1 << 19)
.equ FACE_FLAGS, 0
.equ FACE_NEXT, 4
@ -67,9 +67,6 @@
.equ FACE_TYPE_LINE_H, (8 << FACE_TYPE_SHIFT)
.equ FACE_TYPE_LINE_V, (9 << FACE_TYPE_SHIFT)
.equ FACE_GOURAUD, (2 << FACE_TYPE_SHIFT)
.equ FACE_CLIPPED, (1 << 18)
.equ FIXED_SHIFT, 14
.equ PROJ_SHIFT, 4
.equ OT_SHIFT, 4
@ -90,6 +87,7 @@
.equ MAX_CAUSTICS, 32
.equ MAX_RAND_TABLE, 32
.equ MAX_ANIM_TEX, 128
.equ MIN_INT32, 0x80000000
.equ MAX_INT32, 0x7FFFFFFF
@ -110,6 +108,17 @@
ble \skip
.endm
.macro scaleUV uv, u, v, f
asr \u, \uv, #16
mul \u, \f // u = f * int16(uv >> 16)
lsl \v, \uv, #16
asr \v, #16
mul \v, \f // v = f * int16(uv)
lsr \u, #16
lsl \u, #16
orr \uv, \u, \v, lsr #16 // uv = (u & 0xFFFF0000) | (v >> 16)
.endm
.macro tex index, uv
and \index, \uv, #0xFF00
orr \index, \uv, lsr #24 // index = v * 256 + u

View File

@ -89,6 +89,8 @@ faceAddMeshTriangles_asm:
orr vp1, vp0, vp1, lsl #(16 - 3)
mov vp2, vp2, lsr #3
orr flags, #FACE_TRIANGLE
ldr ot, =gOT
ldr next, [ot, depth, lsl #2]
str face, [ot, depth, lsl #2]

View File

@ -96,6 +96,8 @@ faceAddRoomTriangles_asm:
orr vp1, vp0, vp1, lsl #(16 - 3)
mov vp2, vp2, lsr #3
orr flags, #FACE_TRIANGLE
ldr ot, =gOT
ldr next, [ot, depth, lsl #2]
str face, [ot, depth, lsl #2]

View File

@ -25,6 +25,7 @@ verticesBase .req vZG
facesBase .req vZG
vertex .req vZG
texture .req tmp
texAnim .req vXY
texIndex .req tmp
texTile .req tmp
sprite .req tmp
@ -121,8 +122,12 @@ flush_asm:
.set_texture:
mov texIndex, flags, lsl #(32 - FACE_TEXTURE_BITS)
//cmp texIndex, #(MAX_ANIM_TEX << (32 - FACE_TEXTURE_BITS)) // TODO split to animated and static textures arrays
add texIndex, texIndex, texIndex, lsl #1
add texture, TEXTURES, texIndex, lsr #(32 - FACE_TEXTURE_BITS - 2)
//addge texture, TEXTURES, texIndex, lsr #(32 - FACE_TEXTURE_BITS - 2)
//ldrlt texAnim, =gAnimTextures
//addlt texture, texAnim, texIndex, lsr #(32 - FACE_TEXTURE_BITS - 2)
ldmia texture, {texTile, uv01, uv23}
str texTile, [TILE]

View File

@ -1,11 +1,11 @@
#include "common_asm.inc"
flags .req r0
top .req r1
y .req r2
width .req r3
pixel .req flags
L .req r1
R .req r2
y .req r3
type .req r12
pixel .req flags
.extern rasterizeS_asm
.extern rasterizeF_asm
@ -22,22 +22,19 @@ type .req r12
.global rasterize_asm
rasterize_asm:
and type, flags, #FACE_TYPE_MASK
cmp type, #FACE_TYPE_F
streqb flags, [top, #VERTEX_CLIP]
andeq R, flags, #0xFF // R = face color for FACE_TYPE_F
movne R, L // R = L otherwise
ldr pixel, =fb
ldr pixel, [pixel]
ldrsh y, [top, #VERTEX_Y]
ldrsh y, [L, #VERTEX_Y]
#if (FRAME_WIDTH == 240) // pixel += (y * 16 - y) * 16
// pixel += y * 240 -> (y * 16 - y) * 16
rsb y, y, y, lsl #4
add pixel, pixel, y, lsl #4
#else
mov width, #FRAME_WIDTH
mla pixel, y, width, pixel
#endif
mov r2, top
add pc, type, lsr #(FACE_TYPE_SHIFT - 2)
nop
b rasterizeS_asm

View File

@ -2,7 +2,7 @@
pixel .req r0
L .req r1
R .req r2
color .req r2
index .req r3
Lh .req r4
Rh .req r5
@ -15,6 +15,7 @@ tmp .req r11
DIVLUT .req r12
width .req lr
R .req color
h .req N
Rxy .req tmp
Ry2 .req Rh
@ -22,19 +23,16 @@ Lxy .req tmp
Ly2 .req Lh
LMAP .req Lx
pair .req DIVLUT
blocks .req DIVLUT
.global rasterizeF_asm
rasterizeF_asm:
stmfd sp!, {r4-r11, lr}
mov LMAP, #LMAP_ADDR
// TODO use ldrh, swap g and clip
add LMAP, color, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G]
ldrb index, [L, #VERTEX_CLIP]
orr tmp, index, tmp, lsl #8 // tmp = index | (L->v.g << 8)
ldrb index, [LMAP, tmp] // tmp = lightmap[tmp]
ldrb index, [LMAP, tmp, lsl #8] // index = lightmap[color + L->v.g * 256]
mov R, L
mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0

View File

@ -17,6 +17,8 @@ Lt .req r11
Rt .req r12
h .req lr
ptr .req tmp
Ldx .req h
Rdx .req h
@ -41,14 +43,14 @@ duv .req R
du .req L
dv .req R
Lduv .req h
Ldu .req N
Ldv .req h
Rduv .req h
Rdu .req N
Rdv .req h
Rti .req indexB
sLdx .req tmp
sLdt .req N
sRdx .req Lh
@ -58,32 +60,25 @@ SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12
SP_L = 16
SP_R = 20
SP_LH = 24
SP_RH = 28
SP_SIZE = 32
.macro PUT_PIXELS
tex indexA, t
lit indexA
#ifndef TEX_2PX
add t, dtdx
tex indexB, t
lit indexB
add t, dtdx
orr indexA, indexB, lsl #8
strh indexA, [tmp], #2
#else
add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8
strb indexA, [tmp], #2 // writing a byte to GBA VRAM will write a half word for free
#endif
strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free
.endm
.global rasterizeFT_asm
rasterizeFT_asm:
stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov LMAP, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G]
@ -125,16 +120,9 @@ rasterizeFT_asm:
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldr Lduv, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt
asr Ldu, Lduv, #16
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Ldt, [L, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end:
@ -166,16 +154,9 @@ rasterizeFT_asm:
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldr Rduv, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt
asr Rdu, Rduv, #16
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Rdt, [R, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end:
@ -185,44 +166,36 @@ rasterizeFT_asm:
sub Lh, h // Lh -= h
sub Rh, h // Rh -= h
stmfd sp!, {L,R,Lh,Rh} // sp-16
add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start:
asr tmp, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline
add tmp, pixel, tmp // tmp = pixel + x1
add ptr, pixel, tmp // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub duv, Rt, Lt // duv = Rt - Lt
asr du, duv, #16
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
sub dtdx, Rt, Lt // duv = Rt - Lt
scaleUV dtdx, du, dv, inv
mov t, Lt // t = Lt
// 2 bytes alignment (VRAM write requirement)
.align_left:
tst tmp, #1 // if (tmp & 1)
tst ptr, #1 // if (ptr & 1)
beq .align_right
ldrb indexB, [tmp, #-1]! // read pal index from VRAM (byte)
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
tex indexA, t
lit indexA
ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
orr indexB, indexA, lsl #8
strh indexB, [tmp], #2
strh indexB, [ptr], #2
add t, dtdx
subs width, #1 // width--
@ -231,19 +204,15 @@ rasterizeFT_asm:
.align_right:
tst width, #1
beq .align_block_4px
ldrb indexB, [tmp, width]
sub Rti, Rt, dtdx
tex indexA, Rti
lit indexA
ldrb indexB, [ptr, width]
subs width, #1 // width--
sub Rt, dtdx
and indexA, Rt, #0xFF00
orr indexA, Rt, lsr #24 // res = (t & 0xFF00) | (t >> 24)
add Rt, dtdx
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
orr indexB, indexA, indexB, lsl #8
strh indexB, [tmp, width]
strh indexB, [ptr, width]
beq .scanline_end // if (width == 0)
@ -276,8 +245,7 @@ rasterizeFT_asm:
bne .scanline_block_8px
.scanline_end:
add tmp, sp, #16
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
ldmia sp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
@ -288,9 +256,10 @@ rasterizeFT_asm:
subs h, #1
bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16
add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop
.exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4-r11, pc}

View File

@ -17,6 +17,8 @@ Lt .req r11
Rt .req r12
h .req lr
ptr .req tmp
Ldx .req h
Rdx .req h
@ -41,14 +43,14 @@ duv .req R
du .req L
dv .req R
Lduv .req h
Ldu .req N
Ldv .req h
Rduv .req h
Rdu .req N
Rdv .req h
Rti .req indexB
sLdx .req tmp
sLdt .req N
sRdx .req Lh
@ -58,38 +60,25 @@ SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12
SP_L = 16
SP_R = 20
SP_LH = 24
SP_RH = 28
SP_SIZE = 32
.macro PUT_PIXELS
#ifndef TEX_2PX
tex indexA, t
add t, dtdx
tex indexB, t
add t, dtdx
// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
ands indexA, #255
andnes indexB, #255
orrne indexB, indexA, indexB, lsl #8 // indexB = indexA | (indexB << 8)
ldrneb indexA, [LMAP, indexA]
ldrneb indexB, [LMAP, indexB, lsr #8]
orrne indexA, indexB, lsl #8
strneh indexA, [tmp]
add tmp, #2
#else
tex indexA, t
add t, dtdx, lsl #1
cmp indexA, #0
ldrneb indexA, [LMAP, indexA]
strneb indexA, [tmp]
add tmp, #2
#endif
strneb indexA, [ptr]
add ptr, #2
.endm
.global rasterizeFTA_asm
rasterizeFTA_asm:
stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov LMAP, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G]
@ -131,16 +120,9 @@ rasterizeFTA_asm:
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldr Lduv, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt
asr Ldu, Lduv, #16
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Ldt, [L, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end:
@ -172,16 +154,9 @@ rasterizeFTA_asm:
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldr Rduv, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt
asr Rdu, Rduv, #16
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Rdt, [R, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end:
@ -191,46 +166,38 @@ rasterizeFTA_asm:
sub Lh, h // Lh -= h
sub Rh, h // Rh -= h
stmfd sp!, {L,R,Lh,Rh} // sp-16
add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start:
asr tmp, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline
add tmp, pixel, tmp // tmp = pixel + x1
add ptr, pixel, tmp // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub duv, Rt, Lt // duv = Rt - Lt
asr du, duv, #16
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
sub dtdx, Rt, Lt // duv = Rt - Lt
scaleUV dtdx, du, dv, inv
mov t, Lt // t = Lt
// 2 bytes alignment (VRAM write requirement)
.align_left:
tst tmp, #1 // if (tmp & 1)
tst ptr, #1 // if (ptr & 1)
beq .align_right
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
tex indexA, t
cmp indexA, #0
ldrneb indexB, [tmp, #-1]! // read pal index from VRAM (byte)
ldrneb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
ldrneb indexA, [LMAP, indexA]
orrne indexB, indexA, lsl #8
strneh indexB, [tmp], #2
addeq tmp, #1
strneh indexB, [ptr], #2
addeq ptr, #1
add t, dtdx
subs width, #1 // width--
@ -240,17 +207,14 @@ rasterizeFTA_asm:
tst width, #1
beq .align_block_4px
sub Rt, dtdx
and indexA, Rt, #0xFF00
orr indexA, Rt, lsr #24 // res = (t & 0xFF00) | (t >> 24)
add Rt, dtdx
ldrb indexA, [TILE, indexA]
sub Rti, Rt, dtdx
tex indexA, Rti
cmp indexA, #0
ldrneb indexA, [LMAP, indexA]
ldrneb indexB, [tmp, width]
ldrneb indexB, [ptr, width]
orrne indexB, indexA, indexB, lsl #8
addne indexA, tmp, width
addne indexA, ptr, width
strneh indexB, [indexA, #-1]
subs width, #1 // width--
@ -285,8 +249,7 @@ rasterizeFTA_asm:
bne .scanline_block_8px
.scanline_end:
add tmp, sp, #16
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
ldmia sp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
@ -297,9 +260,10 @@ rasterizeFTA_asm:
subs h, #1
bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16
add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop
.exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4-r11, pc}

View File

@ -35,7 +35,7 @@ Ldt .req h
Rdt .req h
indexA .req Lh
indexB .req Rh
indexB .req tmp
Rxy .req tmp
Ry2 .req Rh
@ -47,23 +47,19 @@ DIVLUT .req N
DIVLUTi .req tmp
ptr .req Lx
width .req Rx
width .req Rh
g .req Lg
dgdx .req Rg
dgdx .req L
t .req Lt
dtdx .req Rt
duv .req R
dtdx .req R
du .req L
dv .req R
Lduv .req N
Ldu .req TILE
Ldv .req N
Rduv .req N
Rdu .req TILE
Rdv .req N
@ -75,7 +71,7 @@ sLdg .req R
sLdt .req Lh
sRdx .req Rh
sRdg .req tmp
sRdt .req N // not used in ldm due h collision
sRdt .req tmp // not enough regs for one ldmia
SP_LDX = 0
SP_LDG = 4
@ -83,6 +79,12 @@ SP_LDT = 8
SP_RDX = 12
SP_RDG = 16
SP_RDT = 20
SP_L = 24
SP_R = 28
SP_LH = 32
SP_RH = 36
SP_SIZE = 40
SP_TILE = SP_SIZE
.macro PUT_PIXELS
bic LMAP, g, #255
@ -91,28 +93,18 @@ SP_RDT = 20
tex indexA, t
lit indexA
#ifndef TEX_2PX
add t, dtdx
tex indexB, t
lit indexB
add t, dtdx
orr indexA, indexB, lsl #8
strh indexA, [ptr], #2
#else
add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8
strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free
#endif
.endm
.global rasterizeGT_asm
rasterizeGT_asm:
stmfd sp!, {r4-r11, lr}
sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
ldr r3, =gTile
ldr r3, [r3]
stmfd sp!, {r3-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0
@ -155,16 +147,9 @@ rasterizeGT_asm:
asr Ldg, #8 // 8-bit for fractional part
str Ldg, [sp, #SP_LDG] // store Ldg to stack
ldr Lduv, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt
asr Ldu, Lduv, #16
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Ldt, [L, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end:
@ -204,16 +189,9 @@ rasterizeGT_asm:
asr Rdg, #8 // 8-bit for fractional part
str Rdg, [sp, #SP_RDG] // store Ldg to stack
ldr Rduv, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt
asr Rdu, Rduv, #16
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Rdt, [R, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end:
@ -226,54 +204,44 @@ rasterizeGT_asm:
sub Lh, h // Lh -= h
sub Rh, h // Rh -= h
ldr TILE, =gTile
ldr TILE, [TILE]
ldr TILE, [sp, #SP_TILE]
stmfd sp!, {L,R,Lh,Rh} // sp-16
add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start:
stmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp-24
stmfd sp!, {Lx, Lg, Lt}
asr tmp, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
asr Lx, Lx, #16 // x1 = (Lx >> 16)
rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline
add ptr, pixel, tmp // ptr = pixel + x1
add ptr, pixel, Lx // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv
// t == Lt (alias)
sub dgdx, Rg, Lg // dgdx = Rg - Lg
mul dgdx, inv // dgdx *= FixedInvU(width)
asr dgdx, #15 // dgdx >>= 15
// g == Lg (alias)
sub duv, Rt, Lt // duv = Rt - Lt
asr du, duv, #16
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
// t == Lt (alias)
// 2 bytes alignment (VRAM write requirement)
.align_left:
tst ptr, #1 // if (ptr & 1)
beq .align_right
ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
bic LMAP, g, #255
add g, dgdx, asr #1
tex indexA, t
lit indexA
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
orr indexB, indexA, lsl #8
strh indexB, [ptr], #2
add t, dtdx
@ -284,21 +252,16 @@ rasterizeGT_asm:
.align_right:
tst width, #1
beq .align_block_4px
ldrb indexB, [ptr, width]
subs width, #1 // width--
sub Rti, Rt, dtdx
tex indexA, Rti
mla Rti, width, dtdx, t // Rti = width * dtdx + t
and indexA, Rti, #0xFF00
orr indexA, Rti, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
asr Rgi, dgdx, #1
mla Rgi, width, Rgi, g // Rgi = width * (dgdx / 2) + g
sub Rgi, Rg, dgdx, asr #1
bic LMAP, Rgi, #255
lit indexA
ldrb indexA, [LMAP, indexA]
ldrb indexB, [ptr, width]
subs width, #1 // width--
orr indexB, indexA, indexB, lsl #8
strh indexB, [ptr, width]
@ -333,10 +296,9 @@ rasterizeGT_asm:
bne .scanline_block_8px
.scanline_end:
ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24
ldmfd sp!, {Lx, Lg, Lt}
add tmp, sp, #16
ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg}
ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg}
add Lx, sLdx
add Lg, sLdg
@ -344,17 +306,18 @@ rasterizeGT_asm:
add Rx, sRdx
add Rg, sRdg
ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
ldr sRdt, [sp, #SP_RDT]
add Rt, sRdt
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
subs h, #1
bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16
add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop
.exit:
add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE]
ldmfd sp!, {r4-r11, pc}

View File

@ -35,7 +35,7 @@ Ldt .req h
Rdt .req h
indexA .req Lh
indexB .req Rh
indexB .req tmp
Rxy .req tmp
Ry2 .req Rh
@ -47,23 +47,21 @@ DIVLUT .req N
DIVLUTi .req tmp
ptr .req Lx
width .req Rx
width .req Rh
g .req Lg
dgdx .req Rg
dgdx .req L
t .req Lt
dtdx .req Rt
dtdx .req R
duv .req R
du .req L
dv .req R
Lduv .req N
Ldu .req TILE
Ldv .req N
Rduv .req N
Rdu .req TILE
Rdv .req N
@ -75,7 +73,7 @@ sLdg .req R
sLdt .req Lh
sRdx .req Rh
sRdg .req tmp
sRdt .req N // not used in ldm due h collision
sRdt .req tmp // not enough regs for one ldmia
SP_LDX = 0
SP_LDG = 4
@ -83,41 +81,32 @@ SP_LDT = 8
SP_RDX = 12
SP_RDG = 16
SP_RDT = 20
SP_L = 24
SP_R = 28
SP_LH = 32
SP_RH = 36
SP_SIZE = 40
SP_TILE = SP_SIZE
.macro PUT_PIXELS
bic LMAP, g, #255
add g, dgdx
#ifndef TEX_2PX
tex indexA, t
add t, dtdx
tex indexB, t
add t, dtdx
// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
ands indexA, #255
andnes indexB, #255
orrne indexB, indexA, indexB, lsl #8 // indexB = indexA | (indexB << 8)
ldrneb indexA, [LMAP, indexA]
ldrneb indexB, [LMAP, indexB, lsr #8]
orrne indexA, indexB, lsl #8
strneh indexA, [ptr]
#else
tex indexA, t
add t, dtdx, lsl #1
cmp indexA, #0
ldrneb indexA, [LMAP, indexA]
strneb indexA, [ptr]
#endif
add ptr, #2
.endm
.global rasterizeGTA_asm
rasterizeGTA_asm:
stmfd sp!, {r4-r11, lr}
sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
ldr r3, =gTile
ldr r3, [r3]
stmfd sp!, {r3-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0
@ -160,16 +149,9 @@ rasterizeGTA_asm:
asr Ldg, #8 // 8-bit for fractional part
str Ldg, [sp, #SP_LDG] // store Ldg to stack
ldr Lduv, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt
asr Ldu, Lduv, #16
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Ldt, [L, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end:
@ -209,16 +191,9 @@ rasterizeGTA_asm:
asr Rdg, #8 // 8-bit for fractional part
str Rdg, [sp, #SP_RDG] // store Ldg to stack
ldr Rduv, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt
asr Rdu, Rduv, #16
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
ldr Rdt, [R, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end:
@ -231,48 +206,39 @@ rasterizeGTA_asm:
sub Lh, h // Lh -= h
sub Rh, h // Rh -= h
ldr TILE, =gTile
ldr TILE, [TILE]
ldr TILE, [sp, #SP_TILE]
stmfd sp!, {L,R,Lh,Rh} // sp-16
add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start:
stmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp-24
stmfd sp!, {Lx, Lg, Lt}
asr tmp, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
asr Lx, Lx, #16 // x1 = (Lx >> 16)
rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline
add ptr, pixel, tmp // ptr = pixel + x1
add ptr, pixel, Lx // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv
// t == Lt (alias)
sub dgdx, Rg, Lg // dgdx = Rg - Lg
mul dgdx, inv // dgdx *= FixedInvU(width)
asr dgdx, #15 // dgdx >>= 15
// g == Lg (alias)
sub duv, Rt, Lt // duv = Rt - Lt
asr du, duv, #16
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
// t == Lt (alias)
// 2 bytes alignment (VRAM write requirement)
.align_left:
tst ptr, #1 // if (ptr & 1)
beq .align_right
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
tex indexA, t
cmp indexA, #0
beq .skip_left
@ -296,29 +262,24 @@ rasterizeGTA_asm:
tst width, #1
beq .align_block_4px
ldrb indexB, [ptr, width]
sub width, #1 // width--
mla Rti, width, dtdx, t // Rti = width * dtdx + t
and indexA, Rti, #0xFF00
orr indexA, Rti, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
sub Rti, Rt, dtdx
tex indexA, Rti
cmp indexA, #0
subeq width, #1
beq .skip_right
asr Rgi, dgdx, #1
mla Rgi, width, Rgi, g // Rgi = width * (dgdx / 2) + g
sub Rgi, Rg, dgdx, asr #1
bic LMAP, Rgi, #255
lit indexA
ldrb indexA, [LMAP, indexA]
ldrb indexB, [ptr, width]
sub width, #1 // width--
orr indexB, indexA, indexB, lsl #8
strh indexB, [ptr, width]
.skip_right:
cmp width, #0 // width--
cmp width, #0
beq .scanline_end // if (width == 0)
.align_block_4px:
@ -350,10 +311,9 @@ rasterizeGTA_asm:
bne .scanline_block_8px
.scanline_end:
ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24
ldmfd sp!, {Lx, Lg, Lt}
add tmp, sp, #16
ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg}
ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg}
add Lx, sLdx
add Lg, sLdg
@ -361,17 +321,18 @@ rasterizeGTA_asm:
add Rx, sRdx
add Rg, sRdg
ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
ldr sRdt, [sp, #SP_RDT]
add Rt, sRdt
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
subs h, #1
bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16
add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop
.exit:
add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE]
ldmfd sp!, {r4-r11, pc}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -153,7 +153,8 @@ void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{
uint16 color = gLightmap[(L->v.g << 8) | L->v.clip];
uint32 color = (uint32)R;
color = gLightmap[(L->v.g << 8) | color];
color |= (color << 8);
int32 Lh = 0;
@ -163,6 +164,8 @@ void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
int32 Rx;
int32 Lx;
R = L;
while (1)
{
while (!Lh)

View File

@ -49,12 +49,12 @@ enum FaceType {
FACE_TYPE_MAX
};
#define FACE_TRIANGLE (1 << 13)
#define FACE_TRIANGLE (1 << 19)
#define FACE_CLIPPED (1 << 18)
#define FACE_TYPE_SHIFT 14
#define FACE_TYPE_MASK 15
#define FACE_GOURAUD (2 << FACE_TYPE_SHIFT)
#define FACE_TEXTURE 0x1FFF
#define FACE_TEXTURE 0x3FFF
#include "rasterizer.h"
@ -411,6 +411,7 @@ void faceAddRoomTriangles_c(const RoomTriangle* polys, int32 count)
if (g0 != g1 || g0 != g2) {
flags += FACE_GOURAUD;
}
flags |= FACE_TRIANGLE;
if (checkBackface(v0, v1, v2))
continue;
@ -487,6 +488,7 @@ void faceAddMeshTriangles_c(const MeshTriangle* polys, int32 count)
if ((c0 | c1 | c2) & CLIP_MASK_VP) {
flags |= FACE_CLIPPED;
}
flags |= FACE_TRIANGLE;
int32 depth = (v0->z + v1->z + v2->z + v2->z) >> (2 + OT_SHIFT);
@ -634,11 +636,9 @@ X_NOINLINE void rasterize_c(uint32 flags, VertexLink* top)
uint32 type = (flags >> FACE_TYPE_SHIFT) & FACE_TYPE_MASK;
if (type == FACE_TYPE_F) {
top->v.clip = flags; // use tex coord as color index for untextured polys
}
VertexLink* R = (type == FACE_TYPE_F) ? (VertexLink*)(flags & 0xFF) : top;
gRasterProc[type]((uint16*)pixel, top, top);
gRasterProc[type]((uint16*)pixel, top, R);
}
void flush_c()