1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-08-08 06:06:51 +02:00

#368 GBA rasterizer optimization

This commit is contained in:
XProger
2021-12-30 09:56:40 +03:00
parent e7cb40706e
commit 9bcc8468d0
10 changed files with 154 additions and 152 deletions

View File

@@ -72,3 +72,13 @@ MAX_INT32 = 0x7FFFFFFF
mlas tmp, vx1, vy2, tmp
bgt \skip
.endm
.macro tex index, uv
and \index, \uv, #0xFF00
orr \index, \uv, lsr #24 // index = t.v * 256 + t.u
ldrb \index, [TILE, \index]
.endm
.macro lit index
ldrb \index, [LMAP, \index]
.endm

View File

@@ -11,11 +11,11 @@ N .req r6
Lh .req r7
Rh .req r8
Lx .req ip
Rx .req lr
Lt .req r9
Rt .req r10
h .req r11
Lx .req r9
Rx .req r10
Lt .req r11
Rt .req r12
h .req lr
Ldx .req h
Rdx .req h
@@ -49,20 +49,21 @@ Rduv .req h
Rdu .req N
Rdv .req h
sLdx .req tmp
sLdt .req N
sRdx .req Lh
sRdt .req Rh
SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12
.macro PUT_PIXELS
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u
ldrb indexA, [TILE, indexA]
tex indexA, t
add t, dtdx
and indexB, t, #0xFF00
orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u
ldrb indexB, [TILE, indexB]
tex indexB, t
add t, dtdx
// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
@@ -78,7 +79,7 @@ SP_RDT = 12
.global rasterizeFTA_mode4_asm
rasterizeFTA_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov LMAP, #LMAP_ADDR
@@ -273,17 +274,12 @@ rasterizeFTA_mode4_asm:
bne .scanline_block_8px
.scanline_end:
ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack
ldr tmp, [sp, #(SP_LDT + 16)]
add Lt, tmp // Lt += Ldt from stack
ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack
ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
add Rt, sRdt
add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240)
@@ -295,4 +291,4 @@ rasterizeFTA_mode4_asm:
.exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}

View File

@@ -11,11 +11,11 @@ N .req r6
Lh .req r7
Rh .req r8
Lx .req ip
Rx .req lr
Lt .req r9
Rt .req r10
h .req r11
Lx .req r9
Rx .req r10
Lt .req r11
Rt .req r12
h .req lr
Ldx .req h
Rdx .req h
@@ -49,39 +49,40 @@ Rduv .req h
Rdu .req N
Rdv .req h
sLdx .req tmp
sLdt .req N
sRdx .req Lh
sRdt .req Rh
SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12
.macro PUT_PIXELS
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
tex indexA, t
lit indexA
#ifndef TEX_2PX
add t, dtdx
and indexB, t, #0xFF00
orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u
ldrb indexB, [TILE, indexB]
ldrb indexB, [LMAP, indexB]
tex indexB, t
lit indexB
add t, dtdx
orr indexA, indexB, lsl #8
strh indexA, [ptr], #2
#else
add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8
strb indexA, [tmp], #2 // writing a byte to GBA VRAM will write a half word for free
#endif
strb indexA, [tmp], #2
.endm
.global rasterizeFT_mode4_asm
rasterizeFT_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov LMAP, #LMAP_ADDR
@@ -273,17 +274,12 @@ rasterizeFT_mode4_asm:
bne .scanline_block_8px
.scanline_end:
ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack
ldr tmp, [sp, #(SP_LDT + 16)]
add Lt, tmp // Lt += Ldt from stack
ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack
ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
add Rt, sRdt
add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240)
@@ -295,4 +291,4 @@ rasterizeFT_mode4_asm:
.exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}

View File

@@ -6,14 +6,15 @@ R .req r2
index .req r3
Lh .req r4
Rh .req r5
Lx .req ip
Rx .req lr
Ldx .req r6
Rdx .req r7
N .req r8
tmp .req r9
DIVLUT .req r10
width .req r11
Lx .req r6
Rx .req r7
Ldx .req r8
Rdx .req r9
N .req r10
tmp .req r11
DIVLUT .req r12
width .req lr
h .req N
Ry1 .req tmp
Ry2 .req Rh
@@ -25,7 +26,7 @@ blocks .req DIVLUT
.global rasterizeF_mode4_asm
rasterizeF_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
mov LMAP, #LMAP_ADDR
@@ -136,4 +137,4 @@ rasterizeF_mode4_asm:
b .loop
.exit:
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}

View File

@@ -70,6 +70,13 @@ Rdv .req N
Rti .req tmp
Rgi .req tmp
sLdx .req L
sLdg .req R
sLdt .req Lh
sRdx .req Rh
sRdg .req tmp
sRdt .req N // not used in ldm due h collision
SP_LDX = 0
SP_LDG = 4
SP_LDT = 8
@@ -81,14 +88,10 @@ SP_RDT = 20
bic LMAP, g, #255
add g, dgdx
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u
ldrb indexA, [TILE, indexA]
tex indexA, t
add t, dtdx
and indexB, t, #0xFF00
orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u
ldrb indexB, [TILE, indexB]
tex indexB, t
add t, dtdx
// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
@@ -104,7 +107,7 @@ SP_RDT = 20
.global rasterizeGTA_mode4_asm
rasterizeGTA_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov Lh, #0 // Lh = 0
@@ -339,20 +342,14 @@ rasterizeGTA_mode4_asm:
.scanline_end:
ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24
ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg}
ldr tmp, [sp, #(SP_LDG + 16)]
add Lg, tmp // Lg += Ldg from stack
ldr tmp, [sp, #(SP_LDT + 16)]
add Lt, tmp // Lt += Ldt from stack
ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack
ldr tmp, [sp, #(SP_RDG + 16)]
add Rg, tmp // Rg += Rdg from stack
add Lx, sLdx
add Lg, sLdg
add Lt, sLdt
add Rx, sRdx
add Rg, sRdg
ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
@@ -367,4 +364,4 @@ rasterizeGTA_mode4_asm:
.exit:
add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}

View File

@@ -70,6 +70,13 @@ Rdv .req N
Rti .req tmp
Rgi .req tmp
sLdx .req L
sLdg .req R
sLdt .req Lh
sRdx .req Rh
sRdg .req tmp
sRdt .req N // not used in ldm due h collision
SP_LDX = 0
SP_LDG = 4
SP_LDT = 8
@@ -81,18 +88,15 @@ SP_RDT = 20
bic LMAP, g, #255
add g, dgdx
and indexA, t, #0xFF00
orr indexA, t, lsr #24 // indexA = t.v * 256 + t.u
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
tex indexA, t
lit indexA
#ifndef TEX_2PX
add t, dtdx
and indexB, t, #0xFF00
orr indexB, t, lsr #24 // indexB = t.v * 256 + t.u
ldrb indexB, [TILE, indexB]
ldrb indexB, [LMAP, indexB]
tex indexB, t
lit indexB
add t, dtdx
orr indexA, indexB, lsl #8
@@ -101,13 +105,13 @@ SP_RDT = 20
add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8
strb indexA, [ptr], #2
strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free
#endif
.endm
.global rasterizeGT_mode4_asm
rasterizeGT_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov Lh, #0 // Lh = 0
@@ -330,20 +334,14 @@ rasterizeGT_mode4_asm:
.scanline_end:
ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24
ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg}
ldr tmp, [sp, #(SP_LDG + 16)]
add Lg, tmp // Lg += Ldg from stack
ldr tmp, [sp, #(SP_LDT + 16)]
add Lt, tmp // Lt += Ldt from stack
ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack
ldr tmp, [sp, #(SP_RDG + 16)]
add Rg, tmp // Rg += Rdg from stack
add Lx, sLdx
add Lg, sLdg
add Lt, sLdt
add Rx, sRdx
add Rg, sRdg
ldr tmp, [sp, #(SP_RDT + 16)]
add Rt, tmp // Rt += Rdt from stack
@@ -358,4 +356,4 @@ rasterizeGT_mode4_asm:
.exit:
add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}

View File

@@ -9,11 +9,11 @@ tmp .req r5
N .req r6
Lh .req r7
Rh .req r8
Lx .req ip
Rx .req lr
Lg .req r9
Rg .req r10
h .req r11
Lx .req r9
Rx .req r10
Lg .req r11
Rg .req r12
h .req lr
Ldx .req h
Rdx .req Ldx
Ldg .req Ldx
@@ -29,6 +29,11 @@ width .req Rh
g .req L
dgdx .req R
sLdx .req L
sLdg .req R
sRdx .req Lh
sRdg .req Rh
SP_LDX = 0
SP_LDG = 4
SP_RDX = 8
@@ -44,7 +49,7 @@ SP_RDG = 12
.global rasterizeG_mode4_asm
rasterizeG_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldg, Rdx, Rdg]
mov tmp, #LMAP_ADDR
@@ -188,17 +193,12 @@ rasterizeG_mode4_asm:
bne .scanline_block_4px
.scanline_end:
ldr tmp, [sp, #(SP_LDX + 16)]
add Lx, tmp // Lx += Ldx from stack
ldr tmp, [sp, #(SP_LDG + 16)]
add Lg, tmp // Lg += Ldg from stack
ldr tmp, [sp, #(SP_RDX + 16)]
add Rx, tmp // Rx += Rdx from stack
ldr tmp, [sp, #(SP_RDG + 16)]
add Rg, tmp // Rg += Rdg from stack
add tmp, sp, #16
ldmia tmp, {sLdx, sLdg, sRdx, sRdg}
add Lx, sLdx
add Lg, sLdg
add Rx, sRdx
add Rg, sRdg
add pixel, #VRAM_STRIDE // pixel += FRAME_WIDTH (240)
@@ -210,4 +210,4 @@ rasterizeG_mode4_asm:
.exit:
add sp, #16 // revert reserved space for [Ldx, Ldg, Rdx, Rdg]
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}

View File

@@ -6,14 +6,14 @@ R .req r2
LMAP .req r3
Lh .req r4
Rh .req r5
Lx .req ip
Rx .req lr
Ldx .req r6
Rdx .req r7
N .req r8
tmp .req r9
DIVLUT .req r10
width .req r11
Lx .req r6
Rx .req r7
Ldx .req r8
Rdx .req r9
N .req r10
tmp .req r11
DIVLUT .req r12
width .req lr
h .req N
Ry1 .req tmp
Ry2 .req Rh
@@ -28,7 +28,7 @@ indexB .req DIVLUT
.global rasterizeS_mode4_asm
rasterizeS_mode4_asm:
stmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}
stmfd sp!, {r4-r11, lr}
ldr LMAP, .shadow_lightmap
@@ -149,4 +149,4 @@ rasterizeS_mode4_asm:
b .loop
.exit:
ldmfd sp!, {r4,r5,r6,r7,r8,r9,r10,r11,pc}
ldmfd sp!, {r4-r11, pc}

View File

@@ -71,7 +71,7 @@ transformMesh_asm:
mla y, mx, vx, y
mla y, my, vy, y
mla y, mz, vz, y
mov y, y, asr #(FIXED_SHIFT - PROJ_SHIFT)
mov y, y, asr #FIXED_SHIFT
// transform z
ldmia m!, {mx, my, mz, z}
@@ -99,21 +99,23 @@ transformMesh_asm:
mul x, dz, x
mul y, dz, y
mov x, x, asr #(16 - PROJ_SHIFT)
// keep y shifted by 16 for min/max cmp
mov y, y, asr #(16 - PROJ_SHIFT)
// viewport clipping
ldmia sp, {minXY, maxXY}
cmp x, minXY, asr #16
orrle vg, vg, #CLIP_LEFT
cmp y, minXY, lsl #16
orrle vg, vg, #CLIP_TOP
cmp x, maxXY, asr #16
orrge vg, vg, #CLIP_RIGHT
cmp y, maxXY, lsl #16
orrge vg, vg, #CLIP_BOTTOM
mov y, y, asr #16
mov minXY, minXY, lsl #16
mov maxXY, maxXY, lsl #16
cmp y, minXY, asr #16
orrle vg, vg, #CLIP_TOP
cmp y, maxXY, asr #16
orrge vg, vg, #CLIP_BOTTOM
add x, x, #(FRAME_WIDTH >> 1)
add y, y, #(FRAME_HEIGHT >> 1)

View File

@@ -85,7 +85,7 @@ transformRoom_asm:
mla y, mx, vx, y
mla y, my, vy, y
mla y, mz, vz, y
mov y, y, asr #(FIXED_SHIFT - PROJ_SHIFT)
mov y, y, asr #FIXED_SHIFT
// transform x
ldmdb m!, {mx, my, mz, x}
@@ -121,21 +121,23 @@ transformRoom_asm:
mul x, dz, x
mul y, dz, y
mov x, x, asr #(16 - PROJ_SHIFT)
// keep y shifted by 16 for min/max cmp
mov y, y, asr #(16 - PROJ_SHIFT)
// viewport clipping
ldmia sp, {m, minXY, maxXY} // preload matrix
cmp x, minXY, asr #16
orrle vg, vg, #CLIP_LEFT
cmp y, minXY, lsl #16
orrle vg, vg, #CLIP_TOP
cmp x, maxXY, asr #16
orrge vg, vg, #CLIP_RIGHT
cmp y, maxXY, lsl #16
orrge vg, vg, #CLIP_BOTTOM
mov y, y, asr #16
mov minXY, minXY, lsl #16
mov maxXY, maxXY, lsl #16
cmp y, minXY, asr #16
orrle vg, vg, #CLIP_TOP
cmp y, maxXY, asr #16
orrge vg, vg, #CLIP_BOTTOM
add x, x, #(FRAME_WIDTH >> 1)
add y, y, #(FRAME_HEIGHT >> 1)