1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-08-06 13:16:52 +02:00

#368 GBA optimization with FIQ registers

This commit is contained in:
XProger
2022-04-23 16:15:06 +03:00
parent 017446b9f7
commit e821f236a2
28 changed files with 670 additions and 686 deletions

View File

@@ -1,7 +1,7 @@
#include "common_asm.inc" #include "common_asm.inc"
v .req r0 v .req r0 // arg
q .req r1 q .req r1 // arg
min .req q min .req q
max .req r2 max .req r2

View File

@@ -1,19 +1,20 @@
#include "common_asm.inc" #include "common_asm.inc"
aabb .req r0 aabb .req r0 // arg
x .req r1 x .req r1 // arg
y .req r2 y .req r2 // arg
z .req r3 z .req r3 // arg
minX .req r4 // FIQ regs
maxX .req r5 minX .req r8
minY .req r6 maxX .req r9
maxY .req r7 minY .req r10
maxY .req r11
minZ .req r12 minZ .req r12
maxZ .req lr maxZ .req r13
.global boxTranslate_asm .global boxTranslate_asm
boxTranslate_asm: boxTranslate_asm:
stmfd sp!, {r4-r7, lr} fiq_on
ldmia aabb, {minX, maxX, minY, maxY, minZ, maxZ} ldmia aabb, {minX, maxX, minY, maxY, minZ, maxZ}
add minX, minX, x add minX, minX, x
@@ -24,5 +25,5 @@ boxTranslate_asm:
add maxZ, maxZ, z add maxZ, maxZ, z
stmia aabb, {minX, maxX, minY, maxY, minZ, maxZ} stmia aabb, {minX, maxX, minY, maxY, minZ, maxZ}
ldmfd sp!, {r4-r7, lr} fiq_off
bx lr bx lr

View File

@@ -120,15 +120,17 @@
ble \skip ble \skip
.endm .endm
.macro scaleUV uv, u, v, f .macro scaleUV uv, tmp, f
asr \u, \uv, #16 asr \tmp, \uv, #16
mul \u, \f // u = f * int16(uv >> 16) mul \tmp, \f // u = f * int16(uv >> 16)
lsl \v, \uv, #16
asr \v, #16 lsl \uv, \uv, #16
mul \v, \f // v = f * int16(uv) asr \uv, #16
lsr \u, #16 mul \uv, \f // v = f * int16(uv)
lsl \u, #16
orr \uv, \u, \v, lsr #16 // uv = (u & 0xFFFF0000) | (v >> 16) lsr \tmp, #16
lsl \tmp, #16
orr \uv, \tmp, \uv, lsr #16 // uv = (u & 0xFFFF0000) | (v >> 16)
.endm .endm
.macro tex index, uv .macro tex index, uv
@@ -140,3 +142,19 @@
.macro lit index .macro lit index
ldrb \index, [LMAP, \index] ldrb \index, [LMAP, \index]
.endm .endm
.macro fiq_on
msr cpsr, #0x11 // switch r8-r14 to FIQ (IRQ enabled)
.endm
.macro fiq_off
msr cpsr, #0x1F // restore r8-r14
.endm
.macro fiq_on_ne
msrne cpsr, #0x11 // switch r8-r14 to FIQ (IRQ enabled)
.endm
.macro fiq_off_ne
msrne cpsr, #0x1F // restore r8-r14
.endm

View File

@@ -1,19 +1,21 @@
#include "common_asm.inc" #include "common_asm.inc"
polys .req r0 polys .req r0 // arg
count .req r1 count .req r1 // arg
vp .req r2 vp .req r2
vg0 .req r3 vg0 .req r3
vg1 .req r4 vg1 .req r4
vg2 .req r5 vg2 .req r5
vg3 .req r6 vg3 .req r6
flags .req r7 flags .req r7
// FIQ regs
vp0 .req r8 vp0 .req r8
vp1 .req r9 vp1 .req r9
vp2 .req r10 vp2 .req r10
vp3 .req r11 vp3 .req r11
ot .req r12 ot .req r12
face .req lr face .req r13
vertices .req r14
vx0 .req vg0 vx0 .req vg0
vy0 .req vg1 vy0 .req vg1
@@ -29,21 +31,18 @@ vz3 .req vg3
depth .req vg0 depth .req vg0
tmp .req flags tmp .req flags
vertices .req vg2
next .req vp0 next .req vp0
SP_SIZE = 4
.global faceAddMeshQuads_asm .global faceAddMeshQuads_asm
faceAddMeshQuads_asm: faceAddMeshQuads_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r7}
fiq_on
ldr vp, =gVerticesBase ldr vp, =gVerticesBase
ldr vp, [vp] ldr vp, [vp]
ldr vertices, =gVertices ldr vertices, =gVertices
lsr vertices, #3 lsr vertices, #3
stmfd sp!, {vertices}
ldr face, =gFacesBase ldr face, =gFacesBase
ldr face, [face] ldr face, [face]
@@ -97,7 +96,6 @@ faceAddMeshQuads_asm:
lsr depth, #(2 + OT_SHIFT) lsr depth, #(2 + OT_SHIFT)
// faceAdd // faceAdd
ldr vertices, [sp]
rsb vp0, vertices, vp0, lsr #3 rsb vp0, vertices, vp0, lsr #3
rsb vp1, vertices, vp1, lsr #3 rsb vp1, vertices, vp1, lsr #3
rsb vp2, vertices, vp2, lsr #3 rsb vp2, vertices, vp2, lsr #3
@@ -116,5 +114,6 @@ faceAddMeshQuads_asm:
ldr tmp, =gFacesBase ldr tmp, =gFacesBase
str face, [tmp] str face, [tmp]
add sp, #SP_SIZE fiq_off
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r7}
bx lr

View File

@@ -1,19 +1,20 @@
#include "common_asm.inc" #include "common_asm.inc"
polys .req r0 polys .req r0 // arg
count .req r1 count .req r1 // arg
vp .req r2 vp .req r2
vg0 .req r3 vg0 .req r3
vg1 .req r4 vg1 .req r4
vg2 .req r5 vg2 .req r5
vg3 .req r6 vg3 .req r6
flags .req r7 // FIQ regs
vp0 .req r8 flags .req r8
vp1 .req r9 vp0 .req r9
vp2 .req r10 vp1 .req r10
vertices .req r11 vp2 .req r11
ot .req r12 vertices .req r12
face .req lr ot .req r13
face .req r14
vx0 .req vg0 vx0 .req vg0
vy0 .req vg1 vy0 .req vg1
@@ -32,7 +33,8 @@ next .req vp0
.global faceAddMeshTriangles_asm .global faceAddMeshTriangles_asm
faceAddMeshTriangles_asm: faceAddMeshTriangles_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r6}
fiq_on
ldr vp, =gVerticesBase ldr vp, =gVerticesBase
ldr vp, [vp] ldr vp, [vp]
@@ -102,4 +104,6 @@ faceAddMeshTriangles_asm:
ldr tmp, =gFacesBase ldr tmp, =gFacesBase
str face, [tmp] str face, [tmp]
ldmfd sp!, {r4-r11, pc} fiq_off
ldmfd sp!, {r4-r6}
bx lr

View File

@@ -1,19 +1,21 @@
#include "common_asm.inc" #include "common_asm.inc"
polys .req r0 polys .req r0 // arg
count .req r1 count .req r1 // arg
vp .req r2 vp .req r2
vg0 .req r3 vg0 .req r3
vg1 .req r4 vg1 .req r4
vg2 .req r5 vg2 .req r5
vg3 .req r6 vg3 .req r6
flags .req r7 flags .req r7
// FIQ regs
vp0 .req r8 vp0 .req r8
vp1 .req r9 vp1 .req r9
vp2 .req r10 vp2 .req r10
vp3 .req r11 vp3 .req r11
ot .req r12 ot .req r12
face .req lr face .req r13
vertices .req r14
vx0 .req vg0 vx0 .req vg0
vy0 .req vg1 vy0 .req vg1
@@ -29,21 +31,20 @@ vz3 .req vg3
depth .req vg0 depth .req vg0
tmp .req flags tmp .req flags
vertices .req vg2
next .req vp0 next .req vp0
SP_SIZE = 4 SP_SIZE = 4
.global faceAddRoomQuads_asm .global faceAddRoomQuads_asm
faceAddRoomQuads_asm: faceAddRoomQuads_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r7}
fiq_on
ldr vp, =gVerticesBase ldr vp, =gVerticesBase
ldr vp, [vp] ldr vp, [vp]
ldr vertices, =gVertices ldr vertices, =gVertices
lsr vertices, #3 lsr vertices, #3
stmfd sp!, {vertices}
ldr face, =gFacesBase ldr face, =gFacesBase
ldr face, [face] ldr face, [face]
@@ -107,7 +108,6 @@ faceAddRoomQuads_asm:
mov depth, vz0, lsr #OT_SHIFT mov depth, vz0, lsr #OT_SHIFT
// faceAdd // faceAdd
ldr vertices, [sp]
rsb vp0, vertices, vp0, lsr #3 rsb vp0, vertices, vp0, lsr #3
rsb vp1, vertices, vp1, lsr #3 rsb vp1, vertices, vp1, lsr #3
rsb vp2, vertices, vp2, lsr #3 rsb vp2, vertices, vp2, lsr #3
@@ -126,5 +126,6 @@ faceAddRoomQuads_asm:
ldr tmp, =gFacesBase ldr tmp, =gFacesBase
str face, [tmp] str face, [tmp]
add sp, #SP_SIZE fiq_off
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r7}
bx lr

View File

@@ -1,19 +1,20 @@
#include "common_asm.inc" #include "common_asm.inc"
polys .req r0 polys .req r0 // arg
count .req r1 count .req r1 // arg
vp .req r2 vp .req r2
vg0 .req r3 vg0 .req r3
vg1 .req r4 vg1 .req r4
vg2 .req r5 vg2 .req r5
vg3 .req r6 vg3 .req r6
flags .req r7 // FIQ regs
vp0 .req r8 flags .req r8
vp1 .req r9 vp0 .req r9
vp2 .req r10 vp1 .req r10
vertices .req r11 vp2 .req r11
ot .req r12 vertices .req r12
face .req lr ot .req r13
face .req r14
vx0 .req vg0 vx0 .req vg0
vy0 .req vg1 vy0 .req vg1
@@ -32,7 +33,8 @@ next .req vp0
.global faceAddRoomTriangles_asm .global faceAddRoomTriangles_asm
faceAddRoomTriangles_asm: faceAddRoomTriangles_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r6}
fiq_on
ldr vp, =gVerticesBase ldr vp, =gVerticesBase
ldr vp, [vp] ldr vp, [vp]
@@ -110,4 +112,6 @@ faceAddRoomTriangles_asm:
ldr tmp, =gFacesBase ldr tmp, =gFacesBase
str face, [tmp] str face, [tmp]
ldmfd sp!, {r4-r11, pc} fiq_off
ldmfd sp!, {r4-r6}
bx lr

View File

@@ -1,8 +1,8 @@
#include "common_asm.inc" #include "common_asm.inc"
this .req r0 this .req r0 // arg
x .req r1 x .req r1 // arg
z .req r2 z .req r2 // arg
info .req r3 info .req r3
roomX .req r12 roomX .req r12
roomZ .req roomX roomZ .req roomX

View File

@@ -1,15 +1,16 @@
#include "common_asm.inc" #include "common_asm.inc"
n .req r0 n .req r0 // arg
pmul .req r1 pmul .req r1 // arg
pdiv .req r2 pdiv .req r2 // arg
m0 .req r3 // FIQ regs
m1 .req r4 m0 .req r8
m2 .req r5 m1 .req r9
n0 .req r6 m2 .req r10
n1 .req r7 n0 .req r11
n2 .req r12 n1 .req r12
m .req lr n2 .req r13
m .req r14
tmp .req m0 tmp .req m0
.macro load .macro load
@@ -83,7 +84,7 @@ tmp .req m0
.global matrixLerp_asm .global matrixLerp_asm
matrixLerp_asm: matrixLerp_asm:
stmfd sp!, {r4-r7, lr} fiq_on
ldr m, =gMatrixPtr ldr m, =gMatrixPtr
ldr m, [m] ldr m, [m]
.check_2: .check_2:
@@ -111,5 +112,5 @@ matrixLerp_asm:
mov pmul, tmp, asr #8 mov pmul, tmp, asr #8
lerp _X_Y lerp _X_Y
.done: .done:
ldmfd sp!, {r4-r7, lr} fiq_off
bx lr bx lr

View File

@@ -5,25 +5,26 @@ e1 .req r1
e2 .req r2 e2 .req r2
e3 .req r3 e3 .req r3
m .req e0 m .req e0
src .req r12 // FIQ regs
dst .req lr src .req r8
dst .req r9
e4 .req r10
e5 .req r11
.global matrixPush_asm .global matrixPush_asm
matrixPush_asm: matrixPush_asm:
stmfd sp!, {lr} fiq_on
ldr m, =gMatrixPtr ldr m, =gMatrixPtr
ldr src, [m] ldr src, [m]
add dst, src, #(12 * 4) add dst, src, #(12 * 4)
str dst, [m] str dst, [m]
ldmia src!, {e0, e1, e2, e3} ldmia src!, {e0, e1, e2, e3, e4, e5}
stmia dst!, {e0, e1, e2, e3} stmia dst!, {e0, e1, e2, e3, e4, e5}
ldmia src!, {e0, e1, e2, e3} ldmia src!, {e0, e1, e2, e3, e4, e5}
stmia dst!, {e0, e1, e2, e3} stmia dst!, {e0, e1, e2, e3, e4, e5}
ldmia src!, {e0, e1, e2, e3} fiq_off
stmia dst!, {e0, e1, e2, e3}
ldmfd sp!, {lr}
bx lr bx lr

View File

@@ -18,17 +18,18 @@
mov \x, \x, asr #FIXED_SHIFT mov \x, \x, asr #FIXED_SHIFT
.endm .endm
angle .req r0 angle .req r0 // arg
e0 .req r1 s .req r1
e1 .req r2 c .req r2
s .req r3 v .req r3
c .req r12 // FIQ regs
v .req lr e0 .req r8
e1 .req r9
m .req angle m .req angle
.global matrixRotateX_asm .global matrixRotateX_asm
matrixRotateX_asm: matrixRotateX_asm:
stmfd sp!, {lr} fiq_on
mov angle, angle, lsl #16 mov angle, angle, lsl #16
mov angle, angle, lsr #20 mov angle, angle, lsr #20
@@ -53,12 +54,12 @@ matrixRotateX_asm:
rotxy e1, e0, s, c, v rotxy e1, e0, s, c, v
stmia m, {e0, e1} stmia m, {e0, e1}
ldmfd sp!, {lr} fiq_off
bx lr bx lr
.global matrixRotateY_asm .global matrixRotateY_asm
matrixRotateY_asm: matrixRotateY_asm:
stmfd sp!, {lr} fiq_on
mov angle, angle, lsl #16 mov angle, angle, lsl #16
mov angle, angle, lsr #20 mov angle, angle, lsr #20
@@ -86,12 +87,12 @@ matrixRotateY_asm:
str e0, [m], #8 str e0, [m], #8
str e1, [m], #8 str e1, [m], #8
ldmfd sp!, {lr} fiq_off
bx lr bx lr
.global matrixRotateZ_asm .global matrixRotateZ_asm
matrixRotateZ_asm: matrixRotateZ_asm:
stmfd sp!, {lr} fiq_on
mov angle, angle, lsl #16 mov angle, angle, lsl #16
mov angle, angle, lsr #20 mov angle, angle, lsr #20
@@ -115,23 +116,24 @@ matrixRotateZ_asm:
rotxy e1, e0, s, c, v rotxy e1, e0, s, c, v
stmia m, {e0, e1} stmia m, {e0, e1}
ldmfd sp!, {lr} fiq_off
bx lr bx lr
angleX .req r0 angleX .req r0 // arg
angleY .req r1 angleY .req r1 // arg
angleZ .req r2 angleZ .req r2 // arg
e00 .req r3 e00 .req r3
e01 .req r4 e01 .req r4
e02 .req r5 e02 .req r5
e10 .req r6 e10 .req r6
e11 .req r7 // FIQ regs
e12 .req r8 e11 .req r8
e20 .req r9 e12 .req r9
e21 .req r10 e20 .req r10
e22 .req r11 e21 .req r11
tmp .req r12 tmp .req r12
sinX .req lr e22 .req r13
sinX .req r14
sinY .req sinX sinY .req sinX
sinZ .req sinX sinZ .req sinX
cosX .req angleX cosX .req angleX
@@ -153,7 +155,8 @@ matrixRotateYXZ_asm:
orrs mask, mask, angleZ orrs mask, mask, angleZ
bxeq lr bxeq lr
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r6}
fiq_on
ldr mm, =gMatrixPtr ldr mm, =gMatrixPtr
ldr mm, [mm] ldr mm, [mm]
@@ -203,10 +206,11 @@ matrixRotateYXZ_asm:
add mm, #(4 * 4) add mm, #(4 * 4)
stmia mm, {e20, e21, e22} stmia mm, {e20, e21, e22}
ldmfd sp!, {r4-r11, lr} fiq_off
ldmfd sp!, {r4-r6}
bx lr bx lr
q .req r0 q .req r0 // arg
n .req r1 n .req r1
mx .req r3 mx .req r3
my .req q my .req q

View File

@@ -1,7 +1,7 @@
#include "common_asm.inc" #include "common_asm.inc"
dst .req r0 dst .req r0 // arg
src .req r1 src .req r1 // arg
e0 .req r2 e0 .req r2
e1 .req r3 e1 .req r3

View File

@@ -1,17 +1,18 @@
#include "common_asm.inc" #include "common_asm.inc"
x .req r0 x .req r0 // arg
y .req r1 y .req r1 // arg
z .req r2 z .req r2 // arg
e0 .req r3 m .req r3
e1 .req r4 // FIQ regs
e2 .req r5 e0 .req r8
v .req r12 e1 .req r9
m .req lr e2 .req r10
v .req r11
.global matrixTranslateRel_asm .global matrixTranslateRel_asm
matrixTranslateRel_asm: matrixTranslateRel_asm:
stmfd sp!, {r4-r5, lr} fiq_on
ldr m, =gMatrixPtr ldr m, =gMatrixPtr
ldr m, [m] ldr m, [m]
@@ -37,12 +38,12 @@ matrixTranslateRel_asm:
mla v, e2, z, v mla v, e2, z, v
stmdb m, {v} stmdb m, {v}
ldmfd sp!, {r4-r5, lr} fiq_off
bx lr bx lr
.global matrixTranslateAbs_asm .global matrixTranslateAbs_asm
matrixTranslateAbs_asm: matrixTranslateAbs_asm:
stmfd sp!, {r4-r5, lr} fiq_on
ldr v, =gCameraViewPos ldr v, =gCameraViewPos
ldmia v, {e0, e1, e2} ldmia v, {e0, e1, e2}
@@ -74,12 +75,12 @@ matrixTranslateAbs_asm:
mla v, e2, z, v mla v, e2, z, v
stmia m!, {v} stmia m!, {v}
ldmfd sp!, {r4-r5, lr} fiq_off
bx lr bx lr
.global matrixTranslateSet_asm .global matrixTranslateSet_asm
matrixTranslateSet_asm: matrixTranslateSet_asm:
stmfd sp!, {r4-r5, lr} fiq_on
ldr m, =gMatrixPtr ldr m, =gMatrixPtr
ldr m, [m] ldr m, [m]
@@ -105,5 +106,5 @@ matrixTranslateSet_asm:
mla v, e2, z, v mla v, e2, z, v
stmia m!, {v} stmia m!, {v}
ldmfd sp!, {r4-r5, lr} fiq_off
bx lr bx lr

View File

@@ -1,7 +1,7 @@
#include "common_asm.inc" #include "common_asm.inc"
flags .req r0 flags .req r0 // arg
L .req r1 L .req r1 // arg
R .req r2 R .req r2
y .req r3 y .req r3
type .req r12 type .req r12

View File

@@ -1,19 +1,20 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 pixel .req r0 // arg
L .req r1 L .req r1 // arg
color .req r2 color .req r2 // arg
index .req r3 index .req r3
Lh .req r4 Lh .req r4
Rh .req r5 Rh .req r5
Lx .req r6 Lx .req r6
Rx .req r7 // FIQ regs
Ldx .req r8 Rx .req r8
Rdx .req r9 Ldx .req r9
N .req r10 Rdx .req r10
tmp .req r11 N .req r11
pair .req r12 tmp .req r12
width .req lr pair .req r13
width .req r14
R .req color R .req color
h .req N h .req N
@@ -26,7 +27,8 @@ ptr .req tmp
.global rasterizeF_asm .global rasterizeF_asm
rasterizeF_asm: rasterizeF_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r6}
fiq_on
add LMAP, color, #LMAP_ADDR add LMAP, color, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G] ldrb tmp, [L, #VERTEX_G]
@@ -133,4 +135,6 @@ rasterizeF_asm:
b .loop b .loop
.exit: .exit:
ldmfd sp!, {r4-r11, pc} fiq_off
ldmfd sp!, {r4-r6}
bx lr

View File

@@ -1,94 +1,85 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 arg_pixel .req r0 // arg
L .req r1 arg_L .req r1 // arg
R .req r2 arg_R .req r2 // arg
LMAP .req r3
TILE .req r4 N .req r0
tmp .req r5 tmp .req r1
N .req r6 Lx .req r2
Lh .req r7 Rx .req r3
Rh .req r8 Lt .req r4
Rt .req r5
t .req r6
dtdx .req r7
Lx .req r9 indexA .req r8
Rx .req r10 indexB .req r9
Lt .req r11 LMAP .req r10
Rt .req r12 TILE .req r11
h .req lr pixel .req r12
width .req lr
// FIQ regs
Ldx .req r8
Rdx .req r9
Ldt .req r10
Rdt .req r11
LRh .req r12
L .req r13
R .req r14
Rh .req LRh
Lh .req t
h .req N
ptr .req tmp ptr .req tmp
Ldx .req h
Rdx .req h
Ldt .req h
Rdt .req h
indexA .req Lh
indexB .req Rh
Rxy .req tmp Rxy .req tmp
Ry2 .req Rh Ry2 .req Rh
Lxy .req tmp Lxy .req tmp
Ly2 .req Lh Ly2 .req Lh
inv .req Lh inv .req indexA
width .req N duv .req indexB
t .req L dtmp .req t
dtdx .req R
duv .req R Ltmp .req N
du .req L Rtmp .req N
dv .req R
Ldu .req N
Ldv .req h
Rdu .req N
Rdv .req h
Rti .req indexB Rti .req indexB
sLdx .req tmp
sLdt .req N
sRdx .req Lh
sRdt .req Rh
SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12
SP_L = 16
SP_R = 20
SP_LH = 24
SP_RH = 28
SP_SIZE = 32
.macro PUT_PIXELS .macro PUT_PIXELS
tex indexA, t tex indexA, t
lit indexA lit indexA
add t, dtdx, lsl #1 add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8
strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free
.endm .endm
.global rasterizeFT_asm .global rasterizeFT_asm
rasterizeFT_asm: rasterizeFT_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov pixel, arg_pixel
mov LMAP, #LMAP_ADDR mov LMAP, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G] ldrb t, [arg_L, #VERTEX_G]
add LMAP, tmp, lsl #8 // tmp = (L->v.g << 8) add LMAP, t, lsl #8 // LMAP = (L->v.g << 8)
ldr TILE, =gTile ldr TILE, =gTile
ldr TILE, [TILE] ldr TILE, [TILE]
mov Lh, #0 // Lh = 0 fiq_on
mov Rh, #0 // Rh = 0 mov L, arg_L
mov R, arg_R
mov LRh, #0 // Lh = 0
.loop: .loop:
lsr Lh, LRh, #16
lsl Rh, LRh, #16
lsr Rh, Rh, #16
cmp Lh, #0 cmp Lh, #0
bgt .calc_left_end // if (Lh != 0) end with left bgt .calc_left_end // if (Lh != 0) end with left
@@ -114,12 +105,10 @@ rasterizeFT_asm:
ldrsh Ldx, [L, #VERTEX_X] ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16 sub Ldx, Lx, asr #16
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldr Ldt, [L, #VERTEX_T] ldr Ldt, [L, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp scaleUV Ldt, Ltmp, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end: .calc_left_end:
cmp Rh, #0 cmp Rh, #0
@@ -146,12 +135,10 @@ rasterizeFT_asm:
ldrsh Rdx, [R, #VERTEX_X] ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16 sub Rdx, Rx, asr #16
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldr Rdt, [R, #VERTEX_T] ldr Rdt, [R, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp scaleUV Rdt, Rtmp, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end: .calc_right_end:
cmp Rh, Lh // if (Rh < Lh) cmp Rh, Lh // if (Rh < Lh)
@@ -160,8 +147,9 @@ rasterizeFT_asm:
sub Lh, h // Lh -= h sub Lh, h // Lh -= h
sub Rh, h // Rh -= h sub Rh, h // Rh -= h
add tmp, sp, #SP_L orr LRh, Rh, Lh, lsl #16
stmia tmp, {L, R, Lh, Rh}
fiq_off
.scanline_start: .scanline_start:
asr tmp, Lx, #16 // x1 = (Lx >> 16) asr tmp, Lx, #16 // x1 = (Lx >> 16)
@@ -173,7 +161,7 @@ rasterizeFT_asm:
divLUT inv, width // inv = FixedInvU(width) divLUT inv, width // inv = FixedInvU(width)
sub dtdx, Rt, Lt // duv = Rt - Lt sub dtdx, Rt, Lt // duv = Rt - Lt
scaleUV dtdx, du, dv, inv scaleUV dtdx, dtmp, inv
mov t, Lt // t = Lt mov t, Lt // t = Lt
@@ -237,21 +225,20 @@ rasterizeFT_asm:
bne .scanline_block_8px bne .scanline_block_8px
.scanline_end: .scanline_end:
ldmia sp, {sLdx, sLdt, sRdx, sRdt} add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
add Rt, sRdt
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) fiq_on
add Lx, Ldx
add Rx, Rdx
add Lt, Ldt
add Rt, Rdt
subs h, #1 subs h, #1
fiq_off_ne
bne .scanline_start bne .scanline_start
add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt] fiq_off
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -1,69 +1,55 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 arg_pixel .req r0 // arg
L .req r1 arg_L .req r1 // arg
R .req r2 arg_R .req r2 // arg
LMAP .req r3
TILE .req r4 N .req r0
tmp .req r5 tmp .req r1
N .req r6 Lx .req r2
Lh .req r7 Rx .req r3
Rh .req r8 Lt .req r4
Rt .req r5
t .req r6
dtdx .req r7
Lx .req r9 indexA .req r8
Rx .req r10 indexB .req r9
Lt .req r11 LMAP .req r10
Rt .req r12 TILE .req r11
h .req lr pixel .req r12
width .req lr
// FIQ regs
Ldx .req r8
Rdx .req r9
Ldt .req r10
Rdt .req r11
LRh .req r12
L .req r13
R .req r14
Rh .req LRh
Lh .req t
h .req N
ptr .req tmp ptr .req tmp
Ldx .req h
Rdx .req h
Ldt .req h
Rdt .req h
indexA .req Lh
indexB .req Rh
Rxy .req tmp Rxy .req tmp
Ry2 .req Rh Ry2 .req Rh
Lxy .req tmp Lxy .req tmp
Ly2 .req Lh Ly2 .req Lh
inv .req Lh inv .req indexA
width .req N duv .req indexB
t .req L dtmp .req t
dtdx .req R
duv .req R Ltmp .req N
du .req L Rtmp .req N
dv .req R
Ldu .req N
Ldv .req h
Rdu .req N
Rdv .req h
Rti .req indexB Rti .req indexB
sLdx .req tmp
sLdt .req N
sRdx .req Lh
sRdt .req Rh
SP_LDX = 0
SP_LDT = 4
SP_RDX = 8
SP_RDT = 12
SP_L = 16
SP_R = 20
SP_LH = 24
SP_RH = 28
SP_SIZE = 32
.macro PUT_PIXELS .macro PUT_PIXELS
tex indexA, t tex indexA, t
add t, dtdx, lsl #1 add t, dtdx, lsl #1
@@ -76,22 +62,28 @@ SP_SIZE = 32
.global rasterizeFTA_asm .global rasterizeFTA_asm
rasterizeFTA_asm: rasterizeFTA_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov pixel, arg_pixel
mov LMAP, #LMAP_ADDR mov LMAP, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G] ldrb t, [arg_L, #VERTEX_G]
add LMAP, tmp, lsl #8 // tmp = (L->v.g << 8) add LMAP, t, lsl #8 // LMAP = (L->v.g << 8)
ldr TILE, =gTile ldr TILE, =gTile
ldr TILE, [TILE] ldr TILE, [TILE]
mov Lh, #0 // Lh = 0 fiq_on
mov Rh, #0 // Rh = 0 mov L, arg_L
mov R, arg_R
mov LRh, #0 // Lh = 0
.loop: .loop:
lsr Lh, LRh, #16
lsl Rh, LRh, #16
lsr Rh, Rh, #16
cmp Lh, #0 cmp Lh, #0
bne .calc_left_end // if (Lh != 0) end with left bgt .calc_left_end // if (Lh != 0) end with left
.calc_left_start: .calc_left_start:
ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev
@@ -114,16 +106,14 @@ rasterizeFTA_asm:
ldrsh Ldx, [L, #VERTEX_X] ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16 sub Ldx, Lx, asr #16
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldr Ldt, [L, #VERTEX_T] ldr Ldt, [L, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp scaleUV Ldt, Ltmp, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end: .calc_left_end:
cmp Rh, #0 cmp Rh, #0
bne .calc_right_end // if (Rh != 0) end with right bgt .calc_right_end // if (Rh != 0) end with right
.calc_right_start: .calc_right_start:
ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next
@@ -131,7 +121,7 @@ rasterizeFTA_asm:
ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x)
ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y
subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y subs Rh, Ry2, Rxy, asr #16 // Rh = Ry2 - Rxy
blt .exit // if (Rh < 0) return blt .exit // if (Rh < 0) return
ldrne Rt, [R, #VERTEX_T] // Rt = R->t ldrne Rt, [R, #VERTEX_T] // Rt = R->t
mov R, N // R = N mov R, N // R = N
@@ -146,12 +136,10 @@ rasterizeFTA_asm:
ldrsh Rdx, [R, #VERTEX_X] ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16 sub Rdx, Rx, asr #16
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldr Rdt, [R, #VERTEX_T] ldr Rdt, [R, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp scaleUV Rdt, Rtmp, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end: .calc_right_end:
cmp Rh, Lh // if (Rh < Lh) cmp Rh, Lh // if (Rh < Lh)
@@ -160,8 +148,9 @@ rasterizeFTA_asm:
sub Lh, h // Lh -= h sub Lh, h // Lh -= h
sub Rh, h // Rh -= h sub Rh, h // Rh -= h
add tmp, sp, #SP_L orr LRh, Rh, Lh, lsl #16
stmia tmp, {L, R, Lh, Rh}
fiq_off
.scanline_start: .scanline_start:
asr tmp, Lx, #16 // x1 = (Lx >> 16) asr tmp, Lx, #16 // x1 = (Lx >> 16)
@@ -173,7 +162,7 @@ rasterizeFTA_asm:
divLUT inv, width // inv = FixedInvU(width) divLUT inv, width // inv = FixedInvU(width)
sub dtdx, Rt, Lt // duv = Rt - Lt sub dtdx, Rt, Lt // duv = Rt - Lt
scaleUV dtdx, du, dv, inv scaleUV dtdx, dtmp, inv
mov t, Lt // t = Lt mov t, Lt // t = Lt
@@ -241,21 +230,20 @@ rasterizeFTA_asm:
bne .scanline_block_8px bne .scanline_block_8px
.scanline_end: .scanline_end:
ldmia sp, {sLdx, sLdt, sRdx, sRdt} add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
add Lx, sLdx
add Lt, sLdt
add Rx, sRdx
add Rt, sRdt
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) fiq_on
add Lx, Ldx
add Rx, Rdx
add Lt, Ldt
add Rt, Rdt
subs h, #1 subs h, #1
fiq_off_ne
bne .scanline_start bne .scanline_start
add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt] fiq_off
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -1,20 +1,22 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 pixel .req r0 // arg
L .req r1 L .req r1 // arg
R .req r2 R .req r2 // arg
p .req r4 p .req r3
w .req r5 // FIQ regs
indexA .req r6 w .req r8
indexB .req r12 indexA .req r9
shade .req lr indexB .req r10
shade .req r11
width .req L width .req L
height .req R height .req R
LMAP .req shade LMAP .req shade
.global rasterizeFillS_asm .global rasterizeFillS_asm
rasterizeFillS_asm: rasterizeFillS_asm:
stmfd sp!, {r4-r6, lr} fiq_on
add R, #VERTEX_SIZEOF add R, #VERTEX_SIZEOF
ldrsh p, [L, #VERTEX_X] ldrsh p, [L, #VERTEX_X]
@@ -68,4 +70,5 @@ rasterizeFillS_asm:
subs height, #1 subs height, #1
bne .loop bne .loop
ldmfd sp!, {r4-r6, pc} fiq_off
bx lr

View File

@@ -1,39 +1,37 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 arg_pixel .req r0 // arg
L .req r1 arg_L .req r1 // arg
R .req r2 arg_R .req r2 // arg
Lh .req r3 N .req r0
Rh .req r4 tmp .req r1
Lx .req r2
Lx .req r5 Rx .req r3
Rx .req r6 Lg .req r4
Rg .req r5
Lg .req r7 Lt .req r6
Rg .req r8 Rt .req r7
Lt .req r9
Rt .req r10
tmp .req r11
N .req r12
L .req r8
R .req r9
Lh .req r10
Rh .req r11
pixel .req r12
TILE .req lr TILE .req lr
// FIQ regs
Ldx .req r8
Rdx .req r9
Ldg .req r10
Rdg .req r11
Ldt .req r12
Rdt .req r13
h .req N h .req N
LMAP .req tmp LMAP .req tmp
Ldx .req h
Rdx .req h
Ldg .req h
Rdg .req h
Ldt .req h
Rdt .req h
indexA .req Lh indexA .req Lh
indexB .req tmp indexB .req tmp
@@ -52,57 +50,37 @@ dgdx .req L
t .req Lt t .req Lt
dtdx .req R dtdx .req R
du .req L dtmp .req L
dv .req R
Ldu .req TILE Ltmp .req N
Ldv .req N Rtmp .req N
Rdu .req TILE
Rdv .req N
Rti .req tmp Rti .req tmp
Rgi .req tmp Rgi .req tmp
sLdx .req L SP_TILE = 0
sLdg .req R SP_SIZE = 4
sLdt .req Lh
sRdx .req Rh
sRdg .req tmp
sRdt .req tmp // not enough regs for one ldmia
SP_LDX = 0
SP_LDG = 4
SP_LDT = 8
SP_RDX = 12
SP_RDG = 16
SP_RDT = 20
SP_L = 24
SP_R = 28
SP_LH = 32
SP_RH = 36
SP_SIZE = 40
SP_TILE = SP_SIZE
.macro PUT_PIXELS .macro PUT_PIXELS
bic LMAP, g, #255 bic LMAP, g, #255
add g, dgdx
tex indexA, t tex indexA, t
lit indexA lit indexA
add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8
strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free
add g, dgdx, lsl #1
add t, dtdx, lsl #1
.endm .endm
.global rasterizeGT_asm .global rasterizeGT_asm
rasterizeGT_asm: rasterizeGT_asm:
ldr r3, =gTile ldr r3, =gTile
ldr r3, [r3] ldr r3, [r3]
stmfd sp!, {r3-r11, lr} stmfd sp!, {r3-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov pixel, arg_pixel
mov L, arg_L
mov R, arg_R
mov Lh, #0 // Lh = 0 mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0 mov Rh, #0 // Rh = 0
@@ -132,21 +110,20 @@ rasterizeGT_asm:
divLUT tmp, Lh // tmp = FixedInvU(Lh) divLUT tmp, Lh // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X] fiq_on
ldrsh Ldx, [N, #VERTEX_X]
sub Ldx, Lx, asr #16 sub Ldx, Lx, asr #16
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldrb Ldg, [L, #VERTEX_G] ldrb Ldg, [N, #VERTEX_G]
sub Ldg, Lg, lsr #8 sub Ldg, Lg, lsr #8
mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg) mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg)
asr Ldg, #8 // 8-bit for fractional part asr Ldg, #8 // 8-bit for fractional part
str Ldg, [sp, #SP_LDG] // store Ldg to stack
ldr Ldt, [L, #VERTEX_T] ldr Ldt, [N, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp scaleUV Ldt, Ltmp, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack fiq_off
.calc_left_end: .calc_left_end:
cmp Rh, #0 cmp Rh, #0
@@ -172,21 +149,20 @@ rasterizeGT_asm:
divLUT tmp, Rh // tmp = FixedInvU(Rh) divLUT tmp, Rh // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X] fiq_on
ldrsh Rdx, [N, #VERTEX_X]
sub Rdx, Rx, asr #16 sub Rdx, Rx, asr #16
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldrb Rdg, [R, #VERTEX_G] ldrb Rdg, [N, #VERTEX_G]
sub Rdg, Rg, lsr #8 sub Rdg, Rg, lsr #8
mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg) mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg)
asr Rdg, #8 // 8-bit for fractional part asr Rdg, #8 // 8-bit for fractional part
str Rdg, [sp, #SP_RDG] // store Ldg to stack
ldr Rdt, [R, #VERTEX_T] ldr Rdt, [N, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp scaleUV Rdt, Rtmp, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack fiq_off
.calc_right_end: .calc_right_end:
orr Lg, #LMAP_ADDR orr Lg, #LMAP_ADDR
@@ -200,27 +176,26 @@ rasterizeGT_asm:
ldr TILE, [sp, #SP_TILE] ldr TILE, [sp, #SP_TILE]
add tmp, sp, #SP_L stmfd sp!, {L, R, Lh, Rh}
stmia tmp, {L, R, Lh, Rh}
.scanline_start: .scanline_start:
asr Lh, Lx, #16 // x1 = (Lx >> 16)
rsbs width, Lh, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end_fast // if (width <= 0) go next scanline
stmfd sp!, {Lx, Lg, Lt} stmfd sp!, {Lx, Lg, Lt}
asr Lx, Lx, #16 // x1 = (Lx >> 16) add ptr, pixel, Lx, asr #16 // ptr = pixel + x1
rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline
add ptr, pixel, Lx // ptr = pixel + x1
divLUT inv, width // inv = FixedInvU(width) divLUT inv, width // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv scaleUV dtdx, dtmp, inv
// t == Lt (alias) // t == Lt (alias)
sub dgdx, Rg, Lg // dgdx = Rg - Lg sub dgdx, Rg, Lg // dgdx = Rg - Lg
mul dgdx, inv // dgdx *= FixedInvU(width) mul dgdx, inv // dgdx *= FixedInvU(width)
asr dgdx, #15 // dgdx >>= 15 asr dgdx, #16 // dgdx >>= 16
// g == Lg (alias) // g == Lg (alias)
// 2 bytes alignment (VRAM write requirement) // 2 bytes alignment (VRAM write requirement)
@@ -229,18 +204,19 @@ rasterizeGT_asm:
beq .align_right beq .align_right
bic LMAP, g, #255 bic LMAP, g, #255
add g, dgdx, asr #1
tex indexA, t tex indexA, t
lit indexA lit indexA
ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte) ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
orr indexB, indexA, lsl #8 orr indexB, indexA, lsl #8
strh indexB, [ptr], #2 strh indexB, [ptr], #2
add t, dtdx
subs width, #1 // width-- subs width, #1 // width--
beq .scanline_end // if (width == 0) beq .scanline_end // if (width == 0)
add g, dgdx
add t, dtdx
.align_right: .align_right:
tst width, #1 tst width, #1
beq .align_block_4px beq .align_block_4px
@@ -248,7 +224,7 @@ rasterizeGT_asm:
sub Rti, Rt, dtdx sub Rti, Rt, dtdx
tex indexA, Rti tex indexA, Rti
sub Rgi, Rg, dgdx, asr #1 sub Rgi, Rg, dgdx
bic LMAP, Rgi, #255 bic LMAP, Rgi, #255
lit indexA lit indexA
@@ -289,34 +265,25 @@ rasterizeGT_asm:
.scanline_end: .scanline_end:
ldmfd sp!, {Lx, Lg, Lt} ldmfd sp!, {Lx, Lg, Lt}
/* TEST FIQ
mrs r1, cpsr // save current program status reg
msr cpsr, #0x11 // switch to FIQ mode with extra r8-r14 regs
mov r8, #0 // trash FIQ regs and
mov r10, #0 // it shouldn't affect normal mode regs
// mov r11, r11
msr cpsr, r1 // restore current program status reg
*/
ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg}
add Lx, sLdx .scanline_end_fast:
add Lg, sLdg fiq_on
add Lt, sLdt add Lx, Ldx
add Rx, sRdx add Rx, Rdx
add Rg, sRdg add Lg, Ldg
add Rg, Rdg
ldr sRdt, [sp, #SP_RDT] add Lt, Ldt
add Rt, sRdt add Rt, Rdt
fiq_off
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
subs h, #1 subs h, #1
bne .scanline_start bne .scanline_start
add tmp, sp, #SP_L ldmfd sp!, {L, R, Lh, Rh}
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE] add sp, #SP_SIZE // revert reserved space for [TILE]
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -1,39 +1,37 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 arg_pixel .req r0 // arg
L .req r1 arg_L .req r1 // arg
R .req r2 arg_R .req r2 // arg
Lh .req r3 N .req r0
Rh .req r4 tmp .req r1
Lx .req r2
Lx .req r5 Rx .req r3
Rx .req r6 Lg .req r4
Rg .req r5
Lg .req r7 Lt .req r6
Rg .req r8 Rt .req r7
Lt .req r9
Rt .req r10
tmp .req r11
N .req r12
L .req r8
R .req r9
Lh .req r10
Rh .req r11
pixel .req r12
TILE .req lr TILE .req lr
// FIQ regs
Ldx .req r8
Rdx .req r9
Ldg .req r10
Rdg .req r11
Ldt .req r12
Rdt .req r13
h .req N h .req N
LMAP .req tmp LMAP .req tmp
Ldx .req h
Rdx .req h
Ldg .req h
Rdg .req h
Ldt .req h
Rdt .req h
indexA .req Lh indexA .req Lh
indexB .req tmp indexB .req tmp
@@ -52,59 +50,39 @@ dgdx .req L
t .req Lt t .req Lt
dtdx .req R dtdx .req R
dtmp .req L
duv .req R Ltmp .req N
du .req L Rtmp .req N
dv .req R
Ldu .req TILE
Ldv .req N
Rdu .req TILE
Rdv .req N
Rti .req tmp Rti .req tmp
Rgi .req tmp Rgi .req tmp
sLdx .req L SP_TILE = 0
sLdg .req R SP_SIZE = 4
sLdt .req Lh
sRdx .req Rh
sRdg .req tmp
sRdt .req tmp // not enough regs for one ldmia
SP_LDX = 0
SP_LDG = 4
SP_LDT = 8
SP_RDX = 12
SP_RDG = 16
SP_RDT = 20
SP_L = 24
SP_R = 28
SP_LH = 32
SP_RH = 36
SP_SIZE = 40
SP_TILE = SP_SIZE
.macro PUT_PIXELS .macro PUT_PIXELS
bic LMAP, g, #255 bic LMAP, g, #255
add g, dgdx
tex indexA, t tex indexA, t
add t, dtdx, lsl #1
cmp indexA, #0 cmp indexA, #0
ldrneb indexA, [LMAP, indexA] ldrneb indexA, [LMAP, indexA]
strneb indexA, [ptr] strneb indexA, [ptr]
add ptr, #2 add ptr, #2
add g, dgdx, lsl #1
add t, dtdx, lsl #1
.endm .endm
.global rasterizeGTA_asm .global rasterizeGTA_asm
rasterizeGTA_asm: rasterizeGTA_asm:
ldr r3, =gTile ldr r3, =gTile
ldr r3, [r3] ldr r3, [r3]
stmfd sp!, {r3-r11, lr} stmfd sp!, {r3-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov pixel, arg_pixel
mov L, arg_L
mov R, arg_R
mov Lh, #0 // Lh = 0 mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0 mov Rh, #0 // Rh = 0
@@ -134,21 +112,20 @@ rasterizeGTA_asm:
divLUT tmp, Lh // tmp = FixedInvU(Lh) divLUT tmp, Lh // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X] fiq_on
ldrsh Ldx, [N, #VERTEX_X]
sub Ldx, Lx, asr #16 sub Ldx, Lx, asr #16
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldrb Ldg, [L, #VERTEX_G] ldrb Ldg, [N, #VERTEX_G]
sub Ldg, Lg, lsr #8 sub Ldg, Lg, lsr #8
mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg) mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg)
asr Ldg, #8 // 8-bit for fractional part asr Ldg, #8 // 8-bit for fractional part
str Ldg, [sp, #SP_LDG] // store Ldg to stack
ldr Ldt, [L, #VERTEX_T] ldr Ldt, [N, #VERTEX_T]
sub Ldt, Lt // Ldt = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
scaleUV Ldt, Ldu, Ldv, tmp scaleUV Ldt, Ltmp, tmp
str Ldt, [sp, #SP_LDT] // store Ldt to stack fiq_off
.calc_left_end: .calc_left_end:
cmp Rh, #0 cmp Rh, #0
@@ -174,21 +151,20 @@ rasterizeGTA_asm:
divLUT tmp, Rh // tmp = FixedInvU(Rh) divLUT tmp, Rh // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X] fiq_on
ldrsh Rdx, [N, #VERTEX_X]
sub Rdx, Rx, asr #16 sub Rdx, Rx, asr #16
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldrb Rdg, [R, #VERTEX_G] ldrb Rdg, [N, #VERTEX_G]
sub Rdg, Rg, lsr #8 sub Rdg, Rg, lsr #8
mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg) mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg)
asr Rdg, #8 // 8-bit for fractional part asr Rdg, #8 // 8-bit for fractional part
str Rdg, [sp, #SP_RDG] // store Ldg to stack
ldr Rdt, [R, #VERTEX_T] ldr Rdt, [N, #VERTEX_T]
sub Rdt, Rt // Rdt = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
scaleUV Rdt, Rdu, Rdv, tmp scaleUV Rdt, Rtmp, tmp
str Rdt, [sp, #SP_RDT] // store Rdt to stack fiq_off
.calc_right_end: .calc_right_end:
orr Lg, #LMAP_ADDR orr Lg, #LMAP_ADDR
@@ -202,27 +178,26 @@ rasterizeGTA_asm:
ldr TILE, [sp, #SP_TILE] ldr TILE, [sp, #SP_TILE]
add tmp, sp, #SP_L stmfd sp!, {L, R, Lh, Rh}
stmia tmp, {L, R, Lh, Rh}
.scanline_start: .scanline_start:
asr Lh, Lx, #16 // x1 = (Lx >> 16)
rsbs width, Lh, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end_fast // if (width <= 0) go next scanline
stmfd sp!, {Lx, Lg, Lt} stmfd sp!, {Lx, Lg, Lt}
asr Lx, Lx, #16 // x1 = (Lx >> 16) add ptr, pixel, Lx, asr #16 // ptr = pixel + x1
rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline
add ptr, pixel, Lx // ptr = pixel + x1
divLUT inv, width // inv = FixedInvU(width) divLUT inv, width // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv scaleUV dtdx, dtmp, inv
// t == Lt (alias) // t == Lt (alias)
sub dgdx, Rg, Lg // dgdx = Rg - Lg sub dgdx, Rg, Lg // dgdx = Rg - Lg
mul dgdx, inv // dgdx *= FixedInvU(width) mul dgdx, inv // dgdx *= FixedInvU(width)
asr dgdx, #15 // dgdx >>= 15 asr dgdx, #16 // dgdx >>= 16
// g == Lg (alias) // g == Lg (alias)
// 2 bytes alignment (VRAM write requirement) // 2 bytes alignment (VRAM write requirement)
@@ -244,8 +219,8 @@ rasterizeGTA_asm:
.skip_left: .skip_left:
add ptr, #1 add ptr, #1
add g, dgdx
add t, dtdx add t, dtdx
add g, dgdx, asr #1
subs width, #1 // width-- subs width, #1 // width--
beq .scanline_end // if (width == 0) beq .scanline_end // if (width == 0)
@@ -305,26 +280,24 @@ rasterizeGTA_asm:
.scanline_end: .scanline_end:
ldmfd sp!, {Lx, Lg, Lt} ldmfd sp!, {Lx, Lg, Lt}
ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg} .scanline_end_fast:
fiq_on
add Lx, sLdx add Lx, Ldx
add Lg, sLdg add Rx, Rdx
add Lt, sLdt add Lg, Ldg
add Rx, sRdx add Rg, Rdg
add Rg, sRdg add Lt, Ldt
add Rt, Rdt
ldr sRdt, [sp, #SP_RDT] fiq_off
add Rt, sRdt
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
subs h, #1 subs h, #1
bne .scanline_start bne .scanline_start
add tmp, sp, #SP_L ldmfd sp!, {L, R, Lh, Rh}
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE] add sp, #SP_SIZE // revert reserved space for [TILE]
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -1,8 +1,8 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 pixel .req r0 // arg
L .req r1 L .req r1 // arg
R .req r2 R .req r2 // arg
tmp .req r12 tmp .req r12
index .req L index .req L
width .req R width .req R

View File

@@ -1,8 +1,8 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 pixel .req r0 // arg
L .req r1 L .req r1 // arg
R .req r2 R .req r2 // arg
tmp .req r12 tmp .req r12
index .req L index .req L
height .req R height .req R

View File

@@ -1,30 +1,33 @@
#include "common_asm.inc" #include "common_asm.inc"
pixel .req r0 pixel .req r0 // arg
L .req r1 L .req r1 // arg
R .req r2 R .req r2 // arg
LMAP .req r3 LMAP .req r3
Lh .req r4 Lh .req r4
Rh .req r5 Rh .req r5
Lx .req r6 Lx .req r6
Rx .req r7 Rx .req r7
// FIQ regs
Ldx .req r8 Ldx .req r8
Rdx .req r9 Rdx .req r9
N .req r10 N .req r10
tmp .req r11 tmp .req r11
pair .req r12 pair .req r12
width .req lr width .req r13
indexA .req r14
h .req N h .req N
Rxy .req tmp Rxy .req tmp
Ry2 .req Rh Ry2 .req Rh
Lxy .req tmp Lxy .req tmp
Ly2 .req Lh Ly2 .req Lh
indexA .req Lh
indexB .req pair indexB .req pair
.global rasterizeS_asm .global rasterizeS_asm
rasterizeS_asm: rasterizeS_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r7}
fiq_on
mov LMAP, #LMAP_ADDR mov LMAP, #LMAP_ADDR
add LMAP, #0x1A00 add LMAP, #0x1A00
@@ -88,8 +91,6 @@ rasterizeS_asm:
sub Lh, h // Lh -= h sub Lh, h // Lh -= h
sub Rh, h // Rh -= h sub Rh, h // Rh -= h
stmfd sp!, {Lh}
.scanline_start: .scanline_start:
asr tmp, Lx, #16 // x1 = (Lx >> 16) asr tmp, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1 rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
@@ -142,8 +143,9 @@ rasterizeS_asm:
subs h, #1 subs h, #1
bne .scanline_start bne .scanline_start
ldmfd sp!, {Lh}
b .loop b .loop
.exit: .exit:
ldmfd sp!, {r4-r11, pc} fiq_off
ldmfd sp!, {r4-r7}
bx lr

View File

@@ -2,4 +2,4 @@
.global rasterize_dummy .global rasterize_dummy
rasterize_dummy: rasterize_dummy:
mov pc, lr bx lr

View File

@@ -1,16 +1,17 @@
#include "common_asm.inc" #include "common_asm.inc"
x .req r0 x .req r0 // arg
y .req r1 y .req r1 // arg
z .req r2 z .req r2 // arg
r .req r3 r .req r3 // arg
mx .req r4 // FIQ regs
my .req r5 mx .req r8
mz .req r6 my .req r9
vx .req r7 mz .req r10
vy .req r8 vx .req r11
vz .req r12 vy .req r12
m .req lr vz .req r13
m .req r14
tmp .req m tmp .req m
vp .req m vp .req m
vMinXY .req z vMinXY .req z
@@ -23,7 +24,7 @@ rMaxY .req y
.global sphereIsVisible_asm .global sphereIsVisible_asm
sphereIsVisible_asm: sphereIsVisible_asm:
stmfd sp!, {r4-r8, lr} fiq_on
ldr m, =gMatrixPtr ldr m, =gMatrixPtr
ldr m, [m] ldr m, [m]
@@ -75,10 +76,10 @@ sphereIsVisible_asm:
bgt .fail bgt .fail
mov r0, #1 mov r0, #1
ldmfd sp!, {r4-r8, lr} fiq_off
bx lr bx lr
.fail: .fail:
mov r0, #0 mov r0, #0
ldmfd sp!, {r4-r8, lr} fiq_off
bx lr bx lr

View File

@@ -1,24 +1,36 @@
#include "common_asm.inc" #include "common_asm.inc"
vertices .req r0 vertices .req r0 // arg
count .req r1 count .req r1 // arg
intensity .req r2 intensity .req r2 // arg
m .req r3 vx .req intensity
vg .req intensity vy .req r3
vx .req r4 vz .req r4
vy .req r5 x .req r5
vz .req r6 y .req r6
mx .req r7 z .req vx
my .req r8 mx0 .req r7
mz .req r9
x .req r10
y .req r11
z .req r12
res .req lr
ambient .req vx mx2 .req r8
my2 .req r9
mz2 .req r10
mw2 .req r11
res .req r12
vg .req lr
// FIQ regs
my0 .req r8
mz0 .req r9
mw0 .req r10
mx1 .req r11
my1 .req r12
mz1 .req r13
mw1 .req r14
ambient .req vz
tmp .req vy tmp .req vy
dz .req vx dz .req vz
m .req vz
.global transformMesh_asm .global transformMesh_asm
transformMesh_asm: transformMesh_asm:
@@ -38,6 +50,9 @@ transformMesh_asm:
ldr m, =gMatrixPtr ldr m, =gMatrixPtr
ldr m, [m] ldr m, [m]
fiq_on
ldmia m!, {mx0, my0, mz0, mw0, mx1, my1, mz1, mw1}
ldmia m, {mx2, my2, mz2, mw2}^
.loop: .loop:
// unpack vertex // unpack vertex
@@ -45,30 +60,26 @@ transformMesh_asm:
ldrsh vy, [vertices], #2 ldrsh vy, [vertices], #2
ldrsh vz, [vertices], #2 ldrsh vz, [vertices], #2
bic vg, #CLIP_MASK // clear clipping flags
// transform x // transform x
ldmia m!, {mx, my, mz, x} mla x, mx0, vx, mw0
mla x, mx, vx, x mla x, my0, vy, x
mla x, my, vy, x mla x, mz0, vz, x
mla x, mz, vz, x
asr x, #FIXED_SHIFT asr x, #FIXED_SHIFT
// transform y // transform y
ldmia m!, {mx, my, mz, y} mla y, mx1, vx, mw1
mla y, mx, vx, y mla y, my1, vy, y
mla y, my, vy, y mla y, mz1, vz, y
mla y, mz, vz, y
asr y, #FIXED_SHIFT asr y, #FIXED_SHIFT
fiq_off
// transform z // transform z
ldmia m!, {mx, my, mz, z} mla z, mx2, vx, mw2
mla z, mx, vx, z mla z, my2, vy, z
mla z, my, vy, z mla z, mz2, vz, z
mla z, mz, vz, z
asr z, #FIXED_SHIFT asr z, #FIXED_SHIFT
sub m, #(12 * 4) // restore matrix ptr bic vg, #CLIP_MASK // clear clipping flags
// z clipping // z clipping
cmp z, #VIEW_MIN cmp z, #VIEW_MIN
@@ -102,6 +113,7 @@ transformMesh_asm:
strh vg, [res], #2 strh vg, [res], #2
subs count, #1 subs count, #1
fiq_on_ne
bne .loop bne .loop
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -1,33 +1,41 @@
#include "common_asm.inc" #include "common_asm.inc"
vertices .req r0 vertices .req r0 // arg
count .req r1 count .req r1 // arg
m .req r2 vx .req r2
v .req r3 vy .req r3
vx .req r4 vz .req r4
vy .req r5 x .req vx
vz .req r6 y .req r5
vg .req v z .req r6
mx .req r7 mx0 .req r7
my .req r8
mz .req r9
x .req r10
y .req r11
z .req r12
res .req lr
t .req y
spMinXY .req x mx2 .req r8
spMaxXY .req y my2 .req r9
mz2 .req r10
mw2 .req r11
res .req r12
vg .req lr
mask .req x // FIQ regs
vp .req vx my0 .req r8
minXY .req vx mz0 .req r9
maxXY .req vy mw0 .req r10
mx1 .req r11
my1 .req r12
mz1 .req r13
mw1 .req r14
tmp .req my m .req vx
dz .req mz v .req vg
fog .req mz mask .req y
minXY .req vy
maxXY .req vz
tmp .req vy
dz .req vz
fog .req vz
SP_MINXY = 0 SP_MINXY = 0
SP_MAXXY = 4 SP_MAXXY = 4
@@ -41,18 +49,18 @@ transformRoom_asm:
ldr res, [res] ldr res, [res]
add res, #VERTEX_G add res, #VERTEX_G
ldr tmp, =viewportRel
ldmia tmp, {minXY, maxXY}
stmfd sp!, {minXY, maxXY}
mov mask, #(0xFF << 10)
ldr m, =gMatrixPtr ldr m, =gMatrixPtr
ldr m, [m] ldr m, [m]
fiq_on
ldr vp, =viewportRel ldmia m!, {mx0, my0, mz0, mw0, mx1, my1, mz1, mw1}
ldmia vp, {spMinXY, spMaxXY} ldmia m, {mx2, my2, mz2, mw2}^
fiq_off
stmfd sp!, {spMinXY, spMaxXY}
// preload mask, matrix and z-row
mov mask, #(0xFF << 10)
add m, #(12 * 4)
ldmdb m!, {mx, my, mz, z}
.loop: .loop:
// unpack vertex // unpack vertex
@@ -63,33 +71,33 @@ transformRoom_asm:
and vx, mask, v, lsl #10 and vx, mask, v, lsl #10
// transform z // transform z
mla t, mx, vx, z mla z, mx2, vx, mw2
mla t, my, vy, t mla z, my2, vy, z
mla t, mz, vz, t mla z, mz2, vz, z
asr t, #FIXED_SHIFT asr z, #FIXED_SHIFT
// skip if vertex is out of z-range // skip if vertex is out of z-range
add t, #VIEW_OFF add z, #VIEW_OFF
cmp t, #(VIEW_OFF + VIEW_OFF + VIEW_MAX) cmp z, #(VIEW_OFF + VIEW_OFF + VIEW_MAX)
movhi vg, #(CLIP_NEAR + CLIP_FAR) movhi vg, #(CLIP_NEAR + CLIP_FAR)
bhi .skip bhi .skip
and vg, mask, v, lsr #14 and vg, mask, v, lsr #14
sub z, t, #VIEW_OFF sub z, #VIEW_OFF
fiq_on
// transform y // transform y
ldmdb m!, {mx, my, mz, y} mla y, mx1, vx, mw1
mla y, mx, vx, y mla y, my1, vy, y
mla y, my, vy, y mla y, mz1, vz, y
mla y, mz, vz, y
asr y, #FIXED_SHIFT asr y, #FIXED_SHIFT
// transform x // transform x
ldmdb m!, {mx, my, mz, x} mla x, mx0, vx, mw0
mla x, mx, vx, x mla x, my0, vy, x
mla x, my, vy, x mla x, mz0, vz, x
mla x, mz, vz, x
asr x, #FIXED_SHIFT asr x, #FIXED_SHIFT
fiq_off
// fog // fog
cmp z, #FOG_MIN cmp z, #FOG_MIN
@@ -145,11 +153,7 @@ transformRoom_asm:
strh y, [res, #-4] strh y, [res, #-4]
strh z, [res, #-2] strh z, [res, #-2]
// preload mask, matrix and z-row
mov mask, #(0xFF << 10) mov mask, #(0xFF << 10)
add m, #(12 * 4)
ldmdb m!, {mx, my, mz, z}
.skip: .skip:
strh vg, [res], #8 strh vg, [res], #8

View File

@@ -1,46 +1,57 @@
#include "common_asm.inc" #include "common_asm.inc"
vertices .req r0 vertices .req r0 // arg
count .req r1 count .req r1 // arg
m .req r2 vx .req r2
v .req r3 vy .req r3
vx .req r4 vz .req r4
vy .req r5 x .req vx
vz .req r6 y .req r5
vg .req v z .req r6
mx .req r7 mx0 .req r7
my .req r8
mz .req r9
x .req r10
y .req r11
z .req r12
res .req lr
t .req y
spMinXY .req mx mx2 .req r8
spMaxXY .req my my2 .req r9
spFrame .req mz mz2 .req r10
spCaustLUT .req x mw2 .req r11
spRandLUT .req y res .req r12
vg .req lr
mask .req x // FIQ regs
vp .req vx my0 .req r8
minXY .req vx mz0 .req r9
maxXY .req vy mw0 .req r10
mx1 .req r11
my1 .req r12
mz1 .req r13
mw1 .req r14
dz .req mz m .req vx
fog .req mz v .req vg
mask .req y
frame .req vx minXY .req vy
maxXY .req vz
tmp .req vy
dz .req vz
fog .req vz
frame .req vy
caust .req vy caust .req vy
rand .req vz rand .req vz
tmp .req mx
spMinXY .req vx
spMaxXY .req vy
spRandLUT .req vz
spFrame .req y
spCaustLUT .req z
SP_MINXY = 0 SP_MINXY = 0
SP_MAXXY = 4 SP_MAXXY = 4
SP_FRAME = 8 SP_RAND = 8
SP_CAUST = 12 SP_FRAME = 12
SP_RAND = 16 SP_CAUST = 16
SP_SIZE = 20 SP_SIZE = 20
.global transformRoomUW_asm .global transformRoomUW_asm
@@ -51,11 +62,8 @@ transformRoomUW_asm:
ldr res, [res] ldr res, [res]
add res, #VERTEX_G add res, #VERTEX_G
ldr m, =gMatrixPtr ldr tmp, =viewportRel
ldr m, [m] ldmia tmp, {spMinXY, spMaxXY}
ldr vp, =viewportRel
ldmia vp, {spMinXY, spMaxXY}
ldr spFrame, =gCausticsFrame ldr spFrame, =gCausticsFrame
ldr spFrame, [spFrame] ldr spFrame, [spFrame]
@@ -63,12 +71,16 @@ transformRoomUW_asm:
ldr spCaustLUT, =gCaustics ldr spCaustLUT, =gCaustics
ldr spRandLUT, =gRandTable ldr spRandLUT, =gRandTable
stmfd sp!, {spMinXY, spMaxXY, spFrame, spCaustLUT, spRandLUT} stmfd sp!, {spMinXY, spMaxXY, spRandLUT, spFrame, spCaustLUT}
// preload mask, matrix and z-row
mov mask, #(0xFF << 10) mov mask, #(0xFF << 10)
add m, #(12 * 4)
ldmdb m!, {mx, my, mz, z} ldr m, =gMatrixPtr
ldr m, [m]
fiq_on
ldmia m!, {mx0, my0, mz0, mw0, mx1, my1, mz1, mw1}
ldmia m, {mx2, my2, mz2, mw2}^
fiq_off
.loop: .loop:
// unpack vertex // unpack vertex
@@ -79,41 +91,42 @@ transformRoomUW_asm:
and vx, mask, v, lsl #10 and vx, mask, v, lsl #10
// transform z // transform z
mla t, mx, vx, z mla z, mx2, vx, mw2
mla t, my, vy, t mla z, my2, vy, z
mla t, mz, vz, t mla z, mz2, vz, z
asr t, #FIXED_SHIFT asr z, #FIXED_SHIFT
// skip if vertex is out of z-range // skip if vertex is out of z-range
add t, #VIEW_OFF add z, #VIEW_OFF
cmp t, #(VIEW_OFF + VIEW_OFF + VIEW_MAX) cmp z, #(VIEW_OFF + VIEW_OFF + VIEW_MAX)
movhi vg, #(CLIP_NEAR + CLIP_FAR) movhi vg, #(CLIP_NEAR + CLIP_FAR)
bhi .skip bhi .skip
and vg, mask, v, lsr #14 and vg, mask, v, lsr #14
sub z, t, #VIEW_OFF sub z, #VIEW_OFF
fiq_on
// transform y // transform y
ldmdb m!, {mx, my, mz, y} mla y, mx1, vx, mw1
mla y, mx, vx, y mla y, my1, vy, y
mla y, my, vy, y mla y, mz1, vz, y
mla y, mz, vz, y
asr y, #FIXED_SHIFT asr y, #FIXED_SHIFT
// transform x // transform x
ldmdb m!, {mx, my, mz, x} mla x, mx0, vx, mw0
mla x, mx, vx, x mla x, my0, vy, x
mla x, my, vy, x mla x, mz0, vz, x
mla x, mz, vz, x
asr x, #FIXED_SHIFT asr x, #FIXED_SHIFT
fiq_off
// caustics // caustics
add tmp, sp, #SP_FRAME ldr rand, [sp, #SP_RAND]
ldmia tmp, {frame, caust, rand}
and tmp, count, #(MAX_RAND_TABLE - 1) and tmp, count, #(MAX_RAND_TABLE - 1)
ldr rand, [rand, tmp, lsl #2] ldr rand, [rand, tmp, lsl #2]
ldr frame, [sp, #SP_FRAME]
add rand, frame add rand, frame
and rand, #(MAX_CAUSTICS - 1) and rand, #(MAX_CAUSTICS - 1)
ldr caust, [sp, #SP_CAUST]
ldr caust, [caust, rand, lsl #2] ldr caust, [caust, rand, lsl #2]
add vg, caust, lsl #5 add vg, caust, lsl #5
@@ -171,11 +184,7 @@ transformRoomUW_asm:
strh y, [res, #-4] strh y, [res, #-4]
strh z, [res, #-2] strh z, [res, #-2]
// preload mask, matrix and z-row
mov mask, #(0xFF << 10) mov mask, #(0xFF << 10)
add m, #(12 * 4)
ldmdb m!, {mx, my, mz, z}
.skip: .skip:
strh vg, [res], #8 strh vg, [res], #8