diff --git a/src/platform/gba/asm/boxRotateYQ.s b/src/platform/gba/asm/boxRotateYQ.s index ad01f14..b284b65 100644 --- a/src/platform/gba/asm/boxRotateYQ.s +++ b/src/platform/gba/asm/boxRotateYQ.s @@ -1,7 +1,7 @@ #include "common_asm.inc" -v .req r0 -q .req r1 +v .req r0 // arg +q .req r1 // arg min .req q max .req r2 diff --git a/src/platform/gba/asm/boxTranslate.s b/src/platform/gba/asm/boxTranslate.s index b68c256..4508bdd 100644 --- a/src/platform/gba/asm/boxTranslate.s +++ b/src/platform/gba/asm/boxTranslate.s @@ -1,19 +1,20 @@ #include "common_asm.inc" -aabb .req r0 -x .req r1 -y .req r2 -z .req r3 -minX .req r4 -maxX .req r5 -minY .req r6 -maxY .req r7 +aabb .req r0 // arg +x .req r1 // arg +y .req r2 // arg +z .req r3 // arg +// FIQ regs +minX .req r8 +maxX .req r9 +minY .req r10 +maxY .req r11 minZ .req r12 -maxZ .req lr +maxZ .req r13 .global boxTranslate_asm boxTranslate_asm: - stmfd sp!, {r4-r7, lr} + fiq_on ldmia aabb, {minX, maxX, minY, maxY, minZ, maxZ} add minX, minX, x @@ -24,5 +25,5 @@ boxTranslate_asm: add maxZ, maxZ, z stmia aabb, {minX, maxX, minY, maxY, minZ, maxZ} - ldmfd sp!, {r4-r7, lr} + fiq_off bx lr diff --git a/src/platform/gba/asm/common_asm.inc b/src/platform/gba/asm/common_asm.inc index d9c4766..6707718 100644 --- a/src/platform/gba/asm/common_asm.inc +++ b/src/platform/gba/asm/common_asm.inc @@ -120,15 +120,17 @@ ble \skip .endm -.macro scaleUV uv, u, v, f - asr \u, \uv, #16 - mul \u, \f // u = f * int16(uv >> 16) - lsl \v, \uv, #16 - asr \v, #16 - mul \v, \f // v = f * int16(uv) - lsr \u, #16 - lsl \u, #16 - orr \uv, \u, \v, lsr #16 // uv = (u & 0xFFFF0000) | (v >> 16) +.macro scaleUV uv, tmp, f + asr \tmp, \uv, #16 + mul \tmp, \f // u = f * int16(uv >> 16) + + lsl \uv, \uv, #16 + asr \uv, #16 + mul \uv, \f // v = f * int16(uv) + + lsr \tmp, #16 + lsl \tmp, #16 + orr \uv, \tmp, \uv, lsr #16 // uv = (u & 0xFFFF0000) | (v >> 16) .endm .macro tex index, uv @@ -140,3 +142,19 @@ .macro lit index ldrb \index, [LMAP, \index] .endm + +.macro fiq_on + msr cpsr, #0x11 // switch r8-r14 to FIQ (IRQ enabled) +.endm + +.macro fiq_off + msr cpsr, #0x1F // restore r8-r14 +.endm + +.macro fiq_on_ne + msrne cpsr, #0x11 // switch r8-r14 to FIQ (IRQ enabled) +.endm + +.macro fiq_off_ne + msrne cpsr, #0x1F // restore r8-r14 +.endm diff --git a/src/platform/gba/asm/faceAddMeshQuads.s b/src/platform/gba/asm/faceAddMeshQuads.s index 89ab6f6..c5e2311 100644 --- a/src/platform/gba/asm/faceAddMeshQuads.s +++ b/src/platform/gba/asm/faceAddMeshQuads.s @@ -1,19 +1,21 @@ #include "common_asm.inc" -polys .req r0 -count .req r1 +polys .req r0 // arg +count .req r1 // arg vp .req r2 vg0 .req r3 vg1 .req r4 vg2 .req r5 vg3 .req r6 flags .req r7 +// FIQ regs vp0 .req r8 vp1 .req r9 vp2 .req r10 vp3 .req r11 ot .req r12 -face .req lr +face .req r13 +vertices .req r14 vx0 .req vg0 vy0 .req vg1 @@ -29,21 +31,18 @@ vz3 .req vg3 depth .req vg0 tmp .req flags -vertices .req vg2 next .req vp0 -SP_SIZE = 4 - .global faceAddMeshQuads_asm faceAddMeshQuads_asm: - stmfd sp!, {r4-r11, lr} + stmfd sp!, {r4-r7} + fiq_on ldr vp, =gVerticesBase ldr vp, [vp] ldr vertices, =gVertices lsr vertices, #3 - stmfd sp!, {vertices} ldr face, =gFacesBase ldr face, [face] @@ -97,7 +96,6 @@ faceAddMeshQuads_asm: lsr depth, #(2 + OT_SHIFT) // faceAdd - ldr vertices, [sp] rsb vp0, vertices, vp0, lsr #3 rsb vp1, vertices, vp1, lsr #3 rsb vp2, vertices, vp2, lsr #3 @@ -116,5 +114,6 @@ faceAddMeshQuads_asm: ldr tmp, =gFacesBase str face, [tmp] - add sp, #SP_SIZE - ldmfd sp!, {r4-r11, pc} + fiq_off + ldmfd sp!, {r4-r7} + bx lr diff --git a/src/platform/gba/asm/faceAddMeshTriangles.s b/src/platform/gba/asm/faceAddMeshTriangles.s index 97ea9e6..d4f4a85 100644 --- a/src/platform/gba/asm/faceAddMeshTriangles.s +++ b/src/platform/gba/asm/faceAddMeshTriangles.s @@ -1,19 +1,20 @@ #include "common_asm.inc" -polys .req r0 -count .req r1 +polys .req r0 // arg +count .req r1 // arg vp .req r2 vg0 .req r3 vg1 .req r4 vg2 .req r5 vg3 .req r6 -flags .req r7 -vp0 .req r8 -vp1 .req r9 -vp2 .req r10 -vertices .req r11 -ot .req r12 -face .req lr +// FIQ regs +flags .req r8 +vp0 .req r9 +vp1 .req r10 +vp2 .req r11 +vertices .req r12 +ot .req r13 +face .req r14 vx0 .req vg0 vy0 .req vg1 @@ -32,7 +33,8 @@ next .req vp0 .global faceAddMeshTriangles_asm faceAddMeshTriangles_asm: - stmfd sp!, {r4-r11, lr} + stmfd sp!, {r4-r6} + fiq_on ldr vp, =gVerticesBase ldr vp, [vp] @@ -102,4 +104,6 @@ faceAddMeshTriangles_asm: ldr tmp, =gFacesBase str face, [tmp] - ldmfd sp!, {r4-r11, pc} + fiq_off + ldmfd sp!, {r4-r6} + bx lr diff --git a/src/platform/gba/asm/faceAddRoomQuads.s b/src/platform/gba/asm/faceAddRoomQuads.s index 78abb89..6a8ef77 100644 --- a/src/platform/gba/asm/faceAddRoomQuads.s +++ b/src/platform/gba/asm/faceAddRoomQuads.s @@ -1,19 +1,21 @@ #include "common_asm.inc" -polys .req r0 -count .req r1 +polys .req r0 // arg +count .req r1 // arg vp .req r2 vg0 .req r3 vg1 .req r4 vg2 .req r5 vg3 .req r6 flags .req r7 +// FIQ regs vp0 .req r8 vp1 .req r9 vp2 .req r10 vp3 .req r11 ot .req r12 -face .req lr +face .req r13 +vertices .req r14 vx0 .req vg0 vy0 .req vg1 @@ -29,21 +31,20 @@ vz3 .req vg3 depth .req vg0 tmp .req flags -vertices .req vg2 next .req vp0 SP_SIZE = 4 .global faceAddRoomQuads_asm faceAddRoomQuads_asm: - stmfd sp!, {r4-r11, lr} + stmfd sp!, {r4-r7} + fiq_on ldr vp, =gVerticesBase ldr vp, [vp] ldr vertices, =gVertices lsr vertices, #3 - stmfd sp!, {vertices} ldr face, =gFacesBase ldr face, [face] @@ -107,7 +108,6 @@ faceAddRoomQuads_asm: mov depth, vz0, lsr #OT_SHIFT // faceAdd - ldr vertices, [sp] rsb vp0, vertices, vp0, lsr #3 rsb vp1, vertices, vp1, lsr #3 rsb vp2, vertices, vp2, lsr #3 @@ -126,5 +126,6 @@ faceAddRoomQuads_asm: ldr tmp, =gFacesBase str face, [tmp] - add sp, #SP_SIZE - ldmfd sp!, {r4-r11, pc} + fiq_off + ldmfd sp!, {r4-r7} + bx lr diff --git a/src/platform/gba/asm/faceAddRoomTriangles.s b/src/platform/gba/asm/faceAddRoomTriangles.s index 644ee78..d9f278f 100644 --- a/src/platform/gba/asm/faceAddRoomTriangles.s +++ b/src/platform/gba/asm/faceAddRoomTriangles.s @@ -1,19 +1,20 @@ #include "common_asm.inc" -polys .req r0 -count .req r1 +polys .req r0 // arg +count .req r1 // arg vp .req r2 vg0 .req r3 vg1 .req r4 vg2 .req r5 vg3 .req r6 -flags .req r7 -vp0 .req r8 -vp1 .req r9 -vp2 .req r10 -vertices .req r11 -ot .req r12 -face .req lr +// FIQ regs +flags .req r8 +vp0 .req r9 +vp1 .req r10 +vp2 .req r11 +vertices .req r12 +ot .req r13 +face .req r14 vx0 .req vg0 vy0 .req vg1 @@ -32,7 +33,8 @@ next .req vp0 .global faceAddRoomTriangles_asm faceAddRoomTriangles_asm: - stmfd sp!, {r4-r11, lr} + stmfd sp!, {r4-r6} + fiq_on ldr vp, =gVerticesBase ldr vp, [vp] @@ -110,4 +112,6 @@ faceAddRoomTriangles_asm: ldr tmp, =gFacesBase str face, [tmp] - ldmfd sp!, {r4-r11, pc} + fiq_off + ldmfd sp!, {r4-r6} + bx lr diff --git a/src/platform/gba/asm/getSector.s b/src/platform/gba/asm/getSector.s index 49d6147..098ac25 100644 --- a/src/platform/gba/asm/getSector.s +++ b/src/platform/gba/asm/getSector.s @@ -1,8 +1,8 @@ #include "common_asm.inc" -this .req r0 -x .req r1 -z .req r2 +this .req r0 // arg +x .req r1 // arg +z .req r2 // arg info .req r3 roomX .req r12 roomZ .req roomX diff --git a/src/platform/gba/asm/matrixLerp.s b/src/platform/gba/asm/matrixLerp.s index f309fd2..41d9fcb 100644 --- a/src/platform/gba/asm/matrixLerp.s +++ b/src/platform/gba/asm/matrixLerp.s @@ -1,15 +1,16 @@ #include "common_asm.inc" -n .req r0 -pmul .req r1 -pdiv .req r2 -m0 .req r3 -m1 .req r4 -m2 .req r5 -n0 .req r6 -n1 .req r7 -n2 .req r12 -m .req lr +n .req r0 // arg +pmul .req r1 // arg +pdiv .req r2 // arg +// FIQ regs +m0 .req r8 +m1 .req r9 +m2 .req r10 +n0 .req r11 +n1 .req r12 +n2 .req r13 +m .req r14 tmp .req m0 .macro load @@ -83,7 +84,7 @@ tmp .req m0 .global matrixLerp_asm matrixLerp_asm: - stmfd sp!, {r4-r7, lr} + fiq_on ldr m, =gMatrixPtr ldr m, [m] .check_2: @@ -111,5 +112,5 @@ matrixLerp_asm: mov pmul, tmp, asr #8 lerp _X_Y .done: - ldmfd sp!, {r4-r7, lr} + fiq_off bx lr diff --git a/src/platform/gba/asm/matrixPush.s b/src/platform/gba/asm/matrixPush.s index ae091cf..a4bdc3b 100644 --- a/src/platform/gba/asm/matrixPush.s +++ b/src/platform/gba/asm/matrixPush.s @@ -5,25 +5,26 @@ e1 .req r1 e2 .req r2 e3 .req r3 m .req e0 -src .req r12 -dst .req lr +// FIQ regs +src .req r8 +dst .req r9 +e4 .req r10 +e5 .req r11 .global matrixPush_asm matrixPush_asm: - stmfd sp!, {lr} + fiq_on + ldr m, =gMatrixPtr ldr src, [m] add dst, src, #(12 * 4) str dst, [m] - ldmia src!, {e0, e1, e2, e3} - stmia dst!, {e0, e1, e2, e3} + ldmia src!, {e0, e1, e2, e3, e4, e5} + stmia dst!, {e0, e1, e2, e3, e4, e5} - ldmia src!, {e0, e1, e2, e3} - stmia dst!, {e0, e1, e2, e3} + ldmia src!, {e0, e1, e2, e3, e4, e5} + stmia dst!, {e0, e1, e2, e3, e4, e5} - ldmia src!, {e0, e1, e2, e3} - stmia dst!, {e0, e1, e2, e3} - - ldmfd sp!, {lr} + fiq_off bx lr diff --git a/src/platform/gba/asm/matrixRotate.s b/src/platform/gba/asm/matrixRotate.s index f5d7079..32d1419 100644 --- a/src/platform/gba/asm/matrixRotate.s +++ b/src/platform/gba/asm/matrixRotate.s @@ -18,17 +18,18 @@ mov \x, \x, asr #FIXED_SHIFT .endm -angle .req r0 -e0 .req r1 -e1 .req r2 -s .req r3 -c .req r12 -v .req lr +angle .req r0 // arg +s .req r1 +c .req r2 +v .req r3 +// FIQ regs +e0 .req r8 +e1 .req r9 m .req angle .global matrixRotateX_asm matrixRotateX_asm: - stmfd sp!, {lr} + fiq_on mov angle, angle, lsl #16 mov angle, angle, lsr #20 @@ -53,12 +54,12 @@ matrixRotateX_asm: rotxy e1, e0, s, c, v stmia m, {e0, e1} - ldmfd sp!, {lr} + fiq_off bx lr .global matrixRotateY_asm matrixRotateY_asm: - stmfd sp!, {lr} + fiq_on mov angle, angle, lsl #16 mov angle, angle, lsr #20 @@ -86,12 +87,12 @@ matrixRotateY_asm: str e0, [m], #8 str e1, [m], #8 - ldmfd sp!, {lr} + fiq_off bx lr .global matrixRotateZ_asm matrixRotateZ_asm: - stmfd sp!, {lr} + fiq_on mov angle, angle, lsl #16 mov angle, angle, lsr #20 @@ -115,23 +116,24 @@ matrixRotateZ_asm: rotxy e1, e0, s, c, v stmia m, {e0, e1} - ldmfd sp!, {lr} + fiq_off bx lr -angleX .req r0 -angleY .req r1 -angleZ .req r2 +angleX .req r0 // arg +angleY .req r1 // arg +angleZ .req r2 // arg e00 .req r3 e01 .req r4 e02 .req r5 e10 .req r6 -e11 .req r7 -e12 .req r8 -e20 .req r9 -e21 .req r10 -e22 .req r11 +// FIQ regs +e11 .req r8 +e12 .req r9 +e20 .req r10 +e21 .req r11 tmp .req r12 -sinX .req lr +e22 .req r13 +sinX .req r14 sinY .req sinX sinZ .req sinX cosX .req angleX @@ -153,7 +155,8 @@ matrixRotateYXZ_asm: orrs mask, mask, angleZ bxeq lr - stmfd sp!, {r4-r11, lr} + stmfd sp!, {r4-r6} + fiq_on ldr mm, =gMatrixPtr ldr mm, [mm] @@ -203,10 +206,11 @@ matrixRotateYXZ_asm: add mm, #(4 * 4) stmia mm, {e20, e21, e22} - ldmfd sp!, {r4-r11, lr} + fiq_off + ldmfd sp!, {r4-r6} bx lr -q .req r0 +q .req r0 // arg n .req r1 mx .req r3 my .req q diff --git a/src/platform/gba/asm/matrixSetBasis.s b/src/platform/gba/asm/matrixSetBasis.s index 4d71c68..4315943 100644 --- a/src/platform/gba/asm/matrixSetBasis.s +++ b/src/platform/gba/asm/matrixSetBasis.s @@ -1,7 +1,7 @@ #include "common_asm.inc" -dst .req r0 -src .req r1 +dst .req r0 // arg +src .req r1 // arg e0 .req r2 e1 .req r3 diff --git a/src/platform/gba/asm/matrixTranslate.s b/src/platform/gba/asm/matrixTranslate.s index ffd5888..2ae2e1e 100644 --- a/src/platform/gba/asm/matrixTranslate.s +++ b/src/platform/gba/asm/matrixTranslate.s @@ -1,17 +1,18 @@ #include "common_asm.inc" -x .req r0 -y .req r1 -z .req r2 -e0 .req r3 -e1 .req r4 -e2 .req r5 -v .req r12 -m .req lr +x .req r0 // arg +y .req r1 // arg +z .req r2 // arg +m .req r3 +// FIQ regs +e0 .req r8 +e1 .req r9 +e2 .req r10 +v .req r11 .global matrixTranslateRel_asm matrixTranslateRel_asm: - stmfd sp!, {r4-r5, lr} + fiq_on ldr m, =gMatrixPtr ldr m, [m] @@ -37,12 +38,12 @@ matrixTranslateRel_asm: mla v, e2, z, v stmdb m, {v} - ldmfd sp!, {r4-r5, lr} + fiq_off bx lr .global matrixTranslateAbs_asm matrixTranslateAbs_asm: - stmfd sp!, {r4-r5, lr} + fiq_on ldr v, =gCameraViewPos ldmia v, {e0, e1, e2} @@ -74,12 +75,12 @@ matrixTranslateAbs_asm: mla v, e2, z, v stmia m!, {v} - ldmfd sp!, {r4-r5, lr} + fiq_off bx lr .global matrixTranslateSet_asm matrixTranslateSet_asm: - stmfd sp!, {r4-r5, lr} + fiq_on ldr m, =gMatrixPtr ldr m, [m] @@ -105,5 +106,5 @@ matrixTranslateSet_asm: mla v, e2, z, v stmia m!, {v} - ldmfd sp!, {r4-r5, lr} + fiq_off bx lr diff --git a/src/platform/gba/asm/rasterize.s b/src/platform/gba/asm/rasterize.s index db543e1..08998d8 100644 --- a/src/platform/gba/asm/rasterize.s +++ b/src/platform/gba/asm/rasterize.s @@ -1,7 +1,7 @@ #include "common_asm.inc" -flags .req r0 -L .req r1 +flags .req r0 // arg +L .req r1 // arg R .req r2 y .req r3 type .req r12 diff --git a/src/platform/gba/asm/rasterizeF.s b/src/platform/gba/asm/rasterizeF.s index 28cb217..d2b9016 100644 --- a/src/platform/gba/asm/rasterizeF.s +++ b/src/platform/gba/asm/rasterizeF.s @@ -1,19 +1,20 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -color .req r2 +pixel .req r0 // arg +L .req r1 // arg +color .req r2 // arg index .req r3 Lh .req r4 Rh .req r5 Lx .req r6 -Rx .req r7 -Ldx .req r8 -Rdx .req r9 -N .req r10 -tmp .req r11 -pair .req r12 -width .req lr +// FIQ regs +Rx .req r8 +Ldx .req r9 +Rdx .req r10 +N .req r11 +tmp .req r12 +pair .req r13 +width .req r14 R .req color h .req N @@ -26,7 +27,8 @@ ptr .req tmp .global rasterizeF_asm rasterizeF_asm: - stmfd sp!, {r4-r11, lr} + stmfd sp!, {r4-r6} + fiq_on add LMAP, color, #LMAP_ADDR ldrb tmp, [L, #VERTEX_G] @@ -133,4 +135,6 @@ rasterizeF_asm: b .loop .exit: - ldmfd sp!, {r4-r11, pc} \ No newline at end of file + fiq_off + ldmfd sp!, {r4-r6} + bx lr diff --git a/src/platform/gba/asm/rasterizeFT.s b/src/platform/gba/asm/rasterizeFT.s index 8cca5ae..d92837c 100644 --- a/src/platform/gba/asm/rasterizeFT.s +++ b/src/platform/gba/asm/rasterizeFT.s @@ -1,94 +1,85 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 -LMAP .req r3 +arg_pixel .req r0 // arg +arg_L .req r1 // arg +arg_R .req r2 // arg -TILE .req r4 -tmp .req r5 -N .req r6 -Lh .req r7 -Rh .req r8 +N .req r0 +tmp .req r1 +Lx .req r2 +Rx .req r3 +Lt .req r4 +Rt .req r5 +t .req r6 +dtdx .req r7 -Lx .req r9 -Rx .req r10 -Lt .req r11 -Rt .req r12 -h .req lr +indexA .req r8 +indexB .req r9 +LMAP .req r10 +TILE .req r11 +pixel .req r12 +width .req lr + +// FIQ regs +Ldx .req r8 +Rdx .req r9 +Ldt .req r10 +Rdt .req r11 +LRh .req r12 +L .req r13 +R .req r14 + +Rh .req LRh +Lh .req t + +h .req N ptr .req tmp -Ldx .req h -Rdx .req h - -Ldt .req h -Rdt .req h - -indexA .req Lh -indexB .req Rh Rxy .req tmp Ry2 .req Rh Lxy .req tmp Ly2 .req Lh -inv .req Lh -width .req N -t .req L -dtdx .req R +inv .req indexA +duv .req indexB +dtmp .req t -duv .req R -du .req L -dv .req R - -Ldu .req N -Ldv .req h - -Rdu .req N -Rdv .req h +Ltmp .req N +Rtmp .req N Rti .req indexB -sLdx .req tmp -sLdt .req N -sRdx .req Lh -sRdt .req Rh - -SP_LDX = 0 -SP_LDT = 4 -SP_RDX = 8 -SP_RDT = 12 -SP_L = 16 -SP_R = 20 -SP_LH = 24 -SP_RH = 28 -SP_SIZE = 32 - .macro PUT_PIXELS tex indexA, t lit indexA add t, dtdx, lsl #1 - //orr indexA, indexA, lsl #8 strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free .endm .global rasterizeFT_asm rasterizeFT_asm: stmfd sp!, {r4-r11, lr} - sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt] + + mov pixel, arg_pixel mov LMAP, #LMAP_ADDR - ldrb tmp, [L, #VERTEX_G] - add LMAP, tmp, lsl #8 // tmp = (L->v.g << 8) + ldrb t, [arg_L, #VERTEX_G] + add LMAP, t, lsl #8 // LMAP = (L->v.g << 8) ldr TILE, =gTile ldr TILE, [TILE] - mov Lh, #0 // Lh = 0 - mov Rh, #0 // Rh = 0 + fiq_on + mov L, arg_L + mov R, arg_R + mov LRh, #0 // Lh = 0 .loop: + lsr Lh, LRh, #16 + lsl Rh, LRh, #16 + lsr Rh, Rh, #16 cmp Lh, #0 bgt .calc_left_end // if (Lh != 0) end with left @@ -114,12 +105,10 @@ rasterizeFT_asm: ldrsh Ldx, [L, #VERTEX_X] sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) - str Ldx, [sp, #SP_LDX] // store Ldx to stack ldr Ldt, [L, #VERTEX_T] sub Ldt, Lt // Ldt = N->v.t - Lt - scaleUV Ldt, Ldu, Ldv, tmp - str Ldt, [sp, #SP_LDT] // store Ldt to stack + scaleUV Ldt, Ltmp, tmp .calc_left_end: cmp Rh, #0 @@ -146,12 +135,10 @@ rasterizeFT_asm: ldrsh Rdx, [R, #VERTEX_X] sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) - str Rdx, [sp, #SP_RDX] // store Rdx to stack ldr Rdt, [R, #VERTEX_T] sub Rdt, Rt // Rdt = N->v.t - Rt - scaleUV Rdt, Rdu, Rdv, tmp - str Rdt, [sp, #SP_RDT] // store Rdt to stack + scaleUV Rdt, Rtmp, tmp .calc_right_end: cmp Rh, Lh // if (Rh < Lh) @@ -160,8 +147,9 @@ rasterizeFT_asm: sub Lh, h // Lh -= h sub Rh, h // Rh -= h - add tmp, sp, #SP_L - stmia tmp, {L, R, Lh, Rh} + orr LRh, Rh, Lh, lsl #16 + + fiq_off .scanline_start: asr tmp, Lx, #16 // x1 = (Lx >> 16) @@ -173,7 +161,7 @@ rasterizeFT_asm: divLUT inv, width // inv = FixedInvU(width) sub dtdx, Rt, Lt // duv = Rt - Lt - scaleUV dtdx, du, dv, inv + scaleUV dtdx, dtmp, inv mov t, Lt // t = Lt @@ -237,21 +225,20 @@ rasterizeFT_asm: bne .scanline_block_8px .scanline_end: - ldmia sp, {sLdx, sLdt, sRdx, sRdt} - add Lx, sLdx - add Lt, sLdt - add Rx, sRdx - add Rt, sRdt + add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) - add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) + fiq_on + add Lx, Ldx + add Rx, Rdx + add Lt, Ldt + add Rt, Rdt subs h, #1 + fiq_off_ne bne .scanline_start - add tmp, sp, #SP_L - ldmia tmp, {L, R, Lh, Rh} b .loop .exit: - add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt] - ldmfd sp!, {r4-r11, pc} \ No newline at end of file + fiq_off + ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/rasterizeFTA.s b/src/platform/gba/asm/rasterizeFTA.s index 20fe35b..85c8a30 100644 --- a/src/platform/gba/asm/rasterizeFTA.s +++ b/src/platform/gba/asm/rasterizeFTA.s @@ -1,69 +1,55 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 -LMAP .req r3 +arg_pixel .req r0 // arg +arg_L .req r1 // arg +arg_R .req r2 // arg -TILE .req r4 -tmp .req r5 -N .req r6 -Lh .req r7 -Rh .req r8 +N .req r0 +tmp .req r1 +Lx .req r2 +Rx .req r3 +Lt .req r4 +Rt .req r5 +t .req r6 +dtdx .req r7 -Lx .req r9 -Rx .req r10 -Lt .req r11 -Rt .req r12 -h .req lr +indexA .req r8 +indexB .req r9 +LMAP .req r10 +TILE .req r11 +pixel .req r12 +width .req lr + +// FIQ regs +Ldx .req r8 +Rdx .req r9 +Ldt .req r10 +Rdt .req r11 +LRh .req r12 +L .req r13 +R .req r14 + +Rh .req LRh +Lh .req t + +h .req N ptr .req tmp -Ldx .req h -Rdx .req h - -Ldt .req h -Rdt .req h - -indexA .req Lh -indexB .req Rh Rxy .req tmp Ry2 .req Rh Lxy .req tmp Ly2 .req Lh -inv .req Lh -width .req N -t .req L -dtdx .req R +inv .req indexA +duv .req indexB +dtmp .req t -duv .req R -du .req L -dv .req R - -Ldu .req N -Ldv .req h - -Rdu .req N -Rdv .req h +Ltmp .req N +Rtmp .req N Rti .req indexB -sLdx .req tmp -sLdt .req N -sRdx .req Lh -sRdt .req Rh - -SP_LDX = 0 -SP_LDT = 4 -SP_RDX = 8 -SP_RDT = 12 -SP_L = 16 -SP_R = 20 -SP_LH = 24 -SP_RH = 28 -SP_SIZE = 32 - .macro PUT_PIXELS tex indexA, t add t, dtdx, lsl #1 @@ -76,22 +62,28 @@ SP_SIZE = 32 .global rasterizeFTA_asm rasterizeFTA_asm: stmfd sp!, {r4-r11, lr} - sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt] + + mov pixel, arg_pixel mov LMAP, #LMAP_ADDR - ldrb tmp, [L, #VERTEX_G] - add LMAP, tmp, lsl #8 // tmp = (L->v.g << 8) + ldrb t, [arg_L, #VERTEX_G] + add LMAP, t, lsl #8 // LMAP = (L->v.g << 8) ldr TILE, =gTile ldr TILE, [TILE] - mov Lh, #0 // Lh = 0 - mov Rh, #0 // Rh = 0 + fiq_on + mov L, arg_L + mov R, arg_R + mov LRh, #0 // Lh = 0 .loop: + lsr Lh, LRh, #16 + lsl Rh, LRh, #16 + lsr Rh, Rh, #16 cmp Lh, #0 - bne .calc_left_end // if (Lh != 0) end with left + bgt .calc_left_end // if (Lh != 0) end with left .calc_left_start: ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev @@ -114,16 +106,14 @@ rasterizeFTA_asm: ldrsh Ldx, [L, #VERTEX_X] sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) - str Ldx, [sp, #SP_LDX] // store Ldx to stack ldr Ldt, [L, #VERTEX_T] sub Ldt, Lt // Ldt = N->v.t - Lt - scaleUV Ldt, Ldu, Ldv, tmp - str Ldt, [sp, #SP_LDT] // store Ldt to stack + scaleUV Ldt, Ltmp, tmp .calc_left_end: cmp Rh, #0 - bne .calc_right_end // if (Rh != 0) end with right + bgt .calc_right_end // if (Rh != 0) end with right .calc_right_start: ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next @@ -131,7 +121,7 @@ rasterizeFTA_asm: ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y - subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y + subs Rh, Ry2, Rxy, asr #16 // Rh = Ry2 - Rxy blt .exit // if (Rh < 0) return ldrne Rt, [R, #VERTEX_T] // Rt = R->t mov R, N // R = N @@ -146,12 +136,10 @@ rasterizeFTA_asm: ldrsh Rdx, [R, #VERTEX_X] sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) - str Rdx, [sp, #SP_RDX] // store Rdx to stack ldr Rdt, [R, #VERTEX_T] sub Rdt, Rt // Rdt = N->v.t - Rt - scaleUV Rdt, Rdu, Rdv, tmp - str Rdt, [sp, #SP_RDT] // store Rdt to stack + scaleUV Rdt, Rtmp, tmp .calc_right_end: cmp Rh, Lh // if (Rh < Lh) @@ -160,8 +148,9 @@ rasterizeFTA_asm: sub Lh, h // Lh -= h sub Rh, h // Rh -= h - add tmp, sp, #SP_L - stmia tmp, {L, R, Lh, Rh} + orr LRh, Rh, Lh, lsl #16 + + fiq_off .scanline_start: asr tmp, Lx, #16 // x1 = (Lx >> 16) @@ -173,7 +162,7 @@ rasterizeFTA_asm: divLUT inv, width // inv = FixedInvU(width) sub dtdx, Rt, Lt // duv = Rt - Lt - scaleUV dtdx, du, dv, inv + scaleUV dtdx, dtmp, inv mov t, Lt // t = Lt @@ -241,21 +230,20 @@ rasterizeFTA_asm: bne .scanline_block_8px .scanline_end: - ldmia sp, {sLdx, sLdt, sRdx, sRdt} - add Lx, sLdx - add Lt, sLdt - add Rx, sRdx - add Rt, sRdt + add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) - add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) + fiq_on + add Lx, Ldx + add Rx, Rdx + add Lt, Ldt + add Rt, Rdt subs h, #1 + fiq_off_ne bne .scanline_start - add tmp, sp, #SP_L - ldmia tmp, {L, R, Lh, Rh} b .loop .exit: - add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt] - ldmfd sp!, {r4-r11, pc} \ No newline at end of file + fiq_off + ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/rasterizeFillS.s b/src/platform/gba/asm/rasterizeFillS.s index cb787ce..eaba805 100644 --- a/src/platform/gba/asm/rasterizeFillS.s +++ b/src/platform/gba/asm/rasterizeFillS.s @@ -1,20 +1,22 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 -p .req r4 -w .req r5 -indexA .req r6 -indexB .req r12 -shade .req lr +pixel .req r0 // arg +L .req r1 // arg +R .req r2 // arg +p .req r3 +// FIQ regs +w .req r8 +indexA .req r9 +indexB .req r10 +shade .req r11 + width .req L height .req R LMAP .req shade .global rasterizeFillS_asm rasterizeFillS_asm: - stmfd sp!, {r4-r6, lr} + fiq_on add R, #VERTEX_SIZEOF ldrsh p, [L, #VERTEX_X] @@ -68,4 +70,5 @@ rasterizeFillS_asm: subs height, #1 bne .loop - ldmfd sp!, {r4-r6, pc} + fiq_off + bx lr diff --git a/src/platform/gba/asm/rasterizeGT.s b/src/platform/gba/asm/rasterizeGT.s index 9a596cb..7d7c3e2 100644 --- a/src/platform/gba/asm/rasterizeGT.s +++ b/src/platform/gba/asm/rasterizeGT.s @@ -1,39 +1,37 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 +arg_pixel .req r0 // arg +arg_L .req r1 // arg +arg_R .req r2 // arg -Lh .req r3 -Rh .req r4 - -Lx .req r5 -Rx .req r6 - -Lg .req r7 -Rg .req r8 - -Lt .req r9 -Rt .req r10 - -tmp .req r11 -N .req r12 +N .req r0 +tmp .req r1 +Lx .req r2 +Rx .req r3 +Lg .req r4 +Rg .req r5 +Lt .req r6 +Rt .req r7 +L .req r8 +R .req r9 +Lh .req r10 +Rh .req r11 +pixel .req r12 TILE .req lr +// FIQ regs +Ldx .req r8 +Rdx .req r9 +Ldg .req r10 +Rdg .req r11 +Ldt .req r12 +Rdt .req r13 + h .req N LMAP .req tmp -Ldx .req h -Rdx .req h - -Ldg .req h -Rdg .req h - -Ldt .req h -Rdt .req h - indexA .req Lh indexB .req tmp @@ -52,57 +50,37 @@ dgdx .req L t .req Lt dtdx .req R -du .req L -dv .req R +dtmp .req L -Ldu .req TILE -Ldv .req N - -Rdu .req TILE -Rdv .req N +Ltmp .req N +Rtmp .req N Rti .req tmp Rgi .req tmp -sLdx .req L -sLdg .req R -sLdt .req Lh -sRdx .req Rh -sRdg .req tmp -sRdt .req tmp // not enough regs for one ldmia - -SP_LDX = 0 -SP_LDG = 4 -SP_LDT = 8 -SP_RDX = 12 -SP_RDG = 16 -SP_RDT = 20 -SP_L = 24 -SP_R = 28 -SP_LH = 32 -SP_RH = 36 -SP_SIZE = 40 -SP_TILE = SP_SIZE +SP_TILE = 0 +SP_SIZE = 4 .macro PUT_PIXELS bic LMAP, g, #255 - add g, dgdx tex indexA, t lit indexA - - add t, dtdx, lsl #1 - //orr indexA, indexA, lsl #8 strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free + + add g, dgdx, lsl #1 + add t, dtdx, lsl #1 .endm .global rasterizeGT_asm rasterizeGT_asm: ldr r3, =gTile ldr r3, [r3] - stmfd sp!, {r3-r11, lr} - sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] + + mov pixel, arg_pixel + mov L, arg_L + mov R, arg_R mov Lh, #0 // Lh = 0 mov Rh, #0 // Rh = 0 @@ -132,21 +110,20 @@ rasterizeGT_asm: divLUT tmp, Lh // tmp = FixedInvU(Lh) - ldrsh Ldx, [L, #VERTEX_X] + fiq_on + ldrsh Ldx, [N, #VERTEX_X] sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) - str Ldx, [sp, #SP_LDX] // store Ldx to stack - ldrb Ldg, [L, #VERTEX_G] + ldrb Ldg, [N, #VERTEX_G] sub Ldg, Lg, lsr #8 mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg) asr Ldg, #8 // 8-bit for fractional part - str Ldg, [sp, #SP_LDG] // store Ldg to stack - ldr Ldt, [L, #VERTEX_T] + ldr Ldt, [N, #VERTEX_T] sub Ldt, Lt // Ldt = N->v.t - Lt - scaleUV Ldt, Ldu, Ldv, tmp - str Ldt, [sp, #SP_LDT] // store Ldt to stack + scaleUV Ldt, Ltmp, tmp + fiq_off .calc_left_end: cmp Rh, #0 @@ -172,21 +149,20 @@ rasterizeGT_asm: divLUT tmp, Rh // tmp = FixedInvU(Rh) - ldrsh Rdx, [R, #VERTEX_X] + fiq_on + ldrsh Rdx, [N, #VERTEX_X] sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) - str Rdx, [sp, #SP_RDX] // store Rdx to stack - ldrb Rdg, [R, #VERTEX_G] + ldrb Rdg, [N, #VERTEX_G] sub Rdg, Rg, lsr #8 mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg) asr Rdg, #8 // 8-bit for fractional part - str Rdg, [sp, #SP_RDG] // store Ldg to stack - ldr Rdt, [R, #VERTEX_T] + ldr Rdt, [N, #VERTEX_T] sub Rdt, Rt // Rdt = N->v.t - Rt - scaleUV Rdt, Rdu, Rdv, tmp - str Rdt, [sp, #SP_RDT] // store Rdt to stack + scaleUV Rdt, Rtmp, tmp + fiq_off .calc_right_end: orr Lg, #LMAP_ADDR @@ -200,27 +176,26 @@ rasterizeGT_asm: ldr TILE, [sp, #SP_TILE] - add tmp, sp, #SP_L - stmia tmp, {L, R, Lh, Rh} + stmfd sp!, {L, R, Lh, Rh} .scanline_start: + asr Lh, Lx, #16 // x1 = (Lx >> 16) + rsbs width, Lh, Rx, asr #16 // width = (Rx >> 16) - x1 + ble .scanline_end_fast // if (width <= 0) go next scanline + stmfd sp!, {Lx, Lg, Lt} - asr Lx, Lx, #16 // x1 = (Lx >> 16) - rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1 - ble .scanline_end // if (width <= 0) go next scanline - - add ptr, pixel, Lx // ptr = pixel + x1 + add ptr, pixel, Lx, asr #16 // ptr = pixel + x1 divLUT inv, width // inv = FixedInvU(width) sub dtdx, Rt, Lt // dtdx = Rt - Lt - scaleUV dtdx, du, dv, inv + scaleUV dtdx, dtmp, inv // t == Lt (alias) sub dgdx, Rg, Lg // dgdx = Rg - Lg mul dgdx, inv // dgdx *= FixedInvU(width) - asr dgdx, #15 // dgdx >>= 15 + asr dgdx, #16 // dgdx >>= 16 // g == Lg (alias) // 2 bytes alignment (VRAM write requirement) @@ -229,18 +204,19 @@ rasterizeGT_asm: beq .align_right bic LMAP, g, #255 - add g, dgdx, asr #1 tex indexA, t lit indexA ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte) orr indexB, indexA, lsl #8 strh indexB, [ptr], #2 - add t, dtdx subs width, #1 // width-- beq .scanline_end // if (width == 0) + add g, dgdx + add t, dtdx + .align_right: tst width, #1 beq .align_block_4px @@ -248,7 +224,7 @@ rasterizeGT_asm: sub Rti, Rt, dtdx tex indexA, Rti - sub Rgi, Rg, dgdx, asr #1 + sub Rgi, Rg, dgdx bic LMAP, Rgi, #255 lit indexA @@ -289,34 +265,25 @@ rasterizeGT_asm: .scanline_end: ldmfd sp!, {Lx, Lg, Lt} -/* TEST FIQ - mrs r1, cpsr // save current program status reg - msr cpsr, #0x11 // switch to FIQ mode with extra r8-r14 regs - mov r8, #0 // trash FIQ regs and - mov r10, #0 // it shouldn't affect normal mode regs -// mov r11, r11 - msr cpsr, r1 // restore current program status reg -*/ - ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg} - add Lx, sLdx - add Lg, sLdg - add Lt, sLdt - add Rx, sRdx - add Rg, sRdg - - ldr sRdt, [sp, #SP_RDT] - add Rt, sRdt +.scanline_end_fast: + fiq_on + add Lx, Ldx + add Rx, Rdx + add Lg, Ldg + add Rg, Rdg + add Lt, Ldt + add Rt, Rdt + fiq_off add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) subs h, #1 bne .scanline_start - add tmp, sp, #SP_L - ldmia tmp, {L, R, Lh, Rh} + ldmfd sp!, {L, R, Lh, Rh} b .loop .exit: - add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE] - ldmfd sp!, {r4-r11, pc} \ No newline at end of file + add sp, #SP_SIZE // revert reserved space for [TILE] + ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/rasterizeGTA.s b/src/platform/gba/asm/rasterizeGTA.s index 5afd6be..ba438de 100644 --- a/src/platform/gba/asm/rasterizeGTA.s +++ b/src/platform/gba/asm/rasterizeGTA.s @@ -1,39 +1,37 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 +arg_pixel .req r0 // arg +arg_L .req r1 // arg +arg_R .req r2 // arg -Lh .req r3 -Rh .req r4 - -Lx .req r5 -Rx .req r6 - -Lg .req r7 -Rg .req r8 - -Lt .req r9 -Rt .req r10 - -tmp .req r11 -N .req r12 +N .req r0 +tmp .req r1 +Lx .req r2 +Rx .req r3 +Lg .req r4 +Rg .req r5 +Lt .req r6 +Rt .req r7 +L .req r8 +R .req r9 +Lh .req r10 +Rh .req r11 +pixel .req r12 TILE .req lr +// FIQ regs +Ldx .req r8 +Rdx .req r9 +Ldg .req r10 +Rdg .req r11 +Ldt .req r12 +Rdt .req r13 + h .req N LMAP .req tmp -Ldx .req h -Rdx .req h - -Ldg .req h -Rdg .req h - -Ldt .req h -Rdt .req h - indexA .req Lh indexB .req tmp @@ -52,59 +50,39 @@ dgdx .req L t .req Lt dtdx .req R +dtmp .req L -duv .req R -du .req L -dv .req R - -Ldu .req TILE -Ldv .req N - -Rdu .req TILE -Rdv .req N +Ltmp .req N +Rtmp .req N Rti .req tmp Rgi .req tmp -sLdx .req L -sLdg .req R -sLdt .req Lh -sRdx .req Rh -sRdg .req tmp -sRdt .req tmp // not enough regs for one ldmia - -SP_LDX = 0 -SP_LDG = 4 -SP_LDT = 8 -SP_RDX = 12 -SP_RDG = 16 -SP_RDT = 20 -SP_L = 24 -SP_R = 28 -SP_LH = 32 -SP_RH = 36 -SP_SIZE = 40 -SP_TILE = SP_SIZE +SP_TILE = 0 +SP_SIZE = 4 .macro PUT_PIXELS bic LMAP, g, #255 - add g, dgdx tex indexA, t - add t, dtdx, lsl #1 cmp indexA, #0 ldrneb indexA, [LMAP, indexA] strneb indexA, [ptr] add ptr, #2 + + add g, dgdx, lsl #1 + add t, dtdx, lsl #1 .endm .global rasterizeGTA_asm rasterizeGTA_asm: ldr r3, =gTile ldr r3, [r3] - stmfd sp!, {r3-r11, lr} - sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] + + mov pixel, arg_pixel + mov L, arg_L + mov R, arg_R mov Lh, #0 // Lh = 0 mov Rh, #0 // Rh = 0 @@ -134,21 +112,20 @@ rasterizeGTA_asm: divLUT tmp, Lh // tmp = FixedInvU(Lh) - ldrsh Ldx, [L, #VERTEX_X] + fiq_on + ldrsh Ldx, [N, #VERTEX_X] sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) - str Ldx, [sp, #SP_LDX] // store Ldx to stack - ldrb Ldg, [L, #VERTEX_G] + ldrb Ldg, [N, #VERTEX_G] sub Ldg, Lg, lsr #8 mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg) asr Ldg, #8 // 8-bit for fractional part - str Ldg, [sp, #SP_LDG] // store Ldg to stack - ldr Ldt, [L, #VERTEX_T] + ldr Ldt, [N, #VERTEX_T] sub Ldt, Lt // Ldt = N->v.t - Lt - scaleUV Ldt, Ldu, Ldv, tmp - str Ldt, [sp, #SP_LDT] // store Ldt to stack + scaleUV Ldt, Ltmp, tmp + fiq_off .calc_left_end: cmp Rh, #0 @@ -174,21 +151,20 @@ rasterizeGTA_asm: divLUT tmp, Rh // tmp = FixedInvU(Rh) - ldrsh Rdx, [R, #VERTEX_X] + fiq_on + ldrsh Rdx, [N, #VERTEX_X] sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) - str Rdx, [sp, #SP_RDX] // store Rdx to stack - ldrb Rdg, [R, #VERTEX_G] + ldrb Rdg, [N, #VERTEX_G] sub Rdg, Rg, lsr #8 mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg) asr Rdg, #8 // 8-bit for fractional part - str Rdg, [sp, #SP_RDG] // store Ldg to stack - ldr Rdt, [R, #VERTEX_T] + ldr Rdt, [N, #VERTEX_T] sub Rdt, Rt // Rdt = N->v.t - Rt - scaleUV Rdt, Rdu, Rdv, tmp - str Rdt, [sp, #SP_RDT] // store Rdt to stack + scaleUV Rdt, Rtmp, tmp + fiq_off .calc_right_end: orr Lg, #LMAP_ADDR @@ -202,27 +178,26 @@ rasterizeGTA_asm: ldr TILE, [sp, #SP_TILE] - add tmp, sp, #SP_L - stmia tmp, {L, R, Lh, Rh} + stmfd sp!, {L, R, Lh, Rh} .scanline_start: + asr Lh, Lx, #16 // x1 = (Lx >> 16) + rsbs width, Lh, Rx, asr #16 // width = (Rx >> 16) - x1 + ble .scanline_end_fast // if (width <= 0) go next scanline + stmfd sp!, {Lx, Lg, Lt} - asr Lx, Lx, #16 // x1 = (Lx >> 16) - rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1 - ble .scanline_end // if (width <= 0) go next scanline - - add ptr, pixel, Lx // ptr = pixel + x1 + add ptr, pixel, Lx, asr #16 // ptr = pixel + x1 divLUT inv, width // inv = FixedInvU(width) sub dtdx, Rt, Lt // dtdx = Rt - Lt - scaleUV dtdx, du, dv, inv + scaleUV dtdx, dtmp, inv // t == Lt (alias) sub dgdx, Rg, Lg // dgdx = Rg - Lg mul dgdx, inv // dgdx *= FixedInvU(width) - asr dgdx, #15 // dgdx >>= 15 + asr dgdx, #16 // dgdx >>= 16 // g == Lg (alias) // 2 bytes alignment (VRAM write requirement) @@ -244,8 +219,8 @@ rasterizeGTA_asm: .skip_left: add ptr, #1 + add g, dgdx add t, dtdx - add g, dgdx, asr #1 subs width, #1 // width-- beq .scanline_end // if (width == 0) @@ -305,26 +280,24 @@ rasterizeGTA_asm: .scanline_end: ldmfd sp!, {Lx, Lg, Lt} - ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg} - - add Lx, sLdx - add Lg, sLdg - add Lt, sLdt - add Rx, sRdx - add Rg, sRdg - - ldr sRdt, [sp, #SP_RDT] - add Rt, sRdt +.scanline_end_fast: + fiq_on + add Lx, Ldx + add Rx, Rdx + add Lg, Ldg + add Rg, Rdg + add Lt, Ldt + add Rt, Rdt + fiq_off add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) subs h, #1 bne .scanline_start - add tmp, sp, #SP_L - ldmia tmp, {L, R, Lh, Rh} + ldmfd sp!, {L, R, Lh, Rh} b .loop .exit: - add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE] - ldmfd sp!, {r4-r11, pc} \ No newline at end of file + add sp, #SP_SIZE // revert reserved space for [TILE] + ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/rasterizeLineH.s b/src/platform/gba/asm/rasterizeLineH.s index fab7a0f..d246dee 100644 --- a/src/platform/gba/asm/rasterizeLineH.s +++ b/src/platform/gba/asm/rasterizeLineH.s @@ -1,8 +1,8 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 +pixel .req r0 // arg +L .req r1 // arg +R .req r2 // arg tmp .req r12 index .req L width .req R diff --git a/src/platform/gba/asm/rasterizeLineV.s b/src/platform/gba/asm/rasterizeLineV.s index b7bea1e..c2e6e0b 100644 --- a/src/platform/gba/asm/rasterizeLineV.s +++ b/src/platform/gba/asm/rasterizeLineV.s @@ -1,8 +1,8 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 +pixel .req r0 // arg +L .req r1 // arg +R .req r2 // arg tmp .req r12 index .req L height .req R diff --git a/src/platform/gba/asm/rasterizeS.s b/src/platform/gba/asm/rasterizeS.s index f232aad..9c8f999 100644 --- a/src/platform/gba/asm/rasterizeS.s +++ b/src/platform/gba/asm/rasterizeS.s @@ -1,30 +1,33 @@ #include "common_asm.inc" -pixel .req r0 -L .req r1 -R .req r2 +pixel .req r0 // arg +L .req r1 // arg +R .req r2 // arg LMAP .req r3 Lh .req r4 Rh .req r5 Lx .req r6 Rx .req r7 +// FIQ regs Ldx .req r8 Rdx .req r9 N .req r10 tmp .req r11 pair .req r12 -width .req lr +width .req r13 +indexA .req r14 + h .req N Rxy .req tmp Ry2 .req Rh Lxy .req tmp Ly2 .req Lh -indexA .req Lh indexB .req pair .global rasterizeS_asm rasterizeS_asm: - stmfd sp!, {r4-r11, lr} + stmfd sp!, {r4-r7} + fiq_on mov LMAP, #LMAP_ADDR add LMAP, #0x1A00 @@ -88,8 +91,6 @@ rasterizeS_asm: sub Lh, h // Lh -= h sub Rh, h // Rh -= h - stmfd sp!, {Lh} - .scanline_start: asr tmp, Lx, #16 // x1 = (Lx >> 16) rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1 @@ -142,8 +143,9 @@ rasterizeS_asm: subs h, #1 bne .scanline_start - ldmfd sp!, {Lh} b .loop .exit: - ldmfd sp!, {r4-r11, pc} \ No newline at end of file + fiq_off + ldmfd sp!, {r4-r7} + bx lr diff --git a/src/platform/gba/asm/rasterize_dummy.s b/src/platform/gba/asm/rasterize_dummy.s index 0a889b4..0222499 100644 --- a/src/platform/gba/asm/rasterize_dummy.s +++ b/src/platform/gba/asm/rasterize_dummy.s @@ -2,4 +2,4 @@ .global rasterize_dummy rasterize_dummy: - mov pc, lr \ No newline at end of file + bx lr \ No newline at end of file diff --git a/src/platform/gba/asm/sphereIsVisible.s b/src/platform/gba/asm/sphereIsVisible.s index 1a343ee..71589ac 100644 --- a/src/platform/gba/asm/sphereIsVisible.s +++ b/src/platform/gba/asm/sphereIsVisible.s @@ -1,16 +1,17 @@ #include "common_asm.inc" -x .req r0 -y .req r1 -z .req r2 -r .req r3 -mx .req r4 -my .req r5 -mz .req r6 -vx .req r7 -vy .req r8 -vz .req r12 -m .req lr +x .req r0 // arg +y .req r1 // arg +z .req r2 // arg +r .req r3 // arg +// FIQ regs +mx .req r8 +my .req r9 +mz .req r10 +vx .req r11 +vy .req r12 +vz .req r13 +m .req r14 tmp .req m vp .req m vMinXY .req z @@ -23,7 +24,7 @@ rMaxY .req y .global sphereIsVisible_asm sphereIsVisible_asm: - stmfd sp!, {r4-r8, lr} + fiq_on ldr m, =gMatrixPtr ldr m, [m] @@ -75,10 +76,10 @@ sphereIsVisible_asm: bgt .fail mov r0, #1 - ldmfd sp!, {r4-r8, lr} + fiq_off bx lr .fail: mov r0, #0 - ldmfd sp!, {r4-r8, lr} + fiq_off bx lr diff --git a/src/platform/gba/asm/transformMesh.s b/src/platform/gba/asm/transformMesh.s index 069f290..2df1083 100644 --- a/src/platform/gba/asm/transformMesh.s +++ b/src/platform/gba/asm/transformMesh.s @@ -1,24 +1,36 @@ #include "common_asm.inc" -vertices .req r0 -count .req r1 -intensity .req r2 -m .req r3 -vg .req intensity -vx .req r4 -vy .req r5 -vz .req r6 -mx .req r7 -my .req r8 -mz .req r9 -x .req r10 -y .req r11 -z .req r12 -res .req lr +vertices .req r0 // arg +count .req r1 // arg +intensity .req r2 // arg +vx .req intensity +vy .req r3 +vz .req r4 +x .req r5 +y .req r6 +z .req vx +mx0 .req r7 -ambient .req vx +mx2 .req r8 +my2 .req r9 +mz2 .req r10 +mw2 .req r11 +res .req r12 +vg .req lr + +// FIQ regs +my0 .req r8 +mz0 .req r9 +mw0 .req r10 +mx1 .req r11 +my1 .req r12 +mz1 .req r13 +mw1 .req r14 + +ambient .req vz tmp .req vy -dz .req vx +dz .req vz +m .req vz .global transformMesh_asm transformMesh_asm: @@ -38,6 +50,9 @@ transformMesh_asm: ldr m, =gMatrixPtr ldr m, [m] + fiq_on + ldmia m!, {mx0, my0, mz0, mw0, mx1, my1, mz1, mw1} + ldmia m, {mx2, my2, mz2, mw2}^ .loop: // unpack vertex @@ -45,30 +60,26 @@ transformMesh_asm: ldrsh vy, [vertices], #2 ldrsh vz, [vertices], #2 - bic vg, #CLIP_MASK // clear clipping flags - // transform x - ldmia m!, {mx, my, mz, x} - mla x, mx, vx, x - mla x, my, vy, x - mla x, mz, vz, x + mla x, mx0, vx, mw0 + mla x, my0, vy, x + mla x, mz0, vz, x asr x, #FIXED_SHIFT // transform y - ldmia m!, {mx, my, mz, y} - mla y, mx, vx, y - mla y, my, vy, y - mla y, mz, vz, y + mla y, mx1, vx, mw1 + mla y, my1, vy, y + mla y, mz1, vz, y asr y, #FIXED_SHIFT + fiq_off // transform z - ldmia m!, {mx, my, mz, z} - mla z, mx, vx, z - mla z, my, vy, z - mla z, mz, vz, z + mla z, mx2, vx, mw2 + mla z, my2, vy, z + mla z, mz2, vz, z asr z, #FIXED_SHIFT - sub m, #(12 * 4) // restore matrix ptr + bic vg, #CLIP_MASK // clear clipping flags // z clipping cmp z, #VIEW_MIN @@ -102,6 +113,7 @@ transformMesh_asm: strh vg, [res], #2 subs count, #1 + fiq_on_ne bne .loop ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/transformRoom.s b/src/platform/gba/asm/transformRoom.s index 7c2e8ce..2e8d9a5 100644 --- a/src/platform/gba/asm/transformRoom.s +++ b/src/platform/gba/asm/transformRoom.s @@ -1,33 +1,41 @@ #include "common_asm.inc" -vertices .req r0 -count .req r1 -m .req r2 -v .req r3 -vx .req r4 -vy .req r5 -vz .req r6 -vg .req v -mx .req r7 -my .req r8 -mz .req r9 -x .req r10 -y .req r11 -z .req r12 -res .req lr -t .req y +vertices .req r0 // arg +count .req r1 // arg +vx .req r2 +vy .req r3 +vz .req r4 +x .req vx +y .req r5 +z .req r6 +mx0 .req r7 -spMinXY .req x -spMaxXY .req y +mx2 .req r8 +my2 .req r9 +mz2 .req r10 +mw2 .req r11 +res .req r12 +vg .req lr -mask .req x -vp .req vx -minXY .req vx -maxXY .req vy +// FIQ regs +my0 .req r8 +mz0 .req r9 +mw0 .req r10 +mx1 .req r11 +my1 .req r12 +mz1 .req r13 +mw1 .req r14 -tmp .req my -dz .req mz -fog .req mz +m .req vx +v .req vg +mask .req y + +minXY .req vy +maxXY .req vz + +tmp .req vy +dz .req vz +fog .req vz SP_MINXY = 0 SP_MAXXY = 4 @@ -41,18 +49,18 @@ transformRoom_asm: ldr res, [res] add res, #VERTEX_G + ldr tmp, =viewportRel + ldmia tmp, {minXY, maxXY} + stmfd sp!, {minXY, maxXY} + + mov mask, #(0xFF << 10) + ldr m, =gMatrixPtr ldr m, [m] - - ldr vp, =viewportRel - ldmia vp, {spMinXY, spMaxXY} - - stmfd sp!, {spMinXY, spMaxXY} - - // preload mask, matrix and z-row - mov mask, #(0xFF << 10) - add m, #(12 * 4) - ldmdb m!, {mx, my, mz, z} + fiq_on + ldmia m!, {mx0, my0, mz0, mw0, mx1, my1, mz1, mw1} + ldmia m, {mx2, my2, mz2, mw2}^ + fiq_off .loop: // unpack vertex @@ -63,33 +71,33 @@ transformRoom_asm: and vx, mask, v, lsl #10 // transform z - mla t, mx, vx, z - mla t, my, vy, t - mla t, mz, vz, t - asr t, #FIXED_SHIFT + mla z, mx2, vx, mw2 + mla z, my2, vy, z + mla z, mz2, vz, z + asr z, #FIXED_SHIFT // skip if vertex is out of z-range - add t, #VIEW_OFF - cmp t, #(VIEW_OFF + VIEW_OFF + VIEW_MAX) + add z, #VIEW_OFF + cmp z, #(VIEW_OFF + VIEW_OFF + VIEW_MAX) movhi vg, #(CLIP_NEAR + CLIP_FAR) bhi .skip and vg, mask, v, lsr #14 - sub z, t, #VIEW_OFF + sub z, #VIEW_OFF + fiq_on // transform y - ldmdb m!, {mx, my, mz, y} - mla y, mx, vx, y - mla y, my, vy, y - mla y, mz, vz, y + mla y, mx1, vx, mw1 + mla y, my1, vy, y + mla y, mz1, vz, y asr y, #FIXED_SHIFT // transform x - ldmdb m!, {mx, my, mz, x} - mla x, mx, vx, x - mla x, my, vy, x - mla x, mz, vz, x + mla x, mx0, vx, mw0 + mla x, my0, vy, x + mla x, mz0, vz, x asr x, #FIXED_SHIFT + fiq_off // fog cmp z, #FOG_MIN @@ -145,11 +153,7 @@ transformRoom_asm: strh y, [res, #-4] strh z, [res, #-2] - // preload mask, matrix and z-row mov mask, #(0xFF << 10) - add m, #(12 * 4) - ldmdb m!, {mx, my, mz, z} - .skip: strh vg, [res], #8 diff --git a/src/platform/gba/asm/transformRoomUW.s b/src/platform/gba/asm/transformRoomUW.s index b148533..4ec8c31 100644 --- a/src/platform/gba/asm/transformRoomUW.s +++ b/src/platform/gba/asm/transformRoomUW.s @@ -1,46 +1,57 @@ #include "common_asm.inc" -vertices .req r0 -count .req r1 -m .req r2 -v .req r3 -vx .req r4 -vy .req r5 -vz .req r6 -vg .req v -mx .req r7 -my .req r8 -mz .req r9 -x .req r10 -y .req r11 -z .req r12 -res .req lr -t .req y +vertices .req r0 // arg +count .req r1 // arg +vx .req r2 +vy .req r3 +vz .req r4 +x .req vx +y .req r5 +z .req r6 +mx0 .req r7 -spMinXY .req mx -spMaxXY .req my -spFrame .req mz -spCaustLUT .req x -spRandLUT .req y +mx2 .req r8 +my2 .req r9 +mz2 .req r10 +mw2 .req r11 +res .req r12 +vg .req lr -mask .req x -vp .req vx -minXY .req vx -maxXY .req vy +// FIQ regs +my0 .req r8 +mz0 .req r9 +mw0 .req r10 +mx1 .req r11 +my1 .req r12 +mz1 .req r13 +mw1 .req r14 -dz .req mz -fog .req mz +m .req vx +v .req vg +mask .req y -frame .req vx +minXY .req vy +maxXY .req vz + +tmp .req vy +dz .req vz +fog .req vz + +frame .req vy caust .req vy rand .req vz -tmp .req mx + +spMinXY .req vx +spMaxXY .req vy +spRandLUT .req vz +spFrame .req y +spCaustLUT .req z SP_MINXY = 0 SP_MAXXY = 4 -SP_FRAME = 8 -SP_CAUST = 12 -SP_RAND = 16 +SP_RAND = 8 +SP_FRAME = 12 +SP_CAUST = 16 SP_SIZE = 20 .global transformRoomUW_asm @@ -51,11 +62,8 @@ transformRoomUW_asm: ldr res, [res] add res, #VERTEX_G - ldr m, =gMatrixPtr - ldr m, [m] - - ldr vp, =viewportRel - ldmia vp, {spMinXY, spMaxXY} + ldr tmp, =viewportRel + ldmia tmp, {spMinXY, spMaxXY} ldr spFrame, =gCausticsFrame ldr spFrame, [spFrame] @@ -63,12 +71,16 @@ transformRoomUW_asm: ldr spCaustLUT, =gCaustics ldr spRandLUT, =gRandTable - stmfd sp!, {spMinXY, spMaxXY, spFrame, spCaustLUT, spRandLUT} + stmfd sp!, {spMinXY, spMaxXY, spRandLUT, spFrame, spCaustLUT} - // preload mask, matrix and z-row mov mask, #(0xFF << 10) - add m, #(12 * 4) - ldmdb m!, {mx, my, mz, z} + + ldr m, =gMatrixPtr + ldr m, [m] + fiq_on + ldmia m!, {mx0, my0, mz0, mw0, mx1, my1, mz1, mw1} + ldmia m, {mx2, my2, mz2, mw2}^ + fiq_off .loop: // unpack vertex @@ -79,41 +91,42 @@ transformRoomUW_asm: and vx, mask, v, lsl #10 // transform z - mla t, mx, vx, z - mla t, my, vy, t - mla t, mz, vz, t - asr t, #FIXED_SHIFT + mla z, mx2, vx, mw2 + mla z, my2, vy, z + mla z, mz2, vz, z + asr z, #FIXED_SHIFT // skip if vertex is out of z-range - add t, #VIEW_OFF - cmp t, #(VIEW_OFF + VIEW_OFF + VIEW_MAX) + add z, #VIEW_OFF + cmp z, #(VIEW_OFF + VIEW_OFF + VIEW_MAX) movhi vg, #(CLIP_NEAR + CLIP_FAR) bhi .skip and vg, mask, v, lsr #14 - sub z, t, #VIEW_OFF + sub z, #VIEW_OFF + fiq_on // transform y - ldmdb m!, {mx, my, mz, y} - mla y, mx, vx, y - mla y, my, vy, y - mla y, mz, vz, y + mla y, mx1, vx, mw1 + mla y, my1, vy, y + mla y, mz1, vz, y asr y, #FIXED_SHIFT // transform x - ldmdb m!, {mx, my, mz, x} - mla x, mx, vx, x - mla x, my, vy, x - mla x, mz, vz, x + mla x, mx0, vx, mw0 + mla x, my0, vy, x + mla x, mz0, vz, x asr x, #FIXED_SHIFT + fiq_off // caustics - add tmp, sp, #SP_FRAME - ldmia tmp, {frame, caust, rand} + ldr rand, [sp, #SP_RAND] and tmp, count, #(MAX_RAND_TABLE - 1) ldr rand, [rand, tmp, lsl #2] + ldr frame, [sp, #SP_FRAME] add rand, frame and rand, #(MAX_CAUSTICS - 1) + ldr caust, [sp, #SP_CAUST] ldr caust, [caust, rand, lsl #2] add vg, caust, lsl #5 @@ -171,11 +184,7 @@ transformRoomUW_asm: strh y, [res, #-4] strh z, [res, #-2] - // preload mask, matrix and z-row mov mask, #(0xFF << 10) - add m, #(12 * 4) - ldmdb m!, {mx, my, mz, z} - .skip: strh vg, [res], #8