From ae63f1c090197fd842ff0af5d62fb68114052ecb Mon Sep 17 00:00:00 2001 From: XProger Date: Sun, 13 Feb 2022 08:46:25 +0300 Subject: [PATCH] #368 micro optimization reduce registers pressure and LDRs in faceAdd* routines --- src/platform/gba/asm/common_asm.inc | 16 +++++++++++----- src/platform/gba/asm/faceAddMeshQuads.s | 21 ++++++++------------- src/platform/gba/asm/faceAddMeshTriangles.s | 19 ++++++++++--------- src/platform/gba/asm/faceAddRoomQuads.s | 21 ++++++++------------- src/platform/gba/asm/faceAddRoomTriangles.s | 14 ++++++++------ 5 files changed, 45 insertions(+), 46 deletions(-) diff --git a/src/platform/gba/asm/common_asm.inc b/src/platform/gba/asm/common_asm.inc index 9adc050..4da4c9d 100644 --- a/src/platform/gba/asm/common_asm.inc +++ b/src/platform/gba/asm/common_asm.inc @@ -97,19 +97,25 @@ ldrh \res, [\res, \x] .endm +// vx0 - vg0 +// vy0 - vg1 +// vx1 - vg2 +// vy1 - vg3 +// vx2 - vg2 +// vy2 - vg2 .macro CCW skip ldrsh vx0, [vp0, #VERTEX_X] ldrsh vy0, [vp0, #VERTEX_Y] ldrsh vx2, [vp2, #VERTEX_X] ldrsh vy1, [vp1, #VERTEX_Y] - rsb vx2, vx2, vx0 // reverse order for mla + rsb vx2, vx2, vx0 // reverse order for mla sub vy1, vy1, vy0 - mul tmp, vx2, vy1 + mul vy1, vx2, vy1 ldrsh vx1, [vp1, #VERTEX_X] + sub vx0, vx1, vx0 ldrsh vy2, [vp2, #VERTEX_Y] - sub vx1, vx1, vx0 - sub vy2, vy2, vy0 - mlas tmp, vx1, vy2, tmp + sub vy0, vy2, vy0 + mlas vy1, vx0, vy0, vy1 ble \skip .endm diff --git a/src/platform/gba/asm/faceAddMeshQuads.s b/src/platform/gba/asm/faceAddMeshQuads.s index 69fbeaf..4efa708 100644 --- a/src/platform/gba/asm/faceAddMeshQuads.s +++ b/src/platform/gba/asm/faceAddMeshQuads.s @@ -12,7 +12,7 @@ vp0 .req r8 vp1 .req r9 vp2 .req r10 vp3 .req r11 -tmp .req r12 +ot .req r12 face .req lr vx0 .req vg0 @@ -20,7 +20,7 @@ vy0 .req vg1 vx1 .req vg2 vy1 .req vg3 vx2 .req vg2 -vy2 .req vg3 +vy2 .req vg2 vz0 .req vg0 vz1 .req vg1 @@ -28,14 +28,10 @@ vz2 .req vg2 vz3 .req vg3 depth .req vg0 -ot .req vg1 +tmp .req flags vertices .req vg2 next .req vp0 -SP_OT = 0 -SP_VERTICES = 4 -SP_SIZE = 8 - .global faceAddMeshQuads_asm faceAddMeshQuads_asm: stmfd sp!, {r4-r11, lr} @@ -47,15 +43,14 @@ faceAddMeshQuads_asm: ldr face, [face] ldr ot, =gOT - ldr vertices, =gVertices - stmfd sp!, {ot, vertices} + + add polys, #2 // skip flags .loop: - ldrh flags, [polys], #2 ldrb vp0, [polys], #1 ldrb vp1, [polys], #1 ldrb vp2, [polys], #1 - ldrb vp3, [polys], #1 + ldrb vp3, [polys], #3 // + flags add vp0, vp, vp0, lsl #3 add vp1, vp, vp1, lsl #3 @@ -82,6 +77,7 @@ faceAddMeshQuads_asm: orr tmp, tmp, vg2 orr tmp, tmp, vg3 tst tmp, #CLIP_MASK_VP + ldrh flags, [polys, #-8] orrne flags, flags, #FACE_CLIPPED // vz0 = AVG_Z4 (depth) @@ -95,7 +91,7 @@ faceAddMeshQuads_asm: mov depth, depth, lsr #(2 + OT_SHIFT) // faceAdd - ldmia sp, {ot, vertices} + ldr vertices, =gVertices sub vp0, vp0, vertices sub vp1, vp1, vertices @@ -117,5 +113,4 @@ faceAddMeshQuads_asm: ldr tmp, =gFacesBase str face, [tmp] - add sp, sp, #SP_SIZE ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/faceAddMeshTriangles.s b/src/platform/gba/asm/faceAddMeshTriangles.s index 919ef41..5997c08 100644 --- a/src/platform/gba/asm/faceAddMeshTriangles.s +++ b/src/platform/gba/asm/faceAddMeshTriangles.s @@ -12,7 +12,7 @@ vp0 .req r8 vp1 .req r9 vp2 .req r10 vertices .req r11 -tmp .req r12 +ot .req r12 face .req lr vx0 .req vg0 @@ -20,14 +20,14 @@ vy0 .req vg1 vx1 .req vg2 vy1 .req vg3 vx2 .req vg2 -vy2 .req vg3 +vy2 .req vg2 vz0 .req vg0 vz1 .req vg1 vz2 .req vg2 depth .req vg0 -ot .req vg1 +tmp .req flags next .req vp0 .global faceAddMeshTriangles_asm @@ -40,14 +40,15 @@ faceAddMeshTriangles_asm: ldr face, =gFacesBase ldr face, [face] + ldr ot, =gOT ldr vertices, =gVertices + add polys, #2 // skip flags + .loop: - ldrh flags, [polys, #0] - ldrb vp0, [polys, #2] - ldrb vp1, [polys, #3] - ldrb vp2, [polys, #4] - add polys, polys, #6 + ldrb vp0, [polys], #1 + ldrb vp1, [polys], #1 + ldrb vp2, [polys], #4 // + padding + flags add vp0, vp, vp0, lsl #3 add vp1, vp, vp1, lsl #3 @@ -70,6 +71,7 @@ faceAddMeshTriangles_asm: orr tmp, vg0, vg1 orr tmp, tmp, vg2 tst tmp, #CLIP_MASK_VP + ldrh flags, [polys, #-8] orrne flags, flags, #FACE_CLIPPED // vz0 = AVG_Z3 (depth) @@ -91,7 +93,6 @@ faceAddMeshTriangles_asm: orr flags, #FACE_TRIANGLE - ldr ot, =gOT ldr next, [ot, depth, lsl #2] str face, [ot, depth, lsl #2] stmia face!, {flags, next, vp1, vp2} diff --git a/src/platform/gba/asm/faceAddRoomQuads.s b/src/platform/gba/asm/faceAddRoomQuads.s index 7cff5d2..f80bb1a 100644 --- a/src/platform/gba/asm/faceAddRoomQuads.s +++ b/src/platform/gba/asm/faceAddRoomQuads.s @@ -12,7 +12,7 @@ vp0 .req r8 vp1 .req r9 vp2 .req r10 vp3 .req r11 -tmp .req r12 +ot .req r12 face .req lr vx0 .req vg0 @@ -20,7 +20,7 @@ vy0 .req vg1 vx1 .req vg2 vy1 .req vg3 vx2 .req vg2 -vy2 .req vg3 +vy2 .req vg2 vz0 .req vg0 vz1 .req vg1 @@ -28,14 +28,10 @@ vz2 .req vg2 vz3 .req vg3 depth .req vg0 -ot .req vg1 +tmp .req flags vertices .req vg2 next .req vp0 -SP_OT = 0 -SP_VERTICES = 4 -SP_SIZE = 8 - .global faceAddRoomQuads_asm faceAddRoomQuads_asm: stmfd sp!, {r4-r11, lr} @@ -47,15 +43,14 @@ faceAddRoomQuads_asm: ldr face, [face] ldr ot, =gOT - ldr vertices, =gVertices - stmfd sp!, {ot, vertices} + + add polys, #2 // skip flags .loop: - ldrh flags, [polys], #2 ldrh vp0, [polys], #2 ldrh vp1, [polys], #2 ldrh vp2, [polys], #2 - ldrh vp3, [polys], #2 + ldrh vp3, [polys], #4 // + flags add vp0, vp, vp0, lsl #3 add vp1, vp, vp1, lsl #3 @@ -80,6 +75,7 @@ faceAddRoomQuads_asm: orr tmp, tmp, vg2 orr tmp, tmp, vg3 tst tmp, #CLIP_MASK_VP + ldrh flags, [polys, #-12] orrne flags, flags, #FACE_CLIPPED // shift and compare VERTEX_G for flat rasterization @@ -105,7 +101,7 @@ faceAddRoomQuads_asm: mov depth, vz0, lsr #OT_SHIFT // faceAdd - ldmia sp, {ot, vertices} + ldr vertices, =gVertices sub vp0, vp0, vertices sub vp1, vp1, vertices @@ -127,5 +123,4 @@ faceAddRoomQuads_asm: ldr tmp, =gFacesBase str face, [tmp] - add sp, sp, #SP_SIZE ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/faceAddRoomTriangles.s b/src/platform/gba/asm/faceAddRoomTriangles.s index dcb1cb8..ea0df95 100644 --- a/src/platform/gba/asm/faceAddRoomTriangles.s +++ b/src/platform/gba/asm/faceAddRoomTriangles.s @@ -12,7 +12,7 @@ vp0 .req r8 vp1 .req r9 vp2 .req r10 vertices .req r11 -tmp .req r12 +ot .req r12 face .req lr vx0 .req vg0 @@ -20,14 +20,14 @@ vy0 .req vg1 vx1 .req vg2 vy1 .req vg3 vx2 .req vg2 -vy2 .req vg3 +vy2 .req vg2 vz0 .req vg0 vz1 .req vg1 vz2 .req vg2 depth .req vg0 -ot .req vg1 +tmp .req flags next .req vp0 .global faceAddRoomTriangles_asm @@ -40,13 +40,15 @@ faceAddRoomTriangles_asm: ldr face, =gFacesBase ldr face, [face] + ldr ot, =gOT ldr vertices, =gVertices + add polys, #2 // skip flags + .loop: - ldrh flags, [polys], #2 ldrh vp0, [polys], #2 ldrh vp1, [polys], #2 - ldrh vp2, [polys], #2 + ldrh vp2, [polys], #4 // + flags add vp0, vp, vp0, lsl #3 add vp1, vp, vp1, lsl #3 @@ -67,6 +69,7 @@ faceAddRoomTriangles_asm: orr tmp, vg0, vg1 orr tmp, tmp, vg2 tst tmp, #CLIP_MASK_VP + ldrh flags, [polys, #-10] orrne flags, flags, #FACE_CLIPPED // shift and compare VERTEX_G for flat rasterization @@ -98,7 +101,6 @@ faceAddRoomTriangles_asm: orr flags, #FACE_TRIANGLE - ldr ot, =gOT ldr next, [ot, depth, lsl #2] str face, [ot, depth, lsl #2] stmia face!, {flags, next, vp1, vp2}