From f434d453591c91822dbbb21cbec1c7d1f2182e11 Mon Sep 17 00:00:00 2001 From: XProger Date: Sun, 13 Feb 2022 18:21:19 +0300 Subject: [PATCH] #368 micro optimizations --- src/fixed/common.h | 4 ++++ src/platform/gba/asm/faceAddMeshQuads.s | 23 +++++++++++++-------- src/platform/gba/asm/faceAddMeshTriangles.s | 11 +++++----- src/platform/gba/asm/faceAddRoomQuads.s | 23 +++++++++++++-------- src/platform/gba/asm/faceAddRoomTriangles.s | 11 +++++----- src/platform/gba/render.iwram.cpp | 8 +++---- 6 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/fixed/common.h b/src/fixed/common.h index 1245848..d2c86e3 100644 --- a/src/fixed/common.h +++ b/src/fixed/common.h @@ -157,16 +157,19 @@ #define X_INLINE inline #define X_NOINLINE __declspec(noinline) #define ALIGN4 __declspec(align(4)) + #define ALIGN8 __declspec(align(8)) #define ALIGN16 __declspec(align(16)) #elif defined(__WATCOMC__) || defined(__3DO__) #define X_INLINE inline #define X_NOINLINE #define ALIGN4 + #define ALIGN8 #define ALIGN16 #else #define X_INLINE __attribute__((always_inline)) inline #define X_NOINLINE __attribute__((noinline)) #define ALIGN4 __attribute__((aligned(4))) + #define ALIGN8 __attribute__((aligned(8))) #define ALIGN16 __attribute__((aligned(16))) #endif @@ -689,6 +692,7 @@ struct Face int32 ccb_HDDX; int32 ccb_HDDY; uint32 ccb_PIXC; + // TODO use 1x1 textures instead of colored faces to remove preamble words (8 bytes per face - 15k) uint32 ccb_PRE0; uint32 ccb_PRE1; //int32 ccb_Width; diff --git a/src/platform/gba/asm/faceAddMeshQuads.s b/src/platform/gba/asm/faceAddMeshQuads.s index 009b366..87923b9 100644 --- a/src/platform/gba/asm/faceAddMeshQuads.s +++ b/src/platform/gba/asm/faceAddMeshQuads.s @@ -32,6 +32,8 @@ tmp .req flags vertices .req vg2 next .req vp0 +SP_SIZE = 4 + .global faceAddMeshQuads_asm faceAddMeshQuads_asm: stmfd sp!, {r4-r11, lr} @@ -39,6 +41,10 @@ faceAddMeshQuads_asm: ldr vp, =gVerticesBase ldr vp, [vp] + ldr vertices, =gVertices + lsr vertices, #3 + stmfd sp!, {vertices} + ldr face, =gFacesBase ldr face, [face] @@ -90,16 +96,14 @@ faceAddMeshQuads_asm: lsr depth, #(2 + OT_SHIFT) // faceAdd - ldr vertices, =gVertices - sub vp0, vertices - sub vp1, vertices - sub vp2, vertices - sub vp3, vertices + ldr vertices, [sp] + rsb vp0, vertices, vp0, lsr #3 + rsb vp1, vertices, vp1, lsr #3 + rsb vp2, vertices, vp2, lsr #3 + rsb vp3, vertices, vp3, lsr #3 - lsr vp0, #3 - orr vp1, vp0, vp1, lsl #(16 - 3) - lsr vp2, #3 - orr vp3, vp2, vp3, lsl #(16 - 3) + orr vp1, vp0, vp1, lsl #16 + orr vp3, vp2, vp3, lsl #16 ldr next, [ot, depth, lsl #2] str face, [ot, depth, lsl #2] @@ -111,4 +115,5 @@ faceAddMeshQuads_asm: ldr tmp, =gFacesBase str face, [tmp] + add sp, #SP_SIZE ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/faceAddMeshTriangles.s b/src/platform/gba/asm/faceAddMeshTriangles.s index f71635b..b04b541 100644 --- a/src/platform/gba/asm/faceAddMeshTriangles.s +++ b/src/platform/gba/asm/faceAddMeshTriangles.s @@ -42,6 +42,7 @@ faceAddMeshTriangles_asm: ldr ot, =gOT ldr vertices, =gVertices + lsr vertices, #3 add polys, #2 // skip flags @@ -82,13 +83,11 @@ faceAddMeshTriangles_asm: lsr depth, #(2 + OT_SHIFT) // faceAdd - sub vp0, vertices - sub vp1, vertices - sub vp2, vertices + rsb vp0, vertices, vp0, lsr #3 + rsb vp1, vertices, vp1, lsr #3 + rsb vp2, vertices, vp2, lsr #3 - lsr vp0, #3 - orr vp1, vp0, vp1, lsl #(16 - 3) - lsr vp2, #3 + orr vp1, vp0, vp1, lsl #16 orr flags, #FACE_TRIANGLE diff --git a/src/platform/gba/asm/faceAddRoomQuads.s b/src/platform/gba/asm/faceAddRoomQuads.s index 6cd7906..5966999 100644 --- a/src/platform/gba/asm/faceAddRoomQuads.s +++ b/src/platform/gba/asm/faceAddRoomQuads.s @@ -32,6 +32,8 @@ tmp .req flags vertices .req vg2 next .req vp0 +SP_SIZE = 4 + .global faceAddRoomQuads_asm faceAddRoomQuads_asm: stmfd sp!, {r4-r11, lr} @@ -39,6 +41,10 @@ faceAddRoomQuads_asm: ldr vp, =gVerticesBase ldr vp, [vp] + ldr vertices, =gVertices + lsr vertices, #3 + stmfd sp!, {vertices} + ldr face, =gFacesBase ldr face, [face] @@ -101,16 +107,14 @@ faceAddRoomQuads_asm: mov depth, vz0, lsr #OT_SHIFT // faceAdd - ldr vertices, =gVertices - sub vp0, vertices - sub vp1, vertices - sub vp2, vertices - sub vp3, vertices + ldr vertices, [sp] + rsb vp0, vertices, vp0, lsr #3 + rsb vp1, vertices, vp1, lsr #3 + rsb vp2, vertices, vp2, lsr #3 + rsb vp3, vertices, vp3, lsr #3 - lsr vp0, #3 - orr vp1, vp0, vp1, lsl #(16 - 3) - lsr vp2, #3 - orr vp3, vp2, vp3, lsl #(16 - 3) + orr vp1, vp0, vp1, lsl #16 + orr vp3, vp2, vp3, lsl #16 ldr next, [ot, depth, lsl #2] str face, [ot, depth, lsl #2] @@ -122,4 +126,5 @@ faceAddRoomQuads_asm: ldr tmp, =gFacesBase str face, [tmp] + add sp, #SP_SIZE ldmfd sp!, {r4-r11, pc} diff --git a/src/platform/gba/asm/faceAddRoomTriangles.s b/src/platform/gba/asm/faceAddRoomTriangles.s index 39aa2a2..3f62b7d 100644 --- a/src/platform/gba/asm/faceAddRoomTriangles.s +++ b/src/platform/gba/asm/faceAddRoomTriangles.s @@ -42,6 +42,7 @@ faceAddRoomTriangles_asm: ldr ot, =gOT ldr vertices, =gVertices + lsr vertices, #3 add polys, #2 // skip flags @@ -91,13 +92,11 @@ faceAddRoomTriangles_asm: mov depth, vz0, lsr #OT_SHIFT // faceAdd - sub vp0, vertices - sub vp1, vertices - sub vp2, vertices + rsb vp0, vertices, vp0, lsr #3 + rsb vp1, vertices, vp1, lsr #3 + rsb vp2, vertices, vp2, lsr #3 - lsr vp0, #3 - orr vp1, vp0, vp1, lsl #(16 - 3) - lsr vp2, #3 + orr vp1, vp0, vp1, lsl #16 orr flags, #FACE_TRIANGLE diff --git a/src/platform/gba/render.iwram.cpp b/src/platform/gba/render.iwram.cpp index 811bbf9..ae31db7 100644 --- a/src/platform/gba/render.iwram.cpp +++ b/src/platform/gba/render.iwram.cpp @@ -65,10 +65,10 @@ const uint8* gTile; Vertex* gVerticesBase; Face* gFacesBase; -EWRAM_DATA uint8 gBackgroundCopy[FRAME_WIDTH * FRAME_HEIGHT]; // EWRAM 37.5k -EWRAM_DATA Vertex gVertices[MAX_VERTICES]; // EWRAM 16k -EWRAM_DATA Face gFaces[MAX_FACES]; // EWRAM 5k -Face* gOT[OT_SIZE]; // IWRAM 2.5k +EWRAM_DATA uint8 gBackgroundCopy[FRAME_WIDTH * FRAME_HEIGHT]; // EWRAM 37.5k +EWRAM_DATA ALIGN8 Vertex gVertices[MAX_VERTICES]; // EWRAM 16k +EWRAM_DATA Face gFaces[MAX_FACES]; // EWRAM 30k +Face* gOT[OT_SIZE]; // IWRAM 2.5k enum ClipFlags { CLIP_LEFT = 1 << 0,