From a69667375db62cf464820caf62a2217f244b79b8 Mon Sep 17 00:00:00 2001 From: XProger Date: Sat, 5 Feb 2022 08:33:13 +0300 Subject: [PATCH] #368 micro optimization (rasterizer L/R branch) --- src/platform/gba/asm/rasterizeF.s | 40 +++++++++++++++++------------ src/platform/gba/asm/rasterizeFT.s | 28 ++++++++++++-------- src/platform/gba/asm/rasterizeFTA.s | 28 ++++++++++++-------- src/platform/gba/asm/rasterizeGT.s | 32 ++++++++++++++--------- src/platform/gba/asm/rasterizeGTA.s | 32 ++++++++++++++--------- src/platform/gba/asm/rasterizeS.s | 40 +++++++++++++++++------------ 6 files changed, 124 insertions(+), 76 deletions(-) diff --git a/src/platform/gba/asm/rasterizeF.s b/src/platform/gba/asm/rasterizeF.s index d05e362..435b2ee 100644 --- a/src/platform/gba/asm/rasterizeF.s +++ b/src/platform/gba/asm/rasterizeF.s @@ -42,19 +42,23 @@ rasterizeF_asm: .loop: mov DIVLUT, #DIVLUT_ADDR + cmp Lh, #0 + bne .calc_left_end // if (Lh != 0) end with left + .calc_left_start: - cmp Lh, #0 - bne .calc_left_end // if (Lh != 0) end with left - ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev - add N, L, N, lsl #VERTEX_SIZEOF_SHIFT ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) - ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y + ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev + add L, L, N, lsl #VERTEX_SIZEOF_SHIFT + ldrsh Ly2, [L, #VERTEX_Y] // Ly2 = N->v.y + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return + beq .calc_left_start + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 - mov L, N // L = N - cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .calc_left_start + cmp Lh, #1 // if (Lh == 1) skip Ldx calc + beq .calc_left_end + lsl tmp, Lh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) @@ -63,19 +67,23 @@ rasterizeF_asm: mul Ldx, tmp // Ldx = tmp * (N->v.x - L->v.x) .calc_left_end: + cmp Rh, #0 + bne .calc_right_end // if (Rh != 0) end with right + .calc_right_start: - cmp Rh, #0 - bne .calc_right_end // if (Rh != 0) end with right - ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next - add N, R, N, lsl #VERTEX_SIZEOF_SHIFT ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) - ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y + ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next + add R, R, N, lsl #VERTEX_SIZEOF_SHIFT + ldrsh Ry2, [R, #VERTEX_Y] // Ry2 = N->v.y + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return + beq .calc_right_start + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 - mov R, N // R = N - cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .calc_right_start + cmp Rh, #1 // if (Rh == 1) skip Rdx calc + beq .calc_right_end + lsl tmp, Rh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh) diff --git a/src/platform/gba/asm/rasterizeFT.s b/src/platform/gba/asm/rasterizeFT.s index 1c52d86..e845e78 100644 --- a/src/platform/gba/asm/rasterizeFT.s +++ b/src/platform/gba/asm/rasterizeFT.s @@ -97,20 +97,24 @@ rasterizeFT_asm: .loop: + cmp Lh, #0 + bgt .calc_left_end // if (Lh != 0) end with left + .calc_left_start: - cmp Lh, #0 - bne .calc_left_end // if (Lh != 0) end with left ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev add N, L, N, lsl #VERTEX_SIZEOF_SHIFT ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return + ldrne Lt, [L, #VERTEX_T] // Lt = L->t + mov L, N // L = N + beq .calc_left_start + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 - ldr Lt, [L, #VERTEX_T] // Lt = L->t - mov L, N // L = N cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .calc_left_start + beq .calc_left_end lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR @@ -134,20 +138,24 @@ rasterizeFT_asm: str Ldt, [sp, #SP_LDT] // store Ldt to stack .calc_left_end: + cmp Rh, #0 + bgt .calc_right_end // if (Rh != 0) end with right + .calc_right_start: - cmp Rh, #0 - bne .calc_right_end // if (Rh != 0) end with right ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next add N, R, N, lsl #VERTEX_SIZEOF_SHIFT ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y + subs Rh, Ry2, Rxy, asr #16 // Rh = Ry2 - Rxy blt .exit // if (Rh < 0) return + ldrne Rt, [R, #VERTEX_T] // Rt = R->t + mov R, N // R = N + beq .calc_right_start + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 - ldr Rt, [R, #VERTEX_T] // Rt = R->t - mov R, N // R = N cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .calc_right_start + beq .calc_right_end lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR diff --git a/src/platform/gba/asm/rasterizeFTA.s b/src/platform/gba/asm/rasterizeFTA.s index dab837a..e300f77 100644 --- a/src/platform/gba/asm/rasterizeFTA.s +++ b/src/platform/gba/asm/rasterizeFTA.s @@ -103,20 +103,24 @@ rasterizeFTA_asm: .loop: + cmp Lh, #0 + bne .calc_left_end // if (Lh != 0) end with left + .calc_left_start: - cmp Lh, #0 - bne .calc_left_end // if (Lh != 0) end with left ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev add N, L, N, lsl #VERTEX_SIZEOF_SHIFT ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return + ldrne Lt, [L, #VERTEX_T] // Lt = L->t + mov L, N // L = N + beq .calc_left_start + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 - ldr Lt, [L, #VERTEX_T] // Lt = L->t - mov L, N // L = N cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .calc_left_start + beq .calc_left_end lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR @@ -140,20 +144,24 @@ rasterizeFTA_asm: str Ldt, [sp, #SP_LDT] // store Ldt to stack .calc_left_end: + cmp Rh, #0 + bne .calc_right_end // if (Rh != 0) end with right + .calc_right_start: - cmp Rh, #0 - bne .calc_right_end // if (Rh != 0) end with right ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next add N, R, N, lsl #VERTEX_SIZEOF_SHIFT ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return + ldrne Rt, [R, #VERTEX_T] // Rt = R->t + mov R, N // R = N + beq .calc_right_start + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 - ldr Rt, [R, #VERTEX_T] // Rt = R->t - mov R, N // R = N cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .calc_right_start + beq .calc_right_end lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR diff --git a/src/platform/gba/asm/rasterizeGT.s b/src/platform/gba/asm/rasterizeGT.s index ece5ea5..3c4a422 100644 --- a/src/platform/gba/asm/rasterizeGT.s +++ b/src/platform/gba/asm/rasterizeGT.s @@ -119,22 +119,26 @@ rasterizeGT_asm: .loop: + cmp Lh, #0 + bne .calc_left_end // if (Lh != 0) end with left + .calc_left_start: - cmp Lh, #0 - bne .calc_left_end // if (Lh != 0) end with left ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev add N, L, N, lsl #VERTEX_SIZEOF_SHIFT ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return + ldrneb Lg, [L, #VERTEX_G] // Lg = L->v.g + ldrne Lt, [L, #VERTEX_T] // Lt = L->t + mov L, N // L = N + beq .calc_left_start + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 - ldrb Lg, [L, #VERTEX_G] // Lg = L->v.g - ldr Lt, [L, #VERTEX_T] // Lt = L->t - mov L, N // L = N lsl Lg, #8 // Lg <<= 8 cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .calc_left_start + beq .calc_left_end lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR @@ -164,22 +168,26 @@ rasterizeGT_asm: str Ldt, [sp, #SP_LDT] // store Ldt to stack .calc_left_end: + cmp Rh, #0 + bne .calc_right_end // if (Rh != 0) end with right + .calc_right_start: - cmp Rh, #0 - bne .calc_right_end // if (Rh != 0) end with right ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next add N, R, N, lsl #VERTEX_SIZEOF_SHIFT ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return + ldrneb Rg, [R, #VERTEX_G] // Rg = R->v.g + ldrne Rt, [R, #VERTEX_T] // Rt = R->t + mov R, N // R = N + beq .calc_right_start + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 - ldrb Rg, [R, #VERTEX_G] // Rg = R->v.g - ldr Rt, [R, #VERTEX_T] // Rt = R->t - mov R, N // R = N lsl Rg, #8 // Rg <<= 8 cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .calc_right_start + beq .calc_right_end lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR diff --git a/src/platform/gba/asm/rasterizeGTA.s b/src/platform/gba/asm/rasterizeGTA.s index b018409..bc5e2f6 100644 --- a/src/platform/gba/asm/rasterizeGTA.s +++ b/src/platform/gba/asm/rasterizeGTA.s @@ -124,22 +124,26 @@ rasterizeGTA_asm: .loop: + cmp Lh, #0 + bne .calc_left_end // if (Lh != 0) end with left + .calc_left_start: - cmp Lh, #0 - bne .calc_left_end // if (Lh != 0) end with left ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev add N, L, N, lsl #VERTEX_SIZEOF_SHIFT ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return + ldrneb Lg, [L, #VERTEX_G] // Lg = L->v.g + ldrne Lt, [L, #VERTEX_T] // Lt = L->t + mov L, N // L = N + beq .calc_left_start + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 - ldrb Lg, [L, #VERTEX_G] // Lg = L->v.g - ldr Lt, [L, #VERTEX_T] // Lt = L->t - mov L, N // L = N lsl Lg, #8 // Lg <<= 8 cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .calc_left_start + beq .calc_left_end lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR @@ -169,22 +173,26 @@ rasterizeGTA_asm: str Ldt, [sp, #SP_LDT] // store Ldt to stack .calc_left_end: + cmp Rh, #0 + bne .calc_right_end // if (Rh != 0) end with right + .calc_right_start: - cmp Rh, #0 - bne .calc_right_end // if (Rh != 0) end with right ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next add N, R, N, lsl #VERTEX_SIZEOF_SHIFT ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return + ldrb Rg, [R, #VERTEX_G] // Rg = R->v.g + ldr Rt, [R, #VERTEX_T] // Rt = R->t + mov R, N // R = N + beq .calc_right_start + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 - ldrb Rg, [R, #VERTEX_G] // Rg = R->v.g - ldr Rt, [R, #VERTEX_T] // Rt = R->t - mov R, N // R = N lsl Rg, #8 // Rg <<= 8 cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .calc_right_start + beq .calc_right_end lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR diff --git a/src/platform/gba/asm/rasterizeS.s b/src/platform/gba/asm/rasterizeS.s index 0b12d52..d0a9747 100644 --- a/src/platform/gba/asm/rasterizeS.s +++ b/src/platform/gba/asm/rasterizeS.s @@ -36,19 +36,23 @@ rasterizeS_asm: .loop: mov DIVLUT, #DIVLUT_ADDR + cmp Lh, #0 + bne .calc_left_end // if (Lh != 0) end with left + .calc_left_start: - cmp Lh, #0 - bne .calc_left_end // if (Lh != 0) end with left - ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev - add N, L, N, lsl #VERTEX_SIZEOF_SHIFT ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) - ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y + ldrsb N, [L, #VERTEX_PREV] // N = L + L->prev + add L, L, N, lsl #VERTEX_SIZEOF_SHIFT + ldrsh Ly2, [L, #VERTEX_Y] // Ly2 = N->v.y + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return + beq .calc_left_start + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 - mov L, N // L = N - cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .calc_left_start + cmp Lh, #1 // if (Lh == 1) skip Ldx calc + beq .calc_left_end + lsl tmp, Lh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) @@ -57,19 +61,23 @@ rasterizeS_asm: mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) .calc_left_end: + cmp Rh, #0 + bne .calc_right_end // if (Rh != 0) end with right + .calc_right_start: - cmp Rh, #0 - bne .calc_right_end // if (Rh != 0) end with right - ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next - add N, R, N, lsl #VERTEX_SIZEOF_SHIFT ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) - ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y + ldrsb N, [R, #VERTEX_NEXT] // N = R + R->next + add R, R, N, lsl #VERTEX_SIZEOF_SHIFT + ldrsh Ry2, [R, #VERTEX_Y] // Ry2 = N->v.y + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return + beq .calc_right_start + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 - mov R, N // R = N - cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .calc_right_start + cmp Rh, #1 // if (Rh == 1) skip Rdx calc + beq .calc_right_end + lsl tmp, Rh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh)