From 889d3af918f6f5d04e778c958d3528aca3b402fa Mon Sep 17 00:00:00 2001 From: XProger Date: Mon, 31 Jan 2022 04:12:51 +0300 Subject: [PATCH] #368 minor asm optimizations --- src/platform/gba/asm/rasterizeF.s | 42 +++++++++++--------------- src/platform/gba/asm/rasterizeFT.s | 32 ++++++++------------ src/platform/gba/asm/rasterizeFTA.s | 32 ++++++++------------ src/platform/gba/asm/rasterizeGT.s | 47 ++++++++++++----------------- src/platform/gba/asm/rasterizeGTA.s | 47 ++++++++++++----------------- src/platform/gba/asm/rasterizeS.s | 40 ++++++++++-------------- 6 files changed, 95 insertions(+), 145 deletions(-) diff --git a/src/platform/gba/asm/rasterizeF.s b/src/platform/gba/asm/rasterizeF.s index 338d6fb..fa9ec0c 100644 --- a/src/platform/gba/asm/rasterizeF.s +++ b/src/platform/gba/asm/rasterizeF.s @@ -16,9 +16,9 @@ DIVLUT .req r12 width .req lr h .req N -Ry1 .req tmp +Rxy .req tmp Ry2 .req Rh -Ly1 .req tmp +Lxy .req tmp Ly2 .req Lh LMAP .req Lx pair .req DIVLUT @@ -45,48 +45,40 @@ rasterizeF_asm: cmp Lh, #0 bne .calc_left_end // if (Lh != 0) end with left ldr N, [L, #VERTEX_PREV] // N = L->prev - ldrsh Ly1, [L, #VERTEX_Y] // Ly1 = L->v.y + ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y - subs Lh, Ly2, Ly1 // Lh = Ly2 - Ly1 + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return - ldrsh Lx, [L, #VERTEX_X] // Lx = L->v.x + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 + mov L, N // L = N cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .skip_left_dx + ble .calc_left_start lsl tmp, Lh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) - ldrsh Ldx, [N, #VERTEX_X] - sub Ldx, Lx - mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) - - .skip_left_dx: - lsl Lx, #16 // Lx <<= 16 - mov L, N // L = N - b .calc_left_start + ldrsh Ldx, [L, #VERTEX_X] + sub Ldx, Lx, asr #16 + mul Ldx, tmp // Ldx = tmp * (N->v.x - L->v.x) .calc_left_end: .calc_right_start: cmp Rh, #0 bne .calc_right_end // if (Rh != 0) end with right ldr N, [R, #VERTEX_NEXT] // N = R->next - ldrsh Ry1, [R, #VERTEX_Y] // Ry1 = R->v.y + ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y - subs Rh, Ry2, Ry1 // Rh = Ry2 - Ry1 + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return - ldrsh Rx, [R, #VERTEX_X] // Rx = R->v.x + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 + mov R, N // R = N cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .skip_right_dx + ble .calc_right_start lsl tmp, Rh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh) - ldrsh Rdx, [N, #VERTEX_X] - sub Rdx, Rx + ldrsh Rdx, [R, #VERTEX_X] + sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) - - .skip_right_dx: - lsl Rx, #16 // Rx <<= 16 - mov R, N // R = N - b .calc_right_start .calc_right_end: cmp Rh, Lh // if (Rh < Lh) diff --git a/src/platform/gba/asm/rasterizeFT.s b/src/platform/gba/asm/rasterizeFT.s index b688928..d04e355 100644 --- a/src/platform/gba/asm/rasterizeFT.s +++ b/src/platform/gba/asm/rasterizeFT.s @@ -25,9 +25,9 @@ Rdt .req h indexA .req Lh indexB .req Rh -Ry1 .req tmp +Rxy .req tmp Ry2 .req Rh -Ly1 .req tmp +Lxy .req tmp Ly2 .req Lh inv .req Lh @@ -101,22 +101,22 @@ rasterizeFT_asm: cmp Lh, #0 bne .calc_left_end // if (Lh != 0) end with left ldr N, [L, #VERTEX_PREV] // N = L->prev - ldrsh Ly1, [L, #VERTEX_Y] // Ly1 = L->v.y + ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y - subs Lh, Ly2, Ly1 // Lh = Ly2 - Ly1 + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return - ldrsh Lx, [L, #VERTEX_X] // Lx = L->v.x + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 ldr Lt, [L, #VERTEX_T] // Lt = L->t mov L, N // L = N cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .skip_left_dx + ble .calc_left_start lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) ldrsh Ldx, [L, #VERTEX_X] - sub Ldx, Lx + sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) str Ldx, [sp, #SP_LDX] // store Ldx to stack @@ -131,32 +131,28 @@ rasterizeFT_asm: lsl Ldu, #16 orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Ldt, [sp, #SP_LDT] // store Ldt to stack - - .skip_left_dx: - lsl Lx, #16 // Lx <<= 16 - b .calc_left_start .calc_left_end: .calc_right_start: cmp Rh, #0 bne .calc_right_end // if (Rh != 0) end with right ldr N, [R, #VERTEX_NEXT] // N = R->next - ldrsh Ry1, [R, #VERTEX_Y] // Ry1 = R->v.y + ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y - subs Rh, Ry2, Ry1 // Rh = Ry2 - Ry1 + subs Rh, Ry2, Rxy, asr #16 // Rh = Ry2 - Rxy blt .exit // if (Rh < 0) return - ldrsh Rx, [R, #VERTEX_X] // Rx = R->v.x + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 ldr Rt, [R, #VERTEX_T] // Rt = R->t mov R, N // R = N cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .skip_right_dx + ble .calc_right_start lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh) ldrsh Rdx, [R, #VERTEX_X] - sub Rdx, Rx + sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) str Rdx, [sp, #SP_RDX] // store Rdx to stack @@ -171,10 +167,6 @@ rasterizeFT_asm: lsl Rdu, #16 orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Rdt, [sp, #SP_RDT] // store Rdt to stack - - .skip_right_dx: - lsl Rx, #16 // Rx <<= 16 - b .calc_right_start .calc_right_end: cmp Rh, Lh // if (Rh < Lh) diff --git a/src/platform/gba/asm/rasterizeFTA.s b/src/platform/gba/asm/rasterizeFTA.s index 23d3713..ce9bef0 100644 --- a/src/platform/gba/asm/rasterizeFTA.s +++ b/src/platform/gba/asm/rasterizeFTA.s @@ -25,9 +25,9 @@ Rdt .req h indexA .req Lh indexB .req Rh -Ry1 .req tmp +Rxy .req tmp Ry2 .req Rh -Ly1 .req tmp +Lxy .req tmp Ly2 .req Lh inv .req Lh @@ -107,22 +107,22 @@ rasterizeFTA_asm: cmp Lh, #0 bne .calc_left_end // if (Lh != 0) end with left ldr N, [L, #VERTEX_PREV] // N = L->prev - ldrsh Ly1, [L, #VERTEX_Y] // Ly1 = L->v.y + ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y - subs Lh, Ly2, Ly1 // Lh = Ly2 - Ly1 + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return - ldrsh Lx, [L, #VERTEX_X] // Lx = L->v.x + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 ldr Lt, [L, #VERTEX_T] // Lt = L->t mov L, N // L = N cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .skip_left_dx + ble .calc_left_start lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) ldrsh Ldx, [L, #VERTEX_X] - sub Ldx, Lx + sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) str Ldx, [sp, #SP_LDX] // store Ldx to stack @@ -137,32 +137,28 @@ rasterizeFTA_asm: lsl Ldu, #16 orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Ldt, [sp, #SP_LDT] // store Ldt to stack - - .skip_left_dx: - lsl Lx, #16 // Lx <<= 16 - b .calc_left_start .calc_left_end: .calc_right_start: cmp Rh, #0 bne .calc_right_end // if (Rh != 0) end with right ldr N, [R, #VERTEX_NEXT] // N = R->next - ldrsh Ry1, [R, #VERTEX_Y] // Ry1 = R->v.y + ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y - subs Rh, Ry2, Ry1 // Rh = Ry2 - Ry1 + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return - ldrsh Rx, [R, #VERTEX_X] // Rx = R->v.x + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 ldr Rt, [R, #VERTEX_T] // Rt = R->t mov R, N // R = N cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .skip_right_dx + ble .calc_right_start lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh) ldrsh Rdx, [R, #VERTEX_X] - sub Rdx, Rx + sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) str Rdx, [sp, #SP_RDX] // store Rdx to stack @@ -177,10 +173,6 @@ rasterizeFTA_asm: lsl Rdu, #16 orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Rdt, [sp, #SP_RDT] // store Rdt to stack - - .skip_right_dx: - lsl Rx, #16 // Rx <<= 16 - b .calc_right_start .calc_right_end: cmp Rh, Lh // if (Rh < Lh) diff --git a/src/platform/gba/asm/rasterizeGT.s b/src/platform/gba/asm/rasterizeGT.s index 9b0ef72..40671dd 100644 --- a/src/platform/gba/asm/rasterizeGT.s +++ b/src/platform/gba/asm/rasterizeGT.s @@ -37,9 +37,9 @@ Rdt .req h indexA .req Lh indexB .req Rh -Ry1 .req tmp +Rxy .req tmp Ry2 .req Rh -Ly1 .req tmp +Lxy .req tmp Ly2 .req Lh inv .req Lh @@ -123,28 +123,29 @@ rasterizeGT_asm: cmp Lh, #0 bne .calc_left_end // if (Lh != 0) end with left ldr N, [L, #VERTEX_PREV] // N = L->prev - ldrsh Ly1, [L, #VERTEX_Y] // Ly1 = L->v.y + ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y - subs Lh, Ly2, Ly1 // Lh = Ly2 - Ly1 + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return - ldrsh Lx, [L, #VERTEX_X] // Lx = L->v.x + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 ldrb Lg, [L, #VERTEX_G] // Lg = L->v.g ldr Lt, [L, #VERTEX_T] // Lt = L->t mov L, N // L = N + lsl Lg, #8 // Lg <<= 8 cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .skip_left_dx + ble .calc_left_start lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) ldrsh Ldx, [L, #VERTEX_X] - sub Ldx, Lx + sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) str Ldx, [sp, #SP_LDX] // store Ldx to stack ldrb Ldg, [L, #VERTEX_G] - sub Ldg, Lg + sub Ldg, Lg, lsr #8 mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg) asr Ldg, #8 // 8-bit for fractional part str Ldg, [sp, #SP_LDG] // store Ldg to stack @@ -160,41 +161,35 @@ rasterizeGT_asm: lsl Ldu, #16 orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Ldt, [sp, #SP_LDT] // store Ldt to stack - - .skip_left_dx: - lsl Lx, #16 // Lx <<= 16 - lsl Lg, #8 // Lg <<= 8 - add Lg, #LMAP_ADDR // Lg += lightmap - - b .calc_left_start .calc_left_end: .calc_right_start: cmp Rh, #0 bne .calc_right_end // if (Rh != 0) end with right ldr N, [R, #VERTEX_NEXT] // N = R->next - ldrsh Ry1, [R, #VERTEX_Y] // Ry1 = R->v.y + ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y - subs Rh, Ry2, Ry1 // Rh = Ry2 - Ry1 + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return - ldrsh Rx, [R, #VERTEX_X] // Rx = R->v.x + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 ldrb Rg, [R, #VERTEX_G] // Rg = R->v.g ldr Rt, [R, #VERTEX_T] // Rt = R->t mov R, N // R = N + lsl Rg, #8 // Rg <<= 8 cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .skip_right_dx + ble .calc_right_start lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh) ldrsh Rdx, [R, #VERTEX_X] - sub Rdx, Rx + sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) str Rdx, [sp, #SP_RDX] // store Rdx to stack ldrb Rdg, [R, #VERTEX_G] - sub Rdg, Rg + sub Rdg, Rg, lsr #8 mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg) asr Rdg, #8 // 8-bit for fractional part str Rdg, [sp, #SP_RDG] // store Ldg to stack @@ -210,15 +205,11 @@ rasterizeGT_asm: lsl Rdu, #16 orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Rdt, [sp, #SP_RDT] // store Rdt to stack - - .skip_right_dx: - lsl Rx, #16 // Rx <<= 16 - lsl Rg, #8 // Rg <<= 8 - add Rg, #LMAP_ADDR // Rg += lightmap - - b .calc_right_start .calc_right_end: + orr Lg, #LMAP_ADDR + orr Rg, #LMAP_ADDR + cmp Rh, Lh // if (Rh < Lh) movlt h, Rh // h = Rh movge h, Lh // else h = Lh diff --git a/src/platform/gba/asm/rasterizeGTA.s b/src/platform/gba/asm/rasterizeGTA.s index 17b4012..7724e61 100644 --- a/src/platform/gba/asm/rasterizeGTA.s +++ b/src/platform/gba/asm/rasterizeGTA.s @@ -37,9 +37,9 @@ Rdt .req h indexA .req Lh indexB .req Rh -Ry1 .req tmp +Rxy .req tmp Ry2 .req Rh -Ly1 .req tmp +Lxy .req tmp Ly2 .req Lh inv .req Lh @@ -128,28 +128,29 @@ rasterizeGTA_asm: cmp Lh, #0 bne .calc_left_end // if (Lh != 0) end with left ldr N, [L, #VERTEX_PREV] // N = L->prev - ldrsh Ly1, [L, #VERTEX_Y] // Ly1 = L->v.y + ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y - subs Lh, Ly2, Ly1 // Lh = Ly2 - Ly1 + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return - ldrsh Lx, [L, #VERTEX_X] // Lx = L->v.x + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 ldrb Lg, [L, #VERTEX_G] // Lg = L->v.g ldr Lt, [L, #VERTEX_T] // Lt = L->t mov L, N // L = N + lsl Lg, #8 // Lg <<= 8 cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .skip_left_dx + ble .calc_left_start lsl tmp, Lh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) ldrsh Ldx, [L, #VERTEX_X] - sub Ldx, Lx + sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) str Ldx, [sp, #SP_LDX] // store Ldx to stack ldrb Ldg, [L, #VERTEX_G] - sub Ldg, Lg + sub Ldg, Lg, lsr #8 mul Ldg, tmp // Ldg = tmp * (N->v.g - Lg) asr Ldg, #8 // 8-bit for fractional part str Ldg, [sp, #SP_LDG] // store Ldg to stack @@ -165,41 +166,35 @@ rasterizeGTA_asm: lsl Ldu, #16 orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Ldt, [sp, #SP_LDT] // store Ldt to stack - - .skip_left_dx: - lsl Lx, #16 // Lx <<= 16 - lsl Lg, #8 // Lg <<= 8 - add Lg, #LMAP_ADDR // Lg += lightmap - - b .calc_left_start .calc_left_end: .calc_right_start: cmp Rh, #0 bne .calc_right_end // if (Rh != 0) end with right ldr N, [R, #VERTEX_NEXT] // N = R->next - ldrsh Ry1, [R, #VERTEX_Y] // Ry1 = R->v.y + ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y - subs Rh, Ry2, Ry1 // Rh = Ry2 - Ry1 + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return - ldrsh Rx, [R, #VERTEX_X] // Rx = R->v.x + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 ldrb Rg, [R, #VERTEX_G] // Rg = R->v.g ldr Rt, [R, #VERTEX_T] // Rt = R->t mov R, N // R = N + lsl Rg, #8 // Rg <<= 8 cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .skip_right_dx + ble .calc_right_start lsl tmp, Rh, #1 mov DIVLUT, #DIVLUT_ADDR ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh) ldrsh Rdx, [R, #VERTEX_X] - sub Rdx, Rx + sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) str Rdx, [sp, #SP_RDX] // store Rdx to stack ldrb Rdg, [R, #VERTEX_G] - sub Rdg, Rg + sub Rdg, Rg, lsr #8 mul Rdg, tmp // Rdg = tmp * (N->v.g - Rg) asr Rdg, #8 // 8-bit for fractional part str Rdg, [sp, #SP_RDG] // store Ldg to stack @@ -215,15 +210,11 @@ rasterizeGTA_asm: lsl Rdu, #16 orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16) str Rdt, [sp, #SP_RDT] // store Rdt to stack - - .skip_right_dx: - lsl Rx, #16 // Rx <<= 16 - lsl Rg, #8 // Rg <<= 8 - add Rg, #LMAP_ADDR // Rg += lightmap - - b .calc_right_start .calc_right_end: + orr Lg, #LMAP_ADDR + orr Rg, #LMAP_ADDR + cmp Rh, Lh // if (Rh < Lh) movlt h, Rh // h = Rh movge h, Lh // else h = Lh diff --git a/src/platform/gba/asm/rasterizeS.s b/src/platform/gba/asm/rasterizeS.s index 74b079a..fcbdd1b 100644 --- a/src/platform/gba/asm/rasterizeS.s +++ b/src/platform/gba/asm/rasterizeS.s @@ -15,9 +15,9 @@ tmp .req r11 DIVLUT .req r12 width .req lr h .req N -Ry1 .req tmp +Rxy .req tmp Ry2 .req Rh -Ly1 .req tmp +Lxy .req tmp Ly2 .req Lh pair .req DIVLUT indexA .req Lh @@ -42,48 +42,40 @@ rasterizeS_asm: cmp Lh, #0 bne .calc_left_end // if (Lh != 0) end with left ldr N, [L, #VERTEX_PREV] // N = L->prev - ldrsh Ly1, [L, #VERTEX_Y] // Ly1 = L->v.y + ldr Lxy, [L, #VERTEX_X] // Lxy = (L->v.y << 16) | (L->v.x) ldrsh Ly2, [N, #VERTEX_Y] // Ly2 = N->v.y - subs Lh, Ly2, Ly1 // Lh = Ly2 - Ly1 + subs Lh, Ly2, Lxy, asr #16 // Lh = N->v.y - L->v.y blt .exit // if (Lh < 0) return - ldrsh Lx, [L, #VERTEX_X] // Lx = L->v.x + lsl Lx, Lxy, #16 // Lx = L->v.x << 16 + mov L, N // L = N cmp Lh, #1 // if (Lh <= 1) skip Ldx calc - ble .skip_left_dx + ble .calc_left_start lsl tmp, Lh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh) - ldrsh Ldx, [N, #VERTEX_X] - sub Ldx, Lx + ldrsh Ldx, [L, #VERTEX_X] + sub Ldx, Lx, asr #16 mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) - - .skip_left_dx: - lsl Lx, #16 // Lx <<= 16 - mov L, N // L = N - b .calc_left_start .calc_left_end: .calc_right_start: cmp Rh, #0 bne .calc_right_end // if (Rh != 0) end with right ldr N, [R, #VERTEX_NEXT] // N = R->next - ldrsh Ry1, [R, #VERTEX_Y] // Ry1 = R->v.y + ldr Rxy, [R, #VERTEX_X] // Rxy = (R->v.y << 16) | (R->v.x) ldrsh Ry2, [N, #VERTEX_Y] // Ry2 = N->v.y - subs Rh, Ry2, Ry1 // Rh = Ry2 - Ry1 + subs Rh, Ry2, Rxy, asr #16 // Rh = N->v.y - R->v.y blt .exit // if (Rh < 0) return - ldrsh Rx, [R, #VERTEX_X] // Rx = R->v.x + lsl Rx, Rxy, #16 // Rx = R->v.x << 16 + mov R, N // R = N cmp Rh, #1 // if (Rh <= 1) skip Rdx calc - ble .skip_right_dx + ble .calc_right_start lsl tmp, Rh, #1 ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh) - ldrsh Rdx, [N, #VERTEX_X] - sub Rdx, Rx + ldrsh Rdx, [R, #VERTEX_X] + sub Rdx, Rx, asr #16 mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) - - .skip_right_dx: - lsl Rx, #16 // Rx <<= 16 - mov R, N // R = N - b .calc_right_start .calc_right_end: cmp Rh, Lh // if (Rh < Lh)