From 889d3af918f6f5d04e778c958d3528aca3b402fa Mon Sep 17 00:00:00 2001
From: XProger <xproger@list.ru>
Date: Mon, 31 Jan 2022 04:12:51 +0300
Subject: [PATCH] #368 minor asm optimizations

---
 src/platform/gba/asm/rasterizeF.s   | 42 +++++++++++---------------
 src/platform/gba/asm/rasterizeFT.s  | 32 ++++++++------------
 src/platform/gba/asm/rasterizeFTA.s | 32 ++++++++------------
 src/platform/gba/asm/rasterizeGT.s  | 47 ++++++++++++-----------------
 src/platform/gba/asm/rasterizeGTA.s | 47 ++++++++++++-----------------
 src/platform/gba/asm/rasterizeS.s   | 40 ++++++++++--------------
 6 files changed, 95 insertions(+), 145 deletions(-)

diff --git a/src/platform/gba/asm/rasterizeF.s b/src/platform/gba/asm/rasterizeF.s
index 338d6fb..fa9ec0c 100644
--- a/src/platform/gba/asm/rasterizeF.s
+++ b/src/platform/gba/asm/rasterizeF.s
@@ -16,9 +16,9 @@ DIVLUT  .req r12
 width   .req lr
 
 h       .req N
-Ry1     .req tmp
+Rxy     .req tmp
 Ry2     .req Rh
-Ly1     .req tmp
+Lxy     .req tmp
 Ly2     .req Lh
 LMAP    .req Lx
 pair    .req DIVLUT
@@ -45,48 +45,40 @@ rasterizeF_asm:
         cmp Lh, #0
           bne .calc_left_end        // if (Lh != 0) end with left
         ldr N, [L, #VERTEX_PREV]    // N = L->prev
-        ldrsh Ly1, [L, #VERTEX_Y]   // Ly1 = L->v.y
+        ldr Lxy, [L, #VERTEX_X]     // Lxy = (L->v.y << 16) | (L->v.x)
         ldrsh Ly2, [N, #VERTEX_Y]   // Ly2 = N->v.y
-        subs Lh, Ly2, Ly1           // Lh = Ly2 - Ly1
+        subs Lh, Ly2, Lxy, asr #16  // Lh = N->v.y - L->v.y
           blt .exit                 // if (Lh < 0) return
-        ldrsh Lx, [L, #VERTEX_X]    // Lx = L->v.x
+        lsl Lx, Lxy, #16            // Lx = L->v.x << 16
+        mov L, N                    // L = N
         cmp Lh, #1                  // if (Lh <= 1) skip Ldx calc
-          ble .skip_left_dx
+          ble .calc_left_start
         lsl tmp, Lh, #1
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Lh)
 
-        ldrsh Ldx, [N, #VERTEX_X]
-        sub Ldx, Lx
-        mul Ldx, tmp                // Ldx = tmp * (N->v.x - Lx)
-
-        .skip_left_dx:
-        lsl Lx, #16                 // Lx <<= 16
-        mov L, N                    // L = N
-        b .calc_left_start
+        ldrsh Ldx, [L, #VERTEX_X]
+        sub Ldx, Lx, asr #16
+        mul Ldx, tmp                // Ldx = tmp * (N->v.x - L->v.x)
     .calc_left_end:
 
     .calc_right_start:
         cmp Rh, #0
           bne .calc_right_end       // if (Rh != 0) end with right
         ldr N, [R, #VERTEX_NEXT]    // N = R->next
-        ldrsh Ry1, [R, #VERTEX_Y]   // Ry1 = R->v.y
+        ldr Rxy, [R, #VERTEX_X]     // Rxy = (R->v.y << 16) | (R->v.x)
         ldrsh Ry2, [N, #VERTEX_Y]   // Ry2 = N->v.y
-        subs Rh, Ry2, Ry1           // Rh = Ry2 - Ry1
+        subs Rh, Ry2, Rxy, asr #16  // Rh = N->v.y - R->v.y
           blt .exit                 // if (Rh < 0) return
-        ldrsh Rx, [R, #VERTEX_X]    // Rx = R->v.x
+        lsl Rx, Rxy, #16            // Rx = R->v.x << 16
+        mov R, N                    // R = N
         cmp Rh, #1                  // if (Rh <= 1) skip Rdx calc
-          ble .skip_right_dx
+          ble .calc_right_start
         lsl tmp, Rh, #1
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Rh)
 
-        ldrsh Rdx, [N, #VERTEX_X]
-        sub Rdx, Rx
+        ldrsh Rdx, [R, #VERTEX_X]
+        sub Rdx, Rx, asr #16
         mul Rdx, tmp                // Rdx = tmp * (N->v.x - Rx)
-
-        .skip_right_dx:
-        lsl Rx, #16                 // Rx <<= 16
-        mov R, N                    // R = N
-        b .calc_right_start
     .calc_right_end:
 
     cmp Rh, Lh              // if (Rh < Lh)
diff --git a/src/platform/gba/asm/rasterizeFT.s b/src/platform/gba/asm/rasterizeFT.s
index b688928..d04e355 100644
--- a/src/platform/gba/asm/rasterizeFT.s
+++ b/src/platform/gba/asm/rasterizeFT.s
@@ -25,9 +25,9 @@ Rdt     .req h
 
 indexA  .req Lh
 indexB  .req Rh
-Ry1     .req tmp
+Rxy     .req tmp
 Ry2     .req Rh
-Ly1     .req tmp
+Lxy     .req tmp
 Ly2     .req Lh
 
 inv     .req Lh
@@ -101,22 +101,22 @@ rasterizeFT_asm:
         cmp Lh, #0
           bne .calc_left_end        // if (Lh != 0) end with left
         ldr N, [L, #VERTEX_PREV]    // N = L->prev
-        ldrsh Ly1, [L, #VERTEX_Y]   // Ly1 = L->v.y
+        ldr Lxy, [L, #VERTEX_X]     // Lxy = (L->v.y << 16) | (L->v.x)
         ldrsh Ly2, [N, #VERTEX_Y]   // Ly2 = N->v.y
-        subs Lh, Ly2, Ly1           // Lh = Ly2 - Ly1
+        subs Lh, Ly2, Lxy, asr #16  // Lh = N->v.y - L->v.y
           blt .exit                 // if (Lh < 0) return
-        ldrsh Lx, [L, #VERTEX_X]    // Lx = L->v.x
+        lsl Lx, Lxy, #16            // Lx = L->v.x << 16
         ldr Lt, [L, #VERTEX_T]      // Lt = L->t
         mov L, N                    // L = N
         cmp Lh, #1                  // if (Lh <= 1) skip Ldx calc
-          ble .skip_left_dx
+          ble .calc_left_start
 
         lsl tmp, Lh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Lh)
 
         ldrsh Ldx, [L, #VERTEX_X]
-        sub Ldx, Lx
+        sub Ldx, Lx, asr #16
         mul Ldx, tmp                // Ldx = tmp * (N->v.x - Lx)
         str Ldx, [sp, #SP_LDX]      // store Ldx to stack
 
@@ -131,32 +131,28 @@ rasterizeFT_asm:
         lsl Ldu, #16
         orr Ldt, Ldu, Ldv, lsr #16  // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Ldt, [sp, #SP_LDT]      // store Ldt to stack
-
-        .skip_left_dx:
-        lsl Lx, #16                 // Lx <<= 16
-        b .calc_left_start
     .calc_left_end:
 
     .calc_right_start:
         cmp Rh, #0
           bne .calc_right_end       // if (Rh != 0) end with right
         ldr N, [R, #VERTEX_NEXT]    // N = R->next
-        ldrsh Ry1, [R, #VERTEX_Y]   // Ry1 = R->v.y
+        ldr Rxy, [R, #VERTEX_X]     // Rxy = (R->v.y << 16) | (R->v.x)
         ldrsh Ry2, [N, #VERTEX_Y]   // Ry2 = N->v.y
-        subs Rh, Ry2, Ry1           // Rh = Ry2 - Ry1
+        subs Rh, Ry2, Rxy, asr #16  // Rh = Ry2 - Rxy
           blt .exit                 // if (Rh < 0) return
-        ldrsh Rx, [R, #VERTEX_X]    // Rx = R->v.x
+        lsl Rx, Rxy, #16            // Rx = R->v.x << 16
         ldr Rt, [R, #VERTEX_T]      // Rt = R->t
         mov R, N                    // R = N
         cmp Rh, #1                  // if (Rh <= 1) skip Rdx calc
-          ble .skip_right_dx
+          ble .calc_right_start
 
         lsl tmp, Rh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Rh)
 
         ldrsh Rdx, [R, #VERTEX_X]
-        sub Rdx, Rx
+        sub Rdx, Rx, asr #16
         mul Rdx, tmp                // Rdx = tmp * (N->v.x - Rx)
         str Rdx, [sp, #SP_RDX]      // store Rdx to stack
 
@@ -171,10 +167,6 @@ rasterizeFT_asm:
         lsl Rdu, #16
         orr Rdt, Rdu, Rdv, lsr #16  // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Rdt, [sp, #SP_RDT]      // store Rdt to stack
-
-        .skip_right_dx:
-        lsl Rx, #16                 // Rx <<= 16
-        b .calc_right_start
     .calc_right_end:
 
     cmp Rh, Lh              // if (Rh < Lh)
diff --git a/src/platform/gba/asm/rasterizeFTA.s b/src/platform/gba/asm/rasterizeFTA.s
index 23d3713..ce9bef0 100644
--- a/src/platform/gba/asm/rasterizeFTA.s
+++ b/src/platform/gba/asm/rasterizeFTA.s
@@ -25,9 +25,9 @@ Rdt     .req h
 
 indexA  .req Lh
 indexB  .req Rh
-Ry1     .req tmp
+Rxy     .req tmp
 Ry2     .req Rh
-Ly1     .req tmp
+Lxy     .req tmp
 Ly2     .req Lh
 
 inv     .req Lh
@@ -107,22 +107,22 @@ rasterizeFTA_asm:
         cmp Lh, #0
           bne .calc_left_end        // if (Lh != 0) end with left
         ldr N, [L, #VERTEX_PREV]    // N = L->prev
-        ldrsh Ly1, [L, #VERTEX_Y]   // Ly1 = L->v.y
+        ldr Lxy, [L, #VERTEX_X]     // Lxy = (L->v.y << 16) | (L->v.x)
         ldrsh Ly2, [N, #VERTEX_Y]   // Ly2 = N->v.y
-        subs Lh, Ly2, Ly1           // Lh = Ly2 - Ly1
+        subs Lh, Ly2, Lxy, asr #16  // Lh = N->v.y - L->v.y
           blt .exit                 // if (Lh < 0) return
-        ldrsh Lx, [L, #VERTEX_X]    // Lx = L->v.x
+        lsl Lx, Lxy, #16            // Lx = L->v.x << 16
         ldr Lt, [L, #VERTEX_T]      // Lt = L->t
         mov L, N                    // L = N
         cmp Lh, #1                  // if (Lh <= 1) skip Ldx calc
-          ble .skip_left_dx
+          ble .calc_left_start
 
         lsl tmp, Lh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Lh)
 
         ldrsh Ldx, [L, #VERTEX_X]
-        sub Ldx, Lx
+        sub Ldx, Lx, asr #16
         mul Ldx, tmp                // Ldx = tmp * (N->v.x - Lx)
         str Ldx, [sp, #SP_LDX]      // store Ldx to stack
 
@@ -137,32 +137,28 @@ rasterizeFTA_asm:
         lsl Ldu, #16
         orr Ldt, Ldu, Ldv, lsr #16  // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Ldt, [sp, #SP_LDT]      // store Ldt to stack
-
-        .skip_left_dx:
-        lsl Lx, #16                 // Lx <<= 16
-        b .calc_left_start
     .calc_left_end:
 
     .calc_right_start:
         cmp Rh, #0
           bne .calc_right_end       // if (Rh != 0) end with right
         ldr N, [R, #VERTEX_NEXT]    // N = R->next
-        ldrsh Ry1, [R, #VERTEX_Y]   // Ry1 = R->v.y
+        ldr Rxy, [R, #VERTEX_X]     // Rxy = (R->v.y << 16) | (R->v.x)
         ldrsh Ry2, [N, #VERTEX_Y]   // Ry2 = N->v.y
-        subs Rh, Ry2, Ry1           // Rh = Ry2 - Ry1
+        subs Rh, Ry2, Rxy, asr #16  // Rh = N->v.y - R->v.y
           blt .exit                 // if (Rh < 0) return
-        ldrsh Rx, [R, #VERTEX_X]    // Rx = R->v.x
+        lsl Rx, Rxy, #16            // Rx = R->v.x << 16
         ldr Rt, [R, #VERTEX_T]      // Rt = R->t
         mov R, N                    // R = N
         cmp Rh, #1                  // if (Rh <= 1) skip Rdx calc
-          ble .skip_right_dx
+          ble .calc_right_start
 
         lsl tmp, Rh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Rh)
 
         ldrsh Rdx, [R, #VERTEX_X]
-        sub Rdx, Rx
+        sub Rdx, Rx, asr #16
         mul Rdx, tmp                // Rdx = tmp * (N->v.x - Rx)
         str Rdx, [sp, #SP_RDX]      // store Rdx to stack
 
@@ -177,10 +173,6 @@ rasterizeFTA_asm:
         lsl Rdu, #16
         orr Rdt, Rdu, Rdv, lsr #16  // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Rdt, [sp, #SP_RDT]      // store Rdt to stack
-
-        .skip_right_dx:
-        lsl Rx, #16                 // Rx <<= 16
-        b .calc_right_start
     .calc_right_end:
 
     cmp Rh, Lh              // if (Rh < Lh)
diff --git a/src/platform/gba/asm/rasterizeGT.s b/src/platform/gba/asm/rasterizeGT.s
index 9b0ef72..40671dd 100644
--- a/src/platform/gba/asm/rasterizeGT.s
+++ b/src/platform/gba/asm/rasterizeGT.s
@@ -37,9 +37,9 @@ Rdt     .req h
 indexA  .req Lh
 indexB  .req Rh
 
-Ry1     .req tmp
+Rxy     .req tmp
 Ry2     .req Rh
-Ly1     .req tmp
+Lxy     .req tmp
 Ly2     .req Lh
 
 inv     .req Lh
@@ -123,28 +123,29 @@ rasterizeGT_asm:
         cmp Lh, #0
           bne .calc_left_end        // if (Lh != 0) end with left
         ldr N, [L, #VERTEX_PREV]    // N = L->prev
-        ldrsh Ly1, [L, #VERTEX_Y]   // Ly1 = L->v.y
+        ldr Lxy, [L, #VERTEX_X]     // Lxy = (L->v.y << 16) | (L->v.x)
         ldrsh Ly2, [N, #VERTEX_Y]   // Ly2 = N->v.y
-        subs Lh, Ly2, Ly1           // Lh = Ly2 - Ly1
+        subs Lh, Ly2, Lxy, asr #16  // Lh = N->v.y - L->v.y
           blt .exit                 // if (Lh < 0) return
-        ldrsh Lx, [L, #VERTEX_X]    // Lx = L->v.x
+        lsl Lx, Lxy, #16            // Lx = L->v.x << 16
         ldrb Lg, [L, #VERTEX_G]     // Lg = L->v.g
         ldr Lt, [L, #VERTEX_T]      // Lt = L->t
         mov L, N                    // L = N
+        lsl Lg, #8                  // Lg <<= 8
         cmp Lh, #1                  // if (Lh <= 1) skip Ldx calc
-          ble .skip_left_dx
+          ble .calc_left_start
 
         lsl tmp, Lh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Lh)
 
         ldrsh Ldx, [L, #VERTEX_X]
-        sub Ldx, Lx
+        sub Ldx, Lx, asr #16
         mul Ldx, tmp                // Ldx = tmp * (N->v.x - Lx)
         str Ldx, [sp, #SP_LDX]      // store Ldx to stack
 
         ldrb Ldg, [L, #VERTEX_G]
-        sub Ldg, Lg
+        sub Ldg, Lg, lsr #8
         mul Ldg, tmp                // Ldg = tmp * (N->v.g - Lg)
         asr Ldg, #8                 // 8-bit for fractional part
         str Ldg, [sp, #SP_LDG]      // store Ldg to stack
@@ -160,41 +161,35 @@ rasterizeGT_asm:
         lsl Ldu, #16
         orr Ldt, Ldu, Ldv, lsr #16  // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Ldt, [sp, #SP_LDT]      // store Ldt to stack
-
-        .skip_left_dx:
-        lsl Lx, #16                 // Lx <<= 16
-        lsl Lg, #8                  // Lg <<= 8
-        add Lg, #LMAP_ADDR          // Lg += lightmap
-
-        b .calc_left_start
     .calc_left_end:
 
     .calc_right_start:
         cmp Rh, #0
           bne .calc_right_end       // if (Rh != 0) end with right
         ldr N, [R, #VERTEX_NEXT]    // N = R->next
-        ldrsh Ry1, [R, #VERTEX_Y]   // Ry1 = R->v.y
+        ldr Rxy, [R, #VERTEX_X]     // Rxy = (R->v.y << 16) | (R->v.x)
         ldrsh Ry2, [N, #VERTEX_Y]   // Ry2 = N->v.y
-        subs Rh, Ry2, Ry1           // Rh = Ry2 - Ry1
+        subs Rh, Ry2, Rxy, asr #16  // Rh = N->v.y - R->v.y
           blt .exit                 // if (Rh < 0) return
-        ldrsh Rx, [R, #VERTEX_X]    // Rx = R->v.x
+        lsl Rx, Rxy, #16            // Rx = R->v.x << 16
         ldrb Rg, [R, #VERTEX_G]     // Rg = R->v.g
         ldr Rt, [R, #VERTEX_T]      // Rt = R->t
         mov R, N                    // R = N
+        lsl Rg, #8                  // Rg <<= 8
         cmp Rh, #1                  // if (Rh <= 1) skip Rdx calc
-          ble .skip_right_dx
+          ble .calc_right_start
 
         lsl tmp, Rh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Rh)
 
         ldrsh Rdx, [R, #VERTEX_X]
-        sub Rdx, Rx
+        sub Rdx, Rx, asr #16
         mul Rdx, tmp                // Rdx = tmp * (N->v.x - Rx)
         str Rdx, [sp, #SP_RDX]      // store Rdx to stack
 
         ldrb Rdg, [R, #VERTEX_G]
-        sub Rdg, Rg
+        sub Rdg, Rg, lsr #8
         mul Rdg, tmp                // Rdg = tmp * (N->v.g - Rg)
         asr Rdg, #8                 // 8-bit for fractional part
         str Rdg, [sp, #SP_RDG]      // store Ldg to stack
@@ -210,15 +205,11 @@ rasterizeGT_asm:
         lsl Rdu, #16
         orr Rdt, Rdu, Rdv, lsr #16  // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Rdt, [sp, #SP_RDT]      // store Rdt to stack
-
-        .skip_right_dx:
-        lsl Rx, #16                 // Rx <<= 16
-        lsl Rg, #8                  // Rg <<= 8
-        add Rg, #LMAP_ADDR          // Rg += lightmap
-
-        b .calc_right_start
     .calc_right_end:
 
+    orr Lg, #LMAP_ADDR
+    orr Rg, #LMAP_ADDR
+
     cmp Rh, Lh              // if (Rh < Lh)
       movlt h, Rh           //      h = Rh
       movge h, Lh           // else h = Lh
diff --git a/src/platform/gba/asm/rasterizeGTA.s b/src/platform/gba/asm/rasterizeGTA.s
index 17b4012..7724e61 100644
--- a/src/platform/gba/asm/rasterizeGTA.s
+++ b/src/platform/gba/asm/rasterizeGTA.s
@@ -37,9 +37,9 @@ Rdt     .req h
 indexA  .req Lh
 indexB  .req Rh
 
-Ry1     .req tmp
+Rxy     .req tmp
 Ry2     .req Rh
-Ly1     .req tmp
+Lxy     .req tmp
 Ly2     .req Lh
 
 inv     .req Lh
@@ -128,28 +128,29 @@ rasterizeGTA_asm:
         cmp Lh, #0
           bne .calc_left_end        // if (Lh != 0) end with left
         ldr N, [L, #VERTEX_PREV]    // N = L->prev
-        ldrsh Ly1, [L, #VERTEX_Y]   // Ly1 = L->v.y
+        ldr Lxy, [L, #VERTEX_X]     // Lxy = (L->v.y << 16) | (L->v.x)
         ldrsh Ly2, [N, #VERTEX_Y]   // Ly2 = N->v.y
-        subs Lh, Ly2, Ly1           // Lh = Ly2 - Ly1
+        subs Lh, Ly2, Lxy, asr #16  // Lh = N->v.y - L->v.y
           blt .exit                 // if (Lh < 0) return
-        ldrsh Lx, [L, #VERTEX_X]    // Lx = L->v.x
+        lsl Lx, Lxy, #16            // Lx = L->v.x << 16
         ldrb Lg, [L, #VERTEX_G]     // Lg = L->v.g
         ldr Lt, [L, #VERTEX_T]      // Lt = L->t
         mov L, N                    // L = N
+        lsl Lg, #8                  // Lg <<= 8
         cmp Lh, #1                  // if (Lh <= 1) skip Ldx calc
-          ble .skip_left_dx
+          ble .calc_left_start
 
         lsl tmp, Lh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Lh)
 
         ldrsh Ldx, [L, #VERTEX_X]
-        sub Ldx, Lx
+        sub Ldx, Lx, asr #16
         mul Ldx, tmp                // Ldx = tmp * (N->v.x - Lx)
         str Ldx, [sp, #SP_LDX]      // store Ldx to stack
 
         ldrb Ldg, [L, #VERTEX_G]
-        sub Ldg, Lg
+        sub Ldg, Lg, lsr #8
         mul Ldg, tmp                // Ldg = tmp * (N->v.g - Lg)
         asr Ldg, #8                 // 8-bit for fractional part
         str Ldg, [sp, #SP_LDG]      // store Ldg to stack
@@ -165,41 +166,35 @@ rasterizeGTA_asm:
         lsl Ldu, #16
         orr Ldt, Ldu, Ldv, lsr #16  // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Ldt, [sp, #SP_LDT]      // store Ldt to stack
-
-        .skip_left_dx:
-        lsl Lx, #16                 // Lx <<= 16
-        lsl Lg, #8                  // Lg <<= 8
-        add Lg, #LMAP_ADDR          // Lg += lightmap
-
-        b .calc_left_start
     .calc_left_end:
 
     .calc_right_start:
         cmp Rh, #0
           bne .calc_right_end       // if (Rh != 0) end with right
         ldr N, [R, #VERTEX_NEXT]    // N = R->next
-        ldrsh Ry1, [R, #VERTEX_Y]   // Ry1 = R->v.y
+        ldr Rxy, [R, #VERTEX_X]     // Rxy = (R->v.y << 16) | (R->v.x)
         ldrsh Ry2, [N, #VERTEX_Y]   // Ry2 = N->v.y
-        subs Rh, Ry2, Ry1           // Rh = Ry2 - Ry1
+        subs Rh, Ry2, Rxy, asr #16  // Rh = N->v.y - R->v.y
           blt .exit                 // if (Rh < 0) return
-        ldrsh Rx, [R, #VERTEX_X]    // Rx = R->v.x
+        lsl Rx, Rxy, #16            // Rx = R->v.x << 16
         ldrb Rg, [R, #VERTEX_G]     // Rg = R->v.g
         ldr Rt, [R, #VERTEX_T]      // Rt = R->t
         mov R, N                    // R = N
+        lsl Rg, #8                  // Rg <<= 8
         cmp Rh, #1                  // if (Rh <= 1) skip Rdx calc
-          ble .skip_right_dx
+          ble .calc_right_start
 
         lsl tmp, Rh, #1
         mov DIVLUT, #DIVLUT_ADDR
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Rh)
 
         ldrsh Rdx, [R, #VERTEX_X]
-        sub Rdx, Rx
+        sub Rdx, Rx, asr #16
         mul Rdx, tmp                // Rdx = tmp * (N->v.x - Rx)
         str Rdx, [sp, #SP_RDX]      // store Rdx to stack
 
         ldrb Rdg, [R, #VERTEX_G]
-        sub Rdg, Rg
+        sub Rdg, Rg, lsr #8
         mul Rdg, tmp                // Rdg = tmp * (N->v.g - Rg)
         asr Rdg, #8                 // 8-bit for fractional part
         str Rdg, [sp, #SP_RDG]      // store Ldg to stack
@@ -215,15 +210,11 @@ rasterizeGTA_asm:
         lsl Rdu, #16
         orr Rdt, Rdu, Rdv, lsr #16  // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
         str Rdt, [sp, #SP_RDT]      // store Rdt to stack
-
-        .skip_right_dx:
-        lsl Rx, #16                 // Rx <<= 16
-        lsl Rg, #8                  // Rg <<= 8
-        add Rg, #LMAP_ADDR          // Rg += lightmap
-
-        b .calc_right_start
     .calc_right_end:
 
+    orr Lg, #LMAP_ADDR
+    orr Rg, #LMAP_ADDR
+
     cmp Rh, Lh              // if (Rh < Lh)
       movlt h, Rh           //      h = Rh
       movge h, Lh           // else h = Lh
diff --git a/src/platform/gba/asm/rasterizeS.s b/src/platform/gba/asm/rasterizeS.s
index 74b079a..fcbdd1b 100644
--- a/src/platform/gba/asm/rasterizeS.s
+++ b/src/platform/gba/asm/rasterizeS.s
@@ -15,9 +15,9 @@ tmp     .req r11
 DIVLUT  .req r12
 width   .req lr
 h       .req N
-Ry1     .req tmp
+Rxy     .req tmp
 Ry2     .req Rh
-Ly1     .req tmp
+Lxy     .req tmp
 Ly2     .req Lh
 pair    .req DIVLUT
 indexA  .req Lh
@@ -42,48 +42,40 @@ rasterizeS_asm:
         cmp Lh, #0
           bne .calc_left_end        // if (Lh != 0) end with left
         ldr N, [L, #VERTEX_PREV]    // N = L->prev
-        ldrsh Ly1, [L, #VERTEX_Y]   // Ly1 = L->v.y
+        ldr Lxy, [L, #VERTEX_X]     // Lxy = (L->v.y << 16) | (L->v.x)
         ldrsh Ly2, [N, #VERTEX_Y]   // Ly2 = N->v.y
-        subs Lh, Ly2, Ly1           // Lh = Ly2 - Ly1
+        subs Lh, Ly2, Lxy, asr #16  // Lh = N->v.y - L->v.y
           blt .exit                 // if (Lh < 0) return
-        ldrsh Lx, [L, #VERTEX_X]    // Lx = L->v.x
+        lsl Lx, Lxy, #16            // Lx = L->v.x << 16
+        mov L, N                    // L = N
         cmp Lh, #1                  // if (Lh <= 1) skip Ldx calc
-          ble .skip_left_dx
+          ble .calc_left_start
         lsl tmp, Lh, #1
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Lh)
 
-        ldrsh Ldx, [N, #VERTEX_X]
-        sub Ldx, Lx
+        ldrsh Ldx, [L, #VERTEX_X]
+        sub Ldx, Lx, asr #16
         mul Ldx, tmp                // Ldx = tmp * (N->v.x - Lx)
-
-        .skip_left_dx:
-        lsl Lx, #16                 // Lx <<= 16
-        mov L, N                    // L = N
-        b .calc_left_start
     .calc_left_end:
 
     .calc_right_start:
         cmp Rh, #0
           bne .calc_right_end       // if (Rh != 0) end with right
         ldr N, [R, #VERTEX_NEXT]    // N = R->next
-        ldrsh Ry1, [R, #VERTEX_Y]   // Ry1 = R->v.y
+        ldr Rxy, [R, #VERTEX_X]     // Rxy = (R->v.y << 16) | (R->v.x)
         ldrsh Ry2, [N, #VERTEX_Y]   // Ry2 = N->v.y
-        subs Rh, Ry2, Ry1           // Rh = Ry2 - Ry1
+        subs Rh, Ry2, Rxy, asr #16  // Rh = N->v.y - R->v.y
           blt .exit                 // if (Rh < 0) return
-        ldrsh Rx, [R, #VERTEX_X]    // Rx = R->v.x
+        lsl Rx, Rxy, #16            // Rx = R->v.x << 16
+        mov R, N                    // R = N
         cmp Rh, #1                  // if (Rh <= 1) skip Rdx calc
-          ble .skip_right_dx
+          ble .calc_right_start
         lsl tmp, Rh, #1
         ldrh tmp, [DIVLUT, tmp]     // tmp = FixedInvU(Rh)
 
-        ldrsh Rdx, [N, #VERTEX_X]
-        sub Rdx, Rx
+        ldrsh Rdx, [R, #VERTEX_X]
+        sub Rdx, Rx, asr #16
         mul Rdx, tmp                // Rdx = tmp * (N->v.x - Rx)
-
-        .skip_right_dx:
-        lsl Rx, #16                 // Rx <<= 16
-        mov R, N                    // R = N
-        b .calc_right_start
     .calc_right_end:
 
     cmp Rh, Lh              // if (Rh < Lh)