From 4e9b92e5a46afcc4a573edb0e1b51c2935a77fc4 Mon Sep 17 00:00:00 2001
From: XProger <xproger@list.ru>
Date: Sat, 24 Dec 2022 11:23:42 +0300
Subject: [PATCH] #407 32X optimizations, increase fog distance (2 blocks)

---
 src/fixed/common.h                          |  22 ++-
 src/platform/32x/asm/common.i               |  18 +--
 src/platform/32x/asm/faceAddMeshQuads.i     |  68 ++++----
 src/platform/32x/asm/faceAddMeshTriangles.i |  70 ++++----
 src/platform/32x/asm/faceAddRoomQuads.i     |  94 ++++++-----
 src/platform/32x/asm/faceAddRoomTriangles.i |  78 ++++-----
 src/platform/32x/asm/rasterize.i            |   4 +-
 src/platform/32x/asm/rasterizeF.i           |  83 ++++------
 src/platform/32x/asm/rasterizeFT.i          |  19 +--
 src/platform/32x/asm/rasterizeGT.i          |  19 +--
 src/platform/32x/asm/rasterizeS.i           |  77 ++++-----
 src/platform/32x/asm/transformMesh.i        |  22 +--
 src/platform/32x/asm/transformRoom.i        | 170 ++++++++------------
 src/platform/32x/rasterizer.h               |  10 +-
 src/platform/32x/render.cpp                 | 103 +++++++-----
 15 files changed, 392 insertions(+), 465 deletions(-)

diff --git a/src/fixed/common.h b/src/fixed/common.h
index 9b298b0..1cdeb75 100644
--- a/src/fixed/common.h
+++ b/src/fixed/common.h
@@ -637,9 +637,12 @@ struct Matrix
 
 struct RoomQuad
 {
-#ifdef __3DO__
+#if defined(__3DO__)
     uint32 flags;
     uint16 indices[4];
+#elif defined(__32X__)
+    uint32 flags;
+    int8 indices[4];
 #else
     int8 indices[4];
     uint16 flags;
@@ -649,9 +652,12 @@ struct RoomQuad
 
 struct RoomTriangle
 {
-#ifdef __3DO__
+#if defined(__3DO__)
     uint32 flags;
     uint16 indices[4];
+#elif defined(__32X__)
+    uint16 flags;
+    uint16 indices[3];
 #else
     uint16 indices[3];
     uint16 flags;
@@ -660,9 +666,12 @@ struct RoomTriangle
 
 struct MeshQuad
 {
-#ifdef __3DO__
+#if defined(__3DO__)
     uint32 flags;
     uint32 indices;
+#elif defined(__32X__)
+    uint16 flags;
+    uint8  indices[4];
 #else
     int8 indices[4];
     uint16 flags;
@@ -672,9 +681,12 @@ struct MeshQuad
 
 struct MeshTriangle
 {
-#ifdef __3DO__
+#if defined(__3DO__)
     uint32 flags;
     uint32 indices;
+#elif defined(__32X__)
+    uint16 flags;
+    uint8  indices[4];
 #else
     int8 indices[4];
     uint16 flags;
@@ -743,7 +755,7 @@ struct Face
 {
     uint32 flags;
     Face* next;
-    uint16 indices[4];
+    int16 indices[4];
 };
 #endif
 
diff --git a/src/platform/32x/asm/common.i b/src/platform/32x/asm/common.i
index 85e1c5f..e9b85b5 100644
--- a/src/platform/32x/asm/common.i
+++ b/src/platform/32x/asm/common.i
@@ -54,22 +54,20 @@
 
 #define FACE_SIZEOF             16
 
-#define VIEW_DIST       (1024 * 10)   // max = DIV_TABLE_END << PROJ_SHIFT
-#define FOG_SHIFT       1
-#define FOG_MAX         VIEW_DIST
-#define FOG_MIN         (FOG_MAX - (8192 >> FOG_SHIFT))
-#define VIEW_MIN        (64)
-#define VIEW_MAX        (VIEW_DIST)
-#define VIEW_OFF        4096
+#define VIEW_MIN        64
+#define VIEW_MAX        (10 << 10)
+#define FOG_SHIFT       4
+#define FOG_MIN         (VIEW_MAX - 2048)
+
+#define OT_SHIFT        4
 
 #define CLIP_FRAME      (1 << 0)
 #define CLIP_LEFT       (1 << 1)
 #define CLIP_RIGHT      (1 << 2)
 #define CLIP_TOP        (1 << 3)
 #define CLIP_BOTTOM     (1 << 4)
-#define CLIP_FAR        (1 << 5)
-#define CLIP_NEAR       (1 << 6)
-#define CLIP_DISCARD    (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_FAR + CLIP_NEAR)
+#define CLIP_PLANE      (1 << 5)
+#define CLIP_DISCARD    (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_PLANE)
 
 #define VP_MINX         0
 #define VP_MINY         4
diff --git a/src/platform/32x/asm/faceAddMeshQuads.i b/src/platform/32x/asm/faceAddMeshQuads.i
index 6b0bfd7..ca67756 100644
--- a/src/platform/32x/asm/faceAddMeshQuads.i
+++ b/src/platform/32x/asm/faceAddMeshQuads.i
@@ -26,9 +26,9 @@
 #define vz2         vg2
 #define vz3         vg3
 
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
-#define ot          tmp
+#define ot          vg0
 
 .align 4
 .global _faceAddMeshQuads_asm
@@ -43,26 +43,30 @@ _faceAddMeshQuads_asm:
         mov.l   r14, @-sp
 
         mov.l   var_gVertices_fam, vertices
+        add     #VERTEX_Z, vertices
 
         mov.l   var_gVerticesBase_fam, vp
         mov.l   @vp, vp
 
         mov.l   var_gFacesBase_fam, face
         mov.l   @face, face
+        nop
 
 .loop_famq:
         // read flags and indices
         mov.w   @polys+, flags
-        mov.b   @polys+, vp0
-        mov.b   @polys+, vp1
-        mov.b   @polys+, vp2
-        mov.b   @polys+, vp3
+        mov.w   @polys+, vp0
+        mov.w   @polys+, vp2
 
-        extu.w  flags, flags
+        extu.w  flags, flags // TODO packer free high bit
+
+        extu.b  vp0, vp1
+        shlr8   vp0
         extu.b  vp0, vp0
-        extu.b  vp1, vp1
+
+        extu.b  vp2, vp3
+        shlr8   vp2
         extu.b  vp2, vp2
-        extu.b  vp3, vp3
 
         // p = gVerticesBase + index * VERTEX_SIZEOF
         shll2   vp0
@@ -111,50 +115,40 @@ _faceAddMeshQuads_asm:
         or      tmp, flags
 
 .avg_z4_famq:
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
         mov.w   @vp1, vz1
         mov.w   @vp2, vz2
         mov.w   @vp3, vz3
-        add     vz1, vz0
-        add     vz2, vz0
-        add     vz3, vz0
-        shlr2   vz0             // div by 4
+        add     vz1, depth
+        add     vz2, depth
+        add     vz3, depth
+        shlr2   depth           // depth /= 4
 
         mov.l   var_gOT_fam, ot
 
  .face_add_famq:
-        // index = (p - vertices) / VERTEX_SIZEOF
+        // offset = (p - vertices)
         sub     vertices, vp0
         sub     vertices, vp1
         sub     vertices, vp2
         sub     vertices, vp3
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr2   vp3
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-        shlr    vp3
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth
 
         shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)
 
+        shll16  vp3
+        xtrct   vp2, vp3
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp3, @(12, face)
         add     #FACE_SIZEOF, face
-        mov     face, tmp
+        nop
 
-        mov.w   vp3, @-tmp
-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_famq:
         dt      count
         bf      .loop_famq
diff --git a/src/platform/32x/asm/faceAddMeshTriangles.i b/src/platform/32x/asm/faceAddMeshTriangles.i
index 383fc48..fe4d869 100644
--- a/src/platform/32x/asm/faceAddMeshTriangles.i
+++ b/src/platform/32x/asm/faceAddMeshTriangles.i
@@ -25,7 +25,7 @@
 #define vz1         vg1
 #define vz2         vg2
 
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
 
 .align 4
@@ -41,6 +41,7 @@ _faceAddMeshTriangles_asm:
         mov.l   r14, @-sp
 
         mov.l   var_gVertices_fam, vertices
+        add     #VERTEX_Z, vertices
 
         mov.l   var_gVerticesBase_fam, vp
         mov.l   @vp, vp
@@ -49,19 +50,20 @@ _faceAddMeshTriangles_asm:
         mov.l   @face, face
 
         mov.l   var_gOT_fam, ot
-        nop
 
 .loop_famt:
         // read flags and indices
         mov.w   @polys+, flags
-        mov.b   @polys+, vp0
-        mov.b   @polys+, vp1
-        mov.b   @polys+, vp2
-        add     #1, polys       // skup 4th index
+        mov.w   @polys+, vp0
+        mov.w   @polys+, vp2
 
-        extu.w  flags, flags
+        extu.w  flags, flags // TODO packer free high bit
+
+        extu.b  vp0, vp1
+        shlr8   vp0
         extu.b  vp0, vp0
-        extu.b  vp1, vp1
+
+        shlr8   vp2
         extu.b  vp2, vp2
 
         // p = gVerticesBase + index * VERTEX_SIZEOF
@@ -80,11 +82,9 @@ _faceAddMeshTriangles_asm:
         // check_backface
         ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
         bt/s    .skip_famt
-        mov.l   const_FACE_TRIANGLE_fam, tmp    // [delay slot]
-        or      tmp, flags
 
         // fetch clip masks
-        mov     #(VERTEX_CLIP - 4), tmp
+        mov     #(VERTEX_CLIP - 4), tmp         // [delay slot]
         mov.b   @(tmp, vp0), vg0
         mov.b   @(tmp, vp1), vg1
         mov.b   @(tmp, vp2), vg2
@@ -95,8 +95,11 @@ _faceAddMeshTriangles_asm:
         tst     #CLIP_DISCARD, tmp
         bf/s    .skip_famt
 
+        mov.l   const_FACE_TRIANGLE_fam, tmp    // [delay slot]
+        or      tmp, flags
+
         // mark if should be clipped by frame
-        mov     vg0, tmp        // [delay slot]
+        mov     vg0, tmp
         or      vg1, tmp
         or      vg2, tmp
         tst     #CLIP_FRAME, tmp
@@ -105,44 +108,35 @@ _faceAddMeshTriangles_asm:
         or      tmp, flags
 
 .avg_z3_famt:
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
         mov.w   @vp1, vz1
         mov.w   @vp2, vz2
-        add     vz1, vz0
-        add     vz2, vz0
-        add     vz2, vz0        // approx.
-        shlr2   vz0             // div by 4
+        add     vz1, depth
+        add     vz2, depth
+        add     vz2, depth      // approx.
+        shlr2   depth           // depth /= 4
 
 .face_add_famt:
-        // index = (p - vertices) / VERTEX_SIZEOF
+        // offset = (p - vertices)
         sub     vertices, vp0
         sub     vertices, vp1
         sub     vertices, vp2
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth
 
         shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)
 
+        shll16  vp2
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp2, @(12, face)
         add     #FACE_SIZEOF, face
-        mov     face, tmp
-        add     #-2, tmp        // skip 4th index
+        nop
 
-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_famt:
         dt      count
         bf      .loop_famt
diff --git a/src/platform/32x/asm/faceAddRoomQuads.i b/src/platform/32x/asm/faceAddRoomQuads.i
index bb9b533..5897a7a 100644
--- a/src/platform/32x/asm/faceAddRoomQuads.i
+++ b/src/platform/32x/asm/faceAddRoomQuads.i
@@ -26,9 +26,9 @@
 #define vz2         vg2
 #define vz3         vg3
 
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
-#define ot          tmp
+#define ot          vg0
 
 .align 4
 .global _faceAddRoomQuads_asm
@@ -43,34 +43,44 @@ _faceAddRoomQuads_asm:
         mov.l   r14, @-sp
 
         mov.l   var_gVertices_far, vertices
+        add     #VERTEX_Z, vertices
 
         mov.l   var_gVerticesBase_far, vp
         mov.l   @vp, vp
 
         mov.l   var_gFacesBase_far, face
         mov.l   @face, face
+        nop
 
 .loop_farq:
         // read flags and indices
-        mov.w   @polys+, flags
-        mov.w   @polys+, vp0
-        mov.w   @polys+, vp1
-        mov.w   @polys+, vp2
-        mov.w   @polys+, vp3
-        extu.w  flags, flags
-        // indices never exceed 32k, no need for extu.w
+        mov.l   @polys+, flags
+        mov.l   @polys+, vp0
 
-        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
+        exts.b  vp0, vp3
+        shlr8   vp0
+        exts.b  vp0, vp2
+        shlr8   vp0
+        exts.b  vp0, vp1
+        shlr8   vp0
+        exts.b  vp0, vp0
+
+        // index *= 8 (VERTEX_SIZEOF)
         shll2   vp0
         shll2   vp1
         shll2   vp2
         shll2   vp3
+        shll    vp0
+        shll    vp1
+        shll    vp2
+        shll    vp3
 
         // get vertex address
         add     vp, vp0
-        add     vp, vp1
-        add     vp, vp2
-        add     vp, vp3
+        add     vp0, vp1
+        add     vp1, vp2
+        add     vp2, vp3
+        mov     vp3, vp
 
         // fetch ((g << 8) | clip)
         mov     #VERTEX_G, tmp
@@ -116,59 +126,45 @@ _faceAddRoomQuads_asm:
         add     #VERTEX_Z, vp3  // [delay slot] ccw shifts p[0..2] address to VERTEX_Z, shift p3 too
 
         // max_z4
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
         mov.w   @vp1, vz1 
         // check_z1
-        cmp/gt  vz0, vz1
+        cmp/gt  depth, vz1
         bf/s    3f
         mov.w   @vp2, vz2       // [delay slot]
-        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+        mov     vz1, depth      // if (z1 > z0) z0 = z1
 3:      // check_z2
-        cmp/gt  vz0, vz2
+        cmp/gt  depth, vz2
         bf/s    4f
         mov.w   @vp3, vz3       // [delay slot]
-        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+        mov     vz2, depth      // if (z2 > z0) z0 = z2
 4:      // check_z3
-        cmp/gt  vz0, vz3
-        bf      .face_add_farq  // TODO use delay slot but not for OT! )
-        mov     vz3, vz0        // if (z3 > z0) z0 = z3
+        cmp/gt  depth, vz3
+        bf/s    .face_add_farq
+        sub     vertices, vp0   // [delay slot] get the first offset
+        mov     vz3, depth      // if (z3 > z0) z0 = z3
 
 .face_add_farq:
-        mov.l   var_gOT_far, ot // [delay slot]
-        // get absolute indices
-        // p address is 4 bytes ahead but it's fine for shlr3
-        // index = (p - vertices) / VERTEX_SIZEOF
-        sub     vertices, vp0
+        mov.l   var_gOT_far, ot
+        // offset = (p - vertices)
         sub     vertices, vp1
         sub     vertices, vp2
         sub     vertices, vp3
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr2   vp3
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-        shlr    vp3
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth
 
         shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)
 
+        shll16  vp3
+        xtrct   vp2, vp3
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp3, @(12, face)
         add     #FACE_SIZEOF, face
-        mov     face, tmp
-
-        mov.w   vp3, @-tmp
-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_farq:
         dt      count
         bf      .loop_farq
diff --git a/src/platform/32x/asm/faceAddRoomTriangles.i b/src/platform/32x/asm/faceAddRoomTriangles.i
index 56580c6..6ad484b 100644
--- a/src/platform/32x/asm/faceAddRoomTriangles.i
+++ b/src/platform/32x/asm/faceAddRoomTriangles.i
@@ -25,7 +25,7 @@
 #define vz1         vg1
 #define vz2         vg2
 
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
 
 .align 4
@@ -41,6 +41,7 @@ _faceAddRoomTriangles_asm:
         mov.l   r14, @-sp
 
         mov.l   var_gVertices_far, vertices
+        add     #VERTEX_Z, vertices
 
         mov.l   var_gVerticesBase_far, vp
         mov.l   @vp, vp
@@ -49,21 +50,19 @@ _faceAddRoomTriangles_asm:
         mov.l   @face, face
 
         mov.l   var_gOT_far, ot
-        nop
 
 .loop_fart:
         // read flags and indices
-        mov.w   @polys+, flags
-        mov.w   @polys+, vp0
-        mov.w   @polys+, vp1
-        mov.w   @polys+, vp2
-        extu.w  flags, flags
-        // indices never exceed 32k, no need for extu.w
+        mov.l   @polys+, flags
+        mov.l   @polys+, vp1
 
-        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
-        shll2   vp0
-        shll2   vp1
-        shll2   vp2
+        extu.w  flags, vp0
+        shlr16  flags
+
+        extu.w  vp1, vp2
+        shlr16  vp1
+
+        // vp[0..2] alreay multiplied by VERTEX_SIZEOF
 
         // get vertex address
         add     vp, vp0
@@ -90,7 +89,7 @@ _faceAddRoomTriangles_asm:
         or      vg2, tmp
         tst     #CLIP_FRAME, tmp
         bt/s    1f
-        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot]
+        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot] mov #1, tmp; rotr x2
         or      tmp, flags
 
 1:      // compare VERTEX_G for gouraud rasterization
@@ -100,60 +99,47 @@ _faceAddRoomTriangles_asm:
         shlr8   vg1             // shift down for g only
         tst     vg1, vg1
         bt/s    2f
-        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot]
+        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot] mov #128, tmp; shll8
         add     tmp, flags
 
 2:      // check_backface
         ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
         bt/s    .skip_fart
-        mov.l   const_FACE_TRIANGLE_far, tmp    // [delay slot]
+        mov.l   const_FACE_TRIANGLE_far, tmp    // [delay slot] mov #1, tmp; rotr
         or      tmp, flags
 
         // max_z3
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth     // depth = vz0
         mov.w   @vp1, vz1
         // check_z1
-        cmp/gt  vz0, vz1
+        cmp/gt  depth, vz1
         bf/s    3f
         mov.w   @vp2, vz2       // [delay slot]
-        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+        mov     vz1, depth      // if (z1 > depth) depth = z1
 3:      // check_z2
-        cmp/gt  vz0, vz2
-        bf      .face_add_fart  // TODO use delay slot but not for OT! )
-        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+        cmp/gt  depth, vz2
+        bf/s    .face_add_fart  // TODO use delay slot but not for OT! )
+        sub     vertices, vp0   // [delay slot] get the first offset
+        mov     vz2, depth      // if (z2 > depth) depth = z2
 
 .face_add_fart:
-        // get absolute indices
-        // p address is 4 bytes ahead but it's fine for shlr3
-        // index = (p - vertices) / VERTEX_SIZEOF
-        sub     vertices, vp0
+        // offset = (p - vertices)
         sub     vertices, vp1
         sub     vertices, vp2
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth
 
         shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)
 
+        shll16  vp2
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp2, @(12, face)
         add     #FACE_SIZEOF, face
-        mov     face, tmp
-        add     #-2, tmp        // skip 4th index
-
-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_fart:
         dt      count
         bf      .loop_fart
diff --git a/src/platform/32x/asm/rasterize.i b/src/platform/32x/asm/rasterize.i
index c92a552..c3e64f2 100644
--- a/src/platform/32x/asm/rasterize.i
+++ b/src/platform/32x/asm/rasterize.i
@@ -40,8 +40,8 @@ _rasterize_asm:
 
 .align 2
 var_fb:
-        // overwrite image frame buffer address has the same
-        // write per but allow transparent write for byte & word
+        // overwrite image frame buffer address, it has the same
+        // write latency but allow transparent write for byte & word
         .long 0x24020200
 var_table:
 #ifdef ON_CHIP_RENDER
diff --git a/src/platform/32x/asm/rasterizeF.i b/src/platform/32x/asm/rasterizeF.i
index 3c023fc..a1e5b22 100644
--- a/src/platform/32x/asm/rasterizeF.i
+++ b/src/platform/32x/asm/rasterizeF.i
@@ -5,25 +5,22 @@
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define index   r6      // arg
-#define gtile   r7      // arg (unused)
-#define N       gtile
+#define h       r7
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
 #define Rdx     r11
 #define dup     r12     // const
 #define inv     r13
-#define divLUT  r14
+#define R       r14
 
-#define R       index
-#define h       N
+#define divLUT  inv
 
 #define Ry      inv
 #define Ly      inv
 
-#define Rptr    R
+#define Rptr    index
 
-#define iw      inv
 #define ih      inv
 #define LMAP    inv
 
@@ -38,7 +35,6 @@
         mov.l   @sp+, r9
         rts
         mov.l   @sp+, r8
-        nop
 
 .global _rasterizeF_asm
 _rasterizeF_asm:
@@ -63,37 +59,30 @@ _rasterizeF_asm:
 
         mov     L, R
 
-        mov.l   var_divTable_fs, divLUT
-
         mov     #0, Rh
-        mov     #0, Lh
-.loop_f:
-        tst     Lh, Lh
-        bf/s    .calc_left_end_f
 
 .calc_left_start_f:
         mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
+        add     L, tmp          // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT)
 
-        mov.w   @L+, Lx
-        mov.w   @L+, Ly
+        mov.l   @L, Lx
+        extu.w  Lx, Ly
+        shlr16  Lx
 
-        mov     N, tmp
-        mov.w   @tmp+, Ldx
-        mov.w   @tmp+, Lh
+        mov.l   @tmp, Ldx
+        extu.w  Ldx, Lh
+        shlr16  Ldx
 
         cmp/ge  Ly, Lh
         bf/s    .exit_f
         cmp/eq  Ly, Lh          // [delay slot]
         bt/s    .calc_left_start_f      // if (L->v.y == N->v.y) check next vertex
-        mov     N, L            // [delay slot]
+        mov     tmp, L          // [delay slot]
 
         sub     Lx, Ldx
         sub     Ly, Lh
 
+        mov.l   var_divTable_fs, divLUT
         mov     Lh, tmp
         shll    tmp
         mov.w   @(tmp, divLUT), ih
@@ -104,31 +93,30 @@ _rasterizeF_asm:
 .calc_left_end_f:
 
         tst     Rh, Rh
-        bf/s    .calc_right_end_f
+        bf      .calc_right_end_f
 
 .calc_right_start_f:
-        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
+        mov.b   @(VERTEX_NEXT, R), tmp
+        add     R, tmp          // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT)
 
-        mov.w   @R+, Rx
-        mov.w   @R+, Ry
+        mov.l   @R, Rx
+        extu.w  Rx, Ry
+        shlr16  Rx
 
-        mov     N, tmp
-        mov.w   @tmp+, Rdx
-        mov.w   @tmp+, Rh
+        mov.l   @tmp, Rdx
+        extu.w  Rdx, Rh
+        shlr16  Rdx
 
         cmp/ge  Ry, Rh
         bf/s    .exit_f
         cmp/eq  Ry, Rh          // [delay slot]
         bt/s    .calc_right_start_f     // if (R->v.y == N->v.y) check next vertex
-        mov     N, R            // [delay slot]
+        mov     tmp, R          // [delay slot]
 
         sub     Rx, Rdx
         sub     Ry, Rh
 
+        mov.l   var_divTable_fs, divLUT
         mov     Rh, tmp
         shll    tmp
         mov.w   @(tmp, divLUT), ih
@@ -148,8 +136,6 @@ _rasterizeF_asm:
         sub     h, Lh
         sub     h, Rh
 
-        mov.l   R, @-sp
-        
 .scanline_start_f:
         mov     Lx, Lptr
         mov     Rx, Rptr
@@ -160,12 +146,6 @@ _rasterizeF_asm:
         cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
         bf/s    .scanline_end_f
 
-        // iw = divTable[Rptr - Lptr]
-        mov     Rptr, tmp       // [delay slot]
-        sub     Lptr, tmp
-        shll    tmp
-        mov.w   @(tmp, divLUT), iw
-
         add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
         add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)
 
@@ -178,10 +158,10 @@ _rasterizeF_asm:
         mov.b   dup, @Lptr
         add     #1, Lptr
 
-        mov     #1, tmp         // tmp = 1 (for align_right)
         cmp/gt  Lptr, Rptr
         bf/s    .scanline_end_f
         tst     tmp, Rptr
+        nop
 
 .align_right_f:
         bt      .block_2px_f
@@ -192,17 +172,20 @@ _rasterizeF_asm:
 .block_2px_f:
         mov.w   dup, @-Rptr
         cmp/gt  Lptr, Rptr
-        bt      .block_2px_f
+        bt/s    .block_2px_f
+        nop
 
 .scanline_end_f:
         dt      h
 
         mov.w   var_frameWidth_fs, tmp
         bf/s    .scanline_start_f
-        add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
+        add     tmp, pixel      // [delay slot] pixel += FRAME_WIDTH
 
-        bra     .loop_f
-        mov.l   @sp+, R
+        tst     Lh, Lh
+        bf      .calc_right_start_f
+        bra     .calc_left_start_f
+        nop
 
 #undef tmp
 #undef Lh
@@ -211,7 +194,6 @@ _rasterizeF_asm:
 #undef pixel
 #undef L
 #undef index
-#undef N
 #undef Lx
 #undef Rx
 #undef Ldx
@@ -224,6 +206,5 @@ _rasterizeF_asm:
 #undef Ry
 #undef Ly
 #undef Rptr
-#undef iw
 #undef ih
 #undef LMAP
diff --git a/src/platform/32x/asm/rasterizeFT.i b/src/platform/32x/asm/rasterizeFT.i
index f617727..48d5e05 100644
--- a/src/platform/32x/asm/rasterizeFT.i
+++ b/src/platform/32x/asm/rasterizeFT.i
@@ -66,6 +66,7 @@
         mov.l   @sp+, r9
         rts
         mov.l   @sp+, r8
+        nop
 
 .global _rasterizeFT_asm
 _rasterizeFT_asm:
@@ -95,14 +96,13 @@ _rasterizeFT_asm:
 
         tst     Lh, Lh
         bf/s    .calc_left_end_ft
+        nop
 
 .calc_left_start_ft:
         mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
         mov     tmp, N
 
         mov.w   @(VERTEX_Y, L), tmp
-        shll2   N
-        shll2   N
         add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
         mov     tmp, Ly
         mov.w   @(VERTEX_Y, N), tmp
@@ -144,14 +144,13 @@ _rasterizeFT_asm:
         shlr16  Rh              // Rh = (Rh >> 16)
         tst     Rh, Rh
         bf/s    .calc_right_end_ft
+        nop
 
 .calc_right_start_ft:
         mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
         mov     tmp, N
 
         mov.w   @(VERTEX_Y, R), tmp
-        shll2   N
-        shll2   N
         add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
         mov     tmp, Ry
         mov.w   @(VERTEX_Y, N), tmp
@@ -206,7 +205,8 @@ _rasterizeFT_asm:
         mov.l   tmp, @(SP_H, sp)
         mov.l   L, @(SP_L, sp)
         mov.l   R, @(SP_R, sp)
-        
+        nop
+
 .scanline_start_ft:
         mov     Lx, Lptr
         mov     Rx, Rptr
@@ -263,15 +263,15 @@ _rasterizeFT_asm:
 
         cmp/gt  Lptr, Rptr
         bf/s    .scanline_end_ft
+        nop
 
 .block_prepare_ft:
         shll    dtdx            // [delay slot] optional
+        nop
 
 .block_2px_ft:
-        swap.b  t, index        // UUuuvvVV
-        swap.w  index, index    // vvVVUUuu
-        shll8   index           // VVUUuu00
-        shlr16  index           // 0000VVUU
+        getUV   t, index
+
         mov.b   @(index, TILE), index
         mov.b   @(index, LMAP), index
 
@@ -283,6 +283,7 @@ _rasterizeFT_asm:
         cmp/gt  Lptr, Rptr
         bt/s    .block_2px_ft
         sub     dtdx, t         // [delay slot] t -= dtdx
+        nop
 
 .scanline_end_ft:
         mov.l   @(SP_LDX, sp), sLdx
diff --git a/src/platform/32x/asm/rasterizeGT.i b/src/platform/32x/asm/rasterizeGT.i
index 2f23cef..f149a33 100644
--- a/src/platform/32x/asm/rasterizeGT.i
+++ b/src/platform/32x/asm/rasterizeGT.i
@@ -93,8 +93,6 @@ _rasterizeGT_asm:
         add     #-SP_SIZE, sp
 
         mov     gtile, TILE
-        nop
-
         mov     #0, Rh
 
 .loop_gt:
@@ -102,14 +100,13 @@ _rasterizeGT_asm:
 
         tst     Lh, Lh
         bf/s    .calc_left_end_gt
+        shlr16  Rh              // [delay slot] Rh = (Rh >> 16)
 
 .calc_left_start_gt:
-        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
+        mov.b   @(VERTEX_PREV, L), tmp
         mov     tmp, N
 
         mov.w   @(VERTEX_Y, L), tmp
-        shll2   N
-        shll2   N
         add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
         mov     tmp, Ly
         mov.w   @(VERTEX_Y, N), tmp
@@ -159,9 +156,9 @@ _rasterizeGT_asm:
         // calc Ldt
         scaleUV Ldt, tmp, ih
         mov.l   tmp, @(SP_LDT, sp)
+        nop
 .calc_left_end_gt:
 
-        shlr16  Rh              // Rh = (Rh >> 16)
         tst     Rh, Rh
         bf/s    .calc_right_end_gt
 
@@ -170,8 +167,6 @@ _rasterizeGT_asm:
         mov     tmp, N
 
         mov.w   @(VERTEX_Y, R), tmp
-        shll2   N
-        shll2   N
         add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
         mov     tmp, Ry
         mov.w   @(VERTEX_Y, N), tmp
@@ -221,6 +216,7 @@ _rasterizeGT_asm:
         // calc Rdt
         scaleUV Rdt, tmp, ih
         mov.l   tmp, @(SP_RDT, sp)
+        nop
 .calc_right_end_gt:
 
         // bake gLightmap address into g value
@@ -233,6 +229,7 @@ _rasterizeGT_asm:
         bf/s    .scanline_prepare_gt
         mov     Lh, h           // [delay slot]
         mov     Rh, h
+        nop
 
 .scanline_prepare_gt:
         sub     h, Lh
@@ -330,10 +327,8 @@ _rasterizeGT_asm:
         shll    dgdx
 
 .block_2px_gt:
-        swap.b  t, index        // UUuuvvVV
-        swap.w  index, index    // vvVVUUuu
-        shll8   index           // VVUUuu00
-        shlr16  index           // 0000VVUU
+        getUV   t, index
+
         mov.b   @(index, TILE), index
 
         mov     g, LMAP
diff --git a/src/platform/32x/asm/rasterizeS.i b/src/platform/32x/asm/rasterizeS.i
index a1ef8bc..985b41c 100644
--- a/src/platform/32x/asm/rasterizeS.i
+++ b/src/platform/32x/asm/rasterizeS.i
@@ -5,8 +5,7 @@
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define R       r6      // arg
-#define gtile   r7      // arg (unused)
-#define N       gtile
+#define h       r7
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
@@ -16,14 +15,12 @@
 #define divLUT  r14
 
 #define index   tmp
-#define h       N
 
 #define Ry      inv
 #define Ly      inv
 
-#define Rptr    R
+#define Rptr    inv
 
-#define iw      inv
 #define ih      inv
 
 .align 4
@@ -37,7 +34,6 @@
         mov.l   @sp+, r9
         rts
         mov.l   @sp+, r8
-        nop
 
 .global _rasterizeS_asm
 _rasterizeS_asm:
@@ -58,30 +54,25 @@ _rasterizeS_asm:
         mov.l   var_divTable_fs, divLUT
 
         mov     #0, Rh
-        mov     #0, Lh
-.loop_s:
-        tst     Lh, Lh
-        bf/s    .calc_left_end_s
+        nop
 
 .calc_left_start_s:
         mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
+        add     L, tmp          // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT)
 
-        mov.w   @L+, Lx
-        mov.w   @L+, Ly
+        mov.l   @L, Lx
+        extu.w  Lx, Ly
+        shlr16  Lx
 
-        mov     N, tmp
-        mov.w   @tmp+, Ldx
-        mov.w   @tmp+, Lh
+        mov.l   @tmp, Ldx
+        extu.w  Ldx, Lh
+        shlr16  Ldx
 
         cmp/ge  Ly, Lh
         bf/s    .exit_s
         cmp/eq  Ly, Lh          // [delay slot]
         bt/s    .calc_left_start_s      // if (L->v.y == N->v.y) check next vertex
-        mov     N, L            // [delay slot]
+        mov     tmp, L          // [delay slot]
 
         sub     Lx, Ldx
         sub     Ly, Lh
@@ -96,27 +87,26 @@ _rasterizeS_asm:
 .calc_left_end_s:
 
         tst     Rh, Rh
-        bf/s    .calc_right_end_s
+        bf      .calc_right_end_s
+        nop
 
 .calc_right_start_s:
-        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
+        mov.b   @(VERTEX_NEXT, R), tmp
+        add     R, tmp          // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT)
 
-        mov.w   @R+, Rx
-        mov.w   @R+, Ry
+        mov.l   @R, Rx
+        extu.w  Rx, Ry
+        shlr16  Rx
 
-        mov     N, tmp
-        mov.w   @tmp+, Rdx
-        mov.w   @tmp+, Rh
+        mov.l   @tmp, Rdx
+        extu.w  Rdx, Rh
+        shlr16  Rdx
 
         cmp/ge  Ry, Rh
         bf/s    .exit_s
         cmp/eq  Ry, Rh          // [delay slot]
         bt/s    .calc_right_start_s     // if (R->v.y == N->v.y) check next vertex
-        mov     N, R            // [delay slot]
+        mov     tmp, R          // [delay slot]
 
         sub     Rx, Rdx
         sub     Ry, Rh
@@ -135,13 +125,12 @@ _rasterizeS_asm:
         bf/s    .scanline_prepare_s
         mov     Lh, h           // [delay slot]
         mov     Rh, h
+        nop
 
 .scanline_prepare_s:
         sub     h, Lh
         sub     h, Rh
 
-        mov.l   R, @-sp
-        
 .scanline_start_s:
         mov     Lx, Lptr
         mov     Rx, Rptr
@@ -152,14 +141,8 @@ _rasterizeS_asm:
         cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
         bf/s    .scanline_end_s
 
-        // iw = divTable[Rptr - Lptr]
-        mov     Rptr, tmp       // [delay slot]
-        sub     Lptr, tmp
-        shll    tmp
-        mov.w   @(tmp, divLUT), iw
-
-        add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
-        add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)
+        add     pixel, Lptr     // Lptr = pixel + (Lx >> 16)
+        add     pixel, Rptr     // Rptr = pixel + (Rx >> 16)
 
 .shade_pixel_s:
         mov.b   @Lptr, index
@@ -174,10 +157,12 @@ _rasterizeS_asm:
 
         mov.w   var_frameWidth_fs, tmp
         bf/s    .scanline_start_s
-        add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
+        add     tmp, pixel      // [delay slot] pixel += FRAME_WIDTH
 
-        bra     .loop_s
-        mov.l   @sp+, R
+        tst     Lh, Lh
+        bf      .calc_right_start_s
+        bra     .calc_left_start_s
+        nop
 
 #undef tmp
 #undef Lh
@@ -186,7 +171,6 @@ _rasterizeS_asm:
 #undef pixel
 #undef L
 #undef R
-#undef N
 #undef Lx
 #undef Rx
 #undef Ldx
@@ -199,5 +183,4 @@ _rasterizeS_asm:
 #undef Ry
 #undef Ly
 #undef Rptr
-#undef iw
 #undef ih
diff --git a/src/platform/32x/asm/transformMesh.i b/src/platform/32x/asm/transformMesh.i
index 75f2095..d0da8dc 100644
--- a/src/platform/32x/asm/transformMesh.i
+++ b/src/platform/32x/asm/transformMesh.i
@@ -78,10 +78,10 @@ _transformMesh_asm:
         // pre-transform the matrix offset
         add     #M03, m
         mov.w   @m+, mx
-        shll16  mx
         mov.w   @m+, my
-        shll16  my
         mov.w   @m+, mz
+        shll16  mx
+        shll16  my
         shll16  mz
         add     #-MATRIX_SIZEOF, m
 
@@ -99,22 +99,24 @@ _transformMesh_asm:
 
         // z clipping
 .clip_z_near_m:
-        mov     #VIEW_MIN, minZ // 64
+        mov     #VIEW_MIN, minZ
         cmp/gt  z, minZ
         bf/s    .clip_z_far_m
         cmp/ge  maxZ, z         // [delay slot]
         mov     minZ, z
-        add     #CLIP_NEAR, vg
+        add     #CLIP_PLANE, vg
 .clip_z_far_m:
-        bf/s    .project_m
-        mov     z, dz           // [delay slot] dz = z
+        bf      .project_m
         mov     maxZ, z
-        add     #CLIP_FAR, vg
+        add     #CLIP_PLANE, vg
 
 .project_m:
-        // dz = divTable[z >> (PROJ_SHIFT = 4)]
-        shlr2   dz
-        shlr2   dz
+        // z >>= OT_SHIFT
+        shlr2   z
+        shlr2   z
+
+        // dz = divTable[z]
+        mov     z, dz
         shll    dz
         mov.w   @(dz, divLUT), dz
 
diff --git a/src/platform/32x/asm/transformRoom.i b/src/platform/32x/asm/transformRoom.i
index 4254a74..f737976 100644
--- a/src/platform/32x/asm/transformRoom.i
+++ b/src/platform/32x/asm/transformRoom.i
@@ -4,9 +4,9 @@
 #define res             r3
 #define vertices        r4      // arg
 #define count           r5      // arg
-#define stackVtx        r6
-#define stackMtx        r7
-#define vp              r8
+#define vp              r6
+#define m               r7
+#define vg              r8
 #define x               r9
 #define y               r10
 #define z               r11
@@ -18,13 +18,14 @@
 #define minY            tmp
 #define maxX            tmp
 #define maxY            tmp
-#define minZ            tmp
+#define minZ            x
 #define dz              tmp
-#define vg              stackVtx
-#define fog             stackMtx
-#define cnt             stackVtx
+#define stackVtx        tmp
+#define fog             x
+#define minFog          y
+#define maxG            y
 
-#define SP_SIZE         (18 + 6)        // mat3x3 + vec3
+#define SP_SIZE         (8)        // vec3s + padding
 
 .align 4
 .global _transformRoom_asm
@@ -37,7 +38,6 @@ _transformRoom_asm:
         mov.l   r12, @-sp
         mov.l   r13, @-sp
         mov.l   r14, @-sp
-        mov     sp, stackMtx 
         add     #-SP_SIZE, sp
 
         mov.l   var_viewportRel, vp
@@ -49,139 +49,111 @@ _transformRoom_asm:
 
         // store matrix into stack (in reverse order)
         mov.l   var_gMatrixPtr, tmp
-        mov.l   @tmp, tmp
+        mov.l   @tmp, m
 
-        // copy 3x3 matrix rotation part
-        mov     #9, cnt
-.copyMtx_r:
-        mov.w   @tmp+, mx
-        dt      cnt
-        bf/s    .copyMtx_r
-        mov.w   mx, @-stackMtx  // [delay slot]
-
-        // prepare offsets (const)
-        mov.w   @tmp+, mx
-        mov.w   @tmp+, my
-        mov.w   @tmp+, mz
+        // pre-transform the matrix offset
+        add     #M03, m
+        mov.w   @m+, mx
+        mov.w   @m+, my
+        mov.w   @m+, mz
         shll8   mx
         shll8   my
         shll8   mz
+        add     #-12, m         // offset to z-row
+
+        // maxZ = VIEW_MAX = (1024 * 10) >> OT_SHIFT = (40 << 8) >> OT_SHIFT
+        mov     #40, maxZ
+        shll2   maxZ
+        shll2   maxZ
 
         add     #8, res         // extra offset for @-Rn
-        nop
 
 .loop_r:
         // unpack vertex
         mov.b   @vertices+, x
         mov.b   @vertices+, y
         mov.b   @vertices+, z
-
         shll2   x
         shll2   y
         shll2   z
 
-        // upload vertex coords into stack (in reverse order)
+        // upload vertex coords into stack
         mov     sp, stackVtx
         add     #6, stackVtx
-        mov     stackVtx, stackMtx
 
-        //shll16  x
-        //xtrct   y, x
-        mov.w   x, @-stackVtx
-        mov.w   y, @-stackVtx
         mov.w   z, @-stackVtx
+        mov.w   y, @-stackVtx
+        mov.w   x, @-stackVtx
 
-        //transform z
+.transform_z:
         lds     mz, MACL
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
         sts     MACL, z
           add     #-6, stackVtx
+          add     #-18, m       // offset to x-row
         shlr8   z
+
+        // z >>= OT_SHIFT
+        shlr2   z
+        shlr2   z
+
         exts.w  z, z
 
-
-        // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
-        // tmp = z + VIEW_OFF = z + 4096
-        mov     #16, tmp
-        shll8   tmp
-        add     z, tmp
-        // maxZ = VIEW_OFF + VIEW_MAX + VIEW_OFF = 18432
-        mov     #72, maxZ
-        shll8   maxZ
-        // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
-        cmp/hi  maxZ, tmp
-        bf/s    .visible_r
-        mov     #40, maxZ       // [delay slot] maxZ = 40
-        mov     #(CLIP_NEAR + CLIP_FAR), vg
-        mov.w   vg, @-res
-        add     #1, vertices
-        dt      count
-        bf/s    .loop_r
-        add     #10, res        // [delay slot]
-        bra     .done_r
-        nop
-
-.visible_r:
-        //transform y
-        lds     my, MACL
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        sts     MACL, y
-          add     #-6, stackVtx
-        shlr8   y
-        exts.w  y, y
-
-        //transform x
-        lds     mx, MACL
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        sts     MACL, x
-          shll8   maxZ  // maxZ = VIEW_MAX = (1024 * 10) = (40 << 8)
-        shlr8   x
-        exts.w  x, x
-
-        mov.b   @vertices+, vg
-
-        // tmp = FOG_MIN = 6144 = (24 << 8)
-        mov     #24, tmp
-        shll8   tmp
+.calc_fog:
         // if z <= FOG_MIN -> skip fog calc
-        cmp/gt  tmp, z
-        bf/s    .clip_z_near_r
-        mov     z, fog          // [delay slot]
-        sub     tmp, fog        // fog = z - FOG_MIN
-        shll    fog             // FOG_SHIFT
-        shlr8   fog             // shift down to 0..31 range
+        mov     #(32 >> OT_SHIFT), minFog // minFog = FOG_MIN >> OT_SHIFT
+        shll8   minFog
+        mov     z, fog
+        subc    minFog, fog     // TODO need to clear T before?
+        bt/s    .clip_z_near_r
+        mov.b   @vertices+, vg  // [delay slot]
+        shlr2   fog
+        shlr    fog             // shift down to 0..31 range
         add     fog, vg
         // vg = min(vg, 31)
-        mov     #31, tmp
-        cmp/gt  tmp, vg
+        mov     #31, maxG
+        cmp/gt  maxG, vg
         bf      .clip_z_near_r
         mov     #31, vg
 
         // z clipping
 .clip_z_near_r:
         add     #1, vg          // +1 for signed lightmap fetch
-        mov     #VIEW_MIN, minZ // minZ = VIEW_MIN = 64
+        mov     #(VIEW_MIN >> OT_SHIFT), minZ
         cmp/gt  z, minZ
         bf/s    .clip_z_far_r
         shll8   vg              // [delay slot] clear lower 8-bits of vg for clipping flags
         mov     minZ, z
-        add     #CLIP_NEAR, vg
+        add     #CLIP_PLANE, vg
 .clip_z_far_r:
         cmp/ge  maxZ, z
-        bf/s    .project_r
-        mov     z, dz           // [delay slot]
+        bf      .transform_x
         mov     maxZ, z
-        add     #CLIP_FAR, vg
+        add     #CLIP_PLANE, vg
 
-.project_r: // dz = divTable[z >> (PROJ_SHIFT = 4)]
-        shlr2   dz
-        shlr2   dz
+.transform_x:
+        lds     mx, MACL
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        sts     MACL, x
+          add     #-6, stackVtx
+        shlr8   x
+        exts.w  x, x
+
+.transform_y:
+        lds     my, MACL
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        sts     MACL, y
+          mov     z, dz         // [delay slot]
+        shlr8   y
+        exts.w  y, y
+
+.project_r: // dz = divTable[z]
         shll    dz
         mov.w   @(dz, divLUT), dz
 
@@ -266,7 +238,6 @@ _transformRoom_asm:
 #undef vertices
 #undef count
 #undef stackVtx
-#undef stackMtx
 #undef vp
 #undef x
 #undef y
@@ -282,5 +253,4 @@ _transformRoom_asm:
 #undef dz
 #undef vg
 #undef fog
-#undef cnt
 #undef SP_SIZE
\ No newline at end of file
diff --git a/src/platform/32x/rasterizer.h b/src/platform/32x/rasterizer.h
index ac93a66..4a89334 100644
--- a/src/platform/32x/rasterizer.h
+++ b/src/platform/32x/rasterizer.h
@@ -132,7 +132,7 @@ extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLin
                 }
             }
 
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
 
             Lx += Ldx;
             Rx += Rdx;
@@ -239,7 +239,7 @@ extern "C" void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLin
                 }
             }
 
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
 
             Lx += Ldx;
             Rx += Rdx;
@@ -380,7 +380,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
             #endif
             }
 
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
 
             Lx += Ldx;
             Rx += Rdx;
@@ -570,7 +570,7 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
             #endif
             }
 
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
 
             Lx += Ldx;
             Rx += Rdx;
@@ -604,7 +604,7 @@ extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const Vert
 
     if (L->v.y < 0)
     {
-        pixel -= L->v.y * VRAM_WIDTH;
+        pixel -= L->v.y * (FRAME_WIDTH >> 1);
         v -= L->v.y * dv;
         h += L->v.y;
     }
diff --git a/src/platform/32x/render.cpp b/src/platform/32x/render.cpp
index 254f008..a0c5c88 100644
--- a/src/platform/32x/render.cpp
+++ b/src/platform/32x/render.cpp
@@ -65,9 +65,8 @@ enum ClipFlags {
     CLIP_RIGHT   = 1 << 2,
     CLIP_TOP     = 1 << 3,
     CLIP_BOTTOM  = 1 << 4,
-    CLIP_FAR     = 1 << 5,
-    CLIP_NEAR    = 1 << 6,
-    CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_FAR | CLIP_NEAR),
+    CLIP_PLANE   = 1 << 5,
+    CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_PLANE)
 };
 
 const MeshQuad gShadowQuads[] = {
@@ -183,12 +182,12 @@ void transformRoom_c(const RoomVertex* vertices, int32 count)
         uint32 clip = 0;
 
         if (z <= VIEW_MIN_F) {
-            clip = CLIP_NEAR;
+            clip = CLIP_PLANE;
             z = VIEW_MIN_F;
         }
 
         if (z >= VIEW_MAX_F) {
-            clip = CLIP_FAR;
+            clip = CLIP_PLANE;
             z = VIEW_MAX_F;
         }
 
@@ -330,12 +329,12 @@ void transformMesh_c(const MeshVertex* vertices, int32 count, int32 intensity)
         uint32 clip = 0;
 
         if (z <= (VIEW_MIN_F >> FIXED_SHIFT)) {
-            clip = CLIP_NEAR;
+            clip = CLIP_PLANE;
             z = VIEW_MIN_F >> FIXED_SHIFT;
         }
 
         if (z >= (VIEW_MAX_F >> FIXED_SHIFT)) {
-            clip = CLIP_FAR;
+            clip = CLIP_PLANE;
             z = VIEW_MAX_F >> FIXED_SHIFT;
         }
 
@@ -598,25 +597,25 @@ int32 sphereIsVisible_c(int32 sx, int32 sy, int32 sz, int32 r)
 
 void flush_ot(int32 bit)
 {
-    VertexLink v[4 + 3];
+    VertexLink v[4 + 4];
     VertexLink* q = v;
     VertexLink* t = v + 4;
     // quad
-    q[0].prev = 3;
-    q[0].next = 1;
-    q[1].prev = -1;
-    q[1].next = 1;
-    q[2].prev = -1;
-    q[2].next = 1;
-    q[3].prev = -1;
-    q[3].next = -3;
+    q[0].prev = (3 << 4);
+    q[0].next = (1 << 4);
+    q[1].prev = -(1 << 4);
+    q[1].next = (1 << 4);
+    q[2].prev = -(1 << 4);
+    q[2].next = (1 << 4);
+    q[3].prev = -(1 << 4);
+    q[3].next = -(3 << 4);
     // triangle
-    t[0].prev = 2;
-    t[0].next = 1;
-    t[1].prev = -1;
-    t[1].next = 1;
-    t[2].prev = -1;
-    t[2].next = -2;
+    t[0].prev = (2 << 4);
+    t[0].next = (1 << 4);
+    t[1].prev = -(1 << 4);
+    t[1].next = (1 << 4);
+    t[2].prev = -(1 << 4);
+    t[2].next = -(2 << 4);
 
     int32 index = 0;
     const ColorIndex* tile = NULL;
@@ -654,12 +653,29 @@ void flush_ot(int32 bit)
                     ptr[3].t.t = 0xFF00FF00 & (tex.uv23 << 8);
                 }
 
-                ptr[0].v = gVertices[face->indices[0]];
-                ptr[1].v = gVertices[face->indices[1]];
-                ptr[2].v = gVertices[face->indices[2]];
+            #if 1
+                uint8* vPtr = (uint8*)gVertices;
+                ((uint32*)&ptr[0].v)[0] = ((uint32*)(vPtr + face->indices[0]))[0];
+                ((uint32*)&ptr[0].v)[1] = ((uint32*)(vPtr + face->indices[0]))[1];
+
+                ((uint32*)&ptr[1].v)[0] = ((uint32*)(vPtr + face->indices[1]))[0];
+                ((uint32*)&ptr[1].v)[1] = ((uint32*)(vPtr + face->indices[1]))[1];
+
+                ((uint32*)&ptr[2].v)[0] = ((uint32*)(vPtr + face->indices[2]))[0];
+                ((uint32*)&ptr[2].v)[1] = ((uint32*)(vPtr + face->indices[2]))[1];
+
                 if (!(flags & FACE_TRIANGLE)) {
-                    ptr[3].v = gVertices[face->indices[3]];
+                    ((uint32*)&ptr[3].v)[0] = ((uint32*)(vPtr + face->indices[3]))[0];
+                    ((uint32*)&ptr[3].v)[1] = ((uint32*)(vPtr + face->indices[3]))[1];
                 }
+            #else
+                ptr[0].v = gVertices[face->indices[0] >> 3];
+                ptr[1].v = gVertices[face->indices[1] >> 3];
+                ptr[2].v = gVertices[face->indices[2] >> 3];
+                if (!(flags & FACE_TRIANGLE)) {
+                    ptr[3].v = gVertices[face->indices[3] >> 3];
+                }
+            #endif
 
                 if (flags & FACE_CLIPPED) {
                     drawPoly(flags, ptr, tile);
@@ -855,10 +871,10 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde
     bool skip = (first->v.y == last->v.y);
 
     VertexLink* top = (first->v.y < last->v.y) ? first : last;
-    first->prev = count - 1;
-    first->next = 1;
-    last->prev = -1;
-    last->next = 1 - count;
+    first->prev = (count - 1) << 4;
+    first->next = (1 << 4);
+    last->prev = -(1 << 4);
+    last->next = (1 - count) << 4;
 
     for (int32 i = 1; i < count - 1; i++)
     {
@@ -873,8 +889,8 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde
             skip = false;
         }
 
-        p->prev = -1;
-        p->next = 1;
+        p->prev = -(1 << 4);
+        p->next = (1 << 4);
     }
 
     if (skip)
@@ -910,7 +926,7 @@ void clear()
     MARS_SYS_COMM4 = MARS_CMD_CLEAR;
 }
 
-void renderRoom(const Room* room)
+void renderRoom(Room* room)
 {
     int32 vCount = room->info->verticesCount;
     if (vCount <= 0)
@@ -1225,14 +1241,8 @@ const int32 BAR_COLORS[BAR_MAX][5] = {
     { 43, 44, 43, 42, 41 },
 };
 
-X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 shade, int32 color1, int32 color2, int32 z)
+X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 color1, int32 color2, int32 z)
 {
-    // background
-    if (shade >= 0) {
-        renderFill(x + 1, y + 1, width - 2, height - 2, shade, z);
-    }
-
-    // frame
     renderLine(x + 1, y, width - 2, 1, color1, z);
     renderLine(x + 1, y + height - 1, width - 2, 1, color2, z);
     renderLine(x, y, 1, height, color1, z);
@@ -1242,9 +1252,9 @@ X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32
 void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type)
 {
     // colored bar
-    int32 ix = x + 2;
-    int32 iy = y + 2;
-    int32 w = value * width >> 8;
+    int32 ix = x + 1;
+    int32 iy = y + 1;
+    int32 w = value* width >> 8;
 
     if (w > 0)
     {
@@ -1254,7 +1264,12 @@ void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type)
         }
     }
 
-    renderBorder(x, y, width + 4, BAR_HEIGHT + 4, 27, 19, 17, 0);
+    if (w < width)
+    {
+        renderFill(x + 1 + w, y + 1, width - w, BAR_HEIGHT, 27, 0);
+    }
+
+    renderBorder(x, y, width + 2, BAR_HEIGHT + 2, 19, 17, 0);
 }
 
 void renderBackground(const void* background)