#407 32X optimizations, increase fog distance (2 blocks)

2025-01-17 04:48:57 +01:00 · 2022-12-24 11:23:42 +03:00 · 2022-12-24 11:23:42 +03:00 · 4e9b92e5a4
commit 4e9b92e5a4
parent d268754786
15 changed files with 392 additions and 465 deletions
--- a/src/fixed/common.h
+++ b/src/fixed/common.h
@ -637,9 +637,12 @@ struct Matrix

 struct RoomQuad
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint16 indices[4];
+#elif defined(__32X__)
+    uint32 flags;
+    int8 indices[4];
 #else
    int8 indices[4];
    uint16 flags;
@ -649,9 +652,12 @@ struct RoomQuad

 struct RoomTriangle
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint16 indices[4];
+#elif defined(__32X__)
+    uint16 flags;
+    uint16 indices[3];
 #else
    uint16 indices[3];
    uint16 flags;
@ -660,9 +666,12 @@ struct RoomTriangle

 struct MeshQuad
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint32 indices;
+#elif defined(__32X__)
+    uint16 flags;
+    uint8  indices[4];
 #else
    int8 indices[4];
    uint16 flags;
@ -672,9 +681,12 @@ struct MeshQuad

 struct MeshTriangle
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint32 indices;
+#elif defined(__32X__)
+    uint16 flags;
+    uint8  indices[4];
 #else
    int8 indices[4];
    uint16 flags;
@ -743,7 +755,7 @@ struct Face
 {
    uint32 flags;
    Face* next;
-    uint16 indices[4];
+    int16 indices[4];
 };
 #endif

--- a/src/platform/32x/asm/common.i
+++ b/src/platform/32x/asm/common.i
@ -54,22 +54,20 @@

 #define FACE_SIZEOF             16

-#define VIEW_DIST       (1024 * 10)   // max = DIV_TABLE_END << PROJ_SHIFT
-#define FOG_SHIFT       1
-#define FOG_MAX         VIEW_DIST
-#define FOG_MIN         (FOG_MAX - (8192 >> FOG_SHIFT))
-#define VIEW_MIN        (64)
-#define VIEW_MAX        (VIEW_DIST)
-#define VIEW_OFF        4096
+#define VIEW_MIN        64
+#define VIEW_MAX        (10 << 10)
+#define FOG_SHIFT       4
+#define FOG_MIN         (VIEW_MAX - 2048)
+
+#define OT_SHIFT        4

 #define CLIP_FRAME      (1 << 0)
 #define CLIP_LEFT       (1 << 1)
 #define CLIP_RIGHT      (1 << 2)
 #define CLIP_TOP        (1 << 3)
 #define CLIP_BOTTOM     (1 << 4)
-#define CLIP_FAR        (1 << 5)
-#define CLIP_NEAR       (1 << 6)
-#define CLIP_DISCARD    (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_FAR + CLIP_NEAR)
+#define CLIP_PLANE      (1 << 5)
+#define CLIP_DISCARD    (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_PLANE)

 #define VP_MINX         0
 #define VP_MINY         4
--- a/src/platform/32x/asm/faceAddMeshQuads.i
+++ b/src/platform/32x/asm/faceAddMeshQuads.i
@ -26,9 +26,9 @@
 #define vz2         vg2
 #define vz3         vg3

-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
-#define ot          tmp
+#define ot          vg0

 .align 4
 .global _faceAddMeshQuads_asm
@ -43,26 +43,30 @@ _faceAddMeshQuads_asm:
        mov.l   r14, @-sp

        mov.l   var_gVertices_fam, vertices
+        add     #VERTEX_Z, vertices

        mov.l   var_gVerticesBase_fam, vp
        mov.l   @vp, vp

        mov.l   var_gFacesBase_fam, face
        mov.l   @face, face
+        nop

 .loop_famq:
        // read flags and indices
        mov.w   @polys+, flags
-        mov.b   @polys+, vp0
-        mov.b   @polys+, vp1
-        mov.b   @polys+, vp2
-        mov.b   @polys+, vp3
+        mov.w   @polys+, vp0
+        mov.w   @polys+, vp2

-        extu.w  flags, flags
+        extu.w  flags, flags // TODO packer free high bit
+
+        extu.b  vp0, vp1
+        shlr8   vp0
        extu.b  vp0, vp0
-        extu.b  vp1, vp1
+
+        extu.b  vp2, vp3
+        shlr8   vp2
        extu.b  vp2, vp2
-        extu.b  vp3, vp3

        // p = gVerticesBase + index * VERTEX_SIZEOF
        shll2   vp0
@ -111,50 +115,40 @@ _faceAddMeshQuads_asm:
        or      tmp, flags

 .avg_z4_famq:
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
        mov.w   @vp1, vz1
        mov.w   @vp2, vz2
        mov.w   @vp3, vz3
-        add     vz1, vz0
-        add     vz2, vz0
-        add     vz3, vz0
-        shlr2   vz0             // div by 4
+        add     vz1, depth
+        add     vz2, depth
+        add     vz3, depth
+        shlr2   depth           // depth /= 4

        mov.l   var_gOT_fam, ot

 .face_add_famq:
-        // index = (p - vertices) / VERTEX_SIZEOF
+        // offset = (p - vertices)
        sub     vertices, vp0
        sub     vertices, vp1
        sub     vertices, vp2
        sub     vertices, vp3
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr2   vp3
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-        shlr    vp3
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth

        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)

+        shll16  vp3
+        xtrct   vp2, vp3
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp3, @(12, face)
        add     #FACE_SIZEOF, face
-        mov     face, tmp
+        nop

-        mov.w   vp3, @-tmp
-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_famq:
        dt      count
        bf      .loop_famq
--- a/src/platform/32x/asm/faceAddMeshTriangles.i
+++ b/src/platform/32x/asm/faceAddMeshTriangles.i
@ -25,7 +25,7 @@
 #define vz1         vg1
 #define vz2         vg2

-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1

 .align 4
@ -41,6 +41,7 @@ _faceAddMeshTriangles_asm:
        mov.l   r14, @-sp

        mov.l   var_gVertices_fam, vertices
+        add     #VERTEX_Z, vertices

        mov.l   var_gVerticesBase_fam, vp
        mov.l   @vp, vp
@ -49,19 +50,20 @@ _faceAddMeshTriangles_asm:
        mov.l   @face, face

        mov.l   var_gOT_fam, ot
-        nop

 .loop_famt:
        // read flags and indices
        mov.w   @polys+, flags
-        mov.b   @polys+, vp0
-        mov.b   @polys+, vp1
-        mov.b   @polys+, vp2
-        add     #1, polys       // skup 4th index
+        mov.w   @polys+, vp0
+        mov.w   @polys+, vp2

-        extu.w  flags, flags
+        extu.w  flags, flags // TODO packer free high bit
+
+        extu.b  vp0, vp1
+        shlr8   vp0
        extu.b  vp0, vp0
-        extu.b  vp1, vp1
+
+        shlr8   vp2
        extu.b  vp2, vp2

        // p = gVerticesBase + index * VERTEX_SIZEOF
@ -80,11 +82,9 @@ _faceAddMeshTriangles_asm:
        // check_backface
        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
        bt/s    .skip_famt
-        mov.l   const_FACE_TRIANGLE_fam, tmp    // [delay slot]
-        or      tmp, flags

        // fetch clip masks
-        mov     #(VERTEX_CLIP - 4), tmp
+        mov     #(VERTEX_CLIP - 4), tmp         // [delay slot]
        mov.b   @(tmp, vp0), vg0
        mov.b   @(tmp, vp1), vg1
        mov.b   @(tmp, vp2), vg2
@ -95,8 +95,11 @@ _faceAddMeshTriangles_asm:
        tst     #CLIP_DISCARD, tmp
        bf/s    .skip_famt

+        mov.l   const_FACE_TRIANGLE_fam, tmp    // [delay slot]
+        or      tmp, flags
+
        // mark if should be clipped by frame
-        mov     vg0, tmp        // [delay slot]
+        mov     vg0, tmp
        or      vg1, tmp
        or      vg2, tmp
        tst     #CLIP_FRAME, tmp
@ -105,44 +108,35 @@ _faceAddMeshTriangles_asm:
        or      tmp, flags

 .avg_z3_famt:
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
        mov.w   @vp1, vz1
        mov.w   @vp2, vz2
-        add     vz1, vz0
-        add     vz2, vz0
-        add     vz2, vz0        // approx.
-        shlr2   vz0             // div by 4
+        add     vz1, depth
+        add     vz2, depth
+        add     vz2, depth      // approx.
+        shlr2   depth           // depth /= 4

 .face_add_famt:
-        // index = (p - vertices) / VERTEX_SIZEOF
+        // offset = (p - vertices)
        sub     vertices, vp0
        sub     vertices, vp1
        sub     vertices, vp2
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth

        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)

+        shll16  vp2
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp2, @(12, face)
        add     #FACE_SIZEOF, face
-        mov     face, tmp
-        add     #-2, tmp        // skip 4th index
+        nop

-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_famt:
        dt      count
        bf      .loop_famt
--- a/src/platform/32x/asm/faceAddRoomQuads.i
+++ b/src/platform/32x/asm/faceAddRoomQuads.i
@ -26,9 +26,9 @@
 #define vz2         vg2
 #define vz3         vg3

-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
-#define ot          tmp
+#define ot          vg0

 .align 4
 .global _faceAddRoomQuads_asm
@ -43,34 +43,44 @@ _faceAddRoomQuads_asm:
        mov.l   r14, @-sp

        mov.l   var_gVertices_far, vertices
+        add     #VERTEX_Z, vertices

        mov.l   var_gVerticesBase_far, vp
        mov.l   @vp, vp

        mov.l   var_gFacesBase_far, face
        mov.l   @face, face
+        nop

 .loop_farq:
        // read flags and indices
-        mov.w   @polys+, flags
-        mov.w   @polys+, vp0
-        mov.w   @polys+, vp1
-        mov.w   @polys+, vp2
-        mov.w   @polys+, vp3
-        extu.w  flags, flags
-        // indices never exceed 32k, no need for extu.w
+        mov.l   @polys+, flags
+        mov.l   @polys+, vp0

-        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
+        exts.b  vp0, vp3
+        shlr8   vp0
+        exts.b  vp0, vp2
+        shlr8   vp0
+        exts.b  vp0, vp1
+        shlr8   vp0
+        exts.b  vp0, vp0
+
+        // index *= 8 (VERTEX_SIZEOF)
        shll2   vp0
        shll2   vp1
        shll2   vp2
        shll2   vp3
+        shll    vp0
+        shll    vp1
+        shll    vp2
+        shll    vp3

        // get vertex address
        add     vp, vp0
-        add     vp, vp1
-        add     vp, vp2
-        add     vp, vp3
+        add     vp0, vp1
+        add     vp1, vp2
+        add     vp2, vp3
+        mov     vp3, vp

        // fetch ((g << 8) | clip)
        mov     #VERTEX_G, tmp
@ -116,59 +126,45 @@ _faceAddRoomQuads_asm:
        add     #VERTEX_Z, vp3  // [delay slot] ccw shifts p[0..2] address to VERTEX_Z, shift p3 too

        // max_z4
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
        mov.w   @vp1, vz1 
        // check_z1
-        cmp/gt  vz0, vz1
+        cmp/gt  depth, vz1
        bf/s    3f
        mov.w   @vp2, vz2       // [delay slot]
-        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+        mov     vz1, depth      // if (z1 > z0) z0 = z1
 3:      // check_z2
-        cmp/gt  vz0, vz2
+        cmp/gt  depth, vz2
        bf/s    4f
        mov.w   @vp3, vz3       // [delay slot]
-        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+        mov     vz2, depth      // if (z2 > z0) z0 = z2
 4:      // check_z3
-        cmp/gt  vz0, vz3
-        bf      .face_add_farq  // TODO use delay slot but not for OT! )
-        mov     vz3, vz0        // if (z3 > z0) z0 = z3
+        cmp/gt  depth, vz3
+        bf/s    .face_add_farq
+        sub     vertices, vp0   // [delay slot] get the first offset
+        mov     vz3, depth      // if (z3 > z0) z0 = z3

 .face_add_farq:
-        mov.l   var_gOT_far, ot // [delay slot]
-        // get absolute indices
-        // p address is 4 bytes ahead but it's fine for shlr3
-        // index = (p - vertices) / VERTEX_SIZEOF
-        sub     vertices, vp0
+        mov.l   var_gOT_far, ot
+        // offset = (p - vertices)
        sub     vertices, vp1
        sub     vertices, vp2
        sub     vertices, vp3
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr2   vp3
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-        shlr    vp3
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth

        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)

+        shll16  vp3
+        xtrct   vp2, vp3
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp3, @(12, face)
        add     #FACE_SIZEOF, face
-        mov     face, tmp
-
-        mov.w   vp3, @-tmp
-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_farq:
        dt      count
        bf      .loop_farq
--- a/src/platform/32x/asm/faceAddRoomTriangles.i
+++ b/src/platform/32x/asm/faceAddRoomTriangles.i
@ -25,7 +25,7 @@
 #define vz1         vg1
 #define vz2         vg2

-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1

 .align 4
@ -41,6 +41,7 @@ _faceAddRoomTriangles_asm:
        mov.l   r14, @-sp

        mov.l   var_gVertices_far, vertices
+        add     #VERTEX_Z, vertices

        mov.l   var_gVerticesBase_far, vp
        mov.l   @vp, vp
@ -49,21 +50,19 @@ _faceAddRoomTriangles_asm:
        mov.l   @face, face

        mov.l   var_gOT_far, ot
-        nop

 .loop_fart:
        // read flags and indices
-        mov.w   @polys+, flags
-        mov.w   @polys+, vp0
-        mov.w   @polys+, vp1
-        mov.w   @polys+, vp2
-        extu.w  flags, flags
-        // indices never exceed 32k, no need for extu.w
+        mov.l   @polys+, flags
+        mov.l   @polys+, vp1

-        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
-        shll2   vp0
-        shll2   vp1
-        shll2   vp2
+        extu.w  flags, vp0
+        shlr16  flags
+
+        extu.w  vp1, vp2
+        shlr16  vp1
+
+        // vp[0..2] alreay multiplied by VERTEX_SIZEOF

        // get vertex address
        add     vp, vp0
@ -90,7 +89,7 @@ _faceAddRoomTriangles_asm:
        or      vg2, tmp
        tst     #CLIP_FRAME, tmp
        bt/s    1f
-        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot]
+        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot] mov #1, tmp; rotr x2
        or      tmp, flags

 1:      // compare VERTEX_G for gouraud rasterization
@ -100,60 +99,47 @@ _faceAddRoomTriangles_asm:
        shlr8   vg1             // shift down for g only
        tst     vg1, vg1
        bt/s    2f
-        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot]
+        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot] mov #128, tmp; shll8
        add     tmp, flags

 2:      // check_backface
        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
        bt/s    .skip_fart
-        mov.l   const_FACE_TRIANGLE_far, tmp    // [delay slot]
+        mov.l   const_FACE_TRIANGLE_far, tmp    // [delay slot] mov #1, tmp; rotr
        or      tmp, flags

        // max_z3
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth     // depth = vz0
        mov.w   @vp1, vz1
        // check_z1
-        cmp/gt  vz0, vz1
+        cmp/gt  depth, vz1
        bf/s    3f
        mov.w   @vp2, vz2       // [delay slot]
-        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+        mov     vz1, depth      // if (z1 > depth) depth = z1
 3:      // check_z2
-        cmp/gt  vz0, vz2
-        bf      .face_add_fart  // TODO use delay slot but not for OT! )
-        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+        cmp/gt  depth, vz2
+        bf/s    .face_add_fart  // TODO use delay slot but not for OT! )
+        sub     vertices, vp0   // [delay slot] get the first offset
+        mov     vz2, depth      // if (z2 > depth) depth = z2

 .face_add_fart:
-        // get absolute indices
-        // p address is 4 bytes ahead but it's fine for shlr3
-        // index = (p - vertices) / VERTEX_SIZEOF
-        sub     vertices, vp0
+        // offset = (p - vertices)
        sub     vertices, vp1
        sub     vertices, vp2
-        shlr2   vp0
-        shlr2   vp1
-        shlr2   vp2
-        shlr    vp0
-        shlr    vp1
-        shlr    vp2
-
-        // depth (vz0) >>= OT_SHIFT (4)
-        shlr2   depth
-        shlr2   depth

        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
-        mov.l   @depth, next
-        mov.l   face, @depth
+        mov.l   @(depth, ot), next
+        mov.l   face, @(depth, ot)

+        shll16  vp2
+        shll16  vp1
+        xtrct   vp0, vp1
+
+        mov.l   flags, @(0, face)
+        mov.l   next, @(4, face)
+        mov.l   vp1, @(8, face)
+        mov.l   vp2, @(12, face)
        add     #FACE_SIZEOF, face
-        mov     face, tmp
-        add     #-2, tmp        // skip 4th index
-
-        mov.w   vp2, @-tmp
-        mov.w   vp1, @-tmp
-        mov.w   vp0, @-tmp
-        mov.l   next, @-tmp
-        mov.l   flags, @-tmp
 .skip_fart:
        dt      count
        bf      .loop_fart
--- a/src/platform/32x/asm/rasterize.i
+++ b/src/platform/32x/asm/rasterize.i
@ -40,8 +40,8 @@ _rasterize_asm:

 .align 2
 var_fb:
-        // overwrite image frame buffer address has the same
-        // write per but allow transparent write for byte & word
+        // overwrite image frame buffer address, it has the same
+        // write latency but allow transparent write for byte & word
        .long 0x24020200
 var_table:
 #ifdef ON_CHIP_RENDER
--- a/src/platform/32x/asm/rasterizeF.i
+++ b/src/platform/32x/asm/rasterizeF.i
@ -5,25 +5,22 @@
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define index   r6      // arg
-#define gtile   r7      // arg (unused)
-#define N       gtile
+#define h       r7
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
 #define Rdx     r11
 #define dup     r12     // const
 #define inv     r13
-#define divLUT  r14
+#define R       r14

-#define R       index
-#define h       N
+#define divLUT  inv

 #define Ry      inv
 #define Ly      inv

-#define Rptr    R
+#define Rptr    index

-#define iw      inv
 #define ih      inv
 #define LMAP    inv

@ -38,7 +35,6 @@
        mov.l   @sp+, r9
        rts
        mov.l   @sp+, r8
-        nop

 .global _rasterizeF_asm
 _rasterizeF_asm:
@ -63,37 +59,30 @@ _rasterizeF_asm:

        mov     L, R

-        mov.l   var_divTable_fs, divLUT
-
        mov     #0, Rh
-        mov     #0, Lh
-.loop_f:
-        tst     Lh, Lh
-        bf/s    .calc_left_end_f

 .calc_left_start_f:
        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
+        add     L, tmp          // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT)

-        mov.w   @L+, Lx
-        mov.w   @L+, Ly
+        mov.l   @L, Lx
+        extu.w  Lx, Ly
+        shlr16  Lx

-        mov     N, tmp
-        mov.w   @tmp+, Ldx
-        mov.w   @tmp+, Lh
+        mov.l   @tmp, Ldx
+        extu.w  Ldx, Lh
+        shlr16  Ldx

        cmp/ge  Ly, Lh
        bf/s    .exit_f
        cmp/eq  Ly, Lh          // [delay slot]
        bt/s    .calc_left_start_f      // if (L->v.y == N->v.y) check next vertex
-        mov     N, L            // [delay slot]
+        mov     tmp, L          // [delay slot]

        sub     Lx, Ldx
        sub     Ly, Lh

+        mov.l   var_divTable_fs, divLUT
        mov     Lh, tmp
        shll    tmp
        mov.w   @(tmp, divLUT), ih
@ -104,31 +93,30 @@ _rasterizeF_asm:
 .calc_left_end_f:

        tst     Rh, Rh
-        bf/s    .calc_right_end_f
+        bf      .calc_right_end_f

 .calc_right_start_f:
-        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
+        mov.b   @(VERTEX_NEXT, R), tmp
+        add     R, tmp          // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT)

-        mov.w   @R+, Rx
-        mov.w   @R+, Ry
+        mov.l   @R, Rx
+        extu.w  Rx, Ry
+        shlr16  Rx

-        mov     N, tmp
-        mov.w   @tmp+, Rdx
-        mov.w   @tmp+, Rh
+        mov.l   @tmp, Rdx
+        extu.w  Rdx, Rh
+        shlr16  Rdx

        cmp/ge  Ry, Rh
        bf/s    .exit_f
        cmp/eq  Ry, Rh          // [delay slot]
        bt/s    .calc_right_start_f     // if (R->v.y == N->v.y) check next vertex
-        mov     N, R            // [delay slot]
+        mov     tmp, R          // [delay slot]

        sub     Rx, Rdx
        sub     Ry, Rh

+        mov.l   var_divTable_fs, divLUT
        mov     Rh, tmp
        shll    tmp
        mov.w   @(tmp, divLUT), ih
@ -148,8 +136,6 @@ _rasterizeF_asm:
        sub     h, Lh
        sub     h, Rh

-        mov.l   R, @-sp
-        
 .scanline_start_f:
        mov     Lx, Lptr
        mov     Rx, Rptr
@ -160,12 +146,6 @@ _rasterizeF_asm:
        cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
        bf/s    .scanline_end_f

-        // iw = divTable[Rptr - Lptr]
-        mov     Rptr, tmp       // [delay slot]
-        sub     Lptr, tmp
-        shll    tmp
-        mov.w   @(tmp, divLUT), iw
-
        add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
        add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)

@ -178,10 +158,10 @@ _rasterizeF_asm:
        mov.b   dup, @Lptr
        add     #1, Lptr

-        mov     #1, tmp         // tmp = 1 (for align_right)
        cmp/gt  Lptr, Rptr
        bf/s    .scanline_end_f
        tst     tmp, Rptr
+        nop

 .align_right_f:
        bt      .block_2px_f
@ -192,17 +172,20 @@ _rasterizeF_asm:
 .block_2px_f:
        mov.w   dup, @-Rptr
        cmp/gt  Lptr, Rptr
-        bt      .block_2px_f
+        bt/s    .block_2px_f
+        nop

 .scanline_end_f:
        dt      h

        mov.w   var_frameWidth_fs, tmp
        bf/s    .scanline_start_f
-        add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
+        add     tmp, pixel      // [delay slot] pixel += FRAME_WIDTH

-        bra     .loop_f
-        mov.l   @sp+, R
+        tst     Lh, Lh
+        bf      .calc_right_start_f
+        bra     .calc_left_start_f
+        nop

 #undef tmp
 #undef Lh
@ -211,7 +194,6 @@ _rasterizeF_asm:
 #undef pixel
 #undef L
 #undef index
-#undef N
 #undef Lx
 #undef Rx
 #undef Ldx
@ -224,6 +206,5 @@ _rasterizeF_asm:
 #undef Ry
 #undef Ly
 #undef Rptr
-#undef iw
 #undef ih
 #undef LMAP
--- a/src/platform/32x/asm/rasterizeFT.i
+++ b/src/platform/32x/asm/rasterizeFT.i
@ -66,6 +66,7 @@
        mov.l   @sp+, r9
        rts
        mov.l   @sp+, r8
+        nop

 .global _rasterizeFT_asm
 _rasterizeFT_asm:
@ -95,14 +96,13 @@ _rasterizeFT_asm:

        tst     Lh, Lh
        bf/s    .calc_left_end_ft
+        nop

 .calc_left_start_ft:
        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
        mov     tmp, N

        mov.w   @(VERTEX_Y, L), tmp
-        shll2   N
-        shll2   N
        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ly
        mov.w   @(VERTEX_Y, N), tmp
@ -144,14 +144,13 @@ _rasterizeFT_asm:
        shlr16  Rh              // Rh = (Rh >> 16)
        tst     Rh, Rh
        bf/s    .calc_right_end_ft
+        nop

 .calc_right_start_ft:
        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
        mov     tmp, N

        mov.w   @(VERTEX_Y, R), tmp
-        shll2   N
-        shll2   N
        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ry
        mov.w   @(VERTEX_Y, N), tmp
@ -206,7 +205,8 @@ _rasterizeFT_asm:
        mov.l   tmp, @(SP_H, sp)
        mov.l   L, @(SP_L, sp)
        mov.l   R, @(SP_R, sp)
-        
+        nop
+
 .scanline_start_ft:
        mov     Lx, Lptr
        mov     Rx, Rptr
@ -263,15 +263,15 @@ _rasterizeFT_asm:

        cmp/gt  Lptr, Rptr
        bf/s    .scanline_end_ft
+        nop

 .block_prepare_ft:
        shll    dtdx            // [delay slot] optional
+        nop

 .block_2px_ft:
-        swap.b  t, index        // UUuuvvVV
-        swap.w  index, index    // vvVVUUuu
-        shll8   index           // VVUUuu00
-        shlr16  index           // 0000VVUU
+        getUV   t, index
+
        mov.b   @(index, TILE), index
        mov.b   @(index, LMAP), index

@ -283,6 +283,7 @@ _rasterizeFT_asm:
        cmp/gt  Lptr, Rptr
        bt/s    .block_2px_ft
        sub     dtdx, t         // [delay slot] t -= dtdx
+        nop

 .scanline_end_ft:
        mov.l   @(SP_LDX, sp), sLdx
--- a/src/platform/32x/asm/rasterizeGT.i
+++ b/src/platform/32x/asm/rasterizeGT.i
@ -93,8 +93,6 @@ _rasterizeGT_asm:
        add     #-SP_SIZE, sp

        mov     gtile, TILE
-        nop
-
        mov     #0, Rh

 .loop_gt:
@ -102,14 +100,13 @@ _rasterizeGT_asm:

        tst     Lh, Lh
        bf/s    .calc_left_end_gt
+        shlr16  Rh              // [delay slot] Rh = (Rh >> 16)

 .calc_left_start_gt:
-        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
+        mov.b   @(VERTEX_PREV, L), tmp
        mov     tmp, N

        mov.w   @(VERTEX_Y, L), tmp
-        shll2   N
-        shll2   N
        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ly
        mov.w   @(VERTEX_Y, N), tmp
@ -159,9 +156,9 @@ _rasterizeGT_asm:
        // calc Ldt
        scaleUV Ldt, tmp, ih
        mov.l   tmp, @(SP_LDT, sp)
+        nop
 .calc_left_end_gt:

-        shlr16  Rh              // Rh = (Rh >> 16)
        tst     Rh, Rh
        bf/s    .calc_right_end_gt

@ -170,8 +167,6 @@ _rasterizeGT_asm:
        mov     tmp, N

        mov.w   @(VERTEX_Y, R), tmp
-        shll2   N
-        shll2   N
        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ry
        mov.w   @(VERTEX_Y, N), tmp
@ -221,6 +216,7 @@ _rasterizeGT_asm:
        // calc Rdt
        scaleUV Rdt, tmp, ih
        mov.l   tmp, @(SP_RDT, sp)
+        nop
 .calc_right_end_gt:

        // bake gLightmap address into g value
@ -233,6 +229,7 @@ _rasterizeGT_asm:
        bf/s    .scanline_prepare_gt
        mov     Lh, h           // [delay slot]
        mov     Rh, h
+        nop

 .scanline_prepare_gt:
        sub     h, Lh
@ -330,10 +327,8 @@ _rasterizeGT_asm:
        shll    dgdx

 .block_2px_gt:
-        swap.b  t, index        // UUuuvvVV
-        swap.w  index, index    // vvVVUUuu
-        shll8   index           // VVUUuu00
-        shlr16  index           // 0000VVUU
+        getUV   t, index
+
        mov.b   @(index, TILE), index

        mov     g, LMAP
--- a/src/platform/32x/asm/rasterizeS.i
+++ b/src/platform/32x/asm/rasterizeS.i
@ -5,8 +5,7 @@
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define R       r6      // arg
-#define gtile   r7      // arg (unused)
-#define N       gtile
+#define h       r7
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
@ -16,14 +15,12 @@
 #define divLUT  r14

 #define index   tmp
-#define h       N

 #define Ry      inv
 #define Ly      inv

-#define Rptr    R
+#define Rptr    inv

-#define iw      inv
 #define ih      inv

 .align 4
@ -37,7 +34,6 @@
        mov.l   @sp+, r9
        rts
        mov.l   @sp+, r8
-        nop

 .global _rasterizeS_asm
 _rasterizeS_asm:
@ -58,30 +54,25 @@ _rasterizeS_asm:
        mov.l   var_divTable_fs, divLUT

        mov     #0, Rh
-        mov     #0, Lh
-.loop_s:
-        tst     Lh, Lh
-        bf/s    .calc_left_end_s
+        nop

 .calc_left_start_s:
        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
+        add     L, tmp          // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT)

-        mov.w   @L+, Lx
-        mov.w   @L+, Ly
+        mov.l   @L, Lx
+        extu.w  Lx, Ly
+        shlr16  Lx

-        mov     N, tmp
-        mov.w   @tmp+, Ldx
-        mov.w   @tmp+, Lh
+        mov.l   @tmp, Ldx
+        extu.w  Ldx, Lh
+        shlr16  Ldx

        cmp/ge  Ly, Lh
        bf/s    .exit_s
        cmp/eq  Ly, Lh          // [delay slot]
        bt/s    .calc_left_start_s      // if (L->v.y == N->v.y) check next vertex
-        mov     N, L            // [delay slot]
+        mov     tmp, L          // [delay slot]

        sub     Lx, Ldx
        sub     Ly, Lh
@ -96,27 +87,26 @@ _rasterizeS_asm:
 .calc_left_end_s:

        tst     Rh, Rh
-        bf/s    .calc_right_end_s
+        bf      .calc_right_end_s
+        nop

 .calc_right_start_s:
-        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
-        mov     tmp, N
-        shll2   N
-        shll2   N
-        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
+        mov.b   @(VERTEX_NEXT, R), tmp
+        add     R, tmp          // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT)

-        mov.w   @R+, Rx
-        mov.w   @R+, Ry
+        mov.l   @R, Rx
+        extu.w  Rx, Ry
+        shlr16  Rx

-        mov     N, tmp
-        mov.w   @tmp+, Rdx
-        mov.w   @tmp+, Rh
+        mov.l   @tmp, Rdx
+        extu.w  Rdx, Rh
+        shlr16  Rdx

        cmp/ge  Ry, Rh
        bf/s    .exit_s
        cmp/eq  Ry, Rh          // [delay slot]
        bt/s    .calc_right_start_s     // if (R->v.y == N->v.y) check next vertex
-        mov     N, R            // [delay slot]
+        mov     tmp, R          // [delay slot]

        sub     Rx, Rdx
        sub     Ry, Rh
@ -135,13 +125,12 @@ _rasterizeS_asm:
        bf/s    .scanline_prepare_s
        mov     Lh, h           // [delay slot]
        mov     Rh, h
+        nop

 .scanline_prepare_s:
        sub     h, Lh
        sub     h, Rh

-        mov.l   R, @-sp
-        
 .scanline_start_s:
        mov     Lx, Lptr
        mov     Rx, Rptr
@ -152,14 +141,8 @@ _rasterizeS_asm:
        cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
        bf/s    .scanline_end_s

-        // iw = divTable[Rptr - Lptr]
-        mov     Rptr, tmp       // [delay slot]
-        sub     Lptr, tmp
-        shll    tmp
-        mov.w   @(tmp, divLUT), iw
-
-        add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
-        add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)
+        add     pixel, Lptr     // Lptr = pixel + (Lx >> 16)
+        add     pixel, Rptr     // Rptr = pixel + (Rx >> 16)

 .shade_pixel_s:
        mov.b   @Lptr, index
@ -174,10 +157,12 @@ _rasterizeS_asm:

        mov.w   var_frameWidth_fs, tmp
        bf/s    .scanline_start_s
-        add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
+        add     tmp, pixel      // [delay slot] pixel += FRAME_WIDTH

-        bra     .loop_s
-        mov.l   @sp+, R
+        tst     Lh, Lh
+        bf      .calc_right_start_s
+        bra     .calc_left_start_s
+        nop

 #undef tmp
 #undef Lh
@ -186,7 +171,6 @@ _rasterizeS_asm:
 #undef pixel
 #undef L
 #undef R
-#undef N
 #undef Lx
 #undef Rx
 #undef Ldx
@ -199,5 +183,4 @@ _rasterizeS_asm:
 #undef Ry
 #undef Ly
 #undef Rptr
-#undef iw
 #undef ih
--- a/src/platform/32x/asm/transformMesh.i
+++ b/src/platform/32x/asm/transformMesh.i
@ -78,10 +78,10 @@ _transformMesh_asm:
        // pre-transform the matrix offset
        add     #M03, m
        mov.w   @m+, mx
-        shll16  mx
        mov.w   @m+, my
-        shll16  my
        mov.w   @m+, mz
+        shll16  mx
+        shll16  my
        shll16  mz
        add     #-MATRIX_SIZEOF, m

@ -99,22 +99,24 @@ _transformMesh_asm:

        // z clipping
 .clip_z_near_m:
-        mov     #VIEW_MIN, minZ // 64
+        mov     #VIEW_MIN, minZ
        cmp/gt  z, minZ
        bf/s    .clip_z_far_m
        cmp/ge  maxZ, z         // [delay slot]
        mov     minZ, z
-        add     #CLIP_NEAR, vg
+        add     #CLIP_PLANE, vg
 .clip_z_far_m:
-        bf/s    .project_m
-        mov     z, dz           // [delay slot] dz = z
+        bf      .project_m
        mov     maxZ, z
-        add     #CLIP_FAR, vg
+        add     #CLIP_PLANE, vg

 .project_m:
-        // dz = divTable[z >> (PROJ_SHIFT = 4)]
-        shlr2   dz
-        shlr2   dz
+        // z >>= OT_SHIFT
+        shlr2   z
+        shlr2   z
+
+        // dz = divTable[z]
+        mov     z, dz
        shll    dz
        mov.w   @(dz, divLUT), dz

--- a/src/platform/32x/asm/transformRoom.i
+++ b/src/platform/32x/asm/transformRoom.i
@ -4,9 +4,9 @@
 #define res             r3
 #define vertices        r4      // arg
 #define count           r5      // arg
-#define stackVtx        r6
-#define stackMtx        r7
-#define vp              r8
+#define vp              r6
+#define m               r7
+#define vg              r8
 #define x               r9
 #define y               r10
 #define z               r11
@ -18,13 +18,14 @@
 #define minY            tmp
 #define maxX            tmp
 #define maxY            tmp
-#define minZ            tmp
+#define minZ            x
 #define dz              tmp
-#define vg              stackVtx
-#define fog             stackMtx
-#define cnt             stackVtx
+#define stackVtx        tmp
+#define fog             x
+#define minFog          y
+#define maxG            y

-#define SP_SIZE         (18 + 6)        // mat3x3 + vec3
+#define SP_SIZE         (8)        // vec3s + padding

 .align 4
 .global _transformRoom_asm
@ -37,7 +38,6 @@ _transformRoom_asm:
        mov.l   r12, @-sp
        mov.l   r13, @-sp
        mov.l   r14, @-sp
-        mov     sp, stackMtx 
        add     #-SP_SIZE, sp

        mov.l   var_viewportRel, vp
@ -49,139 +49,111 @@ _transformRoom_asm:

        // store matrix into stack (in reverse order)
        mov.l   var_gMatrixPtr, tmp
-        mov.l   @tmp, tmp
+        mov.l   @tmp, m

-        // copy 3x3 matrix rotation part
-        mov     #9, cnt
-.copyMtx_r:
-        mov.w   @tmp+, mx
-        dt      cnt
-        bf/s    .copyMtx_r
-        mov.w   mx, @-stackMtx  // [delay slot]
-
-        // prepare offsets (const)
-        mov.w   @tmp+, mx
-        mov.w   @tmp+, my
-        mov.w   @tmp+, mz
+        // pre-transform the matrix offset
+        add     #M03, m
+        mov.w   @m+, mx
+        mov.w   @m+, my
+        mov.w   @m+, mz
        shll8   mx
        shll8   my
        shll8   mz
+        add     #-12, m         // offset to z-row
+
+        // maxZ = VIEW_MAX = (1024 * 10) >> OT_SHIFT = (40 << 8) >> OT_SHIFT
+        mov     #40, maxZ
+        shll2   maxZ
+        shll2   maxZ

        add     #8, res         // extra offset for @-Rn
-        nop

 .loop_r:
        // unpack vertex
        mov.b   @vertices+, x
        mov.b   @vertices+, y
        mov.b   @vertices+, z
-
        shll2   x
        shll2   y
        shll2   z

-        // upload vertex coords into stack (in reverse order)
+        // upload vertex coords into stack
        mov     sp, stackVtx
        add     #6, stackVtx
-        mov     stackVtx, stackMtx

-        //shll16  x
-        //xtrct   y, x
-        mov.w   x, @-stackVtx
-        mov.w   y, @-stackVtx
        mov.w   z, @-stackVtx
+        mov.w   y, @-stackVtx
+        mov.w   x, @-stackVtx

-        //transform z
+.transform_z:
        lds     mz, MACL
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
        sts     MACL, z
          add     #-6, stackVtx
+          add     #-18, m       // offset to x-row
        shlr8   z
+
+        // z >>= OT_SHIFT
+        shlr2   z
+        shlr2   z
+
        exts.w  z, z

-
-        // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
-        // tmp = z + VIEW_OFF = z + 4096
-        mov     #16, tmp
-        shll8   tmp
-        add     z, tmp
-        // maxZ = VIEW_OFF + VIEW_MAX + VIEW_OFF = 18432
-        mov     #72, maxZ
-        shll8   maxZ
-        // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
-        cmp/hi  maxZ, tmp
-        bf/s    .visible_r
-        mov     #40, maxZ       // [delay slot] maxZ = 40
-        mov     #(CLIP_NEAR + CLIP_FAR), vg
-        mov.w   vg, @-res
-        add     #1, vertices
-        dt      count
-        bf/s    .loop_r
-        add     #10, res        // [delay slot]
-        bra     .done_r
-        nop
-
-.visible_r:
-        //transform y
-        lds     my, MACL
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        sts     MACL, y
-          add     #-6, stackVtx
-        shlr8   y
-        exts.w  y, y
-
-        //transform x
-        lds     mx, MACL
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        mac.w   @stackVtx+, @stackMtx+
-        sts     MACL, x
-          shll8   maxZ  // maxZ = VIEW_MAX = (1024 * 10) = (40 << 8)
-        shlr8   x
-        exts.w  x, x
-
-        mov.b   @vertices+, vg
-
-        // tmp = FOG_MIN = 6144 = (24 << 8)
-        mov     #24, tmp
-        shll8   tmp
+.calc_fog:
        // if z <= FOG_MIN -> skip fog calc
-        cmp/gt  tmp, z
-        bf/s    .clip_z_near_r
-        mov     z, fog          // [delay slot]
-        sub     tmp, fog        // fog = z - FOG_MIN
-        shll    fog             // FOG_SHIFT
-        shlr8   fog             // shift down to 0..31 range
+        mov     #(32 >> OT_SHIFT), minFog // minFog = FOG_MIN >> OT_SHIFT
+        shll8   minFog
+        mov     z, fog
+        subc    minFog, fog     // TODO need to clear T before?
+        bt/s    .clip_z_near_r
+        mov.b   @vertices+, vg  // [delay slot]
+        shlr2   fog
+        shlr    fog             // shift down to 0..31 range
        add     fog, vg
        // vg = min(vg, 31)
-        mov     #31, tmp
-        cmp/gt  tmp, vg
+        mov     #31, maxG
+        cmp/gt  maxG, vg
        bf      .clip_z_near_r
        mov     #31, vg

        // z clipping
 .clip_z_near_r:
        add     #1, vg          // +1 for signed lightmap fetch
-        mov     #VIEW_MIN, minZ // minZ = VIEW_MIN = 64
+        mov     #(VIEW_MIN >> OT_SHIFT), minZ
        cmp/gt  z, minZ
        bf/s    .clip_z_far_r
        shll8   vg              // [delay slot] clear lower 8-bits of vg for clipping flags
        mov     minZ, z
-        add     #CLIP_NEAR, vg
+        add     #CLIP_PLANE, vg
 .clip_z_far_r:
        cmp/ge  maxZ, z
-        bf/s    .project_r
-        mov     z, dz           // [delay slot]
+        bf      .transform_x
        mov     maxZ, z
-        add     #CLIP_FAR, vg
+        add     #CLIP_PLANE, vg

-.project_r: // dz = divTable[z >> (PROJ_SHIFT = 4)]
-        shlr2   dz
-        shlr2   dz
+.transform_x:
+        lds     mx, MACL
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        sts     MACL, x
+          add     #-6, stackVtx
+        shlr8   x
+        exts.w  x, x
+
+.transform_y:
+        lds     my, MACL
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        mac.w   @stackVtx+, @m+
+        sts     MACL, y
+          mov     z, dz         // [delay slot]
+        shlr8   y
+        exts.w  y, y
+
+.project_r: // dz = divTable[z]
        shll    dz
        mov.w   @(dz, divLUT), dz

@ -266,7 +238,6 @@ _transformRoom_asm:
 #undef vertices
 #undef count
 #undef stackVtx
-#undef stackMtx
 #undef vp
 #undef x
 #undef y
@ -282,5 +253,4 @@ _transformRoom_asm:
 #undef dz
 #undef vg
 #undef fog
-#undef cnt
 #undef SP_SIZE
--- a/src/platform/32x/rasterizer.h
+++ b/src/platform/32x/rasterizer.h
@ -132,7 +132,7 @@ extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLin
                }
            }

-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);

            Lx += Ldx;
            Rx += Rdx;
@ -239,7 +239,7 @@ extern "C" void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLin
                }
            }

-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);

            Lx += Ldx;
            Rx += Rdx;
@ -380,7 +380,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
            #endif
            }

-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);

            Lx += Ldx;
            Rx += Rdx;
@ -570,7 +570,7 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
            #endif
            }

-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);

            Lx += Ldx;
            Rx += Rdx;
@ -604,7 +604,7 @@ extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const Vert

    if (L->v.y < 0)
    {
-        pixel -= L->v.y * VRAM_WIDTH;
+        pixel -= L->v.y * (FRAME_WIDTH >> 1);
        v -= L->v.y * dv;
        h += L->v.y;
    }
--- a/src/platform/32x/render.cpp
+++ b/src/platform/32x/render.cpp
@ -65,9 +65,8 @@ enum ClipFlags {
    CLIP_RIGHT   = 1 << 2,
    CLIP_TOP     = 1 << 3,
    CLIP_BOTTOM  = 1 << 4,
-    CLIP_FAR     = 1 << 5,
-    CLIP_NEAR    = 1 << 6,
-    CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_FAR | CLIP_NEAR),
+    CLIP_PLANE   = 1 << 5,
+    CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_PLANE)
 };

 const MeshQuad gShadowQuads[] = {
@ -183,12 +182,12 @@ void transformRoom_c(const RoomVertex* vertices, int32 count)
        uint32 clip = 0;

        if (z <= VIEW_MIN_F) {
-            clip = CLIP_NEAR;
+            clip = CLIP_PLANE;
            z = VIEW_MIN_F;
        }

        if (z >= VIEW_MAX_F) {
-            clip = CLIP_FAR;
+            clip = CLIP_PLANE;
            z = VIEW_MAX_F;
        }

@ -330,12 +329,12 @@ void transformMesh_c(const MeshVertex* vertices, int32 count, int32 intensity)
        uint32 clip = 0;

        if (z <= (VIEW_MIN_F >> FIXED_SHIFT)) {
-            clip = CLIP_NEAR;
+            clip = CLIP_PLANE;
            z = VIEW_MIN_F >> FIXED_SHIFT;
        }

        if (z >= (VIEW_MAX_F >> FIXED_SHIFT)) {
-            clip = CLIP_FAR;
+            clip = CLIP_PLANE;
            z = VIEW_MAX_F >> FIXED_SHIFT;
        }

@ -598,25 +597,25 @@ int32 sphereIsVisible_c(int32 sx, int32 sy, int32 sz, int32 r)

 void flush_ot(int32 bit)
 {
-    VertexLink v[4 + 3];
+    VertexLink v[4 + 4];
    VertexLink* q = v;
    VertexLink* t = v + 4;
    // quad
-    q[0].prev = 3;
-    q[0].next = 1;
-    q[1].prev = -1;
-    q[1].next = 1;
-    q[2].prev = -1;
-    q[2].next = 1;
-    q[3].prev = -1;
-    q[3].next = -3;
+    q[0].prev = (3 << 4);
+    q[0].next = (1 << 4);
+    q[1].prev = -(1 << 4);
+    q[1].next = (1 << 4);
+    q[2].prev = -(1 << 4);
+    q[2].next = (1 << 4);
+    q[3].prev = -(1 << 4);
+    q[3].next = -(3 << 4);
    // triangle
-    t[0].prev = 2;
-    t[0].next = 1;
-    t[1].prev = -1;
-    t[1].next = 1;
-    t[2].prev = -1;
-    t[2].next = -2;
+    t[0].prev = (2 << 4);
+    t[0].next = (1 << 4);
+    t[1].prev = -(1 << 4);
+    t[1].next = (1 << 4);
+    t[2].prev = -(1 << 4);
+    t[2].next = -(2 << 4);

    int32 index = 0;
    const ColorIndex* tile = NULL;
@ -654,12 +653,29 @@ void flush_ot(int32 bit)
                    ptr[3].t.t = 0xFF00FF00 & (tex.uv23 << 8);
                }

-                ptr[0].v = gVertices[face->indices[0]];
-                ptr[1].v = gVertices[face->indices[1]];
-                ptr[2].v = gVertices[face->indices[2]];
+            #if 1
+                uint8* vPtr = (uint8*)gVertices;
+                ((uint32*)&ptr[0].v)[0] = ((uint32*)(vPtr + face->indices[0]))[0];
+                ((uint32*)&ptr[0].v)[1] = ((uint32*)(vPtr + face->indices[0]))[1];
+
+                ((uint32*)&ptr[1].v)[0] = ((uint32*)(vPtr + face->indices[1]))[0];
+                ((uint32*)&ptr[1].v)[1] = ((uint32*)(vPtr + face->indices[1]))[1];
+
+                ((uint32*)&ptr[2].v)[0] = ((uint32*)(vPtr + face->indices[2]))[0];
+                ((uint32*)&ptr[2].v)[1] = ((uint32*)(vPtr + face->indices[2]))[1];
+
                if (!(flags & FACE_TRIANGLE)) {
-                    ptr[3].v = gVertices[face->indices[3]];
+                    ((uint32*)&ptr[3].v)[0] = ((uint32*)(vPtr + face->indices[3]))[0];
+                    ((uint32*)&ptr[3].v)[1] = ((uint32*)(vPtr + face->indices[3]))[1];
                }
+            #else
+                ptr[0].v = gVertices[face->indices[0] >> 3];
+                ptr[1].v = gVertices[face->indices[1] >> 3];
+                ptr[2].v = gVertices[face->indices[2] >> 3];
+                if (!(flags & FACE_TRIANGLE)) {
+                    ptr[3].v = gVertices[face->indices[3] >> 3];
+                }
+            #endif

                if (flags & FACE_CLIPPED) {
                    drawPoly(flags, ptr, tile);
@ -855,10 +871,10 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde
    bool skip = (first->v.y == last->v.y);

    VertexLink* top = (first->v.y < last->v.y) ? first : last;
-    first->prev = count - 1;
-    first->next = 1;
-    last->prev = -1;
-    last->next = 1 - count;
+    first->prev = (count - 1) << 4;
+    first->next = (1 << 4);
+    last->prev = -(1 << 4);
+    last->next = (1 - count) << 4;

    for (int32 i = 1; i < count - 1; i++)
    {
@ -873,8 +889,8 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde
            skip = false;
        }

-        p->prev = -1;
-        p->next = 1;
+        p->prev = -(1 << 4);
+        p->next = (1 << 4);
    }

    if (skip)
@ -910,7 +926,7 @@ void clear()
    MARS_SYS_COMM4 = MARS_CMD_CLEAR;
 }

-void renderRoom(const Room* room)
+void renderRoom(Room* room)
 {
    int32 vCount = room->info->verticesCount;
    if (vCount <= 0)
@ -1225,14 +1241,8 @@ const int32 BAR_COLORS[BAR_MAX][5] = {
    { 43, 44, 43, 42, 41 },
 };

-X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 shade, int32 color1, int32 color2, int32 z)
+X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 color1, int32 color2, int32 z)
 {
-    // background
-    if (shade >= 0) {
-        renderFill(x + 1, y + 1, width - 2, height - 2, shade, z);
-    }
-
-    // frame
    renderLine(x + 1, y, width - 2, 1, color1, z);
    renderLine(x + 1, y + height - 1, width - 2, 1, color2, z);
    renderLine(x, y, 1, height, color1, z);
@ -1242,9 +1252,9 @@ X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32
 void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type)
 {
    // colored bar
-    int32 ix = x + 2;
-    int32 iy = y + 2;
-    int32 w = value * width >> 8;
+    int32 ix = x + 1;
+    int32 iy = y + 1;
+    int32 w = value* width >> 8;

    if (w > 0)
    {
@ -1254,7 +1264,12 @@ void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type)
        }
    }

-    renderBorder(x, y, width + 4, BAR_HEIGHT + 4, 27, 19, 17, 0);
+    if (w < width)
+    {
+        renderFill(x + 1 + w, y + 1, width - w, BAR_HEIGHT, 27, 0);
+    }
+
+    renderBorder(x, y, width + 2, BAR_HEIGHT + 2, 19, 17, 0);
 }

 void renderBackground(const void* background)