#407 32X optimizations, increase fog distance (2 blocks)

2025-07-12 09:16:18 +02:00 · 2022-12-24 11:23:42 +03:00
parent d268754786
commit 4e9b92e5a4
15 changed files with 392 additions and 465 deletions
--- a/src/fixed/common.h
+++ b/src/fixed/common.h
@ -637,9 +637,12 @@ struct Matrix
 struct RoomQuad
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint16 indices[4];
 #elif defined(__32X__)
    uint32 flags;
    int8 indices[4];
 #else
    int8 indices[4];
    uint16 flags;
@ -649,9 +652,12 @@ struct RoomQuad
 struct RoomTriangle
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint16 indices[4];
 #elif defined(__32X__)
    uint16 flags;
    uint16 indices[3];
 #else
    uint16 indices[3];
    uint16 flags;
@ -660,9 +666,12 @@ struct RoomTriangle
 struct MeshQuad
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint32 indices;
 #elif defined(__32X__)
    uint16 flags;
    uint8  indices[4];
 #else
    int8 indices[4];
    uint16 flags;
@ -672,9 +681,12 @@ struct MeshQuad
 struct MeshTriangle
 {
-#ifdef __3DO__
+#if defined(__3DO__)
    uint32 flags;
    uint32 indices;
 #elif defined(__32X__)
    uint16 flags;
    uint8  indices[4];
 #else
    int8 indices[4];
    uint16 flags;
@ -743,7 +755,7 @@ struct Face
 {
    uint32 flags;
    Face* next;
-    uint16 indices[4];
+    int16 indices[4];
 };
 #endif
--- a/src/platform/32x/asm/common.i
+++ b/src/platform/32x/asm/common.i
@ -54,22 +54,20 @@
 #define FACE_SIZEOF             16
-#define VIEW_DIST       (1024 * 10)   // max = DIV_TABLE_END << PROJ_SHIFT
+#define VIEW_MIN        64
-#define FOG_SHIFT       1
+#define VIEW_MAX        (10 << 10)
-#define FOG_MAX         VIEW_DIST
+#define FOG_SHIFT       4
-#define FOG_MIN         (FOG_MAX - (8192 >> FOG_SHIFT))
+#define FOG_MIN         (VIEW_MAX - 2048)
-#define VIEW_MIN        (64)
+
-#define VIEW_MAX        (VIEW_DIST)
+#define OT_SHIFT        4
 #define VIEW_OFF        4096
 #define CLIP_FRAME      (1 << 0)
 #define CLIP_LEFT       (1 << 1)
 #define CLIP_RIGHT      (1 << 2)
 #define CLIP_TOP        (1 << 3)
 #define CLIP_BOTTOM     (1 << 4)
-#define CLIP_FAR        (1 << 5)
+#define CLIP_PLANE      (1 << 5)
-#define CLIP_NEAR       (1 << 6)
+#define CLIP_DISCARD    (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_PLANE)
 #define CLIP_DISCARD    (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_FAR + CLIP_NEAR)
 #define VP_MINX         0
 #define VP_MINY         4
--- a/src/platform/32x/asm/faceAddMeshQuads.i
+++ b/src/platform/32x/asm/faceAddMeshQuads.i
@ -26,9 +26,9 @@
 #define vz2         vg2
 #define vz3         vg3
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
-#define ot          tmp
+#define ot          vg0
 .align 4
 .global _faceAddMeshQuads_asm
@ -43,26 +43,30 @@ _faceAddMeshQuads_asm:
        mov.l   r14, @-sp
        mov.l   var_gVertices_fam, vertices
        add     #VERTEX_Z, vertices
        mov.l   var_gVerticesBase_fam, vp
        mov.l   @vp, vp
        mov.l   var_gFacesBase_fam, face
        mov.l   @face, face
        nop
 .loop_famq:
        // read flags and indices
        mov.w   @polys+, flags
-        mov.b   @polys+, vp0
+        mov.w   @polys+, vp0
-        mov.b   @polys+, vp1
+        mov.w   @polys+, vp2
        mov.b   @polys+, vp2
        mov.b   @polys+, vp3
-        extu.w  flags, flags
+        extu.w  flags, flags // TODO packer free high bit
        extu.b  vp0, vp1
        shlr8   vp0
        extu.b  vp0, vp0
-        extu.b  vp1, vp1
+
        extu.b  vp2, vp3
        shlr8   vp2
        extu.b  vp2, vp2
        extu.b  vp3, vp3
        // p = gVerticesBase + index * VERTEX_SIZEOF
        shll2   vp0
@ -111,50 +115,40 @@ _faceAddMeshQuads_asm:
        or      tmp, flags
 .avg_z4_famq:
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
        mov.w   @vp1, vz1
        mov.w   @vp2, vz2
        mov.w   @vp3, vz3
-        add     vz1, vz0
+        add     vz1, depth
-        add     vz2, vz0
+        add     vz2, depth
-        add     vz3, vz0
+        add     vz3, depth
-        shlr2   vz0             // div by 4
+        shlr2   depth           // depth /= 4
        mov.l   var_gOT_fam, ot
 .face_add_famq:
-        // index = (p - vertices) / VERTEX_SIZEOF
+        // offset = (p - vertices)
        sub     vertices, vp0
        sub     vertices, vp1
        sub     vertices, vp2
        sub     vertices, vp3
        shlr2   vp0
        shlr2   vp1
        shlr2   vp2
        shlr2   vp3
        shlr    vp0
        shlr    vp1
        shlr    vp2
        shlr    vp3
        // depth (vz0) >>= OT_SHIFT (4)
        shlr2   depth
        shlr2   depth
        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
+        mov.l   @(depth, ot), next
-        mov.l   @depth, next
+        mov.l   face, @(depth, ot)
        mov.l   face, @depth
        shll16  vp3
        xtrct   vp2, vp3
        shll16  vp1
        xtrct   vp0, vp1
        mov.l   flags, @(0, face)
        mov.l   next, @(4, face)
        mov.l   vp1, @(8, face)
        mov.l   vp3, @(12, face)
        add     #FACE_SIZEOF, face
-        mov     face, tmp
+        nop
        mov.w   vp3, @-tmp
        mov.w   vp2, @-tmp
        mov.w   vp1, @-tmp
        mov.w   vp0, @-tmp
        mov.l   next, @-tmp
        mov.l   flags, @-tmp
 .skip_famq:
        dt      count
        bf      .loop_famq
--- a/src/platform/32x/asm/faceAddMeshTriangles.i
+++ b/src/platform/32x/asm/faceAddMeshTriangles.i
@ -25,7 +25,7 @@
 #define vz1         vg1
 #define vz2         vg2
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
 .align 4
@ -41,6 +41,7 @@ _faceAddMeshTriangles_asm:
        mov.l   r14, @-sp
        mov.l   var_gVertices_fam, vertices
        add     #VERTEX_Z, vertices
        mov.l   var_gVerticesBase_fam, vp
        mov.l   @vp, vp
@ -49,19 +50,20 @@ _faceAddMeshTriangles_asm:
        mov.l   @face, face
        mov.l   var_gOT_fam, ot
        nop
 .loop_famt:
        // read flags and indices
        mov.w   @polys+, flags
-        mov.b   @polys+, vp0
+        mov.w   @polys+, vp0
-        mov.b   @polys+, vp1
+        mov.w   @polys+, vp2
        mov.b   @polys+, vp2
        add     #1, polys       // skup 4th index
-        extu.w  flags, flags
+        extu.w  flags, flags // TODO packer free high bit
        extu.b  vp0, vp1
        shlr8   vp0
        extu.b  vp0, vp0
-        extu.b  vp1, vp1
+
        shlr8   vp2
        extu.b  vp2, vp2
        // p = gVerticesBase + index * VERTEX_SIZEOF
@ -80,11 +82,9 @@ _faceAddMeshTriangles_asm:
        // check_backface
        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
        bt/s    .skip_famt
        mov.l   const_FACE_TRIANGLE_fam, tmp    // [delay slot]
        or      tmp, flags
        // fetch clip masks
-        mov     #(VERTEX_CLIP - 4), tmp
+        mov     #(VERTEX_CLIP - 4), tmp         // [delay slot]
        mov.b   @(tmp, vp0), vg0
        mov.b   @(tmp, vp1), vg1
        mov.b   @(tmp, vp2), vg2
@ -95,8 +95,11 @@ _faceAddMeshTriangles_asm:
        tst     #CLIP_DISCARD, tmp
        bf/s    .skip_famt
        mov.l   const_FACE_TRIANGLE_fam, tmp    // [delay slot]
        or      tmp, flags
        // mark if should be clipped by frame
-        mov     vg0, tmp        // [delay slot]
+        mov     vg0, tmp
        or      vg1, tmp
        or      vg2, tmp
        tst     #CLIP_FRAME, tmp
@ -105,44 +108,35 @@ _faceAddMeshTriangles_asm:
        or      tmp, flags
 .avg_z3_famt:
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
        mov.w   @vp1, vz1
        mov.w   @vp2, vz2
-        add     vz1, vz0
+        add     vz1, depth
-        add     vz2, vz0
+        add     vz2, depth
-        add     vz2, vz0        // approx.
+        add     vz2, depth      // approx.
-        shlr2   vz0             // div by 4
+        shlr2   depth           // depth /= 4
 .face_add_famt:
-        // index = (p - vertices) / VERTEX_SIZEOF
+        // offset = (p - vertices)
        sub     vertices, vp0
        sub     vertices, vp1
        sub     vertices, vp2
        shlr2   vp0
        shlr2   vp1
        shlr2   vp2
        shlr    vp0
        shlr    vp1
        shlr    vp2
        // depth (vz0) >>= OT_SHIFT (4)
        shlr2   depth
        shlr2   depth
        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
+        mov.l   @(depth, ot), next
-        mov.l   @depth, next
+        mov.l   face, @(depth, ot)
        mov.l   face, @depth
        shll16  vp2
        shll16  vp1
        xtrct   vp0, vp1
        mov.l   flags, @(0, face)
        mov.l   next, @(4, face)
        mov.l   vp1, @(8, face)
        mov.l   vp2, @(12, face)
        add     #FACE_SIZEOF, face
-        mov     face, tmp
+        nop
        add     #-2, tmp        // skip 4th index
        mov.w   vp2, @-tmp
        mov.w   vp1, @-tmp
        mov.w   vp0, @-tmp
        mov.l   next, @-tmp
        mov.l   flags, @-tmp
 .skip_famt:
        dt      count
        bf      .loop_famt
--- a/src/platform/32x/asm/faceAddRoomQuads.i
+++ b/src/platform/32x/asm/faceAddRoomQuads.i
@ -26,9 +26,9 @@
 #define vz2         vg2
 #define vz3         vg3
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
-#define ot          tmp
+#define ot          vg0
 .align 4
 .global _faceAddRoomQuads_asm
@ -43,34 +43,44 @@ _faceAddRoomQuads_asm:
        mov.l   r14, @-sp
        mov.l   var_gVertices_far, vertices
        add     #VERTEX_Z, vertices
        mov.l   var_gVerticesBase_far, vp
        mov.l   @vp, vp
        mov.l   var_gFacesBase_far, face
        mov.l   @face, face
        nop
 .loop_farq:
        // read flags and indices
-        mov.w   @polys+, flags
+        mov.l   @polys+, flags
-        mov.w   @polys+, vp0
+        mov.l   @polys+, vp0
        mov.w   @polys+, vp1
        mov.w   @polys+, vp2
        mov.w   @polys+, vp3
        extu.w  flags, flags
        // indices never exceed 32k, no need for extu.w
-        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
+        exts.b  vp0, vp3
        shlr8   vp0
        exts.b  vp0, vp2
        shlr8   vp0
        exts.b  vp0, vp1
        shlr8   vp0
        exts.b  vp0, vp0
        // index *= 8 (VERTEX_SIZEOF)
        shll2   vp0
        shll2   vp1
        shll2   vp2
        shll2   vp3
        shll    vp0
        shll    vp1
        shll    vp2
        shll    vp3
        // get vertex address
        add     vp, vp0
-        add     vp, vp1
+        add     vp0, vp1
-        add     vp, vp2
+        add     vp1, vp2
-        add     vp, vp3
+        add     vp2, vp3
        mov     vp3, vp
        // fetch ((g << 8) | clip)
        mov     #VERTEX_G, tmp
@ -116,59 +126,45 @@ _faceAddRoomQuads_asm:
        add     #VERTEX_Z, vp3  // [delay slot] ccw shifts p[0..2] address to VERTEX_Z, shift p3 too
        // max_z4
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth
        mov.w   @vp1, vz1 
        // check_z1
-        cmp/gt  vz0, vz1
+        cmp/gt  depth, vz1
        bf/s    3f
        mov.w   @vp2, vz2       // [delay slot]
-        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+        mov     vz1, depth      // if (z1 > z0) z0 = z1
 3:      // check_z2
-        cmp/gt  vz0, vz2
+        cmp/gt  depth, vz2
        bf/s    4f
        mov.w   @vp3, vz3       // [delay slot]
-        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+        mov     vz2, depth      // if (z2 > z0) z0 = z2
 4:      // check_z3
-        cmp/gt  vz0, vz3
+        cmp/gt  depth, vz3
-        bf      .face_add_farq  // TODO use delay slot but not for OT! )
+        bf/s    .face_add_farq
-        mov     vz3, vz0        // if (z3 > z0) z0 = z3
+        sub     vertices, vp0   // [delay slot] get the first offset
        mov     vz3, depth      // if (z3 > z0) z0 = z3
 .face_add_farq:
-        mov.l   var_gOT_far, ot // [delay slot]
+        mov.l   var_gOT_far, ot
-        // get absolute indices
+        // offset = (p - vertices)
        // p address is 4 bytes ahead but it's fine for shlr3
        // index = (p - vertices) / VERTEX_SIZEOF
        sub     vertices, vp0
        sub     vertices, vp1
        sub     vertices, vp2
        sub     vertices, vp3
        shlr2   vp0
        shlr2   vp1
        shlr2   vp2
        shlr2   vp3
        shlr    vp0
        shlr    vp1
        shlr    vp2
        shlr    vp3
        // depth (vz0) >>= OT_SHIFT (4)
        shlr2   depth
        shlr2   depth
        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
+        mov.l   @(depth, ot), next
-        mov.l   @depth, next
+        mov.l   face, @(depth, ot)
        mov.l   face, @depth
        shll16  vp3
        xtrct   vp2, vp3
        shll16  vp1
        xtrct   vp0, vp1
        mov.l   flags, @(0, face)
        mov.l   next, @(4, face)
        mov.l   vp1, @(8, face)
        mov.l   vp3, @(12, face)
        add     #FACE_SIZEOF, face
        mov     face, tmp
        mov.w   vp3, @-tmp
        mov.w   vp2, @-tmp
        mov.w   vp1, @-tmp
        mov.w   vp0, @-tmp
        mov.l   next, @-tmp
        mov.l   flags, @-tmp
 .skip_farq:
        dt      count
        bf      .loop_farq
--- a/src/platform/32x/asm/faceAddRoomTriangles.i
+++ b/src/platform/32x/asm/faceAddRoomTriangles.i
@ -25,7 +25,7 @@
 #define vz1         vg1
 #define vz2         vg2
-#define depth       vg0     // == vz0
+#define depth       tmp
 #define next        vg1
 .align 4
@ -41,6 +41,7 @@ _faceAddRoomTriangles_asm:
        mov.l   r14, @-sp
        mov.l   var_gVertices_far, vertices
        add     #VERTEX_Z, vertices
        mov.l   var_gVerticesBase_far, vp
        mov.l   @vp, vp
@ -49,21 +50,19 @@ _faceAddRoomTriangles_asm:
        mov.l   @face, face
        mov.l   var_gOT_far, ot
        nop
 .loop_fart:
        // read flags and indices
-        mov.w   @polys+, flags
+        mov.l   @polys+, flags
-        mov.w   @polys+, vp0
+        mov.l   @polys+, vp1
        mov.w   @polys+, vp1
        mov.w   @polys+, vp2
        extu.w  flags, flags
        // indices never exceed 32k, no need for extu.w
-        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
+        extu.w  flags, vp0
-        shll2   vp0
+        shlr16  flags
-        shll2   vp1
+
-        shll2   vp2
+        extu.w  vp1, vp2
        shlr16  vp1
        // vp[0..2] alreay multiplied by VERTEX_SIZEOF
        // get vertex address
        add     vp, vp0
@ -90,7 +89,7 @@ _faceAddRoomTriangles_asm:
        or      vg2, tmp
        tst     #CLIP_FRAME, tmp
        bt/s    1f
-        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot]
+        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot] mov #1, tmp; rotr x2
        or      tmp, flags
 1:      // compare VERTEX_G for gouraud rasterization
@ -100,60 +99,47 @@ _faceAddRoomTriangles_asm:
        shlr8   vg1             // shift down for g only
        tst     vg1, vg1
        bt/s    2f
-        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot]
+        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot] mov #128, tmp; shll8
        add     tmp, flags
 2:      // check_backface
        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
        bt/s    .skip_fart
-        mov.l   const_FACE_TRIANGLE_far, tmp    // [delay slot]
+        mov.l   const_FACE_TRIANGLE_far, tmp    // [delay slot] mov #1, tmp; rotr
        or      tmp, flags
        // max_z3
-        mov.w   @vp0, vz0
+        mov.w   @vp0, depth     // depth = vz0
        mov.w   @vp1, vz1
        // check_z1
-        cmp/gt  vz0, vz1
+        cmp/gt  depth, vz1
        bf/s    3f
        mov.w   @vp2, vz2       // [delay slot]
-        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+        mov     vz1, depth      // if (z1 > depth) depth = z1
 3:      // check_z2
-        cmp/gt  vz0, vz2
+        cmp/gt  depth, vz2
-        bf      .face_add_fart  // TODO use delay slot but not for OT! )
+        bf/s    .face_add_fart  // TODO use delay slot but not for OT! )
-        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+        sub     vertices, vp0   // [delay slot] get the first offset
        mov     vz2, depth      // if (z2 > depth) depth = z2
 .face_add_fart:
-        // get absolute indices
+        // offset = (p - vertices)
        // p address is 4 bytes ahead but it's fine for shlr3
        // index = (p - vertices) / VERTEX_SIZEOF
        sub     vertices, vp0
        sub     vertices, vp1
        sub     vertices, vp2
        shlr2   vp0
        shlr2   vp1
        shlr2   vp2
        shlr    vp0
        shlr    vp1
        shlr    vp2
        // depth (vz0) >>= OT_SHIFT (4)
        shlr2   depth
        shlr2   depth
        shll2   depth
-        add     ot, depth   // depth = gOT[depth]
+        mov.l   @(depth, ot), next
-        mov.l   @depth, next
+        mov.l   face, @(depth, ot)
        mov.l   face, @depth
        shll16  vp2
        shll16  vp1
        xtrct   vp0, vp1
        mov.l   flags, @(0, face)
        mov.l   next, @(4, face)
        mov.l   vp1, @(8, face)
        mov.l   vp2, @(12, face)
        add     #FACE_SIZEOF, face
        mov     face, tmp
        add     #-2, tmp        // skip 4th index
        mov.w   vp2, @-tmp
        mov.w   vp1, @-tmp
        mov.w   vp0, @-tmp
        mov.l   next, @-tmp
        mov.l   flags, @-tmp
 .skip_fart:
        dt      count
        bf      .loop_fart
--- a/src/platform/32x/asm/rasterize.i
+++ b/src/platform/32x/asm/rasterize.i
@ -40,8 +40,8 @@ _rasterize_asm:
 .align 2
 var_fb:
-        // overwrite image frame buffer address has the same
+        // overwrite image frame buffer address, it has the same
-        // write per but allow transparent write for byte & word
+        // write latency but allow transparent write for byte & word
        .long 0x24020200
 var_table:
 #ifdef ON_CHIP_RENDER
--- a/src/platform/32x/asm/rasterizeF.i
+++ b/src/platform/32x/asm/rasterizeF.i
@ -5,25 +5,22 @@
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define index   r6      // arg
-#define gtile   r7      // arg (unused)
+#define h       r7
 #define N       gtile
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
 #define Rdx     r11
 #define dup     r12     // const
 #define inv     r13
-#define divLUT  r14
+#define R       r14
-#define R       index
+#define divLUT  inv
 #define h       N
 #define Ry      inv
 #define Ly      inv
-#define Rptr    R
+#define Rptr    index
 #define iw      inv
 #define ih      inv
 #define LMAP    inv
@ -38,7 +35,6 @@
        mov.l   @sp+, r9
        rts
        mov.l   @sp+, r8
        nop
 .global _rasterizeF_asm
 _rasterizeF_asm:
@ -63,37 +59,30 @@ _rasterizeF_asm:
        mov     L, R
        mov.l   var_divTable_fs, divLUT
        mov     #0, Rh
        mov     #0, Lh
 .loop_f:
        tst     Lh, Lh
        bf/s    .calc_left_end_f
 .calc_left_start_f:
        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
-        mov     tmp, N
+        add     L, tmp          // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT)
        shll2   N
        shll2   N
        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
-        mov.w   @L+, Lx
+        mov.l   @L, Lx
-        mov.w   @L+, Ly
+        extu.w  Lx, Ly
        shlr16  Lx
-        mov     N, tmp
+        mov.l   @tmp, Ldx
-        mov.w   @tmp+, Ldx
+        extu.w  Ldx, Lh
-        mov.w   @tmp+, Lh
+        shlr16  Ldx
        cmp/ge  Ly, Lh
        bf/s    .exit_f
        cmp/eq  Ly, Lh          // [delay slot]
        bt/s    .calc_left_start_f      // if (L->v.y == N->v.y) check next vertex
-        mov     N, L            // [delay slot]
+        mov     tmp, L          // [delay slot]
        sub     Lx, Ldx
        sub     Ly, Lh
        mov.l   var_divTable_fs, divLUT
        mov     Lh, tmp
        shll    tmp
        mov.w   @(tmp, divLUT), ih
@ -104,31 +93,30 @@ _rasterizeF_asm:
 .calc_left_end_f:
        tst     Rh, Rh
-        bf/s    .calc_right_end_f
+        bf      .calc_right_end_f
 .calc_right_start_f:
-        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
+        mov.b   @(VERTEX_NEXT, R), tmp
-        mov     tmp, N
+        add     R, tmp          // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT)
        shll2   N
        shll2   N
        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
-        mov.w   @R+, Rx
+        mov.l   @R, Rx
-        mov.w   @R+, Ry
+        extu.w  Rx, Ry
        shlr16  Rx
-        mov     N, tmp
+        mov.l   @tmp, Rdx
-        mov.w   @tmp+, Rdx
+        extu.w  Rdx, Rh
-        mov.w   @tmp+, Rh
+        shlr16  Rdx
        cmp/ge  Ry, Rh
        bf/s    .exit_f
        cmp/eq  Ry, Rh          // [delay slot]
        bt/s    .calc_right_start_f     // if (R->v.y == N->v.y) check next vertex
-        mov     N, R            // [delay slot]
+        mov     tmp, R          // [delay slot]
        sub     Rx, Rdx
        sub     Ry, Rh
        mov.l   var_divTable_fs, divLUT
        mov     Rh, tmp
        shll    tmp
        mov.w   @(tmp, divLUT), ih
@ -148,8 +136,6 @@ _rasterizeF_asm:
        sub     h, Lh
        sub     h, Rh
        mov.l   R, @-sp
 .scanline_start_f:
        mov     Lx, Lptr
        mov     Rx, Rptr
@ -160,12 +146,6 @@ _rasterizeF_asm:
        cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
        bf/s    .scanline_end_f
        // iw = divTable[Rptr - Lptr]
        mov     Rptr, tmp       // [delay slot]
        sub     Lptr, tmp
        shll    tmp
        mov.w   @(tmp, divLUT), iw
        add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
        add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)
@ -178,10 +158,10 @@ _rasterizeF_asm:
        mov.b   dup, @Lptr
        add     #1, Lptr
        mov     #1, tmp         // tmp = 1 (for align_right)
        cmp/gt  Lptr, Rptr
        bf/s    .scanline_end_f
        tst     tmp, Rptr
        nop
 .align_right_f:
        bt      .block_2px_f
@ -192,17 +172,20 @@ _rasterizeF_asm:
 .block_2px_f:
        mov.w   dup, @-Rptr
        cmp/gt  Lptr, Rptr
-        bt      .block_2px_f
+        bt/s    .block_2px_f
        nop
 .scanline_end_f:
        dt      h
        mov.w   var_frameWidth_fs, tmp
        bf/s    .scanline_start_f
-        add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
+        add     tmp, pixel      // [delay slot] pixel += FRAME_WIDTH
-        bra     .loop_f
+        tst     Lh, Lh
-        mov.l   @sp+, R
+        bf      .calc_right_start_f
        bra     .calc_left_start_f
        nop
 #undef tmp
 #undef Lh
@ -211,7 +194,6 @@ _rasterizeF_asm:
 #undef pixel
 #undef L
 #undef index
 #undef N
 #undef Lx
 #undef Rx
 #undef Ldx
@ -224,6 +206,5 @@ _rasterizeF_asm:
 #undef Ry
 #undef Ly
 #undef Rptr
 #undef iw
 #undef ih
 #undef LMAP
--- a/src/platform/32x/asm/rasterizeFT.i
+++ b/src/platform/32x/asm/rasterizeFT.i
@ -66,6 +66,7 @@
        mov.l   @sp+, r9
        rts
        mov.l   @sp+, r8
        nop
 .global _rasterizeFT_asm
 _rasterizeFT_asm:
@ -95,14 +96,13 @@ _rasterizeFT_asm:
        tst     Lh, Lh
        bf/s    .calc_left_end_ft
        nop
 .calc_left_start_ft:
        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
        mov     tmp, N
        mov.w   @(VERTEX_Y, L), tmp
        shll2   N
        shll2   N
        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ly
        mov.w   @(VERTEX_Y, N), tmp
@ -144,14 +144,13 @@ _rasterizeFT_asm:
        shlr16  Rh              // Rh = (Rh >> 16)
        tst     Rh, Rh
        bf/s    .calc_right_end_ft
        nop
 .calc_right_start_ft:
        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
        mov     tmp, N
        mov.w   @(VERTEX_Y, R), tmp
        shll2   N
        shll2   N
        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ry
        mov.w   @(VERTEX_Y, N), tmp
@ -206,7 +205,8 @@ _rasterizeFT_asm:
        mov.l   tmp, @(SP_H, sp)
        mov.l   L, @(SP_L, sp)
        mov.l   R, @(SP_R, sp)
-        
+        nop
 .scanline_start_ft:
        mov     Lx, Lptr
        mov     Rx, Rptr
@ -263,15 +263,15 @@ _rasterizeFT_asm:
        cmp/gt  Lptr, Rptr
        bf/s    .scanline_end_ft
        nop
 .block_prepare_ft:
        shll    dtdx            // [delay slot] optional
        nop
 .block_2px_ft:
-        swap.b  t, index        // UUuuvvVV
+        getUV   t, index
-        swap.w  index, index    // vvVVUUuu
+
        shll8   index           // VVUUuu00
        shlr16  index           // 0000VVUU
        mov.b   @(index, TILE), index
        mov.b   @(index, LMAP), index
@ -283,6 +283,7 @@ _rasterizeFT_asm:
        cmp/gt  Lptr, Rptr
        bt/s    .block_2px_ft
        sub     dtdx, t         // [delay slot] t -= dtdx
        nop
 .scanline_end_ft:
        mov.l   @(SP_LDX, sp), sLdx
--- a/src/platform/32x/asm/rasterizeGT.i
+++ b/src/platform/32x/asm/rasterizeGT.i
@ -93,8 +93,6 @@ _rasterizeGT_asm:
        add     #-SP_SIZE, sp
        mov     gtile, TILE
        nop
        mov     #0, Rh
 .loop_gt:
@ -102,14 +100,13 @@ _rasterizeGT_asm:
        tst     Lh, Lh
        bf/s    .calc_left_end_gt
        shlr16  Rh              // [delay slot] Rh = (Rh >> 16)
 .calc_left_start_gt:
-        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
+        mov.b   @(VERTEX_PREV, L), tmp
        mov     tmp, N
        mov.w   @(VERTEX_Y, L), tmp
        shll2   N
        shll2   N
        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ly
        mov.w   @(VERTEX_Y, N), tmp
@ -159,9 +156,9 @@ _rasterizeGT_asm:
        // calc Ldt
        scaleUV Ldt, tmp, ih
        mov.l   tmp, @(SP_LDT, sp)
        nop
 .calc_left_end_gt:
        shlr16  Rh              // Rh = (Rh >> 16)
        tst     Rh, Rh
        bf/s    .calc_right_end_gt
@ -170,8 +167,6 @@ _rasterizeGT_asm:
        mov     tmp, N
        mov.w   @(VERTEX_Y, R), tmp
        shll2   N
        shll2   N
        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
        mov     tmp, Ry
        mov.w   @(VERTEX_Y, N), tmp
@ -221,6 +216,7 @@ _rasterizeGT_asm:
        // calc Rdt
        scaleUV Rdt, tmp, ih
        mov.l   tmp, @(SP_RDT, sp)
        nop
 .calc_right_end_gt:
        // bake gLightmap address into g value
@ -233,6 +229,7 @@ _rasterizeGT_asm:
        bf/s    .scanline_prepare_gt
        mov     Lh, h           // [delay slot]
        mov     Rh, h
        nop
 .scanline_prepare_gt:
        sub     h, Lh
@ -330,10 +327,8 @@ _rasterizeGT_asm:
        shll    dgdx
 .block_2px_gt:
-        swap.b  t, index        // UUuuvvVV
+        getUV   t, index
-        swap.w  index, index    // vvVVUUuu
+
        shll8   index           // VVUUuu00
        shlr16  index           // 0000VVUU
        mov.b   @(index, TILE), index
        mov     g, LMAP
--- a/src/platform/32x/asm/rasterizeS.i
+++ b/src/platform/32x/asm/rasterizeS.i
@ -5,8 +5,7 @@
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define R       r6      // arg
-#define gtile   r7      // arg (unused)
+#define h       r7
 #define N       gtile
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
@ -16,14 +15,12 @@
 #define divLUT  r14
 #define index   tmp
 #define h       N
 #define Ry      inv
 #define Ly      inv
-#define Rptr    R
+#define Rptr    inv
 #define iw      inv
 #define ih      inv
 .align 4
@ -37,7 +34,6 @@
        mov.l   @sp+, r9
        rts
        mov.l   @sp+, r8
        nop
 .global _rasterizeS_asm
 _rasterizeS_asm:
@ -58,30 +54,25 @@ _rasterizeS_asm:
        mov.l   var_divTable_fs, divLUT
        mov     #0, Rh
-        mov     #0, Lh
+        nop
 .loop_s:
        tst     Lh, Lh
        bf/s    .calc_left_end_s
 .calc_left_start_s:
        mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
-        mov     tmp, N
+        add     L, tmp          // tmp = L + (L->prev << VERTEX_SIZEOF_SHIFT)
        shll2   N
        shll2   N
        add     L, N            // N = L + (L->prev << VERTEX_SIZEOF_SHIFT)
-        mov.w   @L+, Lx
+        mov.l   @L, Lx
-        mov.w   @L+, Ly
+        extu.w  Lx, Ly
        shlr16  Lx
-        mov     N, tmp
+        mov.l   @tmp, Ldx
-        mov.w   @tmp+, Ldx
+        extu.w  Ldx, Lh
-        mov.w   @tmp+, Lh
+        shlr16  Ldx
        cmp/ge  Ly, Lh
        bf/s    .exit_s
        cmp/eq  Ly, Lh          // [delay slot]
        bt/s    .calc_left_start_s      // if (L->v.y == N->v.y) check next vertex
-        mov     N, L            // [delay slot]
+        mov     tmp, L          // [delay slot]
        sub     Lx, Ldx
        sub     Ly, Lh
@ -96,27 +87,26 @@ _rasterizeS_asm:
 .calc_left_end_s:
        tst     Rh, Rh
-        bf/s    .calc_right_end_s
+        bf      .calc_right_end_s
        nop
 .calc_right_start_s:
-        mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
+        mov.b   @(VERTEX_NEXT, R), tmp
-        mov     tmp, N
+        add     R, tmp          // tmp = R + (R->next << VERTEX_SIZEOF_SHIFT)
        shll2   N
        shll2   N
        add     R, N            // N = R + (R->next << VERTEX_SIZEOF_SHIFT)
-        mov.w   @R+, Rx
+        mov.l   @R, Rx
-        mov.w   @R+, Ry
+        extu.w  Rx, Ry
        shlr16  Rx
-        mov     N, tmp
+        mov.l   @tmp, Rdx
-        mov.w   @tmp+, Rdx
+        extu.w  Rdx, Rh
-        mov.w   @tmp+, Rh
+        shlr16  Rdx
        cmp/ge  Ry, Rh
        bf/s    .exit_s
        cmp/eq  Ry, Rh          // [delay slot]
        bt/s    .calc_right_start_s     // if (R->v.y == N->v.y) check next vertex
-        mov     N, R            // [delay slot]
+        mov     tmp, R          // [delay slot]
        sub     Rx, Rdx
        sub     Ry, Rh
@ -135,13 +125,12 @@ _rasterizeS_asm:
        bf/s    .scanline_prepare_s
        mov     Lh, h           // [delay slot]
        mov     Rh, h
        nop
 .scanline_prepare_s:
        sub     h, Lh
        sub     h, Rh
        mov.l   R, @-sp
 .scanline_start_s:
        mov     Lx, Lptr
        mov     Rx, Rptr
@ -152,14 +141,8 @@ _rasterizeS_asm:
        cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
        bf/s    .scanline_end_s
-        // iw = divTable[Rptr - Lptr]
+        add     pixel, Lptr     // Lptr = pixel + (Lx >> 16)
-        mov     Rptr, tmp       // [delay slot]
+        add     pixel, Rptr     // Rptr = pixel + (Rx >> 16)
        sub     Lptr, tmp
        shll    tmp
        mov.w   @(tmp, divLUT), iw
        add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
        add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)
 .shade_pixel_s:
        mov.b   @Lptr, index
@ -174,10 +157,12 @@ _rasterizeS_asm:
        mov.w   var_frameWidth_fs, tmp
        bf/s    .scanline_start_s
-        add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
+        add     tmp, pixel      // [delay slot] pixel += FRAME_WIDTH
-        bra     .loop_s
+        tst     Lh, Lh
-        mov.l   @sp+, R
+        bf      .calc_right_start_s
        bra     .calc_left_start_s
        nop
 #undef tmp
 #undef Lh
@ -186,7 +171,6 @@ _rasterizeS_asm:
 #undef pixel
 #undef L
 #undef R
 #undef N
 #undef Lx
 #undef Rx
 #undef Ldx
@ -199,5 +183,4 @@ _rasterizeS_asm:
 #undef Ry
 #undef Ly
 #undef Rptr
 #undef iw
 #undef ih
--- a/src/platform/32x/asm/transformMesh.i
+++ b/src/platform/32x/asm/transformMesh.i
@ -78,10 +78,10 @@ _transformMesh_asm:
        // pre-transform the matrix offset
        add     #M03, m
        mov.w   @m+, mx
        shll16  mx
        mov.w   @m+, my
        shll16  my
        mov.w   @m+, mz
        shll16  mx
        shll16  my
        shll16  mz
        add     #-MATRIX_SIZEOF, m
@ -99,22 +99,24 @@ _transformMesh_asm:
        // z clipping
 .clip_z_near_m:
-        mov     #VIEW_MIN, minZ // 64
+        mov     #VIEW_MIN, minZ
        cmp/gt  z, minZ
        bf/s    .clip_z_far_m
        cmp/ge  maxZ, z         // [delay slot]
        mov     minZ, z
-        add     #CLIP_NEAR, vg
+        add     #CLIP_PLANE, vg
 .clip_z_far_m:
-        bf/s    .project_m
+        bf      .project_m
        mov     z, dz           // [delay slot] dz = z
        mov     maxZ, z
-        add     #CLIP_FAR, vg
+        add     #CLIP_PLANE, vg
 .project_m:
-        // dz = divTable[z >> (PROJ_SHIFT = 4)]
+        // z >>= OT_SHIFT
-        shlr2   dz
+        shlr2   z
-        shlr2   dz
+        shlr2   z
        // dz = divTable[z]
        mov     z, dz
        shll    dz
        mov.w   @(dz, divLUT), dz
--- a/src/platform/32x/asm/transformRoom.i
+++ b/src/platform/32x/asm/transformRoom.i
@ -4,9 +4,9 @@
 #define res             r3
 #define vertices        r4      // arg
 #define count           r5      // arg
-#define stackVtx        r6
+#define vp              r6
-#define stackMtx        r7
+#define m               r7
-#define vp              r8
+#define vg              r8
 #define x               r9
 #define y               r10
 #define z               r11
@ -18,13 +18,14 @@
 #define minY            tmp
 #define maxX            tmp
 #define maxY            tmp
-#define minZ            tmp
+#define minZ            x
 #define dz              tmp
-#define vg              stackVtx
+#define stackVtx        tmp
-#define fog             stackMtx
+#define fog             x
-#define cnt             stackVtx
+#define minFog          y
 #define maxG            y
-#define SP_SIZE         (18 + 6)        // mat3x3 + vec3
+#define SP_SIZE         (8)        // vec3s + padding
 .align 4
 .global _transformRoom_asm
@ -37,7 +38,6 @@ _transformRoom_asm:
        mov.l   r12, @-sp
        mov.l   r13, @-sp
        mov.l   r14, @-sp
        mov     sp, stackMtx 
        add     #-SP_SIZE, sp
        mov.l   var_viewportRel, vp
@ -49,139 +49,111 @@ _transformRoom_asm:
        // store matrix into stack (in reverse order)
        mov.l   var_gMatrixPtr, tmp
-        mov.l   @tmp, tmp
+        mov.l   @tmp, m
-        // copy 3x3 matrix rotation part
+        // pre-transform the matrix offset
-        mov     #9, cnt
+        add     #M03, m
-.copyMtx_r:
+        mov.w   @m+, mx
-        mov.w   @tmp+, mx
+        mov.w   @m+, my
-        dt      cnt
+        mov.w   @m+, mz
        bf/s    .copyMtx_r
        mov.w   mx, @-stackMtx  // [delay slot]
        // prepare offsets (const)
        mov.w   @tmp+, mx
        mov.w   @tmp+, my
        mov.w   @tmp+, mz
        shll8   mx
        shll8   my
        shll8   mz
        add     #-12, m         // offset to z-row
        // maxZ = VIEW_MAX = (1024 * 10) >> OT_SHIFT = (40 << 8) >> OT_SHIFT
        mov     #40, maxZ
        shll2   maxZ
        shll2   maxZ
        add     #8, res         // extra offset for @-Rn
        nop
 .loop_r:
        // unpack vertex
        mov.b   @vertices+, x
        mov.b   @vertices+, y
        mov.b   @vertices+, z
        shll2   x
        shll2   y
        shll2   z
-        // upload vertex coords into stack (in reverse order)
+        // upload vertex coords into stack
        mov     sp, stackVtx
        add     #6, stackVtx
        mov     stackVtx, stackMtx
        //shll16  x
        //xtrct   y, x
        mov.w   x, @-stackVtx
        mov.w   y, @-stackVtx
        mov.w   z, @-stackVtx
        mov.w   y, @-stackVtx
        mov.w   x, @-stackVtx
-        //transform z
+.transform_z:
        lds     mz, MACL
-        mac.w   @stackVtx+, @stackMtx+
+        mac.w   @stackVtx+, @m+
-        mac.w   @stackVtx+, @stackMtx+
+        mac.w   @stackVtx+, @m+
-        mac.w   @stackVtx+, @stackMtx+
+        mac.w   @stackVtx+, @m+
        sts     MACL, z
          add     #-6, stackVtx
          add     #-18, m       // offset to x-row
        shlr8   z
        // z >>= OT_SHIFT
        shlr2   z
        shlr2   z
        exts.w  z, z
-
+.calc_fog:
        // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
        // tmp = z + VIEW_OFF = z + 4096
        mov     #16, tmp
        shll8   tmp
        add     z, tmp
        // maxZ = VIEW_OFF + VIEW_MAX + VIEW_OFF = 18432
        mov     #72, maxZ
        shll8   maxZ
        // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
        cmp/hi  maxZ, tmp
        bf/s    .visible_r
        mov     #40, maxZ       // [delay slot] maxZ = 40
        mov     #(CLIP_NEAR + CLIP_FAR), vg
        mov.w   vg, @-res
        add     #1, vertices
        dt      count
        bf/s    .loop_r
        add     #10, res        // [delay slot]
        bra     .done_r
        nop
 .visible_r:
        //transform y
        lds     my, MACL
        mac.w   @stackVtx+, @stackMtx+
        mac.w   @stackVtx+, @stackMtx+
        mac.w   @stackVtx+, @stackMtx+
        sts     MACL, y
          add     #-6, stackVtx
        shlr8   y
        exts.w  y, y
        //transform x
        lds     mx, MACL
        mac.w   @stackVtx+, @stackMtx+
        mac.w   @stackVtx+, @stackMtx+
        mac.w   @stackVtx+, @stackMtx+
        sts     MACL, x
          shll8   maxZ  // maxZ = VIEW_MAX = (1024 * 10) = (40 << 8)
        shlr8   x
        exts.w  x, x
        mov.b   @vertices+, vg
        // tmp = FOG_MIN = 6144 = (24 << 8)
        mov     #24, tmp
        shll8   tmp
        // if z <= FOG_MIN -> skip fog calc
-        cmp/gt  tmp, z
+        mov     #(32 >> OT_SHIFT), minFog // minFog = FOG_MIN >> OT_SHIFT
-        bf/s    .clip_z_near_r
+        shll8   minFog
-        mov     z, fog          // [delay slot]
+        mov     z, fog
-        sub     tmp, fog        // fog = z - FOG_MIN
+        subc    minFog, fog     // TODO need to clear T before?
-        shll    fog             // FOG_SHIFT
+        bt/s    .clip_z_near_r
-        shlr8   fog             // shift down to 0..31 range
+        mov.b   @vertices+, vg  // [delay slot]
        shlr2   fog
        shlr    fog             // shift down to 0..31 range
        add     fog, vg
        // vg = min(vg, 31)
-        mov     #31, tmp
+        mov     #31, maxG
-        cmp/gt  tmp, vg
+        cmp/gt  maxG, vg
        bf      .clip_z_near_r
        mov     #31, vg
        // z clipping
 .clip_z_near_r:
        add     #1, vg          // +1 for signed lightmap fetch
-        mov     #VIEW_MIN, minZ // minZ = VIEW_MIN = 64
+        mov     #(VIEW_MIN >> OT_SHIFT), minZ
        cmp/gt  z, minZ
        bf/s    .clip_z_far_r
        shll8   vg              // [delay slot] clear lower 8-bits of vg for clipping flags
        mov     minZ, z
-        add     #CLIP_NEAR, vg
+        add     #CLIP_PLANE, vg
 .clip_z_far_r:
        cmp/ge  maxZ, z
-        bf/s    .project_r
+        bf      .transform_x
        mov     z, dz           // [delay slot]
        mov     maxZ, z
-        add     #CLIP_FAR, vg
+        add     #CLIP_PLANE, vg
-.project_r: // dz = divTable[z >> (PROJ_SHIFT = 4)]
+.transform_x:
-        shlr2   dz
+        lds     mx, MACL
-        shlr2   dz
+        mac.w   @stackVtx+, @m+
        mac.w   @stackVtx+, @m+
        mac.w   @stackVtx+, @m+
        sts     MACL, x
          add     #-6, stackVtx
        shlr8   x
        exts.w  x, x
 .transform_y:
        lds     my, MACL
        mac.w   @stackVtx+, @m+
        mac.w   @stackVtx+, @m+
        mac.w   @stackVtx+, @m+
        sts     MACL, y
          mov     z, dz         // [delay slot]
        shlr8   y
        exts.w  y, y
 .project_r: // dz = divTable[z]
        shll    dz
        mov.w   @(dz, divLUT), dz
@ -266,7 +238,6 @@ _transformRoom_asm:
 #undef vertices
 #undef count
 #undef stackVtx
 #undef stackMtx
 #undef vp
 #undef x
 #undef y
@ -282,5 +253,4 @@ _transformRoom_asm:
 #undef dz
 #undef vg
 #undef fog
 #undef cnt
 #undef SP_SIZE
--- a/src/platform/32x/rasterizer.h
+++ b/src/platform/32x/rasterizer.h
@ -132,7 +132,7 @@ extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLin
                }
            }
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
            Lx += Ldx;
            Rx += Rdx;
@ -239,7 +239,7 @@ extern "C" void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLin
                }
            }
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
            Lx += Ldx;
            Rx += Rdx;
@ -380,7 +380,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
            #endif
            }
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
            Lx += Ldx;
            Rx += Rdx;
@ -570,7 +570,7 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
            #endif
            }
-            pixel += VRAM_WIDTH;
+            pixel += (FRAME_WIDTH >> 1);
            Lx += Ldx;
            Rx += Rdx;
@ -604,7 +604,7 @@ extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const Vert
    if (L->v.y < 0)
    {
-        pixel -= L->v.y * VRAM_WIDTH;
+        pixel -= L->v.y * (FRAME_WIDTH >> 1);
        v -= L->v.y * dv;
        h += L->v.y;
    }
--- a/src/platform/32x/render.cpp
+++ b/src/platform/32x/render.cpp
@ -65,9 +65,8 @@ enum ClipFlags {
    CLIP_RIGHT   = 1 << 2,
    CLIP_TOP     = 1 << 3,
    CLIP_BOTTOM  = 1 << 4,
-    CLIP_FAR     = 1 << 5,
+    CLIP_PLANE   = 1 << 5,
-    CLIP_NEAR    = 1 << 6,
+    CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_PLANE)
    CLIP_DISCARD = (CLIP_LEFT | CLIP_RIGHT | CLIP_TOP | CLIP_BOTTOM | CLIP_FAR | CLIP_NEAR),
 };
 const MeshQuad gShadowQuads[] = {
@ -183,12 +182,12 @@ void transformRoom_c(const RoomVertex* vertices, int32 count)
        uint32 clip = 0;
        if (z <= VIEW_MIN_F) {
-            clip = CLIP_NEAR;
+            clip = CLIP_PLANE;
            z = VIEW_MIN_F;
        }
        if (z >= VIEW_MAX_F) {
-            clip = CLIP_FAR;
+            clip = CLIP_PLANE;
            z = VIEW_MAX_F;
        }
@ -330,12 +329,12 @@ void transformMesh_c(const MeshVertex* vertices, int32 count, int32 intensity)
        uint32 clip = 0;
        if (z <= (VIEW_MIN_F >> FIXED_SHIFT)) {
-            clip = CLIP_NEAR;
+            clip = CLIP_PLANE;
            z = VIEW_MIN_F >> FIXED_SHIFT;
        }
        if (z >= (VIEW_MAX_F >> FIXED_SHIFT)) {
-            clip = CLIP_FAR;
+            clip = CLIP_PLANE;
            z = VIEW_MAX_F >> FIXED_SHIFT;
        }
@ -598,25 +597,25 @@ int32 sphereIsVisible_c(int32 sx, int32 sy, int32 sz, int32 r)
 void flush_ot(int32 bit)
 {
-    VertexLink v[4 + 3];
+    VertexLink v[4 + 4];
    VertexLink* q = v;
    VertexLink* t = v + 4;
    // quad
-    q[0].prev = 3;
+    q[0].prev = (3 << 4);
-    q[0].next = 1;
+    q[0].next = (1 << 4);
-    q[1].prev = -1;
+    q[1].prev = -(1 << 4);
-    q[1].next = 1;
+    q[1].next = (1 << 4);
-    q[2].prev = -1;
+    q[2].prev = -(1 << 4);
-    q[2].next = 1;
+    q[2].next = (1 << 4);
-    q[3].prev = -1;
+    q[3].prev = -(1 << 4);
-    q[3].next = -3;
+    q[3].next = -(3 << 4);
    // triangle
-    t[0].prev = 2;
+    t[0].prev = (2 << 4);
-    t[0].next = 1;
+    t[0].next = (1 << 4);
-    t[1].prev = -1;
+    t[1].prev = -(1 << 4);
-    t[1].next = 1;
+    t[1].next = (1 << 4);
-    t[2].prev = -1;
+    t[2].prev = -(1 << 4);
-    t[2].next = -2;
+    t[2].next = -(2 << 4);
    int32 index = 0;
    const ColorIndex* tile = NULL;
@ -654,12 +653,29 @@ void flush_ot(int32 bit)
                    ptr[3].t.t = 0xFF00FF00 & (tex.uv23 << 8);
                }
-                ptr[0].v = gVertices[face->indices[0]];
+            #if 1
-                ptr[1].v = gVertices[face->indices[1]];
+                uint8* vPtr = (uint8*)gVertices;
-                ptr[2].v = gVertices[face->indices[2]];
+                ((uint32*)&ptr[0].v)[0] = ((uint32*)(vPtr + face->indices[0]))[0];
                ((uint32*)&ptr[0].v)[1] = ((uint32*)(vPtr + face->indices[0]))[1];
                ((uint32*)&ptr[1].v)[0] = ((uint32*)(vPtr + face->indices[1]))[0];
                ((uint32*)&ptr[1].v)[1] = ((uint32*)(vPtr + face->indices[1]))[1];
                ((uint32*)&ptr[2].v)[0] = ((uint32*)(vPtr + face->indices[2]))[0];
                ((uint32*)&ptr[2].v)[1] = ((uint32*)(vPtr + face->indices[2]))[1];
                if (!(flags & FACE_TRIANGLE)) {
-                    ptr[3].v = gVertices[face->indices[3]];
+                    ((uint32*)&ptr[3].v)[0] = ((uint32*)(vPtr + face->indices[3]))[0];
                    ((uint32*)&ptr[3].v)[1] = ((uint32*)(vPtr + face->indices[3]))[1];
                }
            #else
                ptr[0].v = gVertices[face->indices[0] >> 3];
                ptr[1].v = gVertices[face->indices[1] >> 3];
                ptr[2].v = gVertices[face->indices[2] >> 3];
                if (!(flags & FACE_TRIANGLE)) {
                    ptr[3].v = gVertices[face->indices[3] >> 3];
                }
            #endif
                if (flags & FACE_CLIPPED) {
                    drawPoly(flags, ptr, tile);
@ -855,10 +871,10 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde
    bool skip = (first->v.y == last->v.y);
    VertexLink* top = (first->v.y < last->v.y) ? first : last;
-    first->prev = count - 1;
+    first->prev = (count - 1) << 4;
-    first->next = 1;
+    first->next = (1 << 4);
-    last->prev = -1;
+    last->prev = -(1 << 4);
-    last->next = 1 - count;
+    last->next = (1 - count) << 4;
    for (int32 i = 1; i < count - 1; i++)
    {
@ -873,8 +889,8 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorInde
            skip = false;
        }
-        p->prev = -1;
+        p->prev = -(1 << 4);
-        p->next = 1;
+        p->next = (1 << 4);
    }
    if (skip)
@ -910,7 +926,7 @@ void clear()
    MARS_SYS_COMM4 = MARS_CMD_CLEAR;
 }
-void renderRoom(const Room* room)
+void renderRoom(Room* room)
 {
    int32 vCount = room->info->verticesCount;
    if (vCount <= 0)
@ -1225,14 +1241,8 @@ const int32 BAR_COLORS[BAR_MAX][5] = {
    { 43, 44, 43, 42, 41 },
 };
-X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 shade, int32 color1, int32 color2, int32 z)
+X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32 color1, int32 color2, int32 z)
 {
    // background
    if (shade >= 0) {
        renderFill(x + 1, y + 1, width - 2, height - 2, shade, z);
    }
    // frame
    renderLine(x + 1, y, width - 2, 1, color1, z);
    renderLine(x + 1, y + height - 1, width - 2, 1, color2, z);
    renderLine(x, y, 1, height, color1, z);
@ -1242,9 +1252,9 @@ X_NOINLINE void renderBorder(int32 x, int32 y, int32 width, int32 height, int32
 void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type)
 {
    // colored bar
-    int32 ix = x + 2;
+    int32 ix = x + 1;
-    int32 iy = y + 2;
+    int32 iy = y + 1;
-    int32 w = value * width >> 8;
+    int32 w = value* width >> 8;
    if (w > 0)
    {
@ -1254,7 +1264,12 @@ void renderBar(int32 x, int32 y, int32 width, int32 value, BarType type)
        }
    }
-    renderBorder(x, y, width + 4, BAR_HEIGHT + 4, 27, 19, 17, 0);
+    if (w < width)
    {
        renderFill(x + 1 + w, y + 1, width - w, BAR_HEIGHT, 27, 0);
    }
    renderBorder(x, y, width + 2, BAR_HEIGHT + 2, 19, 17, 0);
 }
 void renderBackground(const void* background)