diff --git a/src/fixed/common.h b/src/fixed/common.h
index 84338b4..5d415a4 100644
--- a/src/fixed/common.h
+++ b/src/fixed/common.h
@@ -149,6 +149,12 @@
     #define USE_FMT     (LVL_FMT_PKD)
 
     #include "32x.h"
+
+    enum MarsCmd {
+        MARS_CMD_NONE = 0,
+        MARS_CMD_CLEAR,
+        MARS_CMD_FLUSH
+    };
 #else
     #error unsupported platform
 #endif
@@ -2913,7 +2919,7 @@ void drawLevelInit();
 void drawLevelFree();
 void drawText(int32 x, int32 y, const char* text, TextAlign align);
 void drawModel(const ItemObj* item);
-void drawItem(const ItemObj* item);
+void drawSprite(const ItemObj* item);
 void drawRooms(Camera* camera);
 void drawCinematicRooms();
 void drawHUD(Lara* lara);
diff --git a/src/fixed/draw.h b/src/fixed/draw.h
index bf665d2..45a1254 100644
--- a/src/fixed/draw.h
+++ b/src/fixed/draw.h
@@ -701,15 +701,6 @@ void drawModel(const ItemObj* item)
     }
 }
 
-void drawItem(const ItemObj* item)
-{
-    if (level.models[item->type].count > 0) {
-        drawModel(item);
-    } else {
-        drawSprite(item);
-    }
-}
-
 void drawRoom(const Room* room)
 {
     setViewport(room->clip);
@@ -811,7 +802,7 @@ void drawRooms(Camera* camera)
 
     Room** visRoom = camera->view.room->getVisibleRooms();
 
-    // draw Lara first
+#ifdef DRAW_LARA_FIRST
     for (int32 i = 0; i < MAX_PLAYERS; i++)
     {
         Lara* lara = players[i];
@@ -823,7 +814,7 @@ void drawRooms(Camera* camera)
             lara->flags |= ITEM_FLAG_STATUS_INVISIBLE; // skip drawing in the general pass
         }
     }
-
+#endif
     // draw rooms and objects
     while (*visRoom)
     {
@@ -832,6 +823,7 @@ void drawRooms(Camera* camera)
         room->reset();
     }
 
+#ifdef DRAW_LARA_FIRST
     // reset visibility flags for Lara
     for (int32 i = 0; i < MAX_PLAYERS; i++)
     {
@@ -841,6 +833,7 @@ void drawRooms(Camera* camera)
             lara->flags &= ~ITEM_FLAG_STATUS;
         }
     }
+#endif
 
     setPaletteIndex(0);
     setViewport(vp);
diff --git a/src/fixed/enemy.h b/src/fixed/enemy.h
index 4425b7a..a0e7a7f 100644
--- a/src/fixed/enemy.h
+++ b/src/fixed/enemy.h
@@ -787,7 +787,7 @@ struct Wolf : Enemy
             case STATE_STOP:
             {
                 if (nextState)
-                    nextState;
+                    return nextState;
                 return STATE_WALK;
             }
 
diff --git a/src/fixed/item.h b/src/fixed/item.h
index db5cfce..a4d7b6d 100644
--- a/src/fixed/item.h
+++ b/src/fixed/item.h
@@ -1013,7 +1013,11 @@ void ItemObj::update()
 
 void ItemObj::draw()
 {
-    drawItem(this);
+    if (level.models[type].count > 0) {
+        drawModel(this);
+    } else {
+        drawSprite(this);
+    }
 }
 
 struct ItemSave {
diff --git a/src/platform/32x/32x.h b/src/platform/32x/32x.h
index 3ae7739..c06d4b3 100644
--- a/src/platform/32x/32x.h
+++ b/src/platform/32x/32x.h
@@ -155,4 +155,6 @@ extern "C"
     CacheControl(0);\
     CacheControl(SH2_CCTL_CP | SH2_CCTL_CE);
 
+#define MARS_WAIT() {while (MARS_SYS_COMM4);}
+
 #endif
diff --git a/src/platform/32x/asm/block_prepare.s b/src/platform/32x/asm/block_prepare.s
new file mode 100644
index 0000000..84c2abc
--- /dev/null
+++ b/src/platform/32x/asm/block_prepare.s
@@ -0,0 +1,61 @@
+#include "common.i"
+.data
+
+.global _block_prepare_start
+.global _block_prepare_end
+
+.align 4
+_block_prepare_start:
+
+#include "transformMesh.i"
+#include "transformRoom.i"
+
+.align 2
+var_gVerticesBase:
+        .long   _gVerticesBase
+var_gMatrixPtr:
+        .long   _gMatrixPtr
+var_gLightAmbient:
+        .long   _gLightAmbient
+var_divTable:
+        .long   _divTable
+var_viewportRel:
+        .long   _viewportRel
+
+#include "faceAddMeshQuads.i"
+#include "faceAddMeshTriangles.i"
+
+.align 2
+var_gVertices_fam:
+        .long   _gVertices
+var_gFacesBase_fam:
+        .long   _gFacesBase
+var_gVerticesBase_fam:
+        .long   _gVerticesBase
+const_FACE_CLIPPED_fam:
+        .long   FACE_CLIPPED
+const_FACE_TRIANGLE_fam:
+        .long   FACE_TRIANGLE
+var_gOT_fam:
+        .long   _gOT
+
+#include "faceAddRoomQuads.i"
+#include "faceAddRoomTriangles.i"
+
+.align 2
+var_gVertices_far:
+        .long   _gVertices
+var_gFacesBase_far:
+        .long   _gFacesBase
+var_gVerticesBase_far:
+        .long   _gVerticesBase
+const_FACE_CLIPPED_far:
+        .long   FACE_CLIPPED
+const_FACE_GOURAUD_far:
+        .long   FACE_GOURAUD
+const_FACE_TRIANGLE_far:
+        .long   FACE_TRIANGLE
+var_gOT_far:
+        .long   _gOT
+
+_block_prepare_end:
diff --git a/src/platform/32x/asm/block_render.s b/src/platform/32x/asm/block_render.s
new file mode 100644
index 0000000..e8d5c09
--- /dev/null
+++ b/src/platform/32x/asm/block_render.s
@@ -0,0 +1,36 @@
+#include "common.i"
+.data
+
+.global _block_render_start
+.global _block_render_end
+
+.align 4
+_block_render_start:
+
+#include "rasterize.i"
+//#include "rasterize_dummy.i"
+#include "rasterizeS.i"
+#include "rasterizeF.i"
+
+.align 2
+var_LMAP_ADDR_fs:
+        .long   _gLightmap_base
+var_divTable_fs:
+        .long   _divTable
+var_frameWidth_fs:
+        .word   FRAME_WIDTH
+
+#include "rasterizeFT.i"
+#include "rasterizeGT.i"
+
+.align 2
+var_LMAP_ADDR:
+        .long   _gLightmap_base
+var_divTable:
+        .long   _divTable
+var_mask:
+        .word   0xFF00
+var_frameWidth:
+        .word   FRAME_WIDTH
+
+_block_render_end:
diff --git a/src/platform/32x/asm/common.i b/src/platform/32x/asm/common.i
index 57c9698..85e1c5f 100644
--- a/src/platform/32x/asm/common.i
+++ b/src/platform/32x/asm/common.i
@@ -1,9 +1,11 @@
-#define SEG_MATH    .text
-#define SEG_TRANS   .data
-#define SEG_FACE    .data
-#define SEG_RASTER  .data
+#ifndef H_COMMON_ASM
+#define H_COMMON_ASM
+
+#define SEG_MATH    .data
 #define SEG_PHYSICS .data
 
+//#define ON_CHIP_RENDER
+
 // Matrix:
 // int16 e00, e01, e02  // rotation
 // int16 e10, e11, e12  // rotation
@@ -32,6 +34,10 @@
 #define FIXED_SHIFT     14
 
 #define FACE_TYPE_F     1
+#define FACE_TYPE_SHIFT 14
+#define FACE_CLIPPED    (1 << 30)
+#define FACE_TRIANGLE   (1 << 31)
+#define FACE_GOURAUD    (2 << FACE_TYPE_SHIFT)
 
 #define VERTEX_X                0
 #define VERTEX_Y                2
@@ -46,6 +52,8 @@
 #define VERTEX_SIZEOF_SHIFT     4
 #define VERTEX_SIZEOF           (1 << VERTEX_SIZEOF_SHIFT)
 
+#define FACE_SIZEOF             16
+
 #define VIEW_DIST       (1024 * 10)   // max = DIV_TABLE_END << PROJ_SHIFT
 #define FOG_SHIFT       1
 #define FOG_MAX         VIEW_DIST
@@ -61,6 +69,7 @@
 #define CLIP_BOTTOM     (1 << 4)
 #define CLIP_FAR        (1 << 5)
 #define CLIP_NEAR       (1 << 6)
+#define CLIP_DISCARD    (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM + CLIP_FAR + CLIP_NEAR)
 
 #define VP_MINX         0
 #define VP_MINY         4
@@ -121,3 +130,26 @@
 .macro lit lightmap, index
         mov.b   @(\index, \lightmap), \index
 .endm
+
+// (vy1 - vy0) * (vx0 - vx2) <= (vx1 - vx0) * (vy0 - vy2)
+.macro ccw vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
+        mov.w   @\vp0+, \vx0
+        mov.w   @\vp0+, \vy0
+        mov.w   @\vp1+, \vx1
+        mov.w   @\vp1+, \vy1
+        sub     \vx0, \vx1      // vx1 -= vx0
+        sub     \vy0, \vy1      // vy1 -= vy0
+        mov.w   @\vp2+, \vx2
+        sub     \vx2, \vx0      // vx0 -= vx2
+        mov.w   @\vp2+, \vy2
+        sub     \vy2, \vy0      // vy0 -= vy2
+
+        muls.w  \vy1, \vx0
+        sts     MACL, \vx0      // vx0 *= vy1
+        muls.w  \vx1, \vy0
+        sts     MACL, \vy0      // vy0 *= vx1
+
+        cmp/ge  \vx0, \vy0      // T = (vy0 >= vx0)
+.endm
+
+#endif // H_COMMON_ASM
diff --git a/src/platform/32x/asm/faceAddMeshQuads.i b/src/platform/32x/asm/faceAddMeshQuads.i
new file mode 100644
index 0000000..6b0bfd7
--- /dev/null
+++ b/src/platform/32x/asm/faceAddMeshQuads.i
@@ -0,0 +1,202 @@
+#define tmp         r0
+#define face        r1
+#define vp          r2
+#define flags       r3
+#define polys       r4     // arg
+#define count       r5     // arg
+#define vp0         r6
+#define vp1         r7
+#define vp2         r8
+#define vp3         r9
+#define vg0         r10
+#define vg1         r11
+#define vg2         r12
+#define vg3         r13
+#define vertices    r14
+
+#define vx0         vg0
+#define vy0         vg1
+#define vx1         vg2
+#define vy1         vg3
+#define vx2         tmp
+#define vy2         tmp
+
+#define vz0         vg0
+#define vz1         vg1
+#define vz2         vg2
+#define vz3         vg3
+
+#define depth       vg0     // == vz0
+#define next        vg1
+#define ot          tmp
+
+.align 4
+.global _faceAddMeshQuads_asm
+_faceAddMeshQuads_asm:
+        // push
+        mov.l   r8, @-sp
+        mov.l   r9, @-sp
+        mov.l   r10, @-sp
+        mov.l   r11, @-sp
+        mov.l   r12, @-sp
+        mov.l   r13, @-sp
+        mov.l   r14, @-sp
+
+        mov.l   var_gVertices_fam, vertices
+
+        mov.l   var_gVerticesBase_fam, vp
+        mov.l   @vp, vp
+
+        mov.l   var_gFacesBase_fam, face
+        mov.l   @face, face
+
+.loop_famq:
+        // read flags and indices
+        mov.w   @polys+, flags
+        mov.b   @polys+, vp0
+        mov.b   @polys+, vp1
+        mov.b   @polys+, vp2
+        mov.b   @polys+, vp3
+
+        extu.w  flags, flags
+        extu.b  vp0, vp0
+        extu.b  vp1, vp1
+        extu.b  vp2, vp2
+        extu.b  vp3, vp3
+
+        // p = gVerticesBase + index * VERTEX_SIZEOF
+        shll2   vp0
+        shll2   vp1
+        shll2   vp2
+        shll2   vp3
+        shll    vp0
+        shll    vp1
+        shll    vp2
+        shll    vp3
+
+        // get vertex address
+        add     vp, vp0
+        add     vp, vp1
+        add     vp, vp2
+        add     vp, vp3
+
+        // check_backface
+        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
+        bt/s    .skip_famq
+        add     #VERTEX_Z, vp3  // [delay slot] ccw shifts p[0..2] address to VERTEX_Z, shift p3 too
+
+        // fetch clip masks
+        mov     #(VERTEX_CLIP - 4), tmp
+        mov.b   @(tmp, vp0), vg0
+        mov.b   @(tmp, vp1), vg1
+        mov.b   @(tmp, vp2), vg2
+        mov.b   @(tmp, vp3), vg3
+
+        // check clipping
+        mov     vg0, tmp
+        and     vg1, tmp
+        and     vg2, tmp 
+        and     vg3, tmp
+        tst     #CLIP_DISCARD, tmp
+        bf/s    .skip_famq
+
+        // mark if should be clipped by frame
+        mov     vg0, tmp        // [delay slot]
+        or      vg1, tmp
+        or      vg2, tmp
+        or      vg3, tmp
+        tst     #CLIP_FRAME, tmp
+        bt/s    .avg_z4_famq
+        mov.l   const_FACE_CLIPPED_fam, tmp     // [delay slot]
+        or      tmp, flags
+
+.avg_z4_famq:
+        mov.w   @vp0, vz0
+        mov.w   @vp1, vz1
+        mov.w   @vp2, vz2
+        mov.w   @vp3, vz3
+        add     vz1, vz0
+        add     vz2, vz0
+        add     vz3, vz0
+        shlr2   vz0             // div by 4
+
+        mov.l   var_gOT_fam, ot
+
+ .face_add_famq:
+        // index = (p - vertices) / VERTEX_SIZEOF
+        sub     vertices, vp0
+        sub     vertices, vp1
+        sub     vertices, vp2
+        sub     vertices, vp3
+        shlr2   vp0
+        shlr2   vp1
+        shlr2   vp2
+        shlr2   vp3
+        shlr    vp0
+        shlr    vp1
+        shlr    vp2
+        shlr    vp3
+
+        // depth (vz0) >>= OT_SHIFT (4)
+        shlr2   depth
+        shlr2   depth
+
+        shll2   depth
+        add     ot, depth   // depth = gOT[depth]
+        mov.l   @depth, next
+        mov.l   face, @depth
+
+        add     #FACE_SIZEOF, face
+        mov     face, tmp
+
+        mov.w   vp3, @-tmp
+        mov.w   vp2, @-tmp
+        mov.w   vp1, @-tmp
+        mov.w   vp0, @-tmp
+        mov.l   next, @-tmp
+        mov.l   flags, @-tmp
+.skip_famq:
+        dt      count
+        bf      .loop_famq
+
+        mov.l   var_gFacesBase_fam, tmp
+        mov.l   face, @tmp
+
+        // pop
+        mov.l   @sp+, r14
+        mov.l   @sp+, r13
+        mov.l   @sp+, r12
+        mov.l   @sp+, r11
+        mov.l   @sp+, r10
+        mov.l   @sp+, r9
+        rts
+        mov.l   @sp+, r8
+
+#undef tmp
+#undef face
+#undef vp
+#undef flags
+#undef polys
+#undef count
+#undef vp0
+#undef vp1
+#undef vp2
+#undef vp3
+#undef vg0
+#undef vg1
+#undef vg2
+#undef vg3
+#undef vertices
+#undef vx0
+#undef vy0
+#undef vx1
+#undef vy1
+#undef vx2
+#undef vy2
+#undef vz0
+#undef vz1
+#undef vz2
+#undef vz3
+#undef depth
+#undef next
+#undef ot
diff --git a/src/platform/32x/asm/faceAddMeshTriangles.i b/src/platform/32x/asm/faceAddMeshTriangles.i
new file mode 100644
index 0000000..383fc48
--- /dev/null
+++ b/src/platform/32x/asm/faceAddMeshTriangles.i
@@ -0,0 +1,188 @@
+#define tmp         r0
+#define face        r1
+#define vp          r2
+#define flags       r3
+#define polys       r4     // arg
+#define count       r5     // arg
+#define vp0         r6
+#define vp1         r7
+#define vp2         r8
+#define ot          r9
+#define vg0         r10
+#define vg1         r11
+#define vg2         r12
+#define vg3         r13
+#define vertices    r14
+
+#define vx0         vg0
+#define vy0         vg1
+#define vx1         vg2
+#define vy1         vg3
+#define vx2         tmp
+#define vy2         tmp
+
+#define vz0         vg0
+#define vz1         vg1
+#define vz2         vg2
+
+#define depth       vg0     // == vz0
+#define next        vg1
+
+.align 4
+.global _faceAddMeshTriangles_asm
+_faceAddMeshTriangles_asm:
+        // push
+        mov.l   r8, @-sp
+        mov.l   r9, @-sp
+        mov.l   r10, @-sp
+        mov.l   r11, @-sp
+        mov.l   r12, @-sp
+        mov.l   r13, @-sp
+        mov.l   r14, @-sp
+
+        mov.l   var_gVertices_fam, vertices
+
+        mov.l   var_gVerticesBase_fam, vp
+        mov.l   @vp, vp
+
+        mov.l   var_gFacesBase_fam, face
+        mov.l   @face, face
+
+        mov.l   var_gOT_fam, ot
+        nop
+
+.loop_famt:
+        // read flags and indices
+        mov.w   @polys+, flags
+        mov.b   @polys+, vp0
+        mov.b   @polys+, vp1
+        mov.b   @polys+, vp2
+        add     #1, polys       // skup 4th index
+
+        extu.w  flags, flags
+        extu.b  vp0, vp0
+        extu.b  vp1, vp1
+        extu.b  vp2, vp2
+
+        // p = gVerticesBase + index * VERTEX_SIZEOF
+        shll2   vp0
+        shll2   vp1
+        shll2   vp2
+        shll    vp0
+        shll    vp1
+        shll    vp2
+
+        // get vertex address
+        add     vp, vp0
+        add     vp, vp1
+        add     vp, vp2
+
+        // check_backface
+        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
+        bt/s    .skip_famt
+        mov.l   const_FACE_TRIANGLE_fam, tmp    // [delay slot]
+        or      tmp, flags
+
+        // fetch clip masks
+        mov     #(VERTEX_CLIP - 4), tmp
+        mov.b   @(tmp, vp0), vg0
+        mov.b   @(tmp, vp1), vg1
+        mov.b   @(tmp, vp2), vg2
+
+        mov     vg0, tmp
+        and     vg1, tmp
+        and     vg2, tmp 
+        tst     #CLIP_DISCARD, tmp
+        bf/s    .skip_famt
+
+        // mark if should be clipped by frame
+        mov     vg0, tmp        // [delay slot]
+        or      vg1, tmp
+        or      vg2, tmp
+        tst     #CLIP_FRAME, tmp
+        bt/s    .avg_z3_famt
+        mov.l   const_FACE_CLIPPED_fam, tmp     // [delay slot]
+        or      tmp, flags
+
+.avg_z3_famt:
+        mov.w   @vp0, vz0
+        mov.w   @vp1, vz1
+        mov.w   @vp2, vz2
+        add     vz1, vz0
+        add     vz2, vz0
+        add     vz2, vz0        // approx.
+        shlr2   vz0             // div by 4
+
+.face_add_famt:
+        // index = (p - vertices) / VERTEX_SIZEOF
+        sub     vertices, vp0
+        sub     vertices, vp1
+        sub     vertices, vp2
+        shlr2   vp0
+        shlr2   vp1
+        shlr2   vp2
+        shlr    vp0
+        shlr    vp1
+        shlr    vp2
+
+        // depth (vz0) >>= OT_SHIFT (4)
+        shlr2   depth
+        shlr2   depth
+
+        shll2   depth
+        add     ot, depth   // depth = gOT[depth]
+        mov.l   @depth, next
+        mov.l   face, @depth
+
+        add     #FACE_SIZEOF, face
+        mov     face, tmp
+        add     #-2, tmp        // skip 4th index
+
+        mov.w   vp2, @-tmp
+        mov.w   vp1, @-tmp
+        mov.w   vp0, @-tmp
+        mov.l   next, @-tmp
+        mov.l   flags, @-tmp
+.skip_famt:
+        dt      count
+        bf      .loop_famt
+
+        mov.l   var_gFacesBase_fam, tmp
+        mov.l   face, @tmp
+
+        // pop
+        mov.l   @sp+, r14
+        mov.l   @sp+, r13
+        mov.l   @sp+, r12
+        mov.l   @sp+, r11
+        mov.l   @sp+, r10
+        mov.l   @sp+, r9
+        rts
+        mov.l   @sp+, r8
+
+#undef tmp
+#undef face
+#undef vp
+#undef flags
+#undef polys
+#undef count
+#undef vp0
+#undef vp1
+#undef vp2
+#undef ot
+#undef vg0
+#undef vg1
+#undef vg2
+#undef vg3
+#undef vertices
+#undef vx0
+#undef vy0
+#undef vx1
+#undef vy1
+#undef vx2
+#undef vy2
+#undef vz0
+#undef vz1
+#undef vz2
+#undef depth
+#undef next
diff --git a/src/platform/32x/asm/faceAddRoomQuads.i b/src/platform/32x/asm/faceAddRoomQuads.i
new file mode 100644
index 0000000..bb9b533
--- /dev/null
+++ b/src/platform/32x/asm/faceAddRoomQuads.i
@@ -0,0 +1,216 @@
+#define tmp         r0
+#define face        r1
+#define vp          r2
+#define flags       r3
+#define polys       r4     // arg
+#define count       r5     // arg
+#define vp0         r6
+#define vp1         r7
+#define vp2         r8
+#define vp3         r9
+#define vg0         r10
+#define vg1         r11
+#define vg2         r12
+#define vg3         r13
+#define vertices    r14
+
+#define vx0         vg0
+#define vy0         vg1
+#define vx1         vg2
+#define vy1         vg3
+#define vx2         tmp
+#define vy2         tmp
+
+#define vz0         vg0
+#define vz1         vg1
+#define vz2         vg2
+#define vz3         vg3
+
+#define depth       vg0     // == vz0
+#define next        vg1
+#define ot          tmp
+
+.align 4
+.global _faceAddRoomQuads_asm
+_faceAddRoomQuads_asm:
+        // push
+        mov.l   r8, @-sp
+        mov.l   r9, @-sp
+        mov.l   r10, @-sp
+        mov.l   r11, @-sp
+        mov.l   r12, @-sp
+        mov.l   r13, @-sp
+        mov.l   r14, @-sp
+
+        mov.l   var_gVertices_far, vertices
+
+        mov.l   var_gVerticesBase_far, vp
+        mov.l   @vp, vp
+
+        mov.l   var_gFacesBase_far, face
+        mov.l   @face, face
+
+.loop_farq:
+        // read flags and indices
+        mov.w   @polys+, flags
+        mov.w   @polys+, vp0
+        mov.w   @polys+, vp1
+        mov.w   @polys+, vp2
+        mov.w   @polys+, vp3
+        extu.w  flags, flags
+        // indices never exceed 32k, no need for extu.w
+
+        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
+        shll2   vp0
+        shll2   vp1
+        shll2   vp2
+        shll2   vp3
+
+        // get vertex address
+        add     vp, vp0
+        add     vp, vp1
+        add     vp, vp2
+        add     vp, vp3
+
+        // fetch ((g << 8) | clip)
+        mov     #VERTEX_G, tmp
+        mov.w   @(tmp, vp0), vg0
+        mov.w   @(tmp, vp1), vg1
+        mov.w   @(tmp, vp2), vg2
+        mov.w   @(tmp, vp3), vg3
+        // g on high-byte is 5 bits long, no need for extu.w
+
+        // check_clipping
+        mov     vg0, tmp
+        and     vg1, tmp
+        and     vg2, tmp
+        and     vg3, tmp
+        tst     #CLIP_DISCARD, tmp
+        bf/s    .skip_farq
+
+        // mark if should be clipped by frame
+        mov     vg0, tmp        // [delay slot]
+        or      vg1, tmp
+        or      vg2, tmp
+        or      vg3, tmp
+        tst     #CLIP_FRAME, tmp
+        bt/s    1f
+        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot]
+        or      tmp, flags
+
+1:      // compare VERTEX_G for gouraud rasterization
+        xor     vg0, vg1
+        xor     vg0, vg2
+        xor     vg0, vg3
+        or      vg2, vg1
+        or      vg3, vg1
+        shlr8   vg1             // shift down for g only
+        tst     vg1, vg1
+        bt/s    2f
+        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot]
+        add     tmp, flags
+
+2:      // check_backface
+        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
+        bt/s    .skip_farq
+        add     #VERTEX_Z, vp3  // [delay slot] ccw shifts p[0..2] address to VERTEX_Z, shift p3 too
+
+        // max_z4
+        mov.w   @vp0, vz0
+        mov.w   @vp1, vz1 
+        // check_z1
+        cmp/gt  vz0, vz1
+        bf/s    3f
+        mov.w   @vp2, vz2       // [delay slot]
+        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+3:      // check_z2
+        cmp/gt  vz0, vz2
+        bf/s    4f
+        mov.w   @vp3, vz3       // [delay slot]
+        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+4:      // check_z3
+        cmp/gt  vz0, vz3
+        bf      .face_add_farq  // TODO use delay slot but not for OT! )
+        mov     vz3, vz0        // if (z3 > z0) z0 = z3
+
+.face_add_farq:
+        mov.l   var_gOT_far, ot // [delay slot]
+        // get absolute indices
+        // p address is 4 bytes ahead but it's fine for shlr3
+        // index = (p - vertices) / VERTEX_SIZEOF
+        sub     vertices, vp0
+        sub     vertices, vp1
+        sub     vertices, vp2
+        sub     vertices, vp3
+        shlr2   vp0
+        shlr2   vp1
+        shlr2   vp2
+        shlr2   vp3
+        shlr    vp0
+        shlr    vp1
+        shlr    vp2
+        shlr    vp3
+
+        // depth (vz0) >>= OT_SHIFT (4)
+        shlr2   depth
+        shlr2   depth
+
+        shll2   depth
+        add     ot, depth   // depth = gOT[depth]
+        mov.l   @depth, next
+        mov.l   face, @depth
+
+        add     #FACE_SIZEOF, face
+        mov     face, tmp
+
+        mov.w   vp3, @-tmp
+        mov.w   vp2, @-tmp
+        mov.w   vp1, @-tmp
+        mov.w   vp0, @-tmp
+        mov.l   next, @-tmp
+        mov.l   flags, @-tmp
+.skip_farq:
+        dt      count
+        bf      .loop_farq
+
+        mov.l   var_gFacesBase_far, tmp
+        mov.l   face, @tmp
+
+        // pop
+        mov.l   @sp+, r14
+        mov.l   @sp+, r13
+        mov.l   @sp+, r12
+        mov.l   @sp+, r11
+        mov.l   @sp+, r10
+        mov.l   @sp+, r9
+        rts
+        mov.l   @sp+, r8
+
+#undef tmp
+#undef face
+#undef vp
+#undef flags
+#undef polys
+#undef count
+#undef vp0
+#undef vp1
+#undef vp2
+#undef vp3
+#undef vg0
+#undef vg1
+#undef vg2
+#undef vg3
+#undef vertices
+#undef vx0
+#undef vy0
+#undef vx1
+#undef vy1
+#undef vx2
+#undef vy2
+#undef vz0
+#undef vz1
+#undef vz2
+#undef vz3
+#undef depth
+#undef next
+#undef ot
\ No newline at end of file
diff --git a/src/platform/32x/asm/faceAddRoomTriangles.i b/src/platform/32x/asm/faceAddRoomTriangles.i
new file mode 100644
index 0000000..56580c6
--- /dev/null
+++ b/src/platform/32x/asm/faceAddRoomTriangles.i
@@ -0,0 +1,199 @@
+#define tmp         r0
+#define face        r1
+#define vp          r2
+#define flags       r3
+#define polys       r4     // arg
+#define count       r5     // arg
+#define vp0         r6
+#define vp1         r7
+#define vp2         r8
+#define ot          r9
+#define vg0         r10
+#define vg1         r11
+#define vg2         r12
+#define vg3         r13
+#define vertices    r14
+
+#define vx0         vg0
+#define vy0         vg1
+#define vx1         vg2
+#define vy1         vg3
+#define vx2         tmp
+#define vy2         tmp
+
+#define vz0         vg0
+#define vz1         vg1
+#define vz2         vg2
+
+#define depth       vg0     // == vz0
+#define next        vg1
+
+.align 4
+.global _faceAddRoomTriangles_asm
+_faceAddRoomTriangles_asm:
+        // push
+        mov.l   r8, @-sp
+        mov.l   r9, @-sp
+        mov.l   r10, @-sp
+        mov.l   r11, @-sp
+        mov.l   r12, @-sp
+        mov.l   r13, @-sp
+        mov.l   r14, @-sp
+
+        mov.l   var_gVertices_far, vertices
+
+        mov.l   var_gVerticesBase_far, vp
+        mov.l   @vp, vp
+
+        mov.l   var_gFacesBase_far, face
+        mov.l   @face, face
+
+        mov.l   var_gOT_far, ot
+        nop
+
+.loop_fart:
+        // read flags and indices
+        mov.w   @polys+, flags
+        mov.w   @polys+, vp0
+        mov.w   @polys+, vp1
+        mov.w   @polys+, vp2
+        extu.w  flags, flags
+        // indices never exceed 32k, no need for extu.w
+
+        // p = gVerticesBase + index * VERTEX_SIZEOF (index is already multiplied by 2)
+        shll2   vp0
+        shll2   vp1
+        shll2   vp2
+
+        // get vertex address
+        add     vp, vp0
+        add     vp, vp1
+        add     vp, vp2
+
+        // fetch ((g << 8) | clip)
+        mov     #VERTEX_G, tmp
+        mov.w   @(tmp, vp0), vg0
+        mov.w   @(tmp, vp1), vg1
+        mov.w   @(tmp, vp2), vg2
+        // g on high-byte is 5 bits long, no need for extu.w
+
+        // check_clipping
+        mov     vg0, tmp
+        and     vg1, tmp
+        and     vg2, tmp 
+        tst     #CLIP_DISCARD, tmp
+        bf/s    .skip_fart
+
+        // mark if should be clipped by frame
+        mov     vg0, tmp        // [delay slot]
+        or      vg1, tmp
+        or      vg2, tmp
+        tst     #CLIP_FRAME, tmp
+        bt/s    1f
+        mov.l   const_FACE_CLIPPED_far, tmp     // [delay slot]
+        or      tmp, flags
+
+1:      // compare VERTEX_G for gouraud rasterization
+        xor     vg0, vg1
+        xor     vg0, vg2
+        or      vg2, vg1
+        shlr8   vg1             // shift down for g only
+        tst     vg1, vg1
+        bt/s    2f
+        mov.l   const_FACE_GOURAUD_far, tmp     // [delay slot]
+        add     tmp, flags
+
+2:      // check_backface
+        ccw     vp0, vp1, vp2, vx0, vy0, vx1, vy1, vx2, vy2
+        bt/s    .skip_fart
+        mov.l   const_FACE_TRIANGLE_far, tmp    // [delay slot]
+        or      tmp, flags
+
+        // max_z3
+        mov.w   @vp0, vz0
+        mov.w   @vp1, vz1
+        // check_z1
+        cmp/gt  vz0, vz1
+        bf/s    3f
+        mov.w   @vp2, vz2       // [delay slot]
+        mov     vz1, vz0        // if (z1 > z0) z0 = z1
+3:      // check_z2
+        cmp/gt  vz0, vz2
+        bf      .face_add_fart  // TODO use delay slot but not for OT! )
+        mov     vz2, vz0        // if (z2 > z0) z0 = z2
+
+.face_add_fart:
+        // get absolute indices
+        // p address is 4 bytes ahead but it's fine for shlr3
+        // index = (p - vertices) / VERTEX_SIZEOF
+        sub     vertices, vp0
+        sub     vertices, vp1
+        sub     vertices, vp2
+        shlr2   vp0
+        shlr2   vp1
+        shlr2   vp2
+        shlr    vp0
+        shlr    vp1
+        shlr    vp2
+
+        // depth (vz0) >>= OT_SHIFT (4)
+        shlr2   depth
+        shlr2   depth
+
+        shll2   depth
+        add     ot, depth   // depth = gOT[depth]
+        mov.l   @depth, next
+        mov.l   face, @depth
+
+        add     #FACE_SIZEOF, face
+        mov     face, tmp
+        add     #-2, tmp        // skip 4th index
+
+        mov.w   vp2, @-tmp
+        mov.w   vp1, @-tmp
+        mov.w   vp0, @-tmp
+        mov.l   next, @-tmp
+        mov.l   flags, @-tmp
+.skip_fart:
+        dt      count
+        bf      .loop_fart
+
+        mov.l   var_gFacesBase_far, tmp
+        mov.l   face, @tmp
+
+        // pop
+        mov.l   @sp+, r14
+        mov.l   @sp+, r13
+        mov.l   @sp+, r12
+        mov.l   @sp+, r11
+        mov.l   @sp+, r10
+        mov.l   @sp+, r9
+        rts
+        mov.l   @sp+, r8
+
+#undef tmp
+#undef face
+#undef vp
+#undef flags
+#undef polys
+#undef count
+#undef vp0
+#undef vp1
+#undef vp2
+#undef ot
+#undef vg0
+#undef vg1
+#undef vg2
+#undef vg3
+#undef vertices
+#undef vx0
+#undef vy0
+#undef vx1
+#undef vy1
+#undef vx2
+#undef vy2
+#undef vz0
+#undef vz1
+#undef vz2
+#undef depth
+#undef next
\ No newline at end of file
diff --git a/src/platform/32x/asm/rasterize.s b/src/platform/32x/asm/rasterize.i
similarity index 69%
rename from src/platform/32x/asm/rasterize.s
rename to src/platform/32x/asm/rasterize.i
index c8a2bf6..f9adc3f 100644
--- a/src/platform/32x/asm/rasterize.s
+++ b/src/platform/32x/asm/rasterize.i
@@ -1,18 +1,17 @@
-#include "common.i"
-SEG_RASTER
-
 #define type    r0
 #define proc    r1
 
 #define flags   r4      // arg
 #define L       r5      // arg
-#define R       r6
+#define tile    r6      // arg
+#define R       tile
 #define pixel   flags
 #define y       type
 
 .align 4
 .global _rasterize_asm
 _rasterize_asm:
+        mov     tile, r7
         mov     flags, type
         shll2   type
         shlr16  type
@@ -44,21 +43,30 @@ var_fb:
         // write per but allow transparent write for byte & word
         .long 0x24020200
 var_table:
-/* 2k on-chip test
-        .long 0xC0000000 + 516 + 416 + 256 + 18 //_rasterizeS_asm
-        .long 0xC0000000 + 516 + 416 + 18 //_rasterizeF_asm
-        .long 0xC0000000 + 516 + 18 //_rasterizeFT_asm
-        .long 0xC0000000 + 516 + 18 //_rasterizeFT_asm
-        .long 0xC0000000 + 20 //_rasterizeGT_asm
-        .long 0xC0000000 + 20 //_rasterizeGT_asm
-*/
+#ifdef ON_CHIP_RENDER
+        .long 0xC0000000 + _rasterizeS_asm - _block_render_start
+        .long 0xC0000000 + _rasterizeF_asm - _block_render_start
+        .long 0xC0000000 + _rasterizeFT_asm - _block_render_start
+        .long 0xC0000000 + _rasterizeFT_asm - _block_render_start
+        .long 0xC0000000 + _rasterizeGT_asm - _block_render_start
+        .long 0xC0000000 + _rasterizeGT_asm - _block_render_start        
+#else
         .long _rasterizeS_asm
         .long _rasterizeF_asm
         .long _rasterizeFT_asm
         .long _rasterizeFT_asm
         .long _rasterizeGT_asm
         .long _rasterizeGT_asm
+#endif
         .long _rasterizeSprite_c
         .long _rasterizeFillS_c
         .long _rasterizeLineH_c
         .long _rasterizeLineV_c
+
+#undef type
+#undef proc
+#undef flags
+#undef L
+#undef R
+#undef pixel
+#undef y
\ No newline at end of file
diff --git a/src/platform/32x/asm/rasterizeF.s b/src/platform/32x/asm/rasterizeF.i
similarity index 77%
rename from src/platform/32x/asm/rasterizeF.s
rename to src/platform/32x/asm/rasterizeF.i
index bc36e85..3c023fc 100644
--- a/src/platform/32x/asm/rasterizeF.s
+++ b/src/platform/32x/asm/rasterizeF.i
@@ -1,6 +1,3 @@
-#include "common.i"
-SEG_RASTER
-
 #define tmp     r0
 #define Lh      r1
 #define Rh      r2
@@ -8,7 +5,8 @@ SEG_RASTER
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define index   r6      // arg
-#define N       r7
+#define gtile   r7      // arg (unused)
+#define N       gtile
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
@@ -30,10 +28,7 @@ SEG_RASTER
 #define LMAP    inv
 
 .align 4
-.global _rasterizeF_asm_start
-_rasterizeF_asm_start:
-
-.exit:
+.exit_f:
         // pop
         mov.l   @sp+, r14
         mov.l   @sp+, r13
@@ -56,7 +51,7 @@ _rasterizeF_asm:
         mov.l   r13, @-sp
         mov.l   r14, @-sp
 
-        mov.l   var_LMAP_ADDR, LMAP
+        mov.l   var_LMAP_ADDR_fs, LMAP
         mov.b   @(VERTEX_G, L), tmp
         shll8   tmp
         add     index, tmp
@@ -68,15 +63,15 @@ _rasterizeF_asm:
 
         mov     L, R
 
-        mov.l   var_divTable, divLUT
+        mov.l   var_divTable_fs, divLUT
 
         mov     #0, Rh
         mov     #0, Lh
-.loop:
+.loop_f:
         tst     Lh, Lh
-        bf/s    .calc_left_end
+        bf/s    .calc_left_end_f
 
-.calc_left_start:
+.calc_left_start_f:
         mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
         mov     tmp, N
         shll2   N
@@ -91,9 +86,9 @@ _rasterizeF_asm:
         mov.w   @tmp+, Lh
 
         cmp/ge  Ly, Lh
-        bf/s    .exit
+        bf/s    .exit_f
         cmp/eq  Ly, Lh          // [delay slot]
-        bt/s    .calc_left_start        // if (L->v.y == N->v.y) check next vertex
+        bt/s    .calc_left_start_f      // if (L->v.y == N->v.y) check next vertex
         mov     N, L            // [delay slot]
 
         sub     Lx, Ldx
@@ -106,12 +101,12 @@ _rasterizeF_asm:
         muls.w  ih, Ldx
           shll16  Lx            // [delay slot]
         sts     MACL, Ldx
-.calc_left_end:
+.calc_left_end_f:
 
         tst     Rh, Rh
-        bf/s    .calc_right_end
+        bf/s    .calc_right_end_f
 
-.calc_right_start:
+.calc_right_start_f:
         mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
         mov     tmp, N
         shll2   N
@@ -126,9 +121,9 @@ _rasterizeF_asm:
         mov.w   @tmp+, Rh
 
         cmp/ge  Ry, Rh
-        bf/s    .exit
+        bf/s    .exit_f
         cmp/eq  Ry, Rh          // [delay slot]
-        bt/s    .calc_right_start       // if (R->v.y == N->v.y) check next vertex
+        bt/s    .calc_right_start_f     // if (R->v.y == N->v.y) check next vertex
         mov     N, R            // [delay slot]
 
         sub     Rx, Rdx
@@ -141,21 +136,21 @@ _rasterizeF_asm:
         muls.w  ih, Rdx
           shll16  Rx            // [delay slot]
         sts     MACL, Rdx
-.calc_right_end:
+.calc_right_end_f:
 
         // h = min(Lh, Rh)
         cmp/gt  Rh, Lh
-        bf/s    .scanline_prepare
+        bf/s    .scanline_prepare_f
         mov     Lh, h           // [delay slot]
         mov     Rh, h
 
-.scanline_prepare:
+.scanline_prepare_f:
         sub     h, Lh
         sub     h, Rh
 
         mov.l   R, @-sp
         
-.scanline_start:
+.scanline_start_f:
         mov     Lx, Lptr
         mov     Rx, Rptr
         add     Ldx, Lx
@@ -163,7 +158,7 @@ _rasterizeF_asm:
         shlr16  Lptr            // Lptr = (Lx >> 16)
         shlr16  Rptr            // Rptr = (Rx >> 16)
         cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
-        bf/s    .scanline_end
+        bf/s    .scanline_end_f
 
         // iw = divTable[Rptr - Lptr]
         mov     Rptr, tmp       // [delay slot]
@@ -174,10 +169,10 @@ _rasterizeF_asm:
         add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
         add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)
 
-.align_left:
+.align_left_f:
         mov     #1, tmp
         tst     tmp, Lptr
-        bt/s    .align_right
+        bt/s    .align_right_f
         tst     tmp, Rptr       // [delay slot]
 
         mov.b   dup, @Lptr
@@ -185,38 +180,50 @@ _rasterizeF_asm:
 
         mov     #1, tmp         // tmp = 1 (for align_right)
         cmp/gt  Lptr, Rptr
-        bf/s    .scanline_end
+        bf/s    .scanline_end_f
         tst     tmp, Rptr
 
-.align_right:
-        bt      .block_2px
+.align_right_f:
+        bt      .block_2px_f
         mov.b   dup, @-Rptr
         cmp/gt  Lptr, Rptr
-        bf      .scanline_end
+        bf      .scanline_end_f
 
-.block_2px:
+.block_2px_f:
         mov.w   dup, @-Rptr
         cmp/gt  Lptr, Rptr
-        bt      .block_2px
+        bt      .block_2px_f
 
-.scanline_end:
+.scanline_end_f:
         dt      h
 
-        mov.w   var_frameWidth, tmp
-        bf/s    .scanline_start
+        mov.w   var_frameWidth_fs, tmp
+        bf/s    .scanline_start_f
         add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
 
-        bra     .loop
+        bra     .loop_f
         mov.l   @sp+, R
 
-var_frameWidth:
-        .word   FRAME_WIDTH
-.align 2
-var_LMAP_ADDR:
-        .long   _gLightmap_base
-var_divTable:
-        .long   _divTable
-
-.align 2
-.global _rasterizeF_asm_end
-_rasterizeF_asm_end:
\ No newline at end of file
+#undef tmp
+#undef Lh
+#undef Rh
+#undef Lptr
+#undef pixel
+#undef L
+#undef index
+#undef N
+#undef Lx
+#undef Rx
+#undef Ldx
+#undef Rdx
+#undef dup
+#undef inv
+#undef divLUT
+#undef R
+#undef h
+#undef Ry
+#undef Ly
+#undef Rptr
+#undef iw
+#undef ih
+#undef LMAP
diff --git a/src/platform/32x/asm/rasterizeFT.s b/src/platform/32x/asm/rasterizeFT.i
similarity index 79%
rename from src/platform/32x/asm/rasterizeFT.s
rename to src/platform/32x/asm/rasterizeFT.i
index a7cc524..f617727 100644
--- a/src/platform/32x/asm/rasterizeFT.s
+++ b/src/platform/32x/asm/rasterizeFT.i
@@ -1,6 +1,3 @@
-#include "common.i"
-SEG_RASTER
-
 #define tmp     r0
 #define Lh      r1
 #define Rh      r2
@@ -8,7 +5,8 @@ SEG_RASTER
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define R       r6      // arg
-#define N       r7
+#define gtile   r7      // arg
+#define N       gtile
 #define Lx      r8
 #define Rx      r9
 #define Lt      r10
@@ -47,20 +45,17 @@ SEG_RASTER
 #define sLdt    Lh
 #define sRdt    Rh
 
-SP_LDX = 0
-SP_RDX = 4
-SP_LDT = 8
-SP_RDT = 12
-SP_H   = 16
-SP_L   = 20
-SP_R   = 24
-SP_SIZE = 28
+#define SP_LDX  0
+#define SP_RDX  4
+#define SP_LDT  8
+#define SP_RDT  12
+#define SP_H    16
+#define SP_L    20
+#define SP_R    24
+#define SP_SIZE 28
 
 .align 4
-.global _rasterizeFT_asm_start
-_rasterizeFT_asm_start:
-
-.exit:
+.exit_ft:
         // pop
         add     #SP_SIZE, sp
         mov.l   @sp+, r14
@@ -91,17 +86,17 @@ _rasterizeFT_asm:
 
         mov.l   var_divTable, divLUT
 
-        mov.l   var_gTile, TILE
-        mov.l   @TILE, TILE
+        mov     gtile, TILE
+        nop
 
         mov     #0, Rh
-.loop:
+.loop_ft:
         extu.w  Rh, Lh  // Lh = int16(Rh)
 
         tst     Lh, Lh
-        bf/s    .calc_left_end
+        bf/s    .calc_left_end_ft
 
-.calc_left_start:
+.calc_left_start_ft:
         mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
         mov     tmp, N
 
@@ -113,10 +108,10 @@ _rasterizeFT_asm:
         mov.w   @(VERTEX_Y, N), tmp
         sub     Ly, tmp
         cmp/pz  tmp
-        bf/s    .exit
+        bf/s    .exit_ft
         tst     tmp, tmp
         mov     L, Lv           // Lv = L
-        bt/s    .calc_left_start        // if (Lh == 0) check next vertex
+        bt/s    .calc_left_start_ft      // if (Lh == 0) check next vertex
         mov     N, L            // [delay slot]
 
         mov     tmp, Lh
@@ -126,7 +121,7 @@ _rasterizeFT_asm:
 
         mov     Lh, tmp
         cmp/eq  #1, tmp
-        bt/s    .calc_left_end
+        bt/s    .calc_left_end_ft
         shll    tmp             // [delay slot]
 
         mov.w   @(tmp, divLUT), ih
@@ -144,13 +139,13 @@ _rasterizeFT_asm:
         // calc Ldt
         scaleUV Ldt, tmp, ih
         mov.l   tmp, @(SP_LDT, sp)
-.calc_left_end:
+.calc_left_end_ft:
 
         shlr16  Rh              // Rh = (Rh >> 16)
         tst     Rh, Rh
-        bf/s    .calc_right_end
+        bf/s    .calc_right_end_ft
 
-.calc_right_start:
+.calc_right_start_ft:
         mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
         mov     tmp, N
 
@@ -162,10 +157,10 @@ _rasterizeFT_asm:
         mov.w   @(VERTEX_Y, N), tmp
         sub     Ry, tmp
         cmp/pz  tmp
-        bf/s    .exit
+        bf/s    .exit_ft
         tst     tmp, tmp
         mov     R, Rv           // Rv = R
-        bt/s    .calc_right_start       // if (Rh == 0) check next vertex
+        bt/s    .calc_right_start_ft     // if (Rh == 0) check next vertex
         mov     N, R            // [delay slot]
 
         mov     tmp, Rh
@@ -175,7 +170,7 @@ _rasterizeFT_asm:
 
         mov     Rh, tmp
         cmp/eq  #1, tmp
-        bt/s    .calc_right_end
+        bt/s    .calc_right_end_ft
         shll    tmp             // [delay slot]
 
         mov.w   @(tmp, divLUT), ih
@@ -193,15 +188,15 @@ _rasterizeFT_asm:
         // calc Rdt
         scaleUV Rdt, tmp, ih
         mov.l   tmp, @(SP_RDT, sp)
-.calc_right_end:
+.calc_right_end_ft:
 
         // h = min(Lh, Rh)
         cmp/gt  Rh, Lh
-        bf/s    .scanline_prepare
+        bf/s    .scanline_prepare_ft
         mov     Lh, h           // [delay slot]
         mov     Rh, h
 
-.scanline_prepare:
+.scanline_prepare_ft:
         sub     h, Lh
         sub     h, Rh
 
@@ -212,13 +207,13 @@ _rasterizeFT_asm:
         mov.l   L, @(SP_L, sp)
         mov.l   R, @(SP_R, sp)
         
-.scanline_start:
+.scanline_start_ft:
         mov     Lx, Lptr
         mov     Rx, Rptr
         shlr16  Lptr            // Lptr = (Lx >> 16)
         shlr16  Rptr            // Rptr = (Rx >> 16)
         cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
-        bf/s    .scanline_end
+        bf/s    .scanline_end_ft
 
         // iw = divTable[Rptr - Lptr]
         mov     Rptr, tmp       // [delay slot]
@@ -240,10 +235,10 @@ _rasterizeFT_asm:
         shlr16  tmp
         xtrct   tmp, dtdx       // out = uint16(v >> 16) | (u & 0xFFFF0000)
 
-.align_left:
+.align_left_ft:
         mov     #1, tmp
         tst     tmp, Lptr
-        bt/s    .align_right
+        bt/s    .align_right_ft
         tst     tmp, Rptr       // [delay slot]
 
         getUV   Lt, index
@@ -254,11 +249,11 @@ _rasterizeFT_asm:
 
         mov     #1, tmp         // tmp = 1 (for align_right)
         cmp/gt  Lptr, Rptr
-        bf/s    .scanline_end
+        bf/s    .scanline_end_ft
         tst     tmp, Rptr
 
-.align_right:
-        bt/s    .block_prepare
+.align_right_ft:
+        bt/s    .block_prepare_ft
 
         getUV   t, index
         mov.b   @(index, TILE), index
@@ -267,12 +262,12 @@ _rasterizeFT_asm:
         mov.b   index, @-Rptr
 
         cmp/gt  Lptr, Rptr
-        bf/s    .scanline_end
+        bf/s    .scanline_end_ft
 
-.block_prepare:
+.block_prepare_ft:
         shll    dtdx            // [delay slot] optional
 
-.block_2px:
+.block_2px_ft:
         swap.b  t, index        // UUuuvvVV
         swap.w  index, index    // vvVVUUuu
         shll8   index           // VVUUuu00
@@ -286,10 +281,10 @@ _rasterizeFT_asm:
         mov.w   dup, @-Rptr
 
         cmp/gt  Lptr, Rptr
-        bt/s    .block_2px
+        bt/s    .block_2px_ft
         sub     dtdx, t         // [delay slot] t -= dtdx
 
-.scanline_end:
+.scanline_end_ft:
         mov.l   @(SP_LDX, sp), sLdx
         mov.l   @(SP_RDX, sp), sRdx
         mov.l   @(SP_LDT, sp), sLdt
@@ -302,25 +297,58 @@ _rasterizeFT_asm:
 
         dt      h
 
-        mov.w   var_frameWidth, tmp
-        bf/s    .scanline_start
+        mov.w   var_frameWidth_ft, tmp
+        bf/s    .scanline_start_ft
         add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
 
         mov.l   @(SP_L, sp), L
         mov.l   @(SP_R, sp), R
-        bra     .loop
+        bra     .loop_ft
         mov.l   @(SP_H, sp), Rh
 
-var_frameWidth:
+var_frameWidth_ft:
         .word   FRAME_WIDTH
-.align 2
-var_LMAP_ADDR:
-        .long   _gLightmap_base
-var_divTable:
-        .long   _divTable
-var_gTile:
-        .long   _gTile
 
-.align 2
-.global _rasterizeFT_asm_end
-_rasterizeFT_asm_end:
+#undef tmp
+#undef Lh
+#undef Rh
+#undef LMAP
+#undef pixel
+#undef L
+#undef R
+#undef N
+#undef Lx
+#undef Rx
+#undef Lt
+#undef Rt
+#undef dup
+#undef TILE
+#undef divLUT
+#undef h
+#undef Ldx
+#undef Rdx
+#undef Ldt
+#undef Rdt
+#undef Ry
+#undef Ly
+#undef Rv
+#undef Lv
+#undef Lptr
+#undef Rptr
+#undef t
+#undef dtdx
+#undef index
+#undef iw
+#undef ih
+#undef sLdx
+#undef sRdx
+#undef sLdt
+#undef sRdt
+#undef SP_LDX
+#undef SP_RDX
+#undef SP_LDT
+#undef SP_RDT
+#undef SP_H
+#undef SP_L
+#undef SP_R
+#undef SP_SIZE
diff --git a/src/platform/32x/asm/rasterizeGT.s b/src/platform/32x/asm/rasterizeGT.i
similarity index 81%
rename from src/platform/32x/asm/rasterizeGT.s
rename to src/platform/32x/asm/rasterizeGT.i
index aa34f8b..2f23cef 100644
--- a/src/platform/32x/asm/rasterizeGT.s
+++ b/src/platform/32x/asm/rasterizeGT.i
@@ -1,6 +1,3 @@
-#include "common.i"
-SEG_RASTER
-
 #define tmp     r0
 #define Lh      r1
 #define Rh      r2
@@ -8,7 +5,8 @@ SEG_RASTER
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define R       r6      // arg
-#define N       r7
+#define gtile   r7      // arg
+#define N       gtile
 #define Lx      r8
 #define Rx      r9
 #define Lg      r10
@@ -57,23 +55,19 @@ SEG_RASTER
 #define sLdg    L
 #define sRdg    R
 
-SP_LDX = 0
-SP_RDX = 4
-SP_LDT = 8
-SP_RDT = 12
-SP_LDG = 16
-SP_RDG = 18
-SP_H   = 20
-SP_L   = 24
-SP_R   = 28
-SP_SIZE = 32
+#define SP_LDX  0
+#define SP_RDX  4
+#define SP_LDT  8
+#define SP_RDT  12
+#define SP_LDG  16
+#define SP_RDG  18
+#define SP_H    20
+#define SP_L    24
+#define SP_R    28
+#define SP_SIZE 32
 
 .align 4
-
-.global _rasterizeGT_asm_start
-_rasterizeGT_asm_start:
-
-.exit:
+.exit_gt:
         // pop
         add     #SP_SIZE, sp
         mov.l   @sp+, r14
@@ -98,18 +92,18 @@ _rasterizeGT_asm:
         mov.l   r14, @-sp
         add     #-SP_SIZE, sp
 
-        mov.l   var_gTile, TILE
-        mov.l   @TILE, TILE
+        mov     gtile, TILE
+        nop
 
         mov     #0, Rh
 
-.loop:
+.loop_gt:
         extu.w  Rh, Lh  // Lh = int16(Rh)
 
         tst     Lh, Lh
-        bf/s    .calc_left_end
+        bf/s    .calc_left_end_gt
 
-.calc_left_start:
+.calc_left_start_gt:
         mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
         mov     tmp, N
 
@@ -121,10 +115,10 @@ _rasterizeGT_asm:
         mov.w   @(VERTEX_Y, N), tmp
         sub     Ly, tmp
         cmp/pz  tmp
-        bf/s    .exit
+        bf/s    .exit_gt
         tst     tmp, tmp
         mov     L, Lv           // Lv = L
-        bt/s    .calc_left_start        // if (Lh == 0) check next vertex
+        bt/s    .calc_left_start_gt      // if (Lh == 0) check next vertex
         mov     N, L            // [delay slot]
 
         mov     tmp, Lh
@@ -137,7 +131,7 @@ _rasterizeGT_asm:
 
         mov     Lh, tmp
         cmp/eq  #1, tmp
-        bt/s    .calc_left_end
+        bt/s    .calc_left_end_gt
         shll    tmp             // [delay slot]
 
         mov.l   var_divTable, divLUT
@@ -165,13 +159,13 @@ _rasterizeGT_asm:
         // calc Ldt
         scaleUV Ldt, tmp, ih
         mov.l   tmp, @(SP_LDT, sp)
-.calc_left_end:
+.calc_left_end_gt:
 
         shlr16  Rh              // Rh = (Rh >> 16)
         tst     Rh, Rh
-        bf/s    .calc_right_end
+        bf/s    .calc_right_end_gt
 
-.calc_right_start:
+.calc_right_start_gt:
         mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
         mov     tmp, N
 
@@ -183,10 +177,10 @@ _rasterizeGT_asm:
         mov.w   @(VERTEX_Y, N), tmp
         sub     Ry, tmp
         cmp/pz  tmp
-        bf/s    .exit
+        bf/s    .exit_gt
         tst     tmp, tmp
         mov     R, Rv           // Rv = R
-        bt/s    .calc_right_start       // if (Rh == 0) check next vertex
+        bt/s    .calc_right_start_gt     // if (Rh == 0) check next vertex
         mov     N, R            // [delay slot]
 
         mov     tmp, Rh
@@ -199,7 +193,7 @@ _rasterizeGT_asm:
 
         mov     Rh, tmp
         cmp/eq  #1, tmp
-        bt/s    .calc_right_end
+        bt/s    .calc_right_end_gt
         shll    tmp             // [delay slot]
 
         mov.l   var_divTable, divLUT
@@ -227,7 +221,7 @@ _rasterizeGT_asm:
         // calc Rdt
         scaleUV Rdt, tmp, ih
         mov.l   tmp, @(SP_RDT, sp)
-.calc_right_end:
+.calc_right_end_gt:
 
         // bake gLightmap address into g value
         mov.l   var_LMAP_ADDR, tmp
@@ -236,11 +230,11 @@ _rasterizeGT_asm:
 
         // h = min(Lh, Rh)
         cmp/gt  Rh, Lh
-        bf/s    .scanline_prepare
+        bf/s    .scanline_prepare_gt
         mov     Lh, h           // [delay slot]
         mov     Rh, h
 
-.scanline_prepare:
+.scanline_prepare_gt:
         sub     h, Lh
         sub     h, Rh
 
@@ -251,16 +245,16 @@ _rasterizeGT_asm:
         mov.l   L, @(SP_L, sp)
         mov.l   R, @(SP_R, sp)
 
-        mov.l   var_mask, mask
+        mov.w   var_mask, mask
         
-.scanline_start:
+.scanline_start_gt:
         mov.l   Rx, @-sp        // alias Rptr
 
         mov     Lx, Lptr
         shlr16  Lptr            // Lptr = (Lx >> 16)
         shlr16  Rptr            // Rptr = (Rx >> 16)
         cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
-        bf/s    .scanline_end_fast
+        bf/s    .scanline_end_fast_gt
 
         // iw = divTable[Rptr - Lptr]
         mov     Rptr, tmp       // [delay slot]
@@ -296,8 +290,8 @@ _rasterizeGT_asm:
         shlr16  dgdx
         exts.w  dgdx, dgdx
 
-.align_left:
-        bt/s    .align_right
+.align_left_gt:
+        bt/s    .align_right_gt
         tst     tmp, Rptr       // [delay slot]
 
         getUV   Lt, index
@@ -311,11 +305,11 @@ _rasterizeGT_asm:
 
         mov     #1, tmp         // tmp = 1 (for align_right)
         cmp/gt  Lptr, Rptr
-        bf/s    .scanline_end
+        bf/s    .scanline_end_gt
         tst     tmp, Rptr
 
-.align_right:
-        bt/s    .block_prepare
+.align_right_gt:
+        bt/s    .block_prepare_gt
         mov     g, LMAP
 
         getUV   t, index
@@ -329,13 +323,13 @@ _rasterizeGT_asm:
         mov.b   index, @-Rptr
 
         cmp/gt  Lptr, Rptr
-        bf/s    .scanline_end
+        bf/s    .scanline_end_gt
 
-.block_prepare:
+.block_prepare_gt:
         shll    dtdx            // [delay slot] optional
         shll    dgdx
 
-.block_2px:
+.block_2px_gt:
         swap.b  t, index        // UUuuvvVV
         swap.w  index, index    // vvVVUUuu
         shll8   index           // VVUUuu00
@@ -353,13 +347,13 @@ _rasterizeGT_asm:
         mov.w   dup, @-Rptr
 
         cmp/gt  Lptr, Rptr
-        bt/s    .block_2px
+        bt/s    .block_2px_gt
         sub     dtdx, t         // [delay slot] t -= dtdx
 
-.scanline_end:
+.scanline_end_gt:
         mov.l   @sp+, Rg
         mov.l   @sp+, Rt
-.scanline_end_fast:
+.scanline_end_fast_gt:
         mov.l   @sp+, Rx
 
         mov     sp, tmp
@@ -385,26 +379,64 @@ _rasterizeGT_asm:
         dt      h
 
         mov.w   var_frameWidth, tmp
-        bf/s    .scanline_start
+        bf/s    .scanline_start_gt
         add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
 
         mov.l   @(SP_L, sp), L
         mov.l   @(SP_R, sp), R
-        bra     .loop
+        bra     .loop_gt
         mov.l   @(SP_H, sp), Rh
 
-var_frameWidth:
-        .word   FRAME_WIDTH
-.align 2
-var_LMAP_ADDR:
-        .long   _gLightmap_base
-var_mask:
-        .long    0xFFFFFF00
-var_divTable:
-        .long   _divTable
-var_gTile:
-        .long   _gTile
-
-.align 2
-.global _rasterizeGT_asm_end
-_rasterizeGT_asm_end:
+#undef tmp
+#undef Lh
+#undef Rh
+#undef dup
+#undef pixel
+#undef L
+#undef R
+#undef N
+#undef Lx
+#undef Rx
+#undef Lg
+#undef Rg
+#undef Lt
+#undef Rt
+#undef TILE
+#undef h
+#undef Ldx
+#undef Rdx
+#undef Ldt
+#undef Rdt
+#undef Ry
+#undef Ly
+#undef Rv
+#undef Lv
+#undef Lptr
+#undef Rptr
+#undef g
+#undef dgdx
+#undef t
+#undef dtdx
+#undef index
+#undef LMAP
+#undef divLUT
+#undef iw
+#undef ih
+#undef dx
+#undef mask
+#undef sLdx
+#undef sRdx
+#undef sLdt
+#undef sRdt
+#undef sLdg
+#undef sRdg
+#undef SP_LDX
+#undef SP_RDX
+#undef SP_LDT
+#undef SP_RDT
+#undef SP_LDG
+#undef SP_RDG
+#undef SP_H
+#undef SP_L
+#undef SP_R
+#undef SP_SIZE
diff --git a/src/platform/32x/asm/rasterizeS.s b/src/platform/32x/asm/rasterizeS.i
similarity index 78%
rename from src/platform/32x/asm/rasterizeS.s
rename to src/platform/32x/asm/rasterizeS.i
index 5ee61ba..a1ef8bc 100644
--- a/src/platform/32x/asm/rasterizeS.s
+++ b/src/platform/32x/asm/rasterizeS.i
@@ -1,6 +1,3 @@
-#include "common.i"
-SEG_RASTER
-
 #define tmp     r0
 #define Lh      r1
 #define Rh      r2
@@ -8,7 +5,8 @@ SEG_RASTER
 #define pixel   r4      // arg
 #define L       r5      // arg
 #define R       r6      // arg
-#define N       r7
+#define gtile   r7      // arg (unused)
+#define N       gtile
 #define Lx      r8
 #define Rx      r9
 #define Ldx     r10
@@ -29,10 +27,7 @@ SEG_RASTER
 #define ih      inv
 
 .align 4
-.global _rasterizeS_asm_start
-_rasterizeS_asm_start:
-
-.exit:
+.exit_s:
         // pop
         mov.l   @sp+, r14
         mov.l   @sp+, r13
@@ -55,20 +50,20 @@ _rasterizeS_asm:
         mov.l   r13, @-sp
         mov.l   r14, @-sp
 
-        mov.l   var_LMAP_ADDR, LMAP
+        mov.l   var_LMAP_ADDR_fs, LMAP
         mov     #27, tmp
         shll8   tmp
         or      tmp, LMAP
 
-        mov.l   var_divTable, divLUT
+        mov.l   var_divTable_fs, divLUT
 
         mov     #0, Rh
         mov     #0, Lh
-.loop:
+.loop_s:
         tst     Lh, Lh
-        bf/s    .calc_left_end
+        bf/s    .calc_left_end_s
 
-.calc_left_start:
+.calc_left_start_s:
         mov.b   @(VERTEX_PREV, L), tmp  // [delay slot]
         mov     tmp, N
         shll2   N
@@ -83,9 +78,9 @@ _rasterizeS_asm:
         mov.w   @tmp+, Lh
 
         cmp/ge  Ly, Lh
-        bf/s    .exit
+        bf/s    .exit_s
         cmp/eq  Ly, Lh          // [delay slot]
-        bt/s    .calc_left_start        // if (L->v.y == N->v.y) check next vertex
+        bt/s    .calc_left_start_s      // if (L->v.y == N->v.y) check next vertex
         mov     N, L            // [delay slot]
 
         sub     Lx, Ldx
@@ -98,12 +93,12 @@ _rasterizeS_asm:
         muls.w  ih, Ldx
           shll16  Lx            // [delay slot]
         sts     MACL, Ldx
-.calc_left_end:
+.calc_left_end_s:
 
         tst     Rh, Rh
-        bf/s    .calc_right_end
+        bf/s    .calc_right_end_s
 
-.calc_right_start:
+.calc_right_start_s:
         mov.b   @(VERTEX_NEXT, R), tmp  // [delay slot]
         mov     tmp, N
         shll2   N
@@ -118,9 +113,9 @@ _rasterizeS_asm:
         mov.w   @tmp+, Rh
 
         cmp/ge  Ry, Rh
-        bf/s    .exit
+        bf/s    .exit_s
         cmp/eq  Ry, Rh          // [delay slot]
-        bt/s    .calc_right_start       // if (R->v.y == N->v.y) check next vertex
+        bt/s    .calc_right_start_s     // if (R->v.y == N->v.y) check next vertex
         mov     N, R            // [delay slot]
 
         sub     Rx, Rdx
@@ -133,21 +128,21 @@ _rasterizeS_asm:
         muls.w  ih, Rdx
           shll16  Rx            // [delay slot]
         sts     MACL, Rdx
-.calc_right_end:
+.calc_right_end_s:
 
         // h = min(Lh, Rh)
         cmp/gt  Rh, Lh
-        bf/s    .scanline_prepare
+        bf/s    .scanline_prepare_s
         mov     Lh, h           // [delay slot]
         mov     Rh, h
 
-.scanline_prepare:
+.scanline_prepare_s:
         sub     h, Lh
         sub     h, Rh
 
         mov.l   R, @-sp
         
-.scanline_start:
+.scanline_start_s:
         mov     Lx, Lptr
         mov     Rx, Rptr
         add     Ldx, Lx
@@ -155,7 +150,7 @@ _rasterizeS_asm:
         shlr16  Lptr            // Lptr = (Lx >> 16)
         shlr16  Rptr            // Rptr = (Rx >> 16)
         cmp/gt  Lptr, Rptr      // if (!(Rptr > Lptr)) skip zero length scanline
-        bf/s    .scanline_end
+        bf/s    .scanline_end_s
 
         // iw = divTable[Rptr - Lptr]
         mov     Rptr, tmp       // [delay slot]
@@ -166,32 +161,43 @@ _rasterizeS_asm:
         add     pixel, Lptr   // Lptr = pixel + (Lx >> 16)
         add     pixel, Rptr   // Rptr = pixel + (Rx >> 16)
 
-.shade_pixel:
+.shade_pixel_s:
         mov.b   @Lptr, index
         mov.b   @(index, LMAP), index
         mov.b   index, @Lptr
         add     #1, Lptr
         cmp/gt  Lptr, Rptr
-        bt      .shade_pixel
+        bt      .shade_pixel_s
 
-.scanline_end:
+.scanline_end_s:
         dt      h
 
-        mov.w   var_frameWidth, tmp
-        bf/s    .scanline_start
+        mov.w   var_frameWidth_fs, tmp
+        bf/s    .scanline_start_s
         add     tmp, pixel      // [delay slot] pixel += 120 + 120 + 80
 
-        bra     .loop
+        bra     .loop_s
         mov.l   @sp+, R
 
-var_frameWidth:
-        .word   FRAME_WIDTH
-.align 2
-var_LMAP_ADDR:
-        .long   _gLightmap_base
-var_divTable:
-        .long   _divTable
-
-.align 2
-.global _rasterizeS_asm_end
-_rasterizeS_asm_end:
\ No newline at end of file
+#undef tmp
+#undef Lh
+#undef Rh
+#undef Lptr
+#undef pixel
+#undef L
+#undef R
+#undef N
+#undef Lx
+#undef Rx
+#undef Ldx
+#undef Rdx
+#undef LMAP
+#undef inv
+#undef divLUT
+#undef index
+#undef h
+#undef Ry
+#undef Ly
+#undef Rptr
+#undef iw
+#undef ih
diff --git a/src/platform/32x/asm/rasterize_dummy.s b/src/platform/32x/asm/rasterize_dummy.i
similarity index 73%
rename from src/platform/32x/asm/rasterize_dummy.s
rename to src/platform/32x/asm/rasterize_dummy.i
index 1946774..a80b2ff 100644
--- a/src/platform/32x/asm/rasterize_dummy.s
+++ b/src/platform/32x/asm/rasterize_dummy.i
@@ -1,6 +1,3 @@
-#include "common.i"
-
-.text
 .align 4
 .global _rasterize_dummy
 _rasterize_dummy:
diff --git a/src/platform/32x/asm/transformMesh.s b/src/platform/32x/asm/transformMesh.i
similarity index 84%
rename from src/platform/32x/asm/transformMesh.s
rename to src/platform/32x/asm/transformMesh.i
index b5f45d4..75f2095 100644
--- a/src/platform/32x/asm/transformMesh.s
+++ b/src/platform/32x/asm/transformMesh.i
@@ -1,6 +1,3 @@
-#include "common.i"
-SEG_TRANS
-
 #define tmp             r0
 #define maxZ            r1
 #define divLUT          r2
@@ -63,13 +60,13 @@ _transformMesh_asm:
         exts.b  ambient, vg
 
         // vg = clamp(vg, 0, 31) + 1
-.vg_max:
+.vg_max_m:
         mov     #31, tmp
         cmp/gt  tmp, vg
-        bf/s    .vg_min
-        cmp/pz  vg              // T = vg >= 0
+        bf/s    .vg_min_m
+        cmp/pz  vg              // [delay slot] T = vg >= 0
         mov     tmp, vg
-.vg_min:
+.vg_min_m:
         subc    tmp, tmp        // tmp = -T
         and     tmp, vg
 
@@ -88,7 +85,7 @@ _transformMesh_asm:
         shll16  mz
         add     #-MATRIX_SIZEOF, m
 
-.loop:
+.loop_m:
         // clear clipping flags
         shlr8   vg
         shll8   vg
@@ -101,20 +98,20 @@ _transformMesh_asm:
         transform z, mz
 
         // z clipping
-.clip_z_near:
+.clip_z_near_m:
         mov     #VIEW_MIN, minZ // 64
         cmp/gt  z, minZ
-        bf/s    .clip_z_far
-        cmp/ge  maxZ, z
+        bf/s    .clip_z_far_m
+        cmp/ge  maxZ, z         // [delay slot]
         mov     minZ, z
         add     #CLIP_NEAR, vg
-.clip_z_far:
-        bf/s    .project
+.clip_z_far_m:
+        bf/s    .project_m
         mov     z, dz           // [delay slot] dz = z
         mov     maxZ, z
         add     #CLIP_FAR, vg
 
-.project:
+.project_m:
         // dz = divTable[z >> (PROJ_SHIFT = 4)]
         shlr2   dz
         shlr2   dz
@@ -137,34 +134,34 @@ _transformMesh_asm:
         shlr16  y
         exts.w  y, y
 
-.apply_offset:
+        // apply_offset
         // x += FRAME_WIDTH / 2 (160)
         add     #100, x         // x += 100
         add     #60, x          // x += 60
         // y += FRAME_HEIGHT / 2 (112)
         add     #112, y         // y += 112
 
-.clip_frame_x:  // 0 < x > FRAME_WIDTH
+        // 0 < x > FRAME_WIDTH
         mov     #80, tmp
         shll2   tmp             // tmp = 80 * 4 = 320 = FRAME_WIDTH
         cmp/hi  tmp, x
-        bt/s    .clip_frame
+        bt/s    .clip_frame_m
         add     #-96, tmp       // [delay slot] tmp = 320 - 96 = 224 = FRAME_HEIGHT
-.clip_frame_y:  // 0 < y > FRAME_HEIGHT
+        // 0 < y > FRAME_HEIGHT
         cmp/hi  tmp, y
-.clip_frame:
+.clip_frame_m:
         movt    tmp
         or      tmp, vg         // vg |= CLIP_FRAME
 
-.store_vertex:
+        // store_vertex
         mov.w   vg, @-res
         mov.w   z, @-res
         mov.w   y, @-res
         mov.w   x, @-res
 
         dt      count
-        bf/s    .loop
-        add     #16, res
+        bf/s    .loop_m
+        add     #16, res        // [delay slot]
 
         // pop
         mov.l   @sp+, r13
@@ -175,12 +172,21 @@ _transformMesh_asm:
         rts
         mov.l   @sp+, r8
 
-.align 2
-var_gVerticesBase:
-        .long   _gVerticesBase
-var_gMatrixPtr:
-        .long   _gMatrixPtr
-var_gLightAmbient:
-        .long   _gLightAmbient
-var_divTable:
-        .long   _divTable
+#undef tmp
+#undef maxZ
+#undef divLUT
+#undef res
+#undef vertices
+#undef count
+#undef intensity
+#undef m
+#undef x
+#undef y
+#undef z
+#undef mx
+#undef my
+#undef mz
+#undef vg
+#undef ambient
+#undef dz
+#undef minZ
\ No newline at end of file
diff --git a/src/platform/32x/asm/transformRoom.s b/src/platform/32x/asm/transformRoom.i
similarity index 79%
rename from src/platform/32x/asm/transformRoom.s
rename to src/platform/32x/asm/transformRoom.i
index 8bde783..4254a74 100644
--- a/src/platform/32x/asm/transformRoom.s
+++ b/src/platform/32x/asm/transformRoom.i
@@ -1,6 +1,3 @@
-#include "common.i"
-SEG_TRANS
-
 #define tmp             r0
 #define maxZ            r1
 #define divLUT          r2
@@ -56,11 +53,11 @@ _transformRoom_asm:
 
         // copy 3x3 matrix rotation part
         mov     #9, cnt
-.copyMtx:
+.copyMtx_r:
         mov.w   @tmp+, mx
         dt      cnt
-        bf/s    .copyMtx
-        mov.w   mx, @-stackMtx
+        bf/s    .copyMtx_r
+        mov.w   mx, @-stackMtx  // [delay slot]
 
         // prepare offsets (const)
         mov.w   @tmp+, mx
@@ -73,7 +70,7 @@ _transformRoom_asm:
         add     #8, res         // extra offset for @-Rn
         nop
 
-.loop:
+.loop_r:
         // unpack vertex
         mov.b   @vertices+, x
         mov.b   @vertices+, y
@@ -105,7 +102,7 @@ _transformRoom_asm:
         exts.w  z, z
 
 
-.z_range_check: // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
+        // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
         // tmp = z + VIEW_OFF = z + 4096
         mov     #16, tmp
         shll8   tmp
@@ -115,18 +112,18 @@ _transformRoom_asm:
         shll8   maxZ
         // check if z in [-VIEW_OFF..VIEW_MAX + VIEW_OFF]
         cmp/hi  maxZ, tmp
-        bf/s    .visible
+        bf/s    .visible_r
         mov     #40, maxZ       // [delay slot] maxZ = 40
         mov     #(CLIP_NEAR + CLIP_FAR), vg
         mov.w   vg, @-res
         add     #1, vertices
         dt      count
-        bf/s    .loop
-        add     #10, res
-        bra     .done
+        bf/s    .loop_r
+        add     #10, res        // [delay slot]
+        bra     .done_r
         nop
 
-.visible:
+.visible_r:
         //transform y
         lds     my, MACL
         mac.w   @stackVtx+, @stackMtx+
@@ -154,8 +151,8 @@ _transformRoom_asm:
         shll8   tmp
         // if z <= FOG_MIN -> skip fog calc
         cmp/gt  tmp, z
-        bf/s    .clip_z_near
-        mov     z, fog
+        bf/s    .clip_z_near_r
+        mov     z, fog          // [delay slot]
         sub     tmp, fog        // fog = z - FOG_MIN
         shll    fog             // FOG_SHIFT
         shlr8   fog             // shift down to 0..31 range
@@ -163,36 +160,36 @@ _transformRoom_asm:
         // vg = min(vg, 31)
         mov     #31, tmp
         cmp/gt  tmp, vg
-        bf      .clip_z_near
+        bf      .clip_z_near_r
         mov     #31, vg
 
         // z clipping
-.clip_z_near:
+.clip_z_near_r:
         add     #1, vg          // +1 for signed lightmap fetch
         mov     #VIEW_MIN, minZ // minZ = VIEW_MIN = 64
         cmp/gt  z, minZ
-        bf/s    .clip_z_far
+        bf/s    .clip_z_far_r
         shll8   vg              // [delay slot] clear lower 8-bits of vg for clipping flags
         mov     minZ, z
         add     #CLIP_NEAR, vg
-.clip_z_far:
+.clip_z_far_r:
         cmp/ge  maxZ, z
-        bf/s    .project
-        mov     z, dz
+        bf/s    .project_r
+        mov     z, dz           // [delay slot]
         mov     maxZ, z
         add     #CLIP_FAR, vg
 
-.project: // dz = divTable[z >> (PROJ_SHIFT = 4)]
+.project_r: // dz = divTable[z >> (PROJ_SHIFT = 4)]
         shlr2   dz
         shlr2   dz
         shll    dz
         mov.w   @(dz, divLUT), dz
 
-.proj_x: // x = x * dz >> 12
+        // x = x * dz >> 12
         muls.w  dz, x
         sts     MACL, x
 
-.proj_y: // y = y * dz >> 12
+        // y = y * dz >> 12
         muls.w  dz, y
         sts     MACL, y
 
@@ -200,29 +197,29 @@ _transformRoom_asm:
         shar12  y, tmp
 
         // portal rect clipping
-.clip_vp_minX:
+.clip_vp_minX_r:
         mov.w   @(0, vp), minX
         cmp/gt  x, minX
-        bf/s    .clip_vp_minY
-        mov.w   @(2, vp), minY
+        bf/s    .clip_vp_minY_r
+        mov.w   @(2, vp), minY  // [delay slot]
         add     #CLIP_LEFT, vg
-.clip_vp_minY:
+.clip_vp_minY_r:
         cmp/ge  y, minY
-        bf/s    .clip_vp_maxX
-        mov.w   @(4, vp), maxX
+        bf/s    .clip_vp_maxX_r
+        mov.w   @(4, vp), maxX  // [delay slot]
         add     #CLIP_TOP, vg
-.clip_vp_maxX:
+.clip_vp_maxX_r:
         cmp/gt  maxX, x
-        bf/s    .clip_vp_maxY
-        mov.w   @(6, vp), maxY
+        bf/s    .clip_vp_maxY_r
+        mov.w   @(6, vp), maxY  // [delay slot]
         add     #CLIP_RIGHT, vg
-.clip_vp_maxY:
+.clip_vp_maxY_r:
         cmp/ge  maxY, y
-        bf/s    .apply_offset
+        bf/s    .apply_offset_r
         mov     #80, tmp        // [delay slot] tmp = 80
         add     #CLIP_BOTTOM, vg
 
-.apply_offset:
+.apply_offset_r:
         // x += FRAME_WIDTH / 2 (160)
         add     #100, x         // x += 100
         add     #60, x          // x += 60
@@ -230,27 +227,27 @@ _transformRoom_asm:
         add     #112, y         // y += 112
 
         // frame rect clipping
-.clip_frame_x:  // 0 < x > FRAME_WIDTH
+        // 0 < x > FRAME_WIDTH
         shll2   tmp             // tmp = 80 * 4 = 320 = FRAME_WIDTH
         cmp/hi  tmp, x
-        bt/s    .clip_frame
+        bt/s    .clip_frame_r
         add     #-96, tmp       // [delay slot] tmp = 320 - 96 = 224 = FRAME_HEIGHT
-.clip_frame_y:  // 0 < y > FRAME_HEIGHT
+        // 0 < y > FRAME_HEIGHT
         cmp/hi  tmp, y
-.clip_frame:
+.clip_frame_r:
         movt    tmp
         or      tmp, vg         // vg |= CLIP_FRAME
 
-.store_vertex:
+        // store_vertex
         mov.w   vg, @-res
         mov.w   z, @-res
         mov.w   y, @-res
         mov.w   x, @-res
 
         dt      count
-        bf/s    .loop
-        add     #16, res
-.done:
+        bf/s    .loop_r
+        add     #16, res        // [delay slot]
+.done_r:
         // pop
         add     #SP_SIZE, sp
         mov.l   @sp+, r14
@@ -262,12 +259,28 @@ _transformRoom_asm:
         rts
         mov.l   @sp+, r8
 
-.align 2
-var_viewportRel:
-        .long   _viewportRel
-var_gVerticesBase:
-        .long   _gVerticesBase
-var_divTable:
-        .long   _divTable
-var_gMatrixPtr:
-        .long   _gMatrixPtr
+#undef tmp
+#undef maxZ
+#undef divLUT
+#undef res
+#undef vertices
+#undef count
+#undef stackVtx
+#undef stackMtx
+#undef vp
+#undef x
+#undef y
+#undef z
+#undef mx
+#undef my
+#undef mz
+#undef minX
+#undef minY
+#undef maxX
+#undef maxY
+#undef minZ
+#undef dz
+#undef vg
+#undef fog
+#undef cnt
+#undef SP_SIZE
\ No newline at end of file
diff --git a/src/platform/32x/main.cpp b/src/platform/32x/main.cpp
index a366ce8..47b3d2d 100644
--- a/src/platform/32x/main.cpp
+++ b/src/platform/32x/main.cpp
@@ -102,11 +102,18 @@ void pageFlip()
     MARS_VDP_FBCTL = pageIndex;
 }
 
+void pageClear()
+{
+    dmaFill((uint8*)&MARS_FRAMEBUFFER + 0x200, 0, FRAME_WIDTH * FRAME_HEIGHT);
+}
+
 extern "C" void pri_vbi_handler()
 {
     gFrameIndex++;
 }
 
+extern void flush_ot(int32 bit);
+
 extern "C" void secondary()
 {
     // init DMA
@@ -130,7 +137,15 @@ extern "C" void secondary()
         int cmd;
         while ((cmd = MARS_SYS_COMM4) == 0);
 
-        // TODO
+        switch (cmd)
+        {            
+            case MARS_CMD_CLEAR:
+                pageClear();
+                break;
+            case MARS_CMD_FLUSH:
+                flush_ot(1);
+                break;
+        }
 
         MARS_SYS_COMM4 = 0;
     }
@@ -164,7 +179,7 @@ int main()
             }
         }
 
-        clear();
+        pageClear();
     }
 
     SH2_WDT_VCR = (65<<8) | (SH2_WDT_VCR & 0x00FF); // set exception vector for WDT
diff --git a/src/platform/32x/rasterizer.h b/src/platform/32x/rasterizer.h
index 9a5061a..ac93a66 100644
--- a/src/platform/32x/rasterizer.h
+++ b/src/platform/32x/rasterizer.h
@@ -15,24 +15,21 @@
 #define CACHE_OFF(ptr) ptr = &ptr[0x20000000 / sizeof(ptr[0])];
 
 extern uint8 gLightmap[256 * 32];
-extern const ColorIndex* gTile;
 
-    extern "C" {
-        void rasterize_dummy_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeS_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeF_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeFT_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeGT_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeFTA_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeGTA_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeLineH_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeLineV_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-        void rasterizeFillS_asm(uint16* pixel, const VertexLink* L, const VertexLink* R);
-    }
-
-    #define rasterize_dummy rasterize_dummy_asm
-//    #define rasterizeF rasterizeF_asm
+extern "C" {
+    void rasterize_dummy_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeS_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeF_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeFT_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeGT_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeFTA_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeGTA_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeLineH_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeLineV_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+    void rasterizeFillS_asm(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+}
 
+#define rasterize_dummy rasterize_dummy_asm
 #define rasterizeS rasterizeS_c
 #define rasterizeF rasterizeF_c
 #define rasterizeFT rasterizeFT_c
@@ -44,7 +41,7 @@ extern const ColorIndex* gTile;
 #define rasterizeLineV rasterizeLineV_c
 #define rasterizeFillS rasterizeFillS_c
 
-extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
     const uint8* ft_lightmap = &gLightmap[0x1A00];
 
@@ -143,10 +140,9 @@ extern "C" void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLin
     }
 }
 
-extern "C" void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
-    uint32 color = (uint32)R;
-    color = gLightmap[(L->v.g << 8) | color];
+    uint32 color = gLightmap[(L->v.g << 8) | (uint32)R];
     color |= (color << 8);
 
     int32 Lh = 0;
@@ -251,7 +247,7 @@ extern "C" void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLin
     }
 }
 
-extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
     const uint8* ft_lightmap = &gLightmap[L->v.g << 8];
 
@@ -339,7 +335,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
 
                 if (intptr_t(ptr) & 1)
                 {
-                    *ptr++ = ft_lightmap[gTile[(t & 0xFF00) | (t >> 24)]];
+                    *ptr++ = ft_lightmap[tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
                     width--;
                 }
@@ -347,7 +343,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
                 if (width & 1)
                 {
                     uint32 tmp = Rt - dtdx;
-                    ptr[width - 1] = ft_lightmap[gTile[(tmp & 0xFF00) | (tmp >> 24)]];
+                    ptr[width - 1] = ft_lightmap[tile[(tmp & 0xFF00) | (tmp >> 24)]];
                 }
 
                 width >>= 1;
@@ -357,7 +353,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
 
                 while (width--)
                 {
-                    uint8 indexA = ft_lightmap[gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = ft_lightmap[tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
 
                     *(uint16*)ptr = indexA | (indexA << 8);
@@ -368,9 +364,9 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
                 width >>= 1;
                 while (width--)
                 {
-                    uint8 indexA = ft_lightmap[gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = ft_lightmap[tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
-                    uint8 indexB = ft_lightmap[gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexB = ft_lightmap[tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
                     
                 #ifdef CPU_BIG_ENDIAN
@@ -394,7 +390,7 @@ extern "C" void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLi
     }
 }
 
-extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
 #ifdef ALIGNED_LIGHTMAP
     ASSERT((intptr_t(gLightmap) & 0xFFFF) == 0); // lightmap should be 64k aligned
@@ -504,9 +500,9 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
                 {
                 #ifdef ALIGNED_LIGHTMAP
                     const uint8* LMAP = (uint8*)(g >> 8 << 8); 
-                    uint8 indexA = LMAP[gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = LMAP[tile[(t & 0xFF00) | (t >> 24)]];
                 #else
-                    uint8 indexA = gLightmap[(g >> 8 << 8) | gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = gLightmap[(g >> 8 << 8) | tile[(t & 0xFF00) | (t >> 24)]];
                 #endif
                     *ptr++ = indexA;
                     t += dtdx;
@@ -519,9 +515,9 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
                     uint32 tmp = Rt - dtdx;
                 #ifdef ALIGNED_LIGHTMAP
                     const uint8* LMAP = (uint8*)(Rg >> 8 << 8); 
-                    uint8 indexA = LMAP[gTile[(tmp & 0xFF00) | (tmp >> 24)]];
+                    uint8 indexA = LMAP[tile[(tmp & 0xFF00) | (tmp >> 24)]];
                 #else
-                    uint8 indexA = gLightmap[(Rg >> 8 << 8) | gTile[(tmp & 0xFF00) | (tmp >> 24)]];
+                    uint8 indexA = gLightmap[(Rg >> 8 << 8) | tile[(tmp & 0xFF00) | (tmp >> 24)]];
                 #endif
                     ptr[width - 1] = indexA;
                 }
@@ -535,9 +531,9 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
                 {
                 #ifdef ALIGNED_LIGHTMAP
                     const uint8* LMAP = (uint8*)(g >> 8 << 8); 
-                    uint8 indexA = LMAP[gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = LMAP[tile[(t & 0xFF00) | (t >> 24)]];
                 #else
-                    uint8 indexA = gLightmap[(g >> 8 << 8) | gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = gLightmap[(g >> 8 << 8) | tile[(t & 0xFF00) | (t >> 24)]];
                 #endif
                     *(uint16*)ptr = indexA | (indexA << 8);
                     ptr += 2;
@@ -550,15 +546,15 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
                 #ifdef ALIGNED_LIGHTMAP
                     const uint8* LMAP = (uint8*)(g >> 8 << 8); 
 
-                    uint8 indexA = LMAP[gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = LMAP[tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
-                    uint8 indexB = LMAP[gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexB = LMAP[tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
                     g += dgdx;
                 #else
-                    uint8 indexA = gLightmap[(g >> 8 << 8) | gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexA = gLightmap[(g >> 8 << 8) | tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
-                    uint8 indexB = gLightmap[(g >> 8 << 8) | gTile[(t & 0xFF00) | (t >> 24)]];
+                    uint8 indexB = gLightmap[(g >> 8 << 8) | tile[(t & 0xFF00) | (t >> 24)]];
                     t += dtdx;
                     g += dgdx;
                 #endif
@@ -586,177 +582,7 @@ extern "C" void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLi
     }
 }
 
-extern "C" void rasterizeFTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
-{
-    const uint8* ft_lightmap = &gLightmap[L->v.g << 8];
-
-    int32 Lh = 0, Rh = 0;
-    int32 Lx, Rx, Ldx = 0, Rdx = 0;
-    uint32 Lt, Rt, Ldt, Rdt;
-    Ldt = 0;
-    Rdt = 0;
-
-    while (1)
-    {
-        while (!Lh)
-        {
-            const VertexLink* N = L + L->prev;
-
-            if (N->v.y < L->v.y) return;
-
-            Lh = N->v.y - L->v.y;
-            Lx = L->v.x;
-            Lt = L->t.t;
-
-            if (Lh > 1)
-            {
-                int32 tmp = FixedInvU(Lh);
-                Ldx = tmp * (N->v.x - Lx);
-
-                uint32 duv = N->t.t - Lt;
-                uint32 du = tmp * int16(duv >> 16);
-                uint32 dv = tmp * int16(duv);
-                Ldt = (du & 0xFFFF0000) | (dv >> 16);
-            }
-
-            Lx <<= 16;
-            L = N;
-        }
-
-        while (!Rh) 
-        {
-            const VertexLink* N = R + R->next;
-
-            if (N->v.y < R->v.y) return;
-
-            Rh = N->v.y - R->v.y;
-            Rx = R->v.x;
-            Rt = R->t.t;
-
-            if (Rh > 1)
-            {
-                int32 tmp = FixedInvU(Rh);
-                Rdx = tmp * (N->v.x - Rx);
-
-                uint32 duv = N->t.t - Rt;
-                uint32 du = tmp * int16(duv >> 16);
-                uint32 dv = tmp * int16(duv);
-                Rdt = (du & 0xFFFF0000) | (dv >> 16);
-            }
-
-            Rx <<= 16;
-            R = N;
-        }
-
-        int32 h = X_MIN(Lh, Rh);
-        Lh -= h;
-        Rh -= h;
-
-        while (h--)
-        {
-            int32 x1 = Lx >> 16;
-            int32 x2 = Rx >> 16;
-
-            int32 width = x2 - x1;
-
-            if (width > 0)
-            {
-                uint32 tmp = FixedInvU(width);
-
-                uint32 duv = Rt - Lt;
-                uint32 du = tmp * int16(duv >> 16);
-                uint32 dv = tmp * int16(duv);
-                uint32 dtdx = (du & 0xFFFF0000) | (dv >> 16);
-
-                uint32 t = Lt;
-
-                volatile uint8* ptr = (uint8*)pixel + x1;
-
-                if (intptr_t(ptr) & 1)
-                {
-                    uint8 p = gTile[(t & 0xFF00) | (t >> 24)];
-                    if (p) {
-                        *ptr = ft_lightmap[p];
-                    }
-                    ptr++;
-                    t += dtdx;
-                    width--;
-                }
-
-                if (width & 1)
-                {
-                    uint32 tmp = Rt - dtdx;
-                    uint8 p = gTile[(tmp & 0xFF00) | (tmp >> 24)];
-                    if (p) {
-                        ptr[width - 1] = ft_lightmap[p];
-                    }
-                }
-
-                width >>= 1;
-
-            #ifdef TEX_2PX
-                dtdx <<= 1;
-
-                while (width--)
-                {
-                    uint8 indexA = gTile[(t & 0xFF00) | (t >> 24)];
-                    t += dtdx;
-
-                    if (indexA)
-                    {
-                        indexA = ft_lightmap[indexA];
-                        *(uint16*)ptr = indexA | (indexA << 8);
-                    }
-
-                    ptr += 2;
-                }
-            #else
-                while (width--)
-                {
-                    uint8 indexA = gTile[(t & 0xFF00) | (t >> 24)];
-                    t += dtdx;
-                    uint8 indexB = gTile[(t & 0xFF00) | (t >> 24)];
-                    t += dtdx;
-
-
-                    if (indexA && indexB)
-                    {
-                        indexA = ft_lightmap[indexA];
-                        indexB = ft_lightmap[indexB];
-
-                        #ifdef CPU_BIG_ENDIAN
-                            *(uint16*)ptr = indexB | (indexA << 8);
-                        #else
-                            *(uint16*)ptr = indexA | (indexB << 8);
-                        #endif
-
-                    }/* else if (indexA) {
-                        *(uint16*)ptr = (*(uint16*)ptr & 0xFF00) | ft_lightmap[indexA];
-                    } else if (indexB) {
-                        *(uint16*)ptr = (*(uint16*)ptr & 0x00FF) | (ft_lightmap[indexB] << 8);
-                    }*/
-
-                    ptr += 2;
-                }
-            #endif
-            }
-
-            pixel += VRAM_WIDTH;
-
-            Lx += Ldx;
-            Rx += Rdx;
-            Lt += Ldt;
-            Rt += Rdt;
-        }
-    }
-}
-
-extern "C" void rasterizeGTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
-{
-    rasterizeFTA(pixel, L, R);
-}
-
-extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
     R++;
     const uint8* ft_lightmap = &gLightmap[L->v.g << 8] + 128;
@@ -820,7 +646,7 @@ extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const Vert
 
     for (int32 y = 0; y < h; y++)
     {
-        const ColorIndex* xtile = (ColorIndex*)gTile + (v & 0xFF00);
+        const ColorIndex* xtile = tile + (v & 0xFF00);
 
         volatile uint8* xptr = ptr;
 
@@ -859,7 +685,7 @@ extern "C" void rasterizeSprite_c(uint16* pixel, const VertexLink* L, const Vert
     }
 }
 
-extern "C" void rasterizeLineH_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeLineH_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
     R++;
     int32 x = L->v.x;
@@ -889,7 +715,7 @@ extern "C" void rasterizeLineH_c(uint16* pixel, const VertexLink* L, const Verte
     }
 }
 
-extern "C" void rasterizeLineV_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeLineV_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
     R++;
     int32 x = L->v.x;
@@ -905,7 +731,7 @@ extern "C" void rasterizeLineV_c(uint16* pixel, const VertexLink* L, const Verte
     }
 }
 
-extern "C" void rasterizeFillS_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
+extern "C" void rasterizeFillS_c(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile)
 {
     R++;
     int32 x = L->v.x;
diff --git a/src/platform/32x/render.cpp b/src/platform/32x/render.cpp
index d22c39c..186f4bd 100644
--- a/src/platform/32x/render.cpp
+++ b/src/platform/32x/render.cpp
@@ -23,17 +23,7 @@ struct ViewportRel {
     int32 maxXY;
 };
 
-#if defined(_WIN32)
-    uint16 fb[VRAM_WIDTH * FRAME_HEIGHT];
-#elif defined(__GBA__)
-    uint32 fb = MEM_VRAM;
-#elif defined(__TNS__)
-    uint16 fb[VRAM_WIDTH * FRAME_HEIGHT];
-#elif defined(__DOS__)
-    uint16 fb[VRAM_WIDTH * FRAME_HEIGHT];
-#elif defined(__32X__)
-    #define fb ((uint8*)&MARS_FRAMEBUFFER + 0x200)
-#endif
+#define fb ((uint8*)&MARS_FRAMEBUFFER + 0x200)
 
 enum FaceType {
     FACE_TYPE_SHADOW,
@@ -60,8 +50,6 @@ enum FaceType {
 
 extern Level level;
 
-const ColorIndex* gTile;
-
 ViewportRel viewportRel;
 Vertex* gVerticesBase;
 Face* gFacesBase;
@@ -88,6 +76,15 @@ const MeshQuad gShadowQuads[] = {
     { (FACE_TYPE_SHADOW << FACE_TYPE_SHIFT), {6, 3, 4, 5} }
 };
 
+
+// TODO: remove
+// just a dummy function to align functions below >_<
+uint16 test(uint16 g0, uint16 g1, uint16 g2, uint16 g3)
+{
+    return X_MAX(g0, X_MAX(g1, X_MAX(g2, g3)));
+}
+
+
 void setViewport(const RectMinMax &vp)
 {
     viewport = vp;
@@ -118,9 +115,9 @@ X_INLINE Face* faceAdd(int32 depth)
 }
 
 extern "C" {
-    X_NOINLINE void drawPoly(uint32 flags, VertexLink* v);
-    X_NOINLINE void drawTriangle(uint32 flags, VertexLink* v);
-    X_NOINLINE void drawQuad(uint32 flags, VertexLink* v);
+    X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorIndex* tile);
+    X_NOINLINE void drawTriangle(uint32 flags, VertexLink* v, const ColorIndex* tile);
+    X_NOINLINE void drawQuad(uint32 flags, VertexLink* v, const ColorIndex* tile);
 }
 
 extern "C" {
@@ -131,12 +128,12 @@ extern "C" {
     void faceAddRoomTriangles_asm(const RoomTriangle* polys, int32 count);
     void faceAddMeshQuads_asm(const MeshQuad* polys, int32 count);
     void faceAddMeshTriangles_asm(const MeshTriangle* polys, int32 count);
-    void rasterize_asm(uint32 flags, VertexLink* top);
+    void rasterize_asm(uint32 flags, VertexLink* top, const ColorIndex* tile);
 }
 
-#ifdef USE_ASM
+#if 1 //USE_ASM
     #define transformRoom           transformRoom_asm
-    #define transformRoomUW         transformRoomUW_asm
+    #define transformRoomUW         transformRoom_asm
     #define transformMesh           transformMesh_asm
     #define faceAddRoomQuads        faceAddRoomQuads_asm
     #define faceAddRoomTriangles    faceAddRoomTriangles_asm
@@ -366,15 +363,15 @@ void transformMesh_c(const MeshVertex* vertices, int32 count, int32 intensity)
 
 void faceAddRoomQuads_c(const RoomQuad* polys, int32 count)
 {
-    const Vertex* v = gVerticesBase;
+    const uint8* v = (uint8*)gVerticesBase;
 
     for (int32 i = 0; i < count; i++, polys++)
     {
         uint32 flags = polys->flags;
-        const Vertex* v0 = v + polys->indices[0];
-        const Vertex* v1 = v + polys->indices[1];
-        const Vertex* v2 = v + polys->indices[2];
-        const Vertex* v3 = v + polys->indices[3];
+        const Vertex* v0 = (Vertex*)(v + (polys->indices[0] << 2));
+        const Vertex* v1 = (Vertex*)(v + (polys->indices[1] << 2));
+        const Vertex* v2 = (Vertex*)(v + (polys->indices[2] << 2));
+        const Vertex* v3 = (Vertex*)(v + (polys->indices[3] << 2));
 
         uint32 c0 = v0->clip;
         uint32 c1 = v1->clip;
@@ -413,14 +410,14 @@ void faceAddRoomQuads_c(const RoomQuad* polys, int32 count)
 
 void faceAddRoomTriangles_c(const RoomTriangle* polys, int32 count)
 {
-    const Vertex* v = gVerticesBase;
+    const uint8* v = (uint8*)gVerticesBase;
 
     for (int32 i = 0; i < count; i++, polys++)
     {
         uint32 flags = polys->flags;
-        const Vertex* v0 = v + polys->indices[0];
-        const Vertex* v1 = v + polys->indices[1];
-        const Vertex* v2 = v + polys->indices[2];
+        const Vertex* v0 = (Vertex*)(v + (polys->indices[0] << 2));
+        const Vertex* v1 = (Vertex*)(v + (polys->indices[1] << 2));
+        const Vertex* v2 = (Vertex*)(v + (polys->indices[2] << 2));
 
         uint32 c0 = v0->clip;
         uint32 c1 = v1->clip;
@@ -440,11 +437,12 @@ void faceAddRoomTriangles_c(const RoomTriangle* polys, int32 count)
         if (g0 != g1 || g0 != g2) {
             flags += FACE_GOURAUD;
         }
-        flags |= FACE_TRIANGLE;
 
         if (checkBackface(v0, v1, v2))
             continue;
 
+        flags |= FACE_TRIANGLE;
+
         int32 depth = X_MAX(v0->z, X_MAX(v1->z, v2->z)) >> OT_SHIFT;
 
         Face* f = faceAdd(depth);
@@ -529,6 +527,33 @@ void faceAddMeshTriangles_c(const MeshTriangle* polys, int32 count)
     }
 }
 
+typedef void (*RasterProc)(uint16* pixel, const VertexLink* L, const VertexLink* R, const ColorIndex* tile);
+
+extern "C" const RasterProc gRasterProc[FACE_TYPE_MAX] = {
+    rasterizeS,
+    rasterizeF,
+    rasterizeFT,
+    rasterizeFT,
+    rasterizeGT,
+    rasterizeGT,
+    rasterizeSprite,
+    rasterizeFillS,
+    rasterizeLineH,
+    rasterizeLineV
+};
+
+X_NOINLINE void rasterize_c(uint32 flags, VertexLink* top, const ColorIndex* tile)
+{
+    uint8* pixel = (uint8*)fb + top->v.y * FRAME_WIDTH;
+
+    uint32 type = (flags >> FACE_TYPE_SHIFT) & FACE_TYPE_MASK;
+
+    VertexLink* R = (type == FACE_TYPE_F) ? (VertexLink*)(flags & 0xFF) : top;
+
+    gRasterProc[type]((uint16*)pixel, top, R, tile);
+}
+#endif
+
 int32 sphereIsVisible_c(int32 sx, int32 sy, int32 sz, int32 r)
 {
     Matrix &m = matrixGet();
@@ -571,30 +596,93 @@ int32 sphereIsVisible_c(int32 sx, int32 sy, int32 sz, int32 r)
     return 1;
 }
 
-typedef void (*RasterProc)(uint16* pixel, const VertexLink* L, const VertexLink* R);
-
-extern "C" const RasterProc gRasterProc[FACE_TYPE_MAX] = { // IWRAM
-    rasterizeS,
-    rasterizeF,
-    rasterizeFT,
-    rasterizeFTA,
-    rasterizeGT,
-    rasterizeGTA,
-    rasterizeSprite,
-    rasterizeFillS,
-    rasterizeLineH,
-    rasterizeLineV
-};
-
-X_NOINLINE void rasterize_c(uint32 flags, VertexLink* top)
+void flush_ot(int32 bit)
 {
-    uint8* pixel = (uint8*)fb + top->v.y * FRAME_WIDTH;
+    int32 index = 0;
+    const ColorIndex* tile = NULL;
 
-    uint32 type = (flags >> FACE_TYPE_SHIFT) & FACE_TYPE_MASK;
+    for (int32 i = OT_SIZE - 1; i >= 0; i--)
+    {
+        if (!gOT[i]) continue;
 
-    VertexLink* R = (type == FACE_TYPE_F) ? (VertexLink*)(flags & 0xFF) : top;
+        Face *face = gOT[i];
 
-    gRasterProc[type]((uint16*)pixel, top, R);
+        do {
+            index++;
+
+            if ((index & 1) != bit) {
+                face = face->next;
+                continue;
+            }
+
+            uint32 flags = face->flags;
+
+            VertexLink v[16];
+
+            uint32 type = (flags >> FACE_TYPE_SHIFT) & FACE_TYPE_MASK;
+
+            if (type <= FACE_TYPE_GTA)
+            {
+                if (type > FACE_TYPE_F)
+                {
+                    const Texture &tex = level.textures[flags & FACE_TEXTURE];
+                    tile = (ColorIndex*)tex.tile;
+
+                    v[0].t.t = 0xFF00FF00 & (tex.uv01);
+                    v[1].t.t = 0xFF00FF00 & (tex.uv01 << 8);
+                    v[2].t.t = 0xFF00FF00 & (tex.uv23);
+                    v[3].t.t = 0xFF00FF00 & (tex.uv23 << 8);
+                }
+
+                v[0].v = gVertices[face->indices[0]];
+                v[1].v = gVertices[face->indices[1]];
+                v[2].v = gVertices[face->indices[2]];
+                if (!(flags & FACE_TRIANGLE)) {
+                    v[3].v = gVertices[face->indices[3]];
+                }
+
+                if (flags & FACE_CLIPPED) {
+                    drawPoly(flags, v, tile);
+                } else {
+                    if (flags & FACE_TRIANGLE) {
+                        drawTriangle(flags, v, tile);
+                    } else {
+                        drawQuad(flags, v, tile);
+                    }
+                }
+            }
+            else
+            {
+                const Vertex *vert = gVertices + face->indices[0];
+                v[0].v = vert[0];
+                v[1].v = vert[1];
+
+                if (type == FACE_TYPE_SPRITE)
+                {
+                    const Sprite &sprite = level.sprites[flags & FACE_TEXTURE];
+                    tile = (ColorIndex*)sprite.tile;
+                    v[0].t.t = (sprite.uwvh) & (0xFF00FF00);
+                    v[1].t.t = (sprite.uwvh) & (0xFF00FF00 >> 8);
+                }
+
+                rasterize(flags, v, tile);
+            }
+
+            face = face->next;
+
+        } while (face);
+#if 1
+    // sync
+        if (bit) {
+            MARS_SYS_COMM6 = i;
+            while (MARS_SYS_COMM2 > i);
+        } else {
+            MARS_SYS_COMM2 = i;
+            while (MARS_SYS_COMM6 > i);
+        }
+#endif
+    }
+    CacheClear();
 }
 
 void flush_c()
@@ -612,131 +700,39 @@ void flush_c()
         return;
 
     gFacesBase = gFaces;
-/*
+
+//#define ON_CHIP_RENDER
+
+#ifdef ON_CHIP_RENDER
     CacheControl(0);
     CacheControl(SH2_CCTL_CP | SH2_CCTL_CE | SH2_CCTL_TW);
 
-    extern int32 rasterizeGT_asm_start;
-    extern int32 rasterizeGT_asm_end;
-
-    int32 size = intptr_t(&rasterizeGT_asm_end) - intptr_t(&rasterizeGT_asm_start);
-    fast_memcpy((void*)(0xC0000000 + 0), &rasterizeGT_asm_start, size >> 2); // 516
-
-    extern int32 rasterizeFT_asm_start;
-    extern int32 rasterizeFT_asm_end;
-
-    size = intptr_t(&rasterizeFT_asm_end) - intptr_t(&rasterizeFT_asm_start);
-    fast_memcpy((void*)(0xC0000000 + 516), &rasterizeFT_asm_start, size >> 2); // 416
-
-    extern int32 rasterizeF_asm_start;
-    extern int32 rasterizeF_asm_end;
-
-    size = intptr_t(&rasterizeF_asm_end) - intptr_t(&rasterizeF_asm_start);
-    fast_memcpy((void*)(0xC0000000 + 516 + 416), &rasterizeF_asm_start, size >> 2); // 256
-
-    extern int32 rasterizeS_asm_start;
-    extern int32 rasterizeS_asm_end;
-
-    size = intptr_t(&rasterizeS_asm_end) - intptr_t(&rasterizeS_asm_start);
-    fast_memcpy((void*)(0xC0000000 + 516 + 416 + 256), &rasterizeS_asm_start, size >> 2); // 224
-
-    //extern int32 fps;
-    //fps = size;
-*/
+    extern int32 block_render_start;
+    extern int32 block_render_end;
 
+    int32 size = intptr_t(&block_render_end) - intptr_t(&block_render_start);
+    fast_memcpy((void*)0xC0000000, &block_render_start, size >> 2);
+#endif
     PROFILE(CNT_FLUSH);
 
-    for (int32 i = OT_SIZE - 1; i >= 0; i--)
-    {
-        if (!gOT[i]) continue;
+    MARS_WAIT();
+    CacheClear();    
 
-        Face *face = gOT[i];
-        gOT[i] = NULL;
+    MARS_SYS_COMM2 = OT_SIZE;
+    MARS_SYS_COMM6 = OT_SIZE;
+    MARS_SYS_COMM4 = MARS_CMD_FLUSH;
 
-        do {
-            uint32 flags = face->flags;
+    flush_ot(0);
 
-            VertexLink v[16];
+    MARS_WAIT();
 
-            uint32 type = (flags >> FACE_TYPE_SHIFT) & FACE_TYPE_MASK;
+    dmaFill(gOT, 0, OT_SIZE * sizeof(gOT[0]));
 
-            if (type <= FACE_TYPE_GTA)
-            {
-                if (type > FACE_TYPE_F)
-                {
-                    const Texture &tex = level.textures[flags & FACE_TEXTURE];
-                    gTile = (ColorIndex*)tex.tile;
-
-                    v[0].t.t = 0xFF00FF00 & (tex.uv01);
-                    v[1].t.t = 0xFF00FF00 & (tex.uv01 << 8);
-                    v[2].t.t = 0xFF00FF00 & (tex.uv23);
-                    v[3].t.t = 0xFF00FF00 & (tex.uv23 << 8);
-                }
-
-                v[0].v = gVertices[face->indices[0]];
-                v[1].v = gVertices[face->indices[1]];
-                v[2].v = gVertices[face->indices[2]];
-                if (!(flags & FACE_TRIANGLE)) {
-                    v[3].v = gVertices[face->indices[3]];
-                }
-
-                if (flags & FACE_CLIPPED) {
-                    drawPoly(flags, v);
-                } else {
-                    if (flags & FACE_TRIANGLE) {
-                        drawTriangle(flags, v);
-                    } else {
-                        drawQuad(flags, v);
-                    }
-                }
-            }
-            else
-            {
-                const Vertex *vert = gVertices + face->indices[0];
-                v[0].v = vert[0];
-                v[1].v = vert[1];
-
-                if (type == FACE_TYPE_SPRITE)
-                {
-                    const Sprite &sprite = level.sprites[flags & FACE_TEXTURE];
-                    gTile = (ColorIndex*)sprite.tile;
-                    v[0].t.t = (sprite.uwvh) & (0xFF00FF00);
-                    v[1].t.t = (sprite.uwvh) & (0xFF00FF00 >> 8);
-                }
-
-                rasterize(flags, v);
-            }
-
-            face = face->next;
-
-        } while (face);
-    }
-/*
+#ifdef ON_CHIP_RENDER
     CacheControl(0);
     CacheControl(SH2_CCTL_CP | SH2_CCTL_CE);
-*/
+#endif
 }
-#endif
-
-#if defined(__32X__)
-    #undef transformRoom
-    //#undef transformRoomUW
-    #undef transformMesh
-    //#undef faceAddRoomQuads
-    //#undef faceAddRoomTriangles
-    //#undef faceAddMeshQuads
-    //#undef faceAddMeshTriangles
-    #undef rasterize
-
-    #define transformRoom           transformRoom_asm
-    //#define transformRoomUW         transformRoomUW_asm
-    #define transformMesh           transformMesh_asm
-    //#define faceAddRoomQuads        faceAddRoomQuads_asm
-    //#define faceAddRoomTriangles    faceAddRoomTriangles_asm
-    //#define faceAddMeshQuads        faceAddMeshQuads_asm
-    //#define faceAddMeshTriangles    faceAddMeshTriangles_asm
-    #define rasterize               rasterize_asm
-#endif
 
 VertexLink* clipPoly(VertexLink* poly, VertexLink* tmp, int32 &pCount)
 {
@@ -817,7 +813,7 @@ void renderLevelFree()
 {
 }
 
-extern "C" X_NOINLINE void drawTriangle(uint32 flags, VertexLink* v)
+extern "C" X_NOINLINE void drawTriangle(uint32 flags, VertexLink* v, const ColorIndex* tile)
 {
     VertexLink* v0 = v + 0;
     VertexLink* v1 = v + 1;
@@ -846,10 +842,10 @@ extern "C" X_NOINLINE void drawTriangle(uint32 flags, VertexLink* v)
         }
     }
 
-    rasterize(flags, top);
+    rasterize(flags, top, tile);
 }
 
-extern "C" X_NOINLINE void drawQuad(uint32 flags, VertexLink* v)
+extern "C" X_NOINLINE void drawQuad(uint32 flags, VertexLink* v, const ColorIndex* tile)
 {
     VertexLink* v0 = v + 0;
     VertexLink* v1 = v + 1;
@@ -881,10 +877,10 @@ extern "C" X_NOINLINE void drawQuad(uint32 flags, VertexLink* v)
         }
     }
 
-    rasterize(flags, top);
+    rasterize(flags, top, tile);
 }
 
-extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v)
+extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v, const ColorIndex* tile)
 {
     VertexLink tmp[16];
 
@@ -902,7 +898,7 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v)
                 v[0].v.y == v[2].v.y)
                 return;
 
-            drawTriangle(flags, v);
+            drawTriangle(flags, v, tile);
         } else {
 
             if (v[0].v.y == v[1].v.y &&
@@ -910,7 +906,7 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v)
                 v[0].v.y == v[3].v.y)
                 return;
 
-            drawQuad(flags, v);
+            drawQuad(flags, v, tile);
         }
         return;
     }
@@ -954,7 +950,7 @@ extern "C" X_NOINLINE void drawPoly(uint32 flags, VertexLink* v)
         return; // zero height poly
     }
 
-    rasterize(flags, top);
+    rasterize(flags, top, tile);
 }
 
 void faceAddRoom(const Room* room)
@@ -981,18 +977,7 @@ void faceAddMesh(const MeshQuad* quads, const MeshTriangle* triangles, int32 qCo
 
 void clear()
 {
-#if 1
-    MARS_VDP_FILLEN = 0xFF;
-    MARS_VDP_FILADR = 0x100; // skip line table
-    for(int32 i = 0; i < (FRAME_WIDTH * FRAME_HEIGHT) >> 9; i++)
-    {
-        MARS_VDP_FILDAT = 0x0000;
-        while (MARS_VDP_FBCTL & MARS_VDP_FEN);
-        MARS_VDP_FILADR += 0x100;
-    }
-#else
-    dmaFill((void*)fb, 0, FRAME_WIDTH * FRAME_HEIGHT);
-#endif
+    MARS_SYS_COMM4 = MARS_CMD_CLEAR;
 }
 
 void renderRoom(const Room* room)