1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-08-12 08:04:09 +02:00

#368 micro optimizations of rasterizer, remove per-pixel texturing (unused)

This commit is contained in:
XProger
2022-02-12 06:14:47 +03:00
parent 401c854209
commit b6df8a2348
16 changed files with 221 additions and 348 deletions

View File

@@ -1,8 +1,6 @@
.section .iwram .section .iwram
.arm .arm
#define TEX_2PX // lazy texturing, comment out for per-pixel
#define FRAME_WIDTH 240 #define FRAME_WIDTH 240
#define FRAME_HEIGHT 160 #define FRAME_HEIGHT 160
@@ -45,9 +43,11 @@
.equ CLIP_MASK_VP, (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM) .equ CLIP_MASK_VP, (CLIP_LEFT + CLIP_RIGHT + CLIP_TOP + CLIP_BOTTOM)
.equ CLIP_MASK, (CLIP_MASK_VP + CLIP_FAR + CLIP_NEAR) .equ CLIP_MASK, (CLIP_MASK_VP + CLIP_FAR + CLIP_NEAR)
.equ FACE_TEXTURE_BITS, 13 .equ FACE_TEXTURE_BITS, 14
.equ FACE_TRIANGLE, (1 << FACE_TEXTURE_BITS)
.equ FACE_TEXTURE, ((1 << FACE_TEXTURE_BITS) - 1) .equ FACE_TEXTURE, ((1 << FACE_TEXTURE_BITS) - 1)
.equ FACE_GOURAUD, (2 << FACE_TYPE_SHIFT)
.equ FACE_CLIPPED, (1 << 18)
.equ FACE_TRIANGLE, (1 << 19)
.equ FACE_FLAGS, 0 .equ FACE_FLAGS, 0
.equ FACE_NEXT, 4 .equ FACE_NEXT, 4
@@ -67,9 +67,6 @@
.equ FACE_TYPE_LINE_H, (8 << FACE_TYPE_SHIFT) .equ FACE_TYPE_LINE_H, (8 << FACE_TYPE_SHIFT)
.equ FACE_TYPE_LINE_V, (9 << FACE_TYPE_SHIFT) .equ FACE_TYPE_LINE_V, (9 << FACE_TYPE_SHIFT)
.equ FACE_GOURAUD, (2 << FACE_TYPE_SHIFT)
.equ FACE_CLIPPED, (1 << 18)
.equ FIXED_SHIFT, 14 .equ FIXED_SHIFT, 14
.equ PROJ_SHIFT, 4 .equ PROJ_SHIFT, 4
.equ OT_SHIFT, 4 .equ OT_SHIFT, 4
@@ -90,6 +87,7 @@
.equ MAX_CAUSTICS, 32 .equ MAX_CAUSTICS, 32
.equ MAX_RAND_TABLE, 32 .equ MAX_RAND_TABLE, 32
.equ MAX_ANIM_TEX, 128
.equ MIN_INT32, 0x80000000 .equ MIN_INT32, 0x80000000
.equ MAX_INT32, 0x7FFFFFFF .equ MAX_INT32, 0x7FFFFFFF
@@ -110,6 +108,17 @@
ble \skip ble \skip
.endm .endm
.macro scaleUV uv, u, v, f
asr \u, \uv, #16
mul \u, \f // u = f * int16(uv >> 16)
lsl \v, \uv, #16
asr \v, #16
mul \v, \f // v = f * int16(uv)
lsr \u, #16
lsl \u, #16
orr \uv, \u, \v, lsr #16 // uv = (u & 0xFFFF0000) | (v >> 16)
.endm
.macro tex index, uv .macro tex index, uv
and \index, \uv, #0xFF00 and \index, \uv, #0xFF00
orr \index, \uv, lsr #24 // index = v * 256 + u orr \index, \uv, lsr #24 // index = v * 256 + u

View File

@@ -89,6 +89,8 @@ faceAddMeshTriangles_asm:
orr vp1, vp0, vp1, lsl #(16 - 3) orr vp1, vp0, vp1, lsl #(16 - 3)
mov vp2, vp2, lsr #3 mov vp2, vp2, lsr #3
orr flags, #FACE_TRIANGLE
ldr ot, =gOT ldr ot, =gOT
ldr next, [ot, depth, lsl #2] ldr next, [ot, depth, lsl #2]
str face, [ot, depth, lsl #2] str face, [ot, depth, lsl #2]

View File

@@ -96,6 +96,8 @@ faceAddRoomTriangles_asm:
orr vp1, vp0, vp1, lsl #(16 - 3) orr vp1, vp0, vp1, lsl #(16 - 3)
mov vp2, vp2, lsr #3 mov vp2, vp2, lsr #3
orr flags, #FACE_TRIANGLE
ldr ot, =gOT ldr ot, =gOT
ldr next, [ot, depth, lsl #2] ldr next, [ot, depth, lsl #2]
str face, [ot, depth, lsl #2] str face, [ot, depth, lsl #2]

View File

@@ -25,6 +25,7 @@ verticesBase .req vZG
facesBase .req vZG facesBase .req vZG
vertex .req vZG vertex .req vZG
texture .req tmp texture .req tmp
texAnim .req vXY
texIndex .req tmp texIndex .req tmp
texTile .req tmp texTile .req tmp
sprite .req tmp sprite .req tmp
@@ -121,8 +122,12 @@ flush_asm:
.set_texture: .set_texture:
mov texIndex, flags, lsl #(32 - FACE_TEXTURE_BITS) mov texIndex, flags, lsl #(32 - FACE_TEXTURE_BITS)
//cmp texIndex, #(MAX_ANIM_TEX << (32 - FACE_TEXTURE_BITS)) // TODO split to animated and static textures arrays
add texIndex, texIndex, texIndex, lsl #1 add texIndex, texIndex, texIndex, lsl #1
add texture, TEXTURES, texIndex, lsr #(32 - FACE_TEXTURE_BITS - 2) add texture, TEXTURES, texIndex, lsr #(32 - FACE_TEXTURE_BITS - 2)
//addge texture, TEXTURES, texIndex, lsr #(32 - FACE_TEXTURE_BITS - 2)
//ldrlt texAnim, =gAnimTextures
//addlt texture, texAnim, texIndex, lsr #(32 - FACE_TEXTURE_BITS - 2)
ldmia texture, {texTile, uv01, uv23} ldmia texture, {texTile, uv01, uv23}
str texTile, [TILE] str texTile, [TILE]

View File

@@ -1,11 +1,11 @@
#include "common_asm.inc" #include "common_asm.inc"
flags .req r0 flags .req r0
top .req r1 L .req r1
y .req r2 R .req r2
width .req r3 y .req r3
pixel .req flags
type .req r12 type .req r12
pixel .req flags
.extern rasterizeS_asm .extern rasterizeS_asm
.extern rasterizeF_asm .extern rasterizeF_asm
@@ -22,22 +22,19 @@ type .req r12
.global rasterize_asm .global rasterize_asm
rasterize_asm: rasterize_asm:
and type, flags, #FACE_TYPE_MASK and type, flags, #FACE_TYPE_MASK
cmp type, #FACE_TYPE_F cmp type, #FACE_TYPE_F
streqb flags, [top, #VERTEX_CLIP] andeq R, flags, #0xFF // R = face color for FACE_TYPE_F
movne R, L // R = L otherwise
ldr pixel, =fb ldr pixel, =fb
ldr pixel, [pixel] ldr pixel, [pixel]
ldrsh y, [top, #VERTEX_Y] ldrsh y, [L, #VERTEX_Y]
#if (FRAME_WIDTH == 240) // pixel += (y * 16 - y) * 16 // pixel += y * 240 -> (y * 16 - y) * 16
rsb y, y, y, lsl #4 rsb y, y, y, lsl #4
add pixel, pixel, y, lsl #4 add pixel, pixel, y, lsl #4
#else
mov width, #FRAME_WIDTH
mla pixel, y, width, pixel
#endif
mov r2, top
add pc, type, lsr #(FACE_TYPE_SHIFT - 2) add pc, type, lsr #(FACE_TYPE_SHIFT - 2)
nop nop
b rasterizeS_asm b rasterizeS_asm

View File

@@ -2,7 +2,7 @@
pixel .req r0 pixel .req r0
L .req r1 L .req r1
R .req r2 color .req r2
index .req r3 index .req r3
Lh .req r4 Lh .req r4
Rh .req r5 Rh .req r5
@@ -15,6 +15,7 @@ tmp .req r11
DIVLUT .req r12 DIVLUT .req r12
width .req lr width .req lr
R .req color
h .req N h .req N
Rxy .req tmp Rxy .req tmp
Ry2 .req Rh Ry2 .req Rh
@@ -22,19 +23,16 @@ Lxy .req tmp
Ly2 .req Lh Ly2 .req Lh
LMAP .req Lx LMAP .req Lx
pair .req DIVLUT pair .req DIVLUT
blocks .req DIVLUT
.global rasterizeF_asm .global rasterizeF_asm
rasterizeF_asm: rasterizeF_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r11, lr}
mov LMAP, #LMAP_ADDR add LMAP, color, #LMAP_ADDR
// TODO use ldrh, swap g and clip
ldrb tmp, [L, #VERTEX_G] ldrb tmp, [L, #VERTEX_G]
ldrb index, [L, #VERTEX_CLIP] ldrb index, [LMAP, tmp, lsl #8] // index = lightmap[color + L->v.g * 256]
orr tmp, index, tmp, lsl #8 // tmp = index | (L->v.g << 8)
ldrb index, [LMAP, tmp] // tmp = lightmap[tmp] mov R, L
mov Lh, #0 // Lh = 0 mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0 mov Rh, #0 // Rh = 0

View File

@@ -17,6 +17,8 @@ Lt .req r11
Rt .req r12 Rt .req r12
h .req lr h .req lr
ptr .req tmp
Ldx .req h Ldx .req h
Rdx .req h Rdx .req h
@@ -41,14 +43,14 @@ duv .req R
du .req L du .req L
dv .req R dv .req R
Lduv .req h
Ldu .req N Ldu .req N
Ldv .req h Ldv .req h
Rduv .req h
Rdu .req N Rdu .req N
Rdv .req h Rdv .req h
Rti .req indexB
sLdx .req tmp sLdx .req tmp
sLdt .req N sLdt .req N
sRdx .req Lh sRdx .req Lh
@@ -58,32 +60,25 @@ SP_LDX = 0
SP_LDT = 4 SP_LDT = 4
SP_RDX = 8 SP_RDX = 8
SP_RDT = 12 SP_RDT = 12
SP_L = 16
SP_R = 20
SP_LH = 24
SP_RH = 28
SP_SIZE = 32
.macro PUT_PIXELS .macro PUT_PIXELS
tex indexA, t tex indexA, t
lit indexA lit indexA
#ifndef TEX_2PX
add t, dtdx
tex indexB, t
lit indexB
add t, dtdx
orr indexA, indexB, lsl #8
strh indexA, [tmp], #2
#else
add t, dtdx, lsl #1 add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8 //orr indexA, indexA, lsl #8
strb indexA, [tmp], #2 // writing a byte to GBA VRAM will write a half word for free strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free
#endif
.endm .endm
.global rasterizeFT_asm .global rasterizeFT_asm
rasterizeFT_asm: rasterizeFT_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt] sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov LMAP, #LMAP_ADDR mov LMAP, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G] ldrb tmp, [L, #VERTEX_G]
@@ -125,16 +120,9 @@ rasterizeFT_asm:
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldr Lduv, [L, #VERTEX_T] ldr Ldt, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
asr Ldu, Lduv, #16 scaleUV Ldt, Ldu, Ldv, tmp
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Ldt, [sp, #SP_LDT] // store Ldt to stack str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end: .calc_left_end:
@@ -166,16 +154,9 @@ rasterizeFT_asm:
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldr Rduv, [R, #VERTEX_T] ldr Rdt, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
asr Rdu, Rduv, #16 scaleUV Rdt, Rdu, Rdv, tmp
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Rdt, [sp, #SP_RDT] // store Rdt to stack str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end: .calc_right_end:
@@ -185,44 +166,36 @@ rasterizeFT_asm:
sub Lh, h // Lh -= h sub Lh, h // Lh -= h
sub Rh, h // Rh -= h sub Rh, h // Rh -= h
stmfd sp!, {L,R,Lh,Rh} // sp-16 add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start: .scanline_start:
asr tmp, Lx, #16 // x1 = (Lx >> 16) asr tmp, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1 rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline ble .scanline_end // if (width <= 0) go next scanline
add tmp, pixel, tmp // tmp = pixel + x1 add ptr, pixel, tmp // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1 lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width) ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub duv, Rt, Lt // duv = Rt - Lt sub dtdx, Rt, Lt // duv = Rt - Lt
asr du, duv, #16 scaleUV dtdx, du, dv, inv
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
mov t, Lt // t = Lt mov t, Lt // t = Lt
// 2 bytes alignment (VRAM write requirement) // 2 bytes alignment (VRAM write requirement)
.align_left: .align_left:
tst tmp, #1 // if (tmp & 1) tst ptr, #1 // if (ptr & 1)
beq .align_right beq .align_right
ldrb indexB, [tmp, #-1]! // read pal index from VRAM (byte)
and indexA, t, #0xFF00 tex indexA, t
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24) lit indexA
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
orr indexB, indexA, lsl #8 orr indexB, indexA, lsl #8
strh indexB, [tmp], #2 strh indexB, [ptr], #2
add t, dtdx add t, dtdx
subs width, #1 // width-- subs width, #1 // width--
@@ -231,19 +204,15 @@ rasterizeFT_asm:
.align_right: .align_right:
tst width, #1 tst width, #1
beq .align_block_4px beq .align_block_4px
ldrb indexB, [tmp, width]
sub Rti, Rt, dtdx
tex indexA, Rti
lit indexA
ldrb indexB, [ptr, width]
subs width, #1 // width-- subs width, #1 // width--
sub Rt, dtdx
and indexA, Rt, #0xFF00
orr indexA, Rt, lsr #24 // res = (t & 0xFF00) | (t >> 24)
add Rt, dtdx
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
orr indexB, indexA, indexB, lsl #8 orr indexB, indexA, indexB, lsl #8
strh indexB, [tmp, width] strh indexB, [ptr, width]
beq .scanline_end // if (width == 0) beq .scanline_end // if (width == 0)
@@ -276,8 +245,7 @@ rasterizeFT_asm:
bne .scanline_block_8px bne .scanline_block_8px
.scanline_end: .scanline_end:
add tmp, sp, #16 ldmia sp, {sLdx, sLdt, sRdx, sRdt}
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx add Lx, sLdx
add Lt, sLdt add Lt, sLdt
add Rx, sRdx add Rx, sRdx
@@ -288,9 +256,10 @@ rasterizeFT_asm:
subs h, #1 subs h, #1
bne .scanline_start bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16 add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt] add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -17,6 +17,8 @@ Lt .req r11
Rt .req r12 Rt .req r12
h .req lr h .req lr
ptr .req tmp
Ldx .req h Ldx .req h
Rdx .req h Rdx .req h
@@ -41,14 +43,14 @@ duv .req R
du .req L du .req L
dv .req R dv .req R
Lduv .req h
Ldu .req N Ldu .req N
Ldv .req h Ldv .req h
Rduv .req h
Rdu .req N Rdu .req N
Rdv .req h Rdv .req h
Rti .req indexB
sLdx .req tmp sLdx .req tmp
sLdt .req N sLdt .req N
sRdx .req Lh sRdx .req Lh
@@ -58,38 +60,25 @@ SP_LDX = 0
SP_LDT = 4 SP_LDT = 4
SP_RDX = 8 SP_RDX = 8
SP_RDT = 12 SP_RDT = 12
SP_L = 16
SP_R = 20
SP_LH = 24
SP_RH = 28
SP_SIZE = 32
.macro PUT_PIXELS .macro PUT_PIXELS
#ifndef TEX_2PX
tex indexA, t
add t, dtdx
tex indexB, t
add t, dtdx
// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
ands indexA, #255
andnes indexB, #255
orrne indexB, indexA, indexB, lsl #8 // indexB = indexA | (indexB << 8)
ldrneb indexA, [LMAP, indexA]
ldrneb indexB, [LMAP, indexB, lsr #8]
orrne indexA, indexB, lsl #8
strneh indexA, [tmp]
add tmp, #2
#else
tex indexA, t tex indexA, t
add t, dtdx, lsl #1 add t, dtdx, lsl #1
cmp indexA, #0 cmp indexA, #0
ldrneb indexA, [LMAP, indexA] ldrneb indexA, [LMAP, indexA]
strneb indexA, [tmp] strneb indexA, [ptr]
add tmp, #2 add ptr, #2
#endif
.endm .endm
.global rasterizeFTA_asm .global rasterizeFTA_asm
rasterizeFTA_asm: rasterizeFTA_asm:
stmfd sp!, {r4-r11, lr} stmfd sp!, {r4-r11, lr}
sub sp, #16 // reserve stack space for [Ldx, Ldt, Rdx, Rdt] sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldt, Rdx, Rdt]
mov LMAP, #LMAP_ADDR mov LMAP, #LMAP_ADDR
ldrb tmp, [L, #VERTEX_G] ldrb tmp, [L, #VERTEX_G]
@@ -131,16 +120,9 @@ rasterizeFTA_asm:
mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx) mul Ldx, tmp // Ldx = tmp * (N->v.x - Lx)
str Ldx, [sp, #SP_LDX] // store Ldx to stack str Ldx, [sp, #SP_LDX] // store Ldx to stack
ldr Lduv, [L, #VERTEX_T] ldr Ldt, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
asr Ldu, Lduv, #16 scaleUV Ldt, Ldu, Ldv, tmp
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Ldt, [sp, #SP_LDT] // store Ldt to stack str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end: .calc_left_end:
@@ -172,16 +154,9 @@ rasterizeFTA_asm:
mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx) mul Rdx, tmp // Rdx = tmp * (N->v.x - Rx)
str Rdx, [sp, #SP_RDX] // store Rdx to stack str Rdx, [sp, #SP_RDX] // store Rdx to stack
ldr Rduv, [R, #VERTEX_T] ldr Rdt, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
asr Rdu, Rduv, #16 scaleUV Rdt, Rdu, Rdv, tmp
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Rdt, [sp, #SP_RDT] // store Rdt to stack str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end: .calc_right_end:
@@ -191,46 +166,38 @@ rasterizeFTA_asm:
sub Lh, h // Lh -= h sub Lh, h // Lh -= h
sub Rh, h // Rh -= h sub Rh, h // Rh -= h
stmfd sp!, {L,R,Lh,Rh} // sp-16 add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start: .scanline_start:
asr tmp, Lx, #16 // x1 = (Lx >> 16) asr tmp, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1 rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline ble .scanline_end // if (width <= 0) go next scanline
add tmp, pixel, tmp // tmp = pixel + x1 add ptr, pixel, tmp // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1 lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width) ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub duv, Rt, Lt // duv = Rt - Lt sub dtdx, Rt, Lt // duv = Rt - Lt
asr du, duv, #16 scaleUV dtdx, du, dv, inv
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
mov t, Lt // t = Lt mov t, Lt // t = Lt
// 2 bytes alignment (VRAM write requirement) // 2 bytes alignment (VRAM write requirement)
.align_left: .align_left:
tst tmp, #1 // if (tmp & 1) tst ptr, #1 // if (ptr & 1)
beq .align_right beq .align_right
and indexA, t, #0xFF00 tex indexA, t
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
cmp indexA, #0 cmp indexA, #0
ldrneb indexB, [tmp, #-1]! // read pal index from VRAM (byte) ldrneb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
ldrneb indexA, [LMAP, indexA] ldrneb indexA, [LMAP, indexA]
orrne indexB, indexA, lsl #8 orrne indexB, indexA, lsl #8
strneh indexB, [tmp], #2 strneh indexB, [ptr], #2
addeq tmp, #1 addeq ptr, #1
add t, dtdx add t, dtdx
subs width, #1 // width-- subs width, #1 // width--
@@ -240,17 +207,14 @@ rasterizeFTA_asm:
tst width, #1 tst width, #1
beq .align_block_4px beq .align_block_4px
sub Rt, dtdx sub Rti, Rt, dtdx
and indexA, Rt, #0xFF00 tex indexA, Rti
orr indexA, Rt, lsr #24 // res = (t & 0xFF00) | (t >> 24)
add Rt, dtdx
ldrb indexA, [TILE, indexA]
cmp indexA, #0 cmp indexA, #0
ldrneb indexA, [LMAP, indexA] ldrneb indexA, [LMAP, indexA]
ldrneb indexB, [tmp, width] ldrneb indexB, [ptr, width]
orrne indexB, indexA, indexB, lsl #8 orrne indexB, indexA, indexB, lsl #8
addne indexA, tmp, width addne indexA, ptr, width
strneh indexB, [indexA, #-1] strneh indexB, [indexA, #-1]
subs width, #1 // width-- subs width, #1 // width--
@@ -285,8 +249,7 @@ rasterizeFTA_asm:
bne .scanline_block_8px bne .scanline_block_8px
.scanline_end: .scanline_end:
add tmp, sp, #16 ldmia sp, {sLdx, sLdt, sRdx, sRdt}
ldmia tmp, {sLdx, sLdt, sRdx, sRdt}
add Lx, sLdx add Lx, sLdx
add Lt, sLdt add Lt, sLdt
add Rx, sRdx add Rx, sRdx
@@ -297,9 +260,10 @@ rasterizeFTA_asm:
subs h, #1 subs h, #1
bne .scanline_start bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16 add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #16 // revert reserved space for [Ldx, Ldt, Rdx, Rdt] add sp, #SP_SIZE // revert reserved space for [Ldx, Ldt, Rdx, Rdt]
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -35,7 +35,7 @@ Ldt .req h
Rdt .req h Rdt .req h
indexA .req Lh indexA .req Lh
indexB .req Rh indexB .req tmp
Rxy .req tmp Rxy .req tmp
Ry2 .req Rh Ry2 .req Rh
@@ -47,23 +47,19 @@ DIVLUT .req N
DIVLUTi .req tmp DIVLUTi .req tmp
ptr .req Lx ptr .req Lx
width .req Rx width .req Rh
g .req Lg g .req Lg
dgdx .req Rg dgdx .req L
t .req Lt t .req Lt
dtdx .req Rt dtdx .req R
duv .req R
du .req L du .req L
dv .req R dv .req R
Lduv .req N
Ldu .req TILE Ldu .req TILE
Ldv .req N Ldv .req N
Rduv .req N
Rdu .req TILE Rdu .req TILE
Rdv .req N Rdv .req N
@@ -75,7 +71,7 @@ sLdg .req R
sLdt .req Lh sLdt .req Lh
sRdx .req Rh sRdx .req Rh
sRdg .req tmp sRdg .req tmp
sRdt .req N // not used in ldm due h collision sRdt .req tmp // not enough regs for one ldmia
SP_LDX = 0 SP_LDX = 0
SP_LDG = 4 SP_LDG = 4
@@ -83,6 +79,12 @@ SP_LDT = 8
SP_RDX = 12 SP_RDX = 12
SP_RDG = 16 SP_RDG = 16
SP_RDT = 20 SP_RDT = 20
SP_L = 24
SP_R = 28
SP_LH = 32
SP_RH = 36
SP_SIZE = 40
SP_TILE = SP_SIZE
.macro PUT_PIXELS .macro PUT_PIXELS
bic LMAP, g, #255 bic LMAP, g, #255
@@ -91,28 +93,18 @@ SP_RDT = 20
tex indexA, t tex indexA, t
lit indexA lit indexA
#ifndef TEX_2PX
add t, dtdx
tex indexB, t
lit indexB
add t, dtdx
orr indexA, indexB, lsl #8
strh indexA, [ptr], #2
#else
add t, dtdx, lsl #1 add t, dtdx, lsl #1
//orr indexA, indexA, lsl #8 //orr indexA, indexA, lsl #8
strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free strb indexA, [ptr], #2 // writing a byte to GBA VRAM will write a half word for free
#endif
.endm .endm
.global rasterizeGT_asm .global rasterizeGT_asm
rasterizeGT_asm: rasterizeGT_asm:
stmfd sp!, {r4-r11, lr} ldr r3, =gTile
sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] ldr r3, [r3]
stmfd sp!, {r3-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov Lh, #0 // Lh = 0 mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0 mov Rh, #0 // Rh = 0
@@ -155,16 +147,9 @@ rasterizeGT_asm:
asr Ldg, #8 // 8-bit for fractional part asr Ldg, #8 // 8-bit for fractional part
str Ldg, [sp, #SP_LDG] // store Ldg to stack str Ldg, [sp, #SP_LDG] // store Ldg to stack
ldr Lduv, [L, #VERTEX_T] ldr Ldt, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
asr Ldu, Lduv, #16 scaleUV Ldt, Ldu, Ldv, tmp
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Ldt, [sp, #SP_LDT] // store Ldt to stack str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end: .calc_left_end:
@@ -204,16 +189,9 @@ rasterizeGT_asm:
asr Rdg, #8 // 8-bit for fractional part asr Rdg, #8 // 8-bit for fractional part
str Rdg, [sp, #SP_RDG] // store Ldg to stack str Rdg, [sp, #SP_RDG] // store Ldg to stack
ldr Rduv, [R, #VERTEX_T] ldr Rdt, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
asr Rdu, Rduv, #16 scaleUV Rdt, Rdu, Rdv, tmp
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Rdt, [sp, #SP_RDT] // store Rdt to stack str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end: .calc_right_end:
@@ -226,54 +204,44 @@ rasterizeGT_asm:
sub Lh, h // Lh -= h sub Lh, h // Lh -= h
sub Rh, h // Rh -= h sub Rh, h // Rh -= h
ldr TILE, =gTile ldr TILE, [sp, #SP_TILE]
ldr TILE, [TILE]
stmfd sp!, {L,R,Lh,Rh} // sp-16 add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start: .scanline_start:
stmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp-24 stmfd sp!, {Lx, Lg, Lt}
asr tmp, Lx, #16 // x1 = (Lx >> 16) asr Lx, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1 rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline ble .scanline_end // if (width <= 0) go next scanline
add ptr, pixel, tmp // ptr = pixel + x1 add ptr, pixel, Lx // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1 lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width) ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv
// t == Lt (alias)
sub dgdx, Rg, Lg // dgdx = Rg - Lg sub dgdx, Rg, Lg // dgdx = Rg - Lg
mul dgdx, inv // dgdx *= FixedInvU(width) mul dgdx, inv // dgdx *= FixedInvU(width)
asr dgdx, #15 // dgdx >>= 15 asr dgdx, #15 // dgdx >>= 15
// g == Lg (alias) // g == Lg (alias)
sub duv, Rt, Lt // duv = Rt - Lt
asr du, duv, #16
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
// t == Lt (alias)
// 2 bytes alignment (VRAM write requirement) // 2 bytes alignment (VRAM write requirement)
.align_left: .align_left:
tst ptr, #1 // if (ptr & 1) tst ptr, #1 // if (ptr & 1)
beq .align_right beq .align_right
ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
bic LMAP, g, #255 bic LMAP, g, #255
add g, dgdx, asr #1 add g, dgdx, asr #1
tex indexA, t
lit indexA
and indexA, t, #0xFF00 ldrb indexB, [ptr, #-1]! // read pal index from VRAM (byte)
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
ldrb indexA, [LMAP, indexA]
orr indexB, indexA, lsl #8 orr indexB, indexA, lsl #8
strh indexB, [ptr], #2 strh indexB, [ptr], #2
add t, dtdx add t, dtdx
@@ -284,21 +252,16 @@ rasterizeGT_asm:
.align_right: .align_right:
tst width, #1 tst width, #1
beq .align_block_4px beq .align_block_4px
ldrb indexB, [ptr, width]
subs width, #1 // width-- sub Rti, Rt, dtdx
tex indexA, Rti
mla Rti, width, dtdx, t // Rti = width * dtdx + t sub Rgi, Rg, dgdx, asr #1
and indexA, Rti, #0xFF00
orr indexA, Rti, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
asr Rgi, dgdx, #1
mla Rgi, width, Rgi, g // Rgi = width * (dgdx / 2) + g
bic LMAP, Rgi, #255 bic LMAP, Rgi, #255
lit indexA
ldrb indexA, [LMAP, indexA] ldrb indexB, [ptr, width]
subs width, #1 // width--
orr indexB, indexA, indexB, lsl #8 orr indexB, indexA, indexB, lsl #8
strh indexB, [ptr, width] strh indexB, [ptr, width]
@@ -333,10 +296,9 @@ rasterizeGT_asm:
bne .scanline_block_8px bne .scanline_block_8px
.scanline_end: .scanline_end:
ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24 ldmfd sp!, {Lx, Lg, Lt}
add tmp, sp, #16 ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg}
ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg}
add Lx, sLdx add Lx, sLdx
add Lg, sLdg add Lg, sLdg
@@ -344,17 +306,18 @@ rasterizeGT_asm:
add Rx, sRdx add Rx, sRdx
add Rg, sRdg add Rg, sRdg
ldr tmp, [sp, #(SP_RDT + 16)] ldr sRdt, [sp, #SP_RDT]
add Rt, tmp // Rt += Rdt from stack add Rt, sRdt
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
subs h, #1 subs h, #1
bne .scanline_start bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16 add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE]
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

View File

@@ -35,7 +35,7 @@ Ldt .req h
Rdt .req h Rdt .req h
indexA .req Lh indexA .req Lh
indexB .req Rh indexB .req tmp
Rxy .req tmp Rxy .req tmp
Ry2 .req Rh Ry2 .req Rh
@@ -47,23 +47,21 @@ DIVLUT .req N
DIVLUTi .req tmp DIVLUTi .req tmp
ptr .req Lx ptr .req Lx
width .req Rx width .req Rh
g .req Lg g .req Lg
dgdx .req Rg dgdx .req L
t .req Lt t .req Lt
dtdx .req Rt dtdx .req R
duv .req R duv .req R
du .req L du .req L
dv .req R dv .req R
Lduv .req N
Ldu .req TILE Ldu .req TILE
Ldv .req N Ldv .req N
Rduv .req N
Rdu .req TILE Rdu .req TILE
Rdv .req N Rdv .req N
@@ -75,7 +73,7 @@ sLdg .req R
sLdt .req Lh sLdt .req Lh
sRdx .req Rh sRdx .req Rh
sRdg .req tmp sRdg .req tmp
sRdt .req N // not used in ldm due h collision sRdt .req tmp // not enough regs for one ldmia
SP_LDX = 0 SP_LDX = 0
SP_LDG = 4 SP_LDG = 4
@@ -83,41 +81,32 @@ SP_LDT = 8
SP_RDX = 12 SP_RDX = 12
SP_RDG = 16 SP_RDG = 16
SP_RDT = 20 SP_RDT = 20
SP_L = 24
SP_R = 28
SP_LH = 32
SP_RH = 36
SP_SIZE = 40
SP_TILE = SP_SIZE
.macro PUT_PIXELS .macro PUT_PIXELS
bic LMAP, g, #255 bic LMAP, g, #255
add g, dgdx add g, dgdx
#ifndef TEX_2PX
tex indexA, t
add t, dtdx
tex indexB, t
add t, dtdx
// cheap non-accurate alpha test, skip pixels pair if one or both are transparent
ands indexA, #255
andnes indexB, #255
orrne indexB, indexA, indexB, lsl #8 // indexB = indexA | (indexB << 8)
ldrneb indexA, [LMAP, indexA]
ldrneb indexB, [LMAP, indexB, lsr #8]
orrne indexA, indexB, lsl #8
strneh indexA, [ptr]
#else
tex indexA, t tex indexA, t
add t, dtdx, lsl #1 add t, dtdx, lsl #1
cmp indexA, #0 cmp indexA, #0
ldrneb indexA, [LMAP, indexA] ldrneb indexA, [LMAP, indexA]
strneb indexA, [ptr] strneb indexA, [ptr]
#endif
add ptr, #2 add ptr, #2
.endm .endm
.global rasterizeGTA_asm .global rasterizeGTA_asm
rasterizeGTA_asm: rasterizeGTA_asm:
stmfd sp!, {r4-r11, lr} ldr r3, =gTile
sub sp, #24 // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] ldr r3, [r3]
stmfd sp!, {r3-r11, lr}
sub sp, #SP_SIZE // reserve stack space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt]
mov Lh, #0 // Lh = 0 mov Lh, #0 // Lh = 0
mov Rh, #0 // Rh = 0 mov Rh, #0 // Rh = 0
@@ -160,16 +149,9 @@ rasterizeGTA_asm:
asr Ldg, #8 // 8-bit for fractional part asr Ldg, #8 // 8-bit for fractional part
str Ldg, [sp, #SP_LDG] // store Ldg to stack str Ldg, [sp, #SP_LDG] // store Ldg to stack
ldr Lduv, [L, #VERTEX_T] ldr Ldt, [L, #VERTEX_T]
sub Lduv, Lt // Lduv = N->v.t - Lt sub Ldt, Lt // Ldt = N->v.t - Lt
asr Ldu, Lduv, #16 scaleUV Ldt, Ldu, Ldv, tmp
mul Ldu, tmp // Rdu = tmp * int16(Lduv >> 16)
lsl Ldv, Lduv, #16
asr Ldv, #16
mul Ldv, tmp // Rdv = tmp * int16(Lduv)
lsr Ldu, #16
lsl Ldu, #16
orr Ldt, Ldu, Ldv, lsr #16 // Ldt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Ldt, [sp, #SP_LDT] // store Ldt to stack str Ldt, [sp, #SP_LDT] // store Ldt to stack
.calc_left_end: .calc_left_end:
@@ -209,16 +191,9 @@ rasterizeGTA_asm:
asr Rdg, #8 // 8-bit for fractional part asr Rdg, #8 // 8-bit for fractional part
str Rdg, [sp, #SP_RDG] // store Ldg to stack str Rdg, [sp, #SP_RDG] // store Ldg to stack
ldr Rduv, [R, #VERTEX_T] ldr Rdt, [R, #VERTEX_T]
sub Rduv, Rt // Rduv = N->v.t - Rt sub Rdt, Rt // Rdt = N->v.t - Rt
asr Rdu, Rduv, #16 scaleUV Rdt, Rdu, Rdv, tmp
mul Rdu, tmp // Rdu = tmp * int16(Rduv >> 16)
lsl Rdv, Rduv, #16
asr Rdv, #16
mul Rdv, tmp // Rdv = tmp * int16(Rduv)
lsr Rdu, #16
lsl Rdu, #16
orr Rdt, Rdu, Rdv, lsr #16 // Rdt = (Rdu & 0xFFFF0000) | (Rdv >> 16)
str Rdt, [sp, #SP_RDT] // store Rdt to stack str Rdt, [sp, #SP_RDT] // store Rdt to stack
.calc_right_end: .calc_right_end:
@@ -231,48 +206,39 @@ rasterizeGTA_asm:
sub Lh, h // Lh -= h sub Lh, h // Lh -= h
sub Rh, h // Rh -= h sub Rh, h // Rh -= h
ldr TILE, =gTile ldr TILE, [sp, #SP_TILE]
ldr TILE, [TILE]
stmfd sp!, {L,R,Lh,Rh} // sp-16 add tmp, sp, #SP_L
stmia tmp, {L, R, Lh, Rh}
.scanline_start: .scanline_start:
stmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp-24 stmfd sp!, {Lx, Lg, Lt}
asr tmp, Lx, #16 // x1 = (Lx >> 16) asr Lx, Lx, #16 // x1 = (Lx >> 16)
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1 rsbs width, Lx, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline ble .scanline_end // if (width <= 0) go next scanline
add ptr, pixel, tmp // ptr = pixel + x1 add ptr, pixel, Lx // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1 lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width) ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv
// t == Lt (alias)
sub dgdx, Rg, Lg // dgdx = Rg - Lg sub dgdx, Rg, Lg // dgdx = Rg - Lg
mul dgdx, inv // dgdx *= FixedInvU(width) mul dgdx, inv // dgdx *= FixedInvU(width)
asr dgdx, #15 // dgdx >>= 15 asr dgdx, #15 // dgdx >>= 15
// g == Lg (alias) // g == Lg (alias)
sub duv, Rt, Lt // duv = Rt - Lt
asr du, duv, #16
mul du, inv // du = inv * int16(duv >> 16)
lsl dv, duv, #16
asr dv, #16
mul dv, inv // dv = inv * int16(duv)
lsr du, #16
lsl du, #16
orr dtdx, du, dv, lsr #16 // dtdx = (du & 0xFFFF0000) | (dv >> 16)
// t == Lt (alias)
// 2 bytes alignment (VRAM write requirement) // 2 bytes alignment (VRAM write requirement)
.align_left: .align_left:
tst ptr, #1 // if (ptr & 1) tst ptr, #1 // if (ptr & 1)
beq .align_right beq .align_right
and indexA, t, #0xFF00 tex indexA, t
orr indexA, t, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
cmp indexA, #0 cmp indexA, #0
beq .skip_left beq .skip_left
@@ -296,29 +262,24 @@ rasterizeGTA_asm:
tst width, #1 tst width, #1
beq .align_block_4px beq .align_block_4px
ldrb indexB, [ptr, width] sub Rti, Rt, dtdx
tex indexA, Rti
sub width, #1 // width--
mla Rti, width, dtdx, t // Rti = width * dtdx + t
and indexA, Rti, #0xFF00
orr indexA, Rti, lsr #24 // res = (t & 0xFF00) | (t >> 24)
ldrb indexA, [TILE, indexA]
cmp indexA, #0 cmp indexA, #0
subeq width, #1
beq .skip_right beq .skip_right
asr Rgi, dgdx, #1 sub Rgi, Rg, dgdx, asr #1
mla Rgi, width, Rgi, g // Rgi = width * (dgdx / 2) + g
bic LMAP, Rgi, #255 bic LMAP, Rgi, #255
lit indexA
ldrb indexA, [LMAP, indexA] ldrb indexB, [ptr, width]
sub width, #1 // width--
orr indexB, indexA, indexB, lsl #8 orr indexB, indexA, indexB, lsl #8
strh indexB, [ptr, width] strh indexB, [ptr, width]
.skip_right: .skip_right:
cmp width, #0 // width-- cmp width, #0
beq .scanline_end // if (width == 0) beq .scanline_end // if (width == 0)
.align_block_4px: .align_block_4px:
@@ -350,10 +311,9 @@ rasterizeGTA_asm:
bne .scanline_block_8px bne .scanline_block_8px
.scanline_end: .scanline_end:
ldmfd sp!, {Lx,Rx,Lg,Rg,Lt,Rt} // sp+24 ldmfd sp!, {Lx, Lg, Lt}
add tmp, sp, #16 ldmia sp, {sLdx, sLdg, sLdt, sRdx, sRdg}
ldmia tmp, {sLdx, sLdg, sLdt, sRdx, sRdg}
add Lx, sLdx add Lx, sLdx
add Lg, sLdg add Lg, sLdg
@@ -361,17 +321,18 @@ rasterizeGTA_asm:
add Rx, sRdx add Rx, sRdx
add Rg, sRdg add Rg, sRdg
ldr tmp, [sp, #(SP_RDT + 16)] ldr sRdt, [sp, #SP_RDT]
add Rt, tmp // Rt += Rdt from stack add Rt, sRdt
add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240) add pixel, #FRAME_WIDTH // pixel += FRAME_WIDTH (240)
subs h, #1 subs h, #1
bne .scanline_start bne .scanline_start
ldmfd sp!, {L,R,Lh,Rh} // sp+16 add tmp, sp, #SP_L
ldmia tmp, {L, R, Lh, Rh}
b .loop b .loop
.exit: .exit:
add sp, #24 // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt] add sp, #(SP_SIZE + 4) // revert reserved space for [Ldx, Ldg, Ldt, Rdx, Rdg, Rdt, TILE]
ldmfd sp!, {r4-r11, pc} ldmfd sp!, {r4-r11, pc}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -153,7 +153,8 @@ void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R) void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{ {
uint16 color = gLightmap[(L->v.g << 8) | L->v.clip]; uint32 color = (uint32)R;
color = gLightmap[(L->v.g << 8) | color];
color |= (color << 8); color |= (color << 8);
int32 Lh = 0; int32 Lh = 0;
@@ -163,6 +164,8 @@ void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
int32 Rx; int32 Rx;
int32 Lx; int32 Lx;
R = L;
while (1) while (1)
{ {
while (!Lh) while (!Lh)

View File

@@ -49,12 +49,12 @@ enum FaceType {
FACE_TYPE_MAX FACE_TYPE_MAX
}; };
#define FACE_TRIANGLE (1 << 13) #define FACE_TRIANGLE (1 << 19)
#define FACE_CLIPPED (1 << 18) #define FACE_CLIPPED (1 << 18)
#define FACE_TYPE_SHIFT 14 #define FACE_TYPE_SHIFT 14
#define FACE_TYPE_MASK 15 #define FACE_TYPE_MASK 15
#define FACE_GOURAUD (2 << FACE_TYPE_SHIFT) #define FACE_GOURAUD (2 << FACE_TYPE_SHIFT)
#define FACE_TEXTURE 0x1FFF #define FACE_TEXTURE 0x3FFF
#include "rasterizer.h" #include "rasterizer.h"
@@ -411,6 +411,7 @@ void faceAddRoomTriangles_c(const RoomTriangle* polys, int32 count)
if (g0 != g1 || g0 != g2) { if (g0 != g1 || g0 != g2) {
flags += FACE_GOURAUD; flags += FACE_GOURAUD;
} }
flags |= FACE_TRIANGLE;
if (checkBackface(v0, v1, v2)) if (checkBackface(v0, v1, v2))
continue; continue;
@@ -487,6 +488,7 @@ void faceAddMeshTriangles_c(const MeshTriangle* polys, int32 count)
if ((c0 | c1 | c2) & CLIP_MASK_VP) { if ((c0 | c1 | c2) & CLIP_MASK_VP) {
flags |= FACE_CLIPPED; flags |= FACE_CLIPPED;
} }
flags |= FACE_TRIANGLE;
int32 depth = (v0->z + v1->z + v2->z + v2->z) >> (2 + OT_SHIFT); int32 depth = (v0->z + v1->z + v2->z + v2->z) >> (2 + OT_SHIFT);
@@ -634,11 +636,9 @@ X_NOINLINE void rasterize_c(uint32 flags, VertexLink* top)
uint32 type = (flags >> FACE_TYPE_SHIFT) & FACE_TYPE_MASK; uint32 type = (flags >> FACE_TYPE_SHIFT) & FACE_TYPE_MASK;
if (type == FACE_TYPE_F) { VertexLink* R = (type == FACE_TYPE_F) ? (VertexLink*)(flags & 0xFF) : top;
top->v.clip = flags; // use tex coord as color index for untextured polys
}
gRasterProc[type]((uint16*)pixel, top, top); gRasterProc[type]((uint16*)pixel, top, R);
} }
void flush_c() void flush_c()