1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-08-12 08:04:09 +02:00

#368 micro optimization divTable fetch

This commit is contained in:
XProger
2022-02-12 15:04:12 +03:00
parent fe75116d5b
commit 594541d6f4
13 changed files with 84 additions and 116 deletions

View File

@@ -3,8 +3,8 @@
#include "common.h"
extern uint8 lightmap[256 * 32];
extern const uint8* tile;
extern uint8 gLightmap[256 * 32];
extern const uint8* gTile;
#define rasterizeS rasterizeS_c
#define rasterizeF rasterizeF_c
@@ -20,7 +20,7 @@ extern const uint8* tile;
void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{
const uint8* ft_lightmap = &lightmap[0x1A00];
const uint8* ft_lightmap = &gLightmap[0x1A00];
int32 Lh = 0;
int32 Rh = 0;
@@ -128,7 +128,7 @@ void rasterizeS_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
void rasterizeF_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{
uint16 color = lightmap[(L->v.g << 8) | L->t.t];
uint16 color = gLightmap[(L->v.g << 8) | L->t.t];
color |= (color << 8);
int32 Lh = 0;
@@ -237,7 +237,7 @@ void rasterizeG_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
int32 Lx, Rx, Ldx = 0, Rdx = 0;
int32 Lg, Rg, Ldg = 0, Rdg = 0;
const uint8* ft_lightmap = lightmap + L->t.t;
const uint8* ft_lightmap = gLightmap + L->t.t;
while (1)
{
@@ -355,7 +355,7 @@ void rasterizeG_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{
const uint8* ft_lightmap = &lightmap[L->v.g << 8];
const uint8* ft_lightmap = &gLightmap[L->v.g << 8];
int32 Lh = 0, Rh = 0;
int32 Lx, Rx, Ldx = 0, Rdx = 0;
@@ -441,7 +441,7 @@ void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
if (intptr_t(ptr) & 1)
{
*ptr++ = ft_lightmap[tile[(t & 0xFF00) | (t >> 24)]];
*ptr++ = ft_lightmap[gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
width--;
}
@@ -449,15 +449,15 @@ void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
if (width & 1)
{
uint32 tmp = Rt - dtdx;
ptr[width - 1] = ft_lightmap[tile[(tmp & 0xFF00) | (tmp >> 24)]];
ptr[width - 1] = ft_lightmap[gTile[(tmp & 0xFF00) | (tmp >> 24)]];
}
width >>= 1;
while (width--)
{
uint8 indexA = ft_lightmap[tile[(t & 0xFF00) | (t >> 24)]];
uint8 indexA = ft_lightmap[gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
uint8 indexB = ft_lightmap[tile[(t & 0xFF00) | (t >> 24)]];
uint8 indexB = ft_lightmap[gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
#ifdef CPU_BIG_ENDIAN
@@ -483,7 +483,7 @@ void rasterizeFT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{
#ifdef ALIGNED_LIGHTMAP
ASSERT((intptr_t(lightmap) & 0xFFFF) == 0); // lightmap should be 64k aligned
ASSERT((intptr_t(gLightmap) & 0xFFFF) == 0); // lightmap should be 64k aligned
#endif
int32 Lh = 0, Rh = 0;
@@ -583,7 +583,7 @@ void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
if (intptr_t(ptr) & 1)
{
*ptr++ = lightmap[(g >> 8 << 8) | tile[(t & 0xFF00) | (t >> 24)]];
*ptr++ = gLightmap[(g >> 8 << 8) | gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
g += dgdx >> 1;
width--;
@@ -592,11 +592,11 @@ void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
if (width & 1)
{
uint32 tmp = Rt - dtdx;
ptr[width - 1] = lightmap[(Rg >> 8 << 8) | tile[(tmp & 0xFF00) | (tmp >> 24)]];
ptr[width - 1] = gLightmap[(Rg >> 8 << 8) | gTile[(tmp & 0xFF00) | (tmp >> 24)]];
}
#ifdef ALIGNED_LIGHTMAP
g += intptr_t(lightmap);
g += intptr_t(gLightmap);
#endif
width >>= 1;
@@ -606,15 +606,15 @@ void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
#ifdef ALIGNED_LIGHTMAP
const uint8* LMAP = (uint8*)(g >> 8 << 8);
uint8 indexA = LMAP[tile[(t & 0xFF00) | (t >> 24)]];
uint8 indexA = LMAP[gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
uint8 indexB = LMAP[tile[(t & 0xFF00) | (t >> 24)]];
uint8 indexB = LMAP[gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
g += dgdx;
#else
uint8 indexA = lightmap[(g >> 8 << 8) | tile[(t & 0xFF00) | (t >> 24)]];
uint8 indexA = gLightmap[(g >> 8 << 8) | gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
uint8 indexB = lightmap[(g >> 8 << 8) | tile[(t & 0xFF00) | (t >> 24)]];
uint8 indexB = gLightmap[(g >> 8 << 8) | gTile[(t & 0xFF00) | (t >> 24)]];
t += dtdx;
g += dgdx;
#endif
@@ -643,7 +643,7 @@ void rasterizeGT_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
void rasterizeFTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
{
const uint8* ft_lightmap = &lightmap[L->v.g << 8];
const uint8* ft_lightmap = &gLightmap[L->v.g << 8];
int32 Lh = 0, Rh = 0;
int32 Lx, Rx, Ldx = 0, Rdx = 0;
@@ -729,7 +729,7 @@ void rasterizeFTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
if (intptr_t(ptr) & 1)
{
uint8 p = tile[(t & 0xFF00) | (t >> 24)];
uint8 p = gTile[(t & 0xFF00) | (t >> 24)];
if (p) {
*ptr = ft_lightmap[p];
}
@@ -741,7 +741,7 @@ void rasterizeFTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
if (width & 1)
{
uint32 tmp = Rt - dtdx;
uint8 p = tile[(tmp & 0xFF00) | (tmp >> 24)];
uint8 p = gTile[(tmp & 0xFF00) | (tmp >> 24)];
if (p) {
ptr[width - 1] = ft_lightmap[p];
}
@@ -750,9 +750,9 @@ void rasterizeFTA_c(uint16* pixel, const VertexLink* L, const VertexLink* R)
width >>= 1;
while (width--)
{
uint8 indexA = tile[(t & 0xFF00) | (t >> 24)];
uint8 indexA = gTile[(t & 0xFF00) | (t >> 24)];
t += dtdx;
uint8 indexB = tile[(t & 0xFF00) | (t >> 24)];
uint8 indexB = gTile[(t & 0xFF00) | (t >> 24)];
t += dtdx;

View File

@@ -16,9 +16,9 @@ rMaxX .req r12
rMaxY .req lr
boxArg .req mx
divLUT .req mz
tmp .req mz
bz .req divLUT
bz .req mz
offset .req m
dz .req offset
xx .req rMinX
@@ -69,8 +69,8 @@ SIZE = (6 * 3 * 4)
mov dz, z, lsr #(FIXED_SHIFT + 6)
add dz, dz, z, lsr #(FIXED_SHIFT + 4)
mov dz, dz, lsl #1
ldrh dz, [divLUT, dz]
add tmp, dz, #DIVLUT_ADDR
ldrh dz, [tmp, dz]
mul x, dz, x
mul y, dz, y
@@ -165,7 +165,6 @@ boxIsVisible_asm:
mov maxY, maxY, asr #FIXED_SHIFT
stmdb sp!, {maxX, maxY, maxZ, minX, minY, minZ}
mov divLUT, #DIVLUT_ADDR
mov rMinX, #MAX_INT32
mov rMinY, #MAX_INT32
mov rMaxX, #MIN_INT32

View File

@@ -11,7 +11,6 @@ n1 .req r7
n2 .req r12
m .req lr
tmp .req m0
divLUT .req m0
.macro load
ldmia m, {m0, m1, m2}
@@ -107,9 +106,8 @@ matrixLerp_asm:
lerp _1_2
b .done
.mX_dY:
mov divLUT, #DIVLUT_ADDR
mov pdiv, pdiv, lsl #1
ldrh tmp, [divLUT, pdiv]
add tmp, pdiv, #DIVLUT_ADDR
ldrh tmp, [tmp, pdiv]
mul tmp, pmul, tmp
mov pmul, tmp, asr #8
lerp _X_Y

View File

@@ -12,7 +12,7 @@ Ldx .req r8
Rdx .req r9
N .req r10
tmp .req r11
DIVLUT .req r12
pair .req r12
width .req lr
R .req color
@@ -22,7 +22,7 @@ Ry2 .req Rh
Lxy .req tmp
Ly2 .req Lh
LMAP .req Lx
pair .req DIVLUT
ptr .req tmp
.global rasterizeF_asm
rasterizeF_asm:
@@ -38,8 +38,6 @@ rasterizeF_asm:
mov Rh, #0 // Rh = 0
.loop:
mov DIVLUT, #DIVLUT_ADDR
cmp Lh, #0
bne .calc_left_end // if (Lh != 0) end with left
@@ -57,8 +55,8 @@ rasterizeF_asm:
cmp Lh, #1 // if (Lh == 1) skip Ldx calc
beq .calc_left_end
lsl tmp, Lh, #1
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh)
add tmp, Lh, #DIVLUT_ADDR
ldrh tmp, [tmp, Lh] // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16
@@ -82,8 +80,8 @@ rasterizeF_asm:
cmp Rh, #1 // if (Rh == 1) skip Rdx calc
beq .calc_right_end
lsl tmp, Rh, #1
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh)
add tmp, Rh, #DIVLUT_ADDR
ldrh tmp, [tmp, Rh] // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16
@@ -101,29 +99,29 @@ rasterizeF_asm:
rsbs width, tmp, Rx, asr #16 // width = (Rx >> 16) - x1
ble .scanline_end // if (width <= 0) go next scanline
add tmp, pixel, tmp // tmp = pixel + x1
add ptr, pixel, tmp // ptr = pixel + x1
// 2 bytes alignment (VRAM write requirement)
.align_left:
tst tmp, #1 // if (tmp & 1)
tst ptr, #1 // if (ptr & 1)
beq .align_right
ldrb pair, [tmp, #-1]! // *tmp++ = (*tmp & 0x00FF) | (index << 8)
ldrb pair, [ptr, #-1]! // *ptr++ = (*ptr & 0x00FF) | (index << 8)
orr pair, index, lsl #8
strh pair, [tmp], #2
strh pair, [ptr], #2
subs width, #1 // width--
beq .scanline_end // if (width == 0)
.align_right:
tst width, #1
beq .scanline_block_2px
ldrb pair, [tmp, width]
ldrb pair, [ptr, width]
subs width, #1 // width--
orr pair, index, pair, lsl #8
strh pair, [tmp, width]
strh pair, [ptr, width]
beq .scanline_end // if (width == 0)
.scanline_block_2px:
strb index, [tmp], #2 // VRAM one as two bytes write hack
strb index, [ptr], #2 // VRAM one as two bytes write hack
subs width, #2
bne .scanline_block_2px

View File

@@ -33,8 +33,6 @@ Lxy .req tmp
Ly2 .req Lh
inv .req Lh
DIVLUT .req N
DIVLUTi .req L
width .req N
t .req L
dtdx .req R
@@ -111,9 +109,8 @@ rasterizeFT_asm:
cmp Lh, #1 // if (Lh <= 1) skip Ldx calc
beq .calc_left_end
lsl tmp, Lh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh)
add tmp, Lh, #DIVLUT_ADDR
ldrh tmp, [tmp, Lh] // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16
@@ -145,9 +142,8 @@ rasterizeFT_asm:
cmp Rh, #1 // if (Rh <= 1) skip Rdx calc
beq .calc_right_end
lsl tmp, Rh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh)
add tmp, Rh, #DIVLUT_ADDR
ldrh tmp, [tmp, Rh] // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16
@@ -176,9 +172,8 @@ rasterizeFT_asm:
add ptr, pixel, tmp // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
add inv, width, #DIVLUT_ADDR
ldrh inv, [inv, width] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // duv = Rt - Lt
scaleUV dtdx, du, dv, inv

View File

@@ -33,8 +33,6 @@ Lxy .req tmp
Ly2 .req Lh
inv .req Lh
DIVLUT .req N
DIVLUTi .req L
width .req N
t .req L
dtdx .req R
@@ -111,9 +109,8 @@ rasterizeFTA_asm:
cmp Lh, #1 // if (Lh <= 1) skip Ldx calc
beq .calc_left_end
lsl tmp, Lh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh)
add tmp, Lh, #DIVLUT_ADDR
ldrh tmp, [tmp, Lh] // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16
@@ -145,9 +142,8 @@ rasterizeFTA_asm:
cmp Rh, #1 // if (Rh <= 1) skip Rdx calc
beq .calc_right_end
lsl tmp, Rh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh)
add tmp, Rh, #DIVLUT_ADDR
ldrh tmp, [tmp, Rh] // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16
@@ -176,9 +172,8 @@ rasterizeFTA_asm:
add ptr, pixel, tmp // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
add inv, width, #DIVLUT_ADDR
ldrh inv, [inv, width] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // duv = Rt - Lt
scaleUV dtdx, du, dv, inv

View File

@@ -43,8 +43,6 @@ Lxy .req tmp
Ly2 .req Lh
inv .req Lh
DIVLUT .req N
DIVLUTi .req tmp
ptr .req Lx
width .req Rh
@@ -132,9 +130,8 @@ rasterizeGT_asm:
cmp Lh, #1 // if (Lh <= 1) skip Ldx calc
beq .calc_left_end
lsl tmp, Lh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh)
add tmp, Lh, #DIVLUT_ADDR
ldrh tmp, [tmp, Lh] // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16
@@ -174,9 +171,8 @@ rasterizeGT_asm:
cmp Rh, #1 // if (Rh <= 1) skip Rdx calc
beq .calc_right_end
lsl tmp, Rh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh)
add tmp, Rh, #DIVLUT_ADDR
ldrh tmp, [tmp, Rh] // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16
@@ -218,9 +214,8 @@ rasterizeGT_asm:
add ptr, pixel, Lx // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
add inv, width, #DIVLUT_ADDR
ldrh inv, [inv, width] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv

View File

@@ -43,8 +43,6 @@ Lxy .req tmp
Ly2 .req Lh
inv .req Lh
DIVLUT .req N
DIVLUTi .req tmp
ptr .req Lx
width .req Rh
@@ -134,9 +132,8 @@ rasterizeGTA_asm:
cmp Lh, #1 // if (Lh <= 1) skip Ldx calc
beq .calc_left_end
lsl tmp, Lh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh)
add tmp, Lh, #DIVLUT_ADDR
ldrh tmp, [tmp, Lh] // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16
@@ -176,9 +173,8 @@ rasterizeGTA_asm:
cmp Rh, #1 // if (Rh <= 1) skip Rdx calc
beq .calc_right_end
lsl tmp, Rh, #1
mov DIVLUT, #DIVLUT_ADDR
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh)
add tmp, Rh, #DIVLUT_ADDR
ldrh tmp, [tmp, Rh] // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16
@@ -220,9 +216,8 @@ rasterizeGTA_asm:
add ptr, pixel, Lx // ptr = pixel + x1
mov DIVLUTi, #DIVLUT_ADDR
lsl inv, width, #1
ldrh inv, [DIVLUTi, inv] // inv = FixedInvU(width)
add inv, width, #DIVLUT_ADDR
ldrh inv, [inv, width] // inv = FixedInvU(width)
sub dtdx, Rt, Lt // dtdx = Rt - Lt
scaleUV dtdx, du, dv, inv

View File

@@ -12,16 +12,15 @@ Ldx .req r8
Rdx .req r9
N .req r10
tmp .req r11
DIVLUT .req r12
pair .req r12
width .req lr
h .req N
Rxy .req tmp
Ry2 .req Rh
Lxy .req tmp
Ly2 .req Lh
pair .req DIVLUT
indexA .req Lh
indexB .req DIVLUT
indexB .req pair
.global rasterizeS_asm
rasterizeS_asm:
@@ -34,7 +33,6 @@ rasterizeS_asm:
mov Rh, #0 // Rh = 0
.loop:
mov DIVLUT, #DIVLUT_ADDR
cmp Lh, #0
bne .calc_left_end // if (Lh != 0) end with left
@@ -53,8 +51,8 @@ rasterizeS_asm:
cmp Lh, #1 // if (Lh == 1) skip Ldx calc
beq .calc_left_end
lsl tmp, Lh, #1
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Lh)
add tmp, Lh, #DIVLUT_ADDR
ldrh tmp, [tmp, Lh] // tmp = FixedInvU(Lh)
ldrsh Ldx, [L, #VERTEX_X]
sub Ldx, Lx, asr #16
@@ -78,8 +76,8 @@ rasterizeS_asm:
cmp Rh, #1 // if (Rh == 1) skip Rdx calc
beq .calc_right_end
lsl tmp, Rh, #1
ldrh tmp, [DIVLUT, tmp] // tmp = FixedInvU(Rh)
add tmp, Rh, #DIVLUT_ADDR
ldrh tmp, [tmp, Rh] // tmp = FixedInvU(Rh)
ldrsh Rdx, [R, #VERTEX_X]
sub Rdx, Rx, asr #16
@@ -112,7 +110,7 @@ rasterizeS_asm:
orr pair, indexA, lsl #8
strh pair, [tmp], #2
subs width, #1 // width--
subs width, #1 // width--
beq .scanline_end
.align_right:

View File

@@ -11,7 +11,7 @@ vx .req r7
vy .req r8
vz .req r12
m .req lr
divLUT .req m
tmp .req m
vp .req m
vMinXY .req z
vMaxXY .req r
@@ -49,9 +49,8 @@ sphereIsVisible_asm:
mov z, vz, lsr #(FIXED_SHIFT + 6)
add z, z, vz, lsr #(FIXED_SHIFT + 4)
mov z, z, lsl #1
mov divLUT, #DIVLUT_ADDR
ldrh z, [divLUT, z]
add tmp, z, #DIVLUT_ADDR
ldrh z, [tmp, z]
mul x, z, x
mul y, z, y
mul r, z, r

View File

@@ -20,7 +20,7 @@ ambient .req vx
vp .req vx
minXY .req vx
maxXY .req vy
DIVLUT .req vy
tmp .req vy
dz .req vx
SP_MINXY = 0
@@ -93,9 +93,8 @@ transformMesh_asm:
// project
mov dz, z, lsr #4
add dz, dz, z, lsr #6
mov dz, dz, lsl #1
mov DIVLUT, #DIVLUT_ADDR
ldrh dz, [DIVLUT, dz]
add tmp, dz, #DIVLUT_ADDR
ldrh dz, [tmp, dz]
mul x, dz, x
mul y, dz, y
mov x, x, asr #(16 - PROJ_SHIFT)

View File

@@ -25,7 +25,7 @@ vp .req vx
minXY .req vx
maxXY .req vy
DIVLUT .req my
tmp .req my
dz .req mz
fog .req mz
@@ -110,9 +110,8 @@ transformRoom_asm:
// project
mov dz, z, lsr #6
add dz, dz, z, lsr #4
mov dz, dz, lsl #1
mov DIVLUT, #DIVLUT_ADDR
ldrh dz, [DIVLUT, dz]
add tmp, dz, #DIVLUT_ADDR
ldrh dz, [tmp, dz]
mul x, dz, x
mul y, dz, y
mov x, x, asr #(16 - PROJ_SHIFT)

View File

@@ -28,7 +28,6 @@ vp .req vx
minXY .req vx
maxXY .req vy
DIVLUT .req my
dz .req mz
fog .req mz
@@ -137,9 +136,8 @@ transformRoomUW_asm:
// project
mov dz, z, lsr #6
add dz, dz, z, lsr #4
mov dz, dz, lsl #1
mov DIVLUT, #DIVLUT_ADDR
ldrh dz, [DIVLUT, dz]
add tmp, dz, #DIVLUT_ADDR
ldrh dz, [tmp, dz]
mul x, dz, x
mul y, dz, y
mov x, x, asr #(16 - PROJ_SHIFT)