1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-08-07 13:46:45 +02:00

3DS optimize i-cache for STR playback

This commit is contained in:
XProger
2020-02-21 04:27:43 +03:00
parent 9e904600a6
commit dc9badd801
2 changed files with 42 additions and 47 deletions

View File

@@ -48,10 +48,7 @@ APP_DESCRIPTION := Classic Tomb Raider open-source engine
#---------------------------------------------------------------------------------
ARCH := -march=armv6k -mtune=mpcore -mfloat-abi=hard -mtp=soft
CFLAGS := -g0 -w -Ofast -ffast-math -mword-relocations \
-fomit-frame-pointer -ffunction-sections \
$(ARCH)
CFLAGS := $(ARCH) -g0 -w -Ofast -ffast-math -mword-relocations -fomit-frame-pointer -ffunction-sections
CFLAGS += $(INCLUDE) -DARM11 -D_3DS
CXXFLAGS := $(CFLAGS) -fno-rtti -fno-exceptions -std=gnu++11

View File

@@ -970,48 +970,44 @@ struct Video {
ac = (code & (1 << (8 + shift - e.length))) ? -e.ac : e.ac;
return true;
}
void IDCT_PASS(int16 *src, int16 *dst, int32 x) { \
int32 a0 = src[0 * x] * STR_IDCT_A;
int32 b1 = src[1 * x] * STR_IDCT_B;
int32 c1 = src[1 * x] * STR_IDCT_C;
int32 d1 = src[1 * x] * STR_IDCT_D;
int32 e1 = src[1 * x] * STR_IDCT_E;
int32 f2 = src[2 * x] * STR_IDCT_F;
int32 g2 = src[2 * x] * STR_IDCT_G;
int32 b3 = src[3 * x] * STR_IDCT_B;
int32 c3 = src[3 * x] * STR_IDCT_C;
int32 d3 = src[3 * x] * STR_IDCT_D;
int32 e3 = src[3 * x] * STR_IDCT_E;
int32 a4 = src[4 * x] * STR_IDCT_A;
int32 b5 = src[5 * x] * STR_IDCT_B;
int32 c5 = src[5 * x] * STR_IDCT_C;
int32 d5 = src[5 * x] * STR_IDCT_D;
int32 e5 = src[5 * x] * STR_IDCT_E;
int32 f6 = src[6 * x] * STR_IDCT_F;
int32 g6 = src[6 * x] * STR_IDCT_G;
int32 b7 = src[7 * x] * STR_IDCT_B;
int32 c7 = src[7 * x] * STR_IDCT_C;
int32 d7 = src[7 * x] * STR_IDCT_D;
int32 e7 = src[7 * x] * STR_IDCT_E;
dst[0 * x] = ( a0 + b1 + f2 + c3 + a4 + d5 + g6 + e7 ) >> 16;
dst[1 * x] = ( a0 + c1 + g2 - e3 - a4 - b5 - f6 - d7 ) >> 16;
dst[2 * x] = ( a0 + d1 - g2 - b3 - a4 + e5 + f6 + c7 ) >> 16;
dst[3 * x] = ( a0 + e1 - f2 - d3 + a4 + c5 - g6 - b7 ) >> 16;
dst[4 * x] = ( a0 - e1 - f2 + d3 + a4 - c5 - g6 + b7 ) >> 16;
dst[5 * x] = ( a0 - d1 - g2 + b3 - a4 - e5 + f6 - c7 ) >> 16;
dst[6 * x] = ( a0 - c1 + g2 + e3 - a4 + b5 - f6 + d7 ) >> 16;
dst[7 * x] = ( a0 - b1 + f2 - c3 + a4 - d5 + g6 - e7 ) >> 16;
}
void IDCT(int16 *b) {
int16 t[64];
#define IDCT_PASS(src, dst, x, y) { \
int16 *s = src + i * y;\
int16 *d = dst + i * y;\
int32 a0 = s[0 * x] * STR_IDCT_A; \
int32 b1 = s[1 * x] * STR_IDCT_B; \
int32 c1 = s[1 * x] * STR_IDCT_C; \
int32 d1 = s[1 * x] * STR_IDCT_D; \
int32 e1 = s[1 * x] * STR_IDCT_E; \
int32 f2 = s[2 * x] * STR_IDCT_F; \
int32 g2 = s[2 * x] * STR_IDCT_G; \
int32 b3 = s[3 * x] * STR_IDCT_B; \
int32 c3 = s[3 * x] * STR_IDCT_C; \
int32 d3 = s[3 * x] * STR_IDCT_D; \
int32 e3 = s[3 * x] * STR_IDCT_E; \
int32 a4 = s[4 * x] * STR_IDCT_A; \
int32 b5 = s[5 * x] * STR_IDCT_B; \
int32 c5 = s[5 * x] * STR_IDCT_C; \
int32 d5 = s[5 * x] * STR_IDCT_D; \
int32 e5 = s[5 * x] * STR_IDCT_E; \
int32 f6 = s[6 * x] * STR_IDCT_F; \
int32 g6 = s[6 * x] * STR_IDCT_G; \
int32 b7 = s[7 * x] * STR_IDCT_B; \
int32 c7 = s[7 * x] * STR_IDCT_C; \
int32 d7 = s[7 * x] * STR_IDCT_D; \
int32 e7 = s[7 * x] * STR_IDCT_E; \
d[0 * x] = ( a0 + b1 + f2 + c3 + a4 + d5 + g6 + e7 ) >> 16; \
d[1 * x] = ( a0 + c1 + g2 - e3 - a4 - b5 - f6 - d7 ) >> 16; \
d[2 * x] = ( a0 + d1 - g2 - b3 - a4 + e5 + f6 + c7 ) >> 16; \
d[3 * x] = ( a0 + e1 - f2 - d3 + a4 + c5 - g6 - b7 ) >> 16; \
d[4 * x] = ( a0 - e1 - f2 + d3 + a4 - c5 - g6 + b7 ) >> 16; \
d[5 * x] = ( a0 - d1 - g2 + b3 - a4 - e5 + f6 - c7 ) >> 16; \
d[6 * x] = ( a0 - c1 + g2 + e3 - a4 + b5 - f6 + d7 ) >> 16; \
d[7 * x] = ( a0 - b1 + f2 - c3 + a4 - d5 + g6 - e7 ) >> 16; }
for (int i = 0; i < 8; i++) IDCT_PASS(b, t, 8, 1);
for (int i = 0; i < 8; i++) IDCT_PASS(t, b, 1, 8);
#undef IDCT_PASS
for (int i = 0; i < 8 * 1; i += 1) IDCT_PASS(b + i, t + i, 8);
for (int i = 0; i < 8 * 8; i += 8) IDCT_PASS(t + i, b + i, 1);
}
virtual bool decodeVideo(Color32 *pixels) {
@@ -1027,7 +1023,7 @@ struct Video {
BitStream bs(chunk->data + 8, chunk->size - 8); // make bitstream without frame header
int16 block[6][64]; // Cr, Cb, YTL, YTR, YBL, YBR
for (int bX = 0; bX < width / 16; bX++)
for (int bX = 0; bX < width / 16; bX++) {
for (int bY = 0; bY < height / 16; bY++) {
memset(block, 0, sizeof(block));
@@ -1037,8 +1033,9 @@ struct Video {
int16 *channel = block[i];
channel[0] = bs.readU(10);
if (channel[0]) {
if (channel[0] & 0x200)
if (channel[0] & 0x200) {
channel[0] -= 0x400;
}
channel[0] = channel[0] * STR_QUANTIZATION[0]; // DC
nonZero = true;
}
@@ -1072,6 +1069,7 @@ struct Video {
c[0], c[1], c[width], c[width + 1]);
}
}
}
chunk->size = 0;
@@ -1351,7 +1349,7 @@ struct Video {
#endif
}
void render() { // just update GPU texture if it's necessary
void render() { // update GPU texture
if (!needUpdate) return;
frameTex[0]->update(frameData);
swap(frameTex[0], frameTex[1]);