1
0
mirror of https://github.com/XProger/OpenLara.git synced 2025-03-17 17:49:43 +01:00

GBA: Micro-optimization, and new ADPCM format for tracks (#452)

* GBA: Slightly faster clamping in sndPCM_mix

* GBA: Change IMA-ADPCM to new format...

... and fix a race condition when writing to music.data.

* GBA: Properly clear buffer on end-of-track

* GBA: Update packer for new ADPCM format

* GBA: Make music output clamp optional...

... and disable it by default, as the music has been built to avoid overflows.

* GBA: Add reference ADPCM encoder

* GBA: Re-encode music

* GBA: Improve ADPCM encoder

Minimize RMSE of post-filtered output rather than pre-filtered

* GBA: Fix ADPCM issues with silence
This commit is contained in:
Ruben 2023-03-01 11:04:52 +11:00 committed by GitHub
parent 2763743038
commit 4876cc8f0e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 289 additions and 146 deletions

View File

@ -30,7 +30,7 @@ EWRAM_DATA const char* const* STR = STR_EN;
EWRAM_DATA ExtraInfoLara playersExtra[MAX_PLAYERS];
#if defined(__GBA__)
#include "TRACKS_IMA.h"
#include "TRACKS_AD4.h"
#include "TITLE_SCR.h"
#include "TITLE_PKD.h"
#include "GYM_PKD.h"

View File

@ -2142,6 +2142,14 @@ struct IMA_STATE
int32 idx;
};
// Currently only used for GBA
struct ADPCM4_STATE
{
int32 zM1, zM2;
int32 tap;
int32 quant;
};
#if defined(GAPI_GL1)
#define PERSPECTIVE_DZ(z) z

View File

@ -144,7 +144,7 @@ $(OFILES_SOURCES) : $(HFILES)
@echo $(notdir $<)
@$(bin2o)
%.IMA.o %_IMA.h : %.IMA
%.AD4.o %_AD4.h : %.AD4
@echo $(notdir $<)
@$(bin2o)

View File

@ -0,0 +1,70 @@
#include "common_asm.inc"
// Clamping is only required if the encoder gives overflow warnings.
// To improve on speed, the music volume has been reduced to avoid this.
#define CLAMP_OUTPUT 0
// Unrolling saves 3.75 cycles per sample, but uses a lot more RAM.
#define UNROLL 0
state .req r0
buffer .req r1
data .req r2
size .req r3
zM1 .req r4
zM2 .req r5
tap .req r6
quant .req r7
n .req r8
mask .req r9
adapt .req r10
stepLUT .req r11
temp .req r12
.macro adpcm4_decode zM1, zM2
sub tap, tap, tap, asr #3
add tap, tap, \zM2
mov temp, tap, asr #8
#if CLAMP_OUTPUT
teq temp, temp, lsl #32-8
eormi temp, mask, temp, asr #31
#endif
strb temp, [buffer], #1
mov n, n, ror #4
mov temp, n, asr #32-4
sub \zM2, \zM1, \zM2
mla \zM2, quant, temp, \zM2
// zM1 and zM2 now swapped
ldrb temp, [stepLUT, temp]
mla temp, quant, temp, mask
mov quant, temp, lsr #7
.endm
.global sndADPCM4_fill_asm
sndADPCM4_fill_asm:
stmfd sp!, {r4-r11}
ldmia state, {zM1,zM2,tap,quant}
mov mask, #127
ldr stepLUT, =ADPCM4_ADAPT+8
.loop:
ldr n, [data], #4
#if UNROLL
.rept 8/2
#endif
1: adpcm4_decode zM1, zM2 // zM1 and zM2 get swapped...
adpcm4_decode zM2, zM1 // ... and swapped back
#if UNROLL
.endr
#else
adds size, size, #1<<(32-3+1) // Count up the 8 samples
bcc 1b
#endif
subs size, #8/2 // size is provided as number of bytes
bne .loop
stmia state, {zM1,zM2,tap,quant}
ldmfd sp!, {r4-r11}
bx lr

View File

@ -1,67 +0,0 @@
#include "common_asm.inc"
state .req r0
buffer .req r1
data .req r2
size .req r3
smp .req r4
idx .req r5
stepLUT .req r6
step .req r7
n .req r8
index .req r9
mask .req r10
out .req r12
tmp .req out
diff .req step
IMA_STEP_SIZE = 88
.macro ima_decode
ldr step, [stepLUT, idx, lsl #2]
mul tmp, step, index
add diff, tmp, lsl #1
subne smp, diff, lsr #3
addeq smp, diff, lsr #3
subs index, #3
suble idx, #1
addgt idx, index, lsl #1
// clamp 0..88
bic idx, idx, asr #31
cmp idx, #IMA_STEP_SIZE
movgt idx, #IMA_STEP_SIZE
mov out, smp, asr #(2 + SND_VOL_SHIFT)
strb out, [buffer], #1
.endm
.global sndIMA_fill_asm
sndIMA_fill_asm:
stmfd sp!, {r4-r9}
ldmia state, {smp, idx}
ldr stepLUT, =IMA_STEP
mov mask, #7
.loop:
ldrb n, [data], #1
and index, mask, n
tst n, #8
ima_decode
and index, mask, n, lsr #4
tst n, #(8 << 4)
ima_decode
subs size, #1
bne .loop
stmia state, {smp, idx}
ldmfd sp!, {r4-r9}
bx lr

View File

@ -8,15 +8,16 @@ volume .req r3
data .req r4
buffer .req r5
tmp .req r6
mask .req r7
last .req r12
tmpSP .req last
out .req size
.macro clamp
// Vanadium's clamp trick (-128..127)
mov tmp, out, asr #31 // tmp <- 0xffffffff
cmp tmp, out, asr #7 // not equal
eorne out, tmp, #0x7F // out <- 0xffffff80
// Aikku93's quick-and-dirty clamp (-128..+127)
// This only works for inputs of -256..+255
TEQ out, out, lsl #32-8 // If the sign of 8bit value does not match...
EORMI out, mask, out, asr #31 // ... then clip using the real sign
.endm
.macro calc_last
@ -72,8 +73,9 @@ sndPCM_fill_asm:
.global sndPCM_mix_asm
sndPCM_mix_asm:
mov mask, #127
mov tmpSP, sp
stmfd sp!, {r4-r6} // tmp reg required
stmfd sp!, {r4-r7} // tmp reg required
ldmia tmpSP, {data, buffer}
@ -89,7 +91,7 @@ sndPCM_mix_asm:
cmp pos, last
blt .loop_mix
ldmfd sp!, {r4-r6}
ldmfd sp!, {r4-r7}
bx lr
.global sndClear_asm

Binary file not shown.

Binary file not shown.

View File

@ -6,7 +6,7 @@ EWRAM_DATA int32 fpsCounter = 0;
EWRAM_DATA uint32 curSoundBuffer = 0;
#ifdef __GBA_WIN__
const void* TRACKS_IMA;
const void* TRACKS_AD4;
const void* TITLE_SCR;
const void* levelData;
@ -233,9 +233,9 @@ const void* osLoadLevel(LevelID id)
}
// tracks
if (!TRACKS_IMA)
if (!TRACKS_AD4)
{
FILE *f = fopen("data/TRACKS.IMA", "rb");
FILE *f = fopen("data/TRACKS.AD4", "rb");
if (!f)
return NULL;
@ -246,7 +246,7 @@ const void* osLoadLevel(LevelID id)
fread(data, 1, size, f);
fclose(f);
TRACKS_IMA = data;
TRACKS_AD4 = data;
}
if (!TITLE_SCR)

View File

@ -0,0 +1,81 @@
#pragma once
#include <stdint.h>
struct AD4State_t {
int32_t zM1, zM2;
int32_t Tap;
int32_t Quant;
int32_t Output;
uint32_t MaxOutputLevel;
};
void AD4_Init(struct AD4State_t *State) {
State->zM1 = 0;
State->zM2 = 0;
State->Tap = 0;
State->Quant = 0x0800;
State->Output = 0;
State->MaxOutputLevel = 0;
}
uint32_t AD4_EncodeFrame(struct AD4State_t *State, const int16_t *Data) {
static const uint8_t AdaptTable[] = {
192,192,136,136,128,128,128,128, // -8..-1
112,128,128,128,128,136,136,192, // 0..+7
};
uint8_t n;
int32_t zM1 = State->zM1;
int32_t zM2 = State->zM2;
int32_t Tap = State->Tap;
int32_t Quant = State->Quant;
int32_t Output = State->Output;
uint32_t MaxOutputLevel = State->MaxOutputLevel;
uint32_t FrameData = 0;
for(n=0;n<8;n++) {
//! Get input, compute prediction, and quantize residue
//! Note that we minimize error of Output rather than Y, which implies
//! applying the post-filter in the analysis equation to get the residue.
int32_t X = Data[n];
int32_t P = zM1 - zM2;
int32_t R = X - (P + (Tap - (Tap >> 3))); {
#if 0 //! Lower RMSE, but sounds noisier
R = (2*R + ((R < 0) ? (-Quant) : (+Quant))) / (2*Quant); //! (R + Sign[R]*(Quant/2)) / Quant
#else
R /= Quant;
#endif
if(R < -8) R = -8;
if(R > +7) R = +7;
}
int32_t Y = P + R*Quant;
//! Calculate output value and update maximum level
//! Post-filter: Hpost(z) = 1 / Hpre(z) = 1 / (1 - (7/8)z^-1)
Output = Y + Output - (Output >> 3);
uint32_t Level = (uint32_t)((Output < 0) ? (-Output) : (+Output));
if(Level > MaxOutputLevel) MaxOutputLevel = Level;
//! Do the same, but for the encoding tap. This is needed to
//! avoid a limit oscillation on silence from round-off error.
//! Technically, it does mean a different output, but it should
//! be close enough to what we want that it shouldn't matter.
Tap = Y + Tap - ((Tap + 4 - (Tap < 0)) >> 3); //! Y + Round[Tap*7/8]
//! Update taps and push residue to frame
zM2 = zM1;
zM1 = Y;
FrameData |= (R&0xF) << (n*4);
//! Adapt quantizer
//! Rounding up means that Quant can never collapse to 0
Quant = (Quant * AdaptTable[R+8] + 127) >> 7;
}
State->zM1 = zM1;
State->zM2 = zM2;
State->Tap = Tap;
State->Quant = Quant;
State->Output = Output;
State->MaxOutputLevel = MaxOutputLevel;
return FrameData;
}

View File

@ -0,0 +1,60 @@
#ifndef __GNUC__
# warning "Compile with GCC-compatible compiler for endianness checking."
#endif
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include "AD4.h"
int main(int argc, const char *argv[]) {
if(argc < 3 || argc > 4) {
printf(
"Usage: ad4 Input.raw Output.ad4 [dBGain]\n"
"Input.raw must be mono signed PCM16.\n"
"Output will be aligned to 4 bytes.\n"
);
return 1;
}
FILE *InFile = fopen(argv[1], "rb");
if(!InFile) { printf("Couldn't open input file.\n"); goto Error_InFile; }
FILE *OutFile = fopen(argv[2], "wb");
if(!OutFile) { printf("Couldn't open output file.\n"); goto Error_OutFile; }
double Volume = 1.0;
if(argc >= 4) Volume = pow(10.0, atof(argv[3]) / 20.0);
size_t nSamples; {
fseek(InFile, 0, SEEK_END);
nSamples = ftell(InFile) / sizeof(int16_t);
rewind(InFile);
}
struct AD4State_t AD4State; AD4_Init(&AD4State);
size_t Frame, nFrames = (nSamples + 7) / 8;
for(Frame=0;Frame<nFrames;Frame++) {
int16_t Buffer[8]; //! 1 frame = 8 samples
size_t n, nRead = fread(Buffer, sizeof(int16_t), 8, InFile);
for(n=0;n<nRead;n++) Buffer[n] = (int16_t)(Buffer[n] * Volume);
while(nRead < 8) Buffer[nRead++] = 0;
uint32_t FrameData = AD4_EncodeFrame(&AD4State, Buffer);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
//! Target is actually intended to be bytes, so swap the
//! endianness of the frame data on big-endian systems
FrameData = ((FrameData & 0x000000FF) << 24) |
((FrameData & 0x0000FF00) << 8) |
((FrameData & 0x00FF0000) >> 8) |
((FrameData & 0xFF000000) >> 24) ;
#endif
fwrite(&FrameData, 1, sizeof(FrameData), OutFile);
}
printf("Maximum output level: %u", AD4State.MaxOutputLevel);
if(AD4State.MaxOutputLevel < 32768) putchar('\n');
else printf(" (overflow by %u)\n", AD4State.MaxOutputLevel - 32767);
fclose(OutFile);
Error_OutFile:
fclose(InFile);
Error_InFile:
return 0;
}

View File

@ -2189,7 +2189,7 @@ struct out_GBA
void convertTracks(FileStream &f, const char* from)
{
char buf[256];
sprintf(buf, "%s/*.ima", from);
sprintf(buf, "%s/*.ad4", from);
WIN32_FIND_DATA fd;
HANDLE h = FindFirstFile(buf, &fd);
@ -2209,9 +2209,10 @@ struct out_GBA
if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
{
const char* src = fd.cFileName;
const char* srcEnd = strrchr(src, '.');
char* dst = buf;
while (*src)
while (src < srcEnd)
{
if (*src >= '0' && *src <= '9')
{
@ -2237,12 +2238,11 @@ struct out_GBA
fseek(f, 0, SEEK_END);
int32 size = ftell(f);
fseek(f, 0, SEEK_SET);
tracks[index].data = new char[size + 4];
tracks[index].data = new char[size];
fread(tracks[index].data, 1, size, f);
fclose(f);
tracks[index].size = ALIGN(*((int32*)tracks[index].data + 2), 4) - 4;
tracks[index].size = size; // ad4 tool encodes 32-bit chunks, so no need to align
ASSERT(tracks[index].size % 4 == 0);
}
}
@ -2268,7 +2268,7 @@ struct out_GBA
{
if (tracks[i].size == 0)
continue;
f.write((uint8*)tracks[i].data + 16, tracks[i].size);
f.write(tracks[i].data, tracks[i].size);
delete[] tracks[i].data;
}
}
@ -2752,7 +2752,7 @@ struct out_GBA
// audio tracks
{
sprintf(buf, "%s/TRACKS.IMA", dir);
sprintf(buf, "%s/TRACKS.AD4", dir);
FileStream f(buf, true);
convertTracks(f, "tracks/conv_demo");
}

View File

@ -1,81 +1,65 @@
#include "common.h"
int32 IMA_STEP[] = { // IWRAM !
7, 8, 9, 10, 11, 12, 13, 14,
16, 17, 19, 21, 23, 25, 28, 31,
34, 37, 41, 45, 50, 55, 60, 66,
73, 80, 88, 97, 107, 118, 130, 143,
157, 173, 190, 209, 230, 253, 279, 307,
337, 371, 408, 449, 494, 544, 598, 658,
724, 796, 876, 963, 1060, 1166, 1282, 1411,
1552, 1707, 1878, 2066, 2272, 2499, 2749, 3024,
3327, 3660, 4026, 4428, 4871, 5358, 5894, 6484,
7132, 7845, 8630, 9493, 10442, 11487, 12635, 13899,
15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794,
32767
uint8_t ADPCM4_ADAPT[] = { // IWRAM !
192,192,136,136,128,128,128,128, // -8..-1
112,128,128,128,128,136,136,192, // 0..+7
};
#if defined(__GBA__) && defined(USE_ASM)
extern const uint8_t TRACKS_IMA[];
extern const uint8_t TRACKS_AD4[];
#else
extern const void* TRACKS_IMA;
extern const void* TRACKS_AD4;
#endif
int8 soundBuffer[2 * SND_SAMPLES + 32]; // 32 bytes of silence for DMA overrun while interrupt
#ifdef USE_ASM
#define sndIMA_fill sndIMA_fill_asm
#define sndPCM_fill sndPCM_fill_asm
#define sndPCM_mix sndPCM_mix_asm
#define sndClear sndClear_asm
#define sndADPCM4_fill sndADPCM4_fill_asm
#define sndPCM_fill sndPCM_fill_asm
#define sndPCM_mix sndPCM_mix_asm
#define sndClear sndClear_asm
extern "C" {
void sndClear_asm(int8* buffer);
void sndIMA_fill_asm(IMA_STATE &state, int8* buffer, const uint8* data, int32 size);
void sndADPCM4_fill_asm(ADPCM4_STATE &state, int8* buffer, const uint8* data, int32 size);
int32 sndPCM_fill_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer);
int32 sndPCM_mix_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer);
}
#else
#define sndIMA_fill sndIMA_c
#define sndPCM_fill sndPCM_c
#define sndPCM_mix sndPCM_c
#define sndClear(b) dmaFill(b, SND_ENCODE(0), SND_SAMPLES * sizeof(b[0]))
#define sndADPCM4_fill sndADPCM4_c
#define sndPCM_fill sndPCM_c
#define sndPCM_mix sndPCM_c
#define sndClear(b) dmaFill(b, SND_ENCODE(0), SND_SAMPLES * sizeof(b[0]))
#define DECODE_IMA_4(n)\
step = IMA_STEP[idx];\
index = n & 7;\
step += index * step << 1;\
if (index < 4) {\
idx = X_MAX(idx - 1, 0);\
} else {\
idx = X_MIN(idx + ((index - 3) << 1), X_COUNT(IMA_STEP) - 1);\
}\
if (n & 8) {\
smp -= step >> 3;\
} else {\
smp += step >> 3;\
}\
amp = smp >> 8;\
*buffer++ = SND_ENCODE(X_CLAMP(amp, SND_MIN, SND_MAX));
#define DECODE_ADPCM4(n)\
tap = zM2 + tap - (tap >> 3);\
*buffer++ = SND_ENCODE(X_CLAMP(tap >> 8, SND_MIN, SND_MAX));\
res = ((n&0xF) ^ 8) - 8;\
out = res*quant + (zM1 - zM2);\
zM2 = zM1;\
zM1 = out;\
quant = (quant*(int32)ADPCM4_ADAPT[res+8] + 127) >> 7;\
void sndIMA_c(IMA_STATE &state, int8* buffer, const uint8* data, int32 size)
void sndADPCM4_c(ADPCM4_STATE &state, int8* buffer, const uint8* data, int32 size)
{
uint32 step, index;
int32 smp = state.smp;
int32 idx = state.idx;
int32 amp;
for (int32 i = 0; i < size; i++)
int32 zM1 = state.zM1;
int32 zM2 = state.zM2;
int32 tap = state.tap;
int32 quant = state.quant;
int32 res, out;
for (int32 i=0; i < size; i++)
{
uint32 n = *data++;
DECODE_IMA_4(n);
DECODE_ADPCM4(n);
n >>= 4;
DECODE_IMA_4(n);
DECODE_ADPCM4(n);
}
state.smp = smp;
state.idx = idx;
state.zM1 = zM1;
state.zM2 = zM2;
state.tap = tap;
state.quant = quant;
}
int32 sndPCM_c(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer)
@ -101,20 +85,20 @@ struct Music
const uint8* data;
int32 size;
int32 pos;
IMA_STATE state;
ADPCM4_STATE state;
void fill(int8* buffer)
{
int32 len = X_MIN(size - pos, SND_SAMPLES >> 1);
sndIMA_fill(state, buffer, data + pos, len);
sndADPCM4_fill(state, buffer, data + pos, len);
pos += len;
if (pos >= size)
{
data = NULL;
memset(buffer, 0, (SND_SAMPLES - (len << 1)) * sizeof(buffer[0]));
memset(buffer + (len << 1), 0, (SND_SAMPLES - (len << 1)) * sizeof(buffer[0]));
}
}
};
@ -232,17 +216,22 @@ void sndPlayTrack(int32 track)
int32 size;
};
const TrackInfo* info = (const TrackInfo*)TRACKS_IMA + track;
const TrackInfo* info = (const TrackInfo*)TRACKS_AD4 + track;
if (!info->size)
return;
music.data = (uint8*)TRACKS_IMA + info->offset;
// Clear music.data before setup, and write it after to ensure
// music.fill() has a consistent state at any point in time
music.data = NULL;
music.size = info->size;
music.pos = 0;
//music.volume = (1 << SND_VOL_SHIFT);
music.state.smp = 0;
music.state.idx = 0;
music.state.zM1 = 0;
music.state.zM2 = 0;
music.state.tap = 0;
music.state.quant = 0x0800;
music.data = (const uint8*)TRACKS_AD4 + info->offset;
}
void sndStopTrack()