GBA: Micro-optimization, and new ADPCM format for tracks (#452)

* GBA: Slightly faster clamping in sndPCM_mix * GBA: Change IMA-ADPCM to new format... ... and fix a race condition when writing to music.data. * GBA: Properly clear buffer on end-of-track * GBA: Update packer for new ADPCM format * GBA: Make music output clamp optional... ... and disable it by default, as the music has been built to avoid overflows. * GBA: Add reference ADPCM encoder * GBA: Re-encode music * GBA: Improve ADPCM encoder Minimize RMSE of post-filtered output rather than pre-filtered * GBA: Fix ADPCM issues with silence
2025-03-17 17:49:43 +01:00 · 2023-03-01 11:04:52 +11:00 · 2023-03-01 11:04:52 +11:00 · 4876cc8f0e
commit 4876cc8f0e
parent 2763743038
13 changed files with 289 additions and 146 deletions
--- a/src/fixed/common.cpp
+++ b/src/fixed/common.cpp
@ -30,7 +30,7 @@ EWRAM_DATA const char* const* STR = STR_EN;
 EWRAM_DATA ExtraInfoLara playersExtra[MAX_PLAYERS];

 #if defined(__GBA__)
-    #include "TRACKS_IMA.h"
+    #include "TRACKS_AD4.h"
    #include "TITLE_SCR.h"
    #include "TITLE_PKD.h"
    #include "GYM_PKD.h"
--- a/src/fixed/common.h
+++ b/src/fixed/common.h
@ -2142,6 +2142,14 @@ struct IMA_STATE
    int32 idx;
 };

+// Currently only used for GBA
+struct ADPCM4_STATE
+{
+    int32 zM1, zM2;
+    int32 tap;
+    int32 quant;
+};
+
 #if defined(GAPI_GL1)
    #define PERSPECTIVE_DZ(z) z

--- a/src/platform/gba/Makefile
+++ b/src/platform/gba/Makefile
@ -144,7 +144,7 @@ $(OFILES_SOURCES) : $(HFILES)
 	@echo $(notdir $<)
 	@$(bin2o)

-%.IMA.o	%_IMA.h :	%.IMA
+%.AD4.o	%_AD4.h :	%.AD4
 	@echo $(notdir $<)
 	@$(bin2o)

--- a/src/platform/gba/asm/sndAD4.s
+++ b/src/platform/gba/asm/sndAD4.s
@ -0,0 +1,70 @@
+#include "common_asm.inc"
+
+// Clamping is only required if the encoder gives overflow warnings.
+// To improve on speed, the music volume has been reduced to avoid this.
+#define CLAMP_OUTPUT 0
+
+// Unrolling saves 3.75 cycles per sample, but uses a lot more RAM.
+#define UNROLL 0
+
+state   .req r0
+buffer  .req r1
+data    .req r2
+size    .req r3
+zM1     .req r4
+zM2     .req r5
+tap     .req r6
+quant   .req r7
+n       .req r8
+mask    .req r9
+adapt   .req r10
+stepLUT .req r11
+temp    .req r12
+
+.macro adpcm4_decode zM1, zM2
+    sub   tap, tap, tap, asr #3
+    add   tap, tap, \zM2
+    mov   temp, tap, asr #8
+#if CLAMP_OUTPUT
+    teq   temp, temp, lsl #32-8
+    eormi temp, mask, temp, asr #31
+#endif
+    strb  temp, [buffer], #1
+    mov   n, n, ror #4
+    mov   temp, n, asr #32-4
+    sub   \zM2, \zM1, \zM2
+    mla   \zM2, quant, temp, \zM2
+    // zM1 and zM2 now swapped
+    ldrb  temp, [stepLUT, temp]
+    mla   temp, quant, temp, mask
+    mov   quant, temp, lsr #7
+.endm
+
+.global sndADPCM4_fill_asm
+sndADPCM4_fill_asm:
+    stmfd sp!, {r4-r11}
+
+    ldmia state, {zM1,zM2,tap,quant}
+    mov mask, #127
+    ldr stepLUT, =ADPCM4_ADAPT+8
+
+.loop:
+    ldr   n, [data], #4
+#if UNROLL
+.rept 8/2
+#endif
+1:  adpcm4_decode zM1, zM2 // zM1 and zM2 get swapped...
+    adpcm4_decode zM2, zM1 // ... and swapped back
+#if UNROLL
+.endr
+#else
+    adds  size, size, #1<<(32-3+1) // Count up the 8 samples
+    bcc   1b
+#endif
+    subs  size, #8/2 // size is provided as number of bytes
+    bne   .loop
+
+    stmia state, {zM1,zM2,tap,quant}
+
+    ldmfd sp!, {r4-r11}
+    bx lr
--- a/src/platform/gba/asm/sndIMA.s
+++ b/src/platform/gba/asm/sndIMA.s
@ -1,67 +0,0 @@
-#include "common_asm.inc"
-
-state   .req r0
-buffer  .req r1
-data    .req r2
-size    .req r3
-smp     .req r4
-idx     .req r5
-stepLUT .req r6
-step    .req r7
-n       .req r8
-index   .req r9
-mask    .req r10
-out     .req r12
-tmp     .req out
-diff    .req step
-
-IMA_STEP_SIZE = 88
-
-.macro ima_decode
-    ldr step, [stepLUT, idx, lsl #2]
-
-    mul tmp, step, index
-    add diff, tmp, lsl #1
-
-    subne smp, diff, lsr #3
-    addeq smp, diff, lsr #3
-
-    subs index, #3
-    suble idx, #1
-    addgt idx, index, lsl #1
-
-    // clamp 0..88
-    bic idx, idx, asr #31
-    cmp idx, #IMA_STEP_SIZE
-    movgt idx, #IMA_STEP_SIZE
-
-    mov out, smp, asr #(2 + SND_VOL_SHIFT)
-    strb out, [buffer], #1
-.endm
-
-.global sndIMA_fill_asm
-sndIMA_fill_asm:
-    stmfd sp!, {r4-r9}
-
-    ldmia state, {smp, idx}
-    ldr stepLUT, =IMA_STEP
-
-    mov mask, #7
-.loop:
-    ldrb n, [data], #1
-
-    and index, mask, n
-    tst n, #8
-    ima_decode
-
-    and index, mask, n, lsr #4
-    tst n, #(8 << 4)
-    ima_decode
-
-    subs size, #1
-    bne .loop
-
-    stmia state, {smp, idx}
-
-    ldmfd sp!, {r4-r9}
-    bx lr
--- a/src/platform/gba/asm/sndPCM.s
+++ b/src/platform/gba/asm/sndPCM.s
@ -8,15 +8,16 @@ volume  .req r3
 data    .req r4
 buffer  .req r5
 tmp     .req r6
+mask    .req r7
 last    .req r12
 tmpSP   .req last
 out     .req size

 .macro clamp
-    // Vanadium's clamp trick (-128..127)
-    mov tmp, out, asr #31  // tmp <- 0xffffffff
-    cmp tmp, out, asr #7   // not equal
-    eorne out, tmp, #0x7F  // out <- 0xffffff80
+    // Aikku93's quick-and-dirty clamp (-128..+127)
+    // This only works for inputs of -256..+255
+    TEQ out, out, lsl #32-8       // If the sign of 8bit value does not match...
+    EORMI out, mask, out, asr #31 // ... then clip using the real sign
 .endm

 .macro calc_last
@ -72,8 +73,9 @@ sndPCM_fill_asm:

 .global sndPCM_mix_asm
 sndPCM_mix_asm:
+    mov mask, #127
    mov tmpSP, sp
-    stmfd sp!, {r4-r6} // tmp reg required
+    stmfd sp!, {r4-r7} // tmp reg required

    ldmia tmpSP, {data, buffer}

@ -89,7 +91,7 @@ sndPCM_mix_asm:
    cmp pos, last
    blt .loop_mix

-    ldmfd sp!, {r4-r6}
+    ldmfd sp!, {r4-r7}
    bx lr

 .global sndClear_asm
--- a/src/platform/gba/data/TRACKS.AD4
+++ b/src/platform/gba/data/TRACKS.AD4
--- a/src/platform/gba/data/TRACKS.IMA
+++ b/src/platform/gba/data/TRACKS.IMA
--- a/src/platform/gba/main.cpp
+++ b/src/platform/gba/main.cpp
@ -6,7 +6,7 @@ EWRAM_DATA int32 fpsCounter = 0;
 EWRAM_DATA uint32 curSoundBuffer = 0;

 #ifdef __GBA_WIN__
-const void* TRACKS_IMA;
+const void* TRACKS_AD4;
 const void* TITLE_SCR;
 const void* levelData;

@ -233,9 +233,9 @@ const void* osLoadLevel(LevelID id)
    }

 // tracks
-    if (!TRACKS_IMA)
+    if (!TRACKS_AD4)
    {
-        FILE *f = fopen("data/TRACKS.IMA", "rb");
+        FILE *f = fopen("data/TRACKS.AD4", "rb");
        if (!f)
            return NULL;

@ -246,7 +246,7 @@ const void* osLoadLevel(LevelID id)
        fread(data, 1, size, f);
        fclose(f);

-        TRACKS_IMA = data;
+        TRACKS_AD4 = data;
    }

    if (!TITLE_SCR)
--- a/src/platform/gba/packer/ad4/AD4.h
+++ b/src/platform/gba/packer/ad4/AD4.h
@ -0,0 +1,81 @@
+#pragma once
+
+#include <stdint.h>
+
+struct AD4State_t {
+	 int32_t zM1, zM2;
+	 int32_t Tap;
+	 int32_t Quant;
+	 int32_t Output;
+	uint32_t MaxOutputLevel;
+};
+
+void AD4_Init(struct AD4State_t *State) {
+	State->zM1    = 0;
+	State->zM2    = 0;
+	State->Tap    = 0;
+	State->Quant  = 0x0800;
+	State->Output = 0;
+	State->MaxOutputLevel = 0;
+}
+
+uint32_t AD4_EncodeFrame(struct AD4State_t *State, const int16_t *Data) {
+	static const uint8_t AdaptTable[] = {
+	    192,192,136,136,128,128,128,128, // -8..-1
+	    112,128,128,128,128,136,136,192, //  0..+7
+	};
+
+	uint8_t  n;
+	 int32_t zM1    = State->zM1;
+	 int32_t zM2    = State->zM2;
+	 int32_t Tap    = State->Tap;
+	 int32_t Quant  = State->Quant;
+	 int32_t Output = State->Output;
+	uint32_t MaxOutputLevel = State->MaxOutputLevel;
+	uint32_t FrameData = 0;
+	for(n=0;n<8;n++) {
+		//! Get input, compute prediction, and quantize residue
+		//! Note that we minimize error of Output rather than Y, which implies
+		//! applying the post-filter in the analysis equation to get the residue.
+		int32_t X = Data[n];
+		int32_t P = zM1 - zM2;
+		int32_t R = X - (P + (Tap - (Tap >> 3))); {
+#if 0 //! Lower RMSE, but sounds noisier
+			R = (2*R + ((R < 0) ? (-Quant) : (+Quant))) / (2*Quant); //! (R + Sign[R]*(Quant/2)) / Quant
+#else
+			R /= Quant;
+#endif
+			if(R < -8) R = -8;
+			if(R > +7) R = +7;
+		}
+		int32_t Y = P + R*Quant;
+
+		//! Calculate output value and update maximum level
+		//! Post-filter: Hpost(z) = 1 / Hpre(z) = 1 / (1 - (7/8)z^-1)
+		Output = Y + Output - (Output >> 3);
+		uint32_t Level = (uint32_t)((Output < 0) ? (-Output) : (+Output));
+		if(Level > MaxOutputLevel) MaxOutputLevel = Level;
+
+		//! Do the same, but for the encoding tap. This is needed to
+		//! avoid a limit oscillation on silence from round-off error.
+		//! Technically, it does mean a different output, but it should
+		//! be close enough to what we want that it shouldn't matter.
+		Tap = Y + Tap - ((Tap + 4 - (Tap < 0)) >> 3); //! Y + Round[Tap*7/8]
+
+		//! Update taps and push residue to frame
+		zM2 = zM1;
+		zM1 = Y;
+		FrameData |= (R&0xF) << (n*4);
+
+		//! Adapt quantizer
+		//! Rounding up means that Quant can never collapse to 0
+		Quant = (Quant * AdaptTable[R+8] + 127) >> 7;
+	}
+	State->zM1    = zM1;
+	State->zM2    = zM2;
+	State->Tap    = Tap;
+	State->Quant  = Quant;
+	State->Output = Output;
+	State->MaxOutputLevel = MaxOutputLevel;
+	return FrameData;
+}
--- a/src/platform/gba/packer/ad4/ad4.c
+++ b/src/platform/gba/packer/ad4/ad4.c
@ -0,0 +1,60 @@
+#ifndef __GNUC__
+# warning "Compile with GCC-compatible compiler for endianness checking."
+#endif
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "AD4.h"
+
+int main(int argc, const char *argv[]) {
+	if(argc < 3 || argc > 4) {
+		printf(
+			"Usage: ad4 Input.raw Output.ad4 [dBGain]\n"
+			"Input.raw must be mono signed PCM16.\n"
+			"Output will be aligned to 4 bytes.\n"
+		);
+		return 1;
+	}
+
+	FILE *InFile = fopen(argv[1], "rb");
+	if(!InFile) { printf("Couldn't open input file.\n"); goto Error_InFile; }
+	FILE *OutFile = fopen(argv[2], "wb");
+	if(!OutFile) { printf("Couldn't open output file.\n"); goto Error_OutFile; }
+	double Volume = 1.0;
+	if(argc >= 4) Volume = pow(10.0, atof(argv[3]) / 20.0);
+
+	size_t nSamples; {
+		fseek(InFile, 0, SEEK_END);
+		nSamples = ftell(InFile) / sizeof(int16_t);
+		rewind(InFile);
+	}
+
+	struct AD4State_t AD4State; AD4_Init(&AD4State);
+	size_t Frame, nFrames = (nSamples + 7) / 8;
+	for(Frame=0;Frame<nFrames;Frame++) {
+		int16_t Buffer[8]; //! 1 frame = 8 samples
+		size_t n, nRead = fread(Buffer, sizeof(int16_t), 8, InFile);
+		for(n=0;n<nRead;n++) Buffer[n] = (int16_t)(Buffer[n] * Volume);
+		while(nRead < 8) Buffer[nRead++] = 0;
+		uint32_t FrameData = AD4_EncodeFrame(&AD4State, Buffer);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+		//! Target is actually intended to be bytes, so swap the
+		//! endianness of the frame data on big-endian systems
+		FrameData = ((FrameData & 0x000000FF) << 24) |
+			    ((FrameData & 0x0000FF00) <<  8) |
+			    ((FrameData & 0x00FF0000) >>  8) |
+			    ((FrameData & 0xFF000000) >> 24) ;
+#endif
+		fwrite(&FrameData, 1, sizeof(FrameData), OutFile);
+	}
+	printf("Maximum output level: %u", AD4State.MaxOutputLevel);
+	if(AD4State.MaxOutputLevel < 32768) putchar('\n');
+	else printf(" (overflow by %u)\n", AD4State.MaxOutputLevel - 32767);
+
+	fclose(OutFile);
+Error_OutFile:
+	fclose(InFile);
+Error_InFile:
+	return 0;
+}
--- a/src/platform/gba/packer/out_GBA.h
+++ b/src/platform/gba/packer/out_GBA.h
@ -2189,7 +2189,7 @@ struct out_GBA
    void convertTracks(FileStream &f, const char* from)
    {
        char buf[256];
-        sprintf(buf, "%s/*.ima", from);
+        sprintf(buf, "%s/*.ad4", from);

        WIN32_FIND_DATA fd;
        HANDLE h = FindFirstFile(buf, &fd);
@ -2209,9 +2209,10 @@ struct out_GBA
            if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
            {
                const char* src = fd.cFileName;
+                const char* srcEnd = strrchr(src, '.');
                char* dst = buf;

-                while (*src)
+                while (src < srcEnd)
                {
                    if (*src >= '0' && *src <= '9')
                    {
@ -2237,12 +2238,11 @@ struct out_GBA
                    fseek(f, 0, SEEK_END);
                    int32 size = ftell(f);
                    fseek(f, 0, SEEK_SET);
-                    tracks[index].data = new char[size + 4];
+                    tracks[index].data = new char[size];
                    fread(tracks[index].data, 1, size, f);
                    fclose(f);

-                    tracks[index].size = ALIGN(*((int32*)tracks[index].data + 2), 4) - 4;
-
+                    tracks[index].size = size; // ad4 tool encodes 32-bit chunks, so no need to align
                    ASSERT(tracks[index].size % 4 == 0);
                }
            }
@ -2268,7 +2268,7 @@ struct out_GBA
        {
            if (tracks[i].size == 0)
                continue;
-            f.write((uint8*)tracks[i].data + 16, tracks[i].size);
+            f.write(tracks[i].data, tracks[i].size);
            delete[] tracks[i].data;
        }
    }
@ -2752,7 +2752,7 @@ struct out_GBA

        // audio tracks
        {
-            sprintf(buf, "%s/TRACKS.IMA", dir);
+            sprintf(buf, "%s/TRACKS.AD4", dir);
            FileStream f(buf, true);
            convertTracks(f, "tracks/conv_demo");
        }
--- a/src/platform/gba/sound.cpp
+++ b/src/platform/gba/sound.cpp
@ -1,81 +1,65 @@
 #include "common.h"

-int32 IMA_STEP[] = { // IWRAM !
-    7,     8,     9,     10,    11,    12,    13,    14,
-    16,    17,    19,    21,    23,    25,    28,    31,
-    34,    37,    41,    45,    50,    55,    60,    66,
-    73,    80,    88,    97,    107,   118,   130,   143,
-    157,   173,   190,   209,   230,   253,   279,   307,
-    337,   371,   408,   449,   494,   544,   598,   658,
-    724,   796,   876,   963,   1060,  1166,  1282,  1411,
-    1552,  1707,  1878,  2066,  2272,  2499,  2749,  3024,
-    3327,  3660,  4026,  4428,  4871,  5358,  5894,  6484,
-    7132,  7845,  8630,  9493,  10442, 11487, 12635, 13899,
-    15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794,
-    32767
+uint8_t ADPCM4_ADAPT[] = { // IWRAM !
+    192,192,136,136,128,128,128,128, // -8..-1
+    112,128,128,128,128,136,136,192, //  0..+7
 };

 #if defined(__GBA__) && defined(USE_ASM)
-    extern const uint8_t TRACKS_IMA[];
+    extern const uint8_t TRACKS_AD4[];
 #else
-    extern const void* TRACKS_IMA;
+    extern const void* TRACKS_AD4;
 #endif

 int8 soundBuffer[2 * SND_SAMPLES + 32]; // 32 bytes of silence for DMA overrun while interrupt

 #ifdef USE_ASM
-    #define sndIMA_fill sndIMA_fill_asm
-    #define sndPCM_fill sndPCM_fill_asm
-    #define sndPCM_mix  sndPCM_mix_asm
-    #define sndClear    sndClear_asm
+    #define sndADPCM4_fill sndADPCM4_fill_asm
+    #define sndPCM_fill    sndPCM_fill_asm
+    #define sndPCM_mix     sndPCM_mix_asm
+    #define sndClear       sndClear_asm

    extern "C" {
        void sndClear_asm(int8* buffer);
-        void sndIMA_fill_asm(IMA_STATE &state, int8* buffer, const uint8* data, int32 size);
+        void sndADPCM4_fill_asm(ADPCM4_STATE &state, int8* buffer, const uint8* data, int32 size);
        int32 sndPCM_fill_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer);
        int32 sndPCM_mix_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer);
    }
 #else
-    #define sndIMA_fill sndIMA_c
-    #define sndPCM_fill sndPCM_c
-    #define sndPCM_mix  sndPCM_c
-    #define sndClear(b) dmaFill(b, SND_ENCODE(0), SND_SAMPLES * sizeof(b[0]))
+    #define sndADPCM4_fill sndADPCM4_c
+    #define sndPCM_fill    sndPCM_c
+    #define sndPCM_mix     sndPCM_c
+    #define sndClear(b)    dmaFill(b, SND_ENCODE(0), SND_SAMPLES * sizeof(b[0]))

-#define DECODE_IMA_4(n)\
-    step = IMA_STEP[idx];\
-    index = n & 7;\
-    step += index * step << 1;\
-    if (index < 4) {\
-        idx = X_MAX(idx - 1, 0);\
-    } else {\
-        idx = X_MIN(idx + ((index - 3) << 1), X_COUNT(IMA_STEP) - 1);\
-    }\
-    if (n & 8) {\
-        smp -= step >> 3;\
-    } else {\
-        smp += step >> 3;\
-    }\
-    amp = smp >> 8;\
-    *buffer++ = SND_ENCODE(X_CLAMP(amp, SND_MIN, SND_MAX));
+#define DECODE_ADPCM4(n)\
+    tap = zM2 + tap - (tap >> 3);\
+    *buffer++ = SND_ENCODE(X_CLAMP(tap >> 8, SND_MIN, SND_MAX));\
+    res = ((n&0xF) ^ 8) - 8;\
+    out = res*quant + (zM1 - zM2);\
+    zM2 = zM1;\
+    zM1 = out;\
+    quant = (quant*(int32)ADPCM4_ADAPT[res+8] + 127) >> 7;\

-void sndIMA_c(IMA_STATE &state, int8* buffer, const uint8* data, int32 size)
+void sndADPCM4_c(ADPCM4_STATE &state, int8* buffer, const uint8* data, int32 size)
 {
-    uint32 step, index;
-
-    int32 smp = state.smp;
-    int32 idx = state.idx;
-    int32 amp;
-
-    for (int32 i = 0; i < size; i++)
+    int32 zM1   = state.zM1;
+    int32 zM2   = state.zM2;
+    int32 tap   = state.tap;
+    int32 quant = state.quant;
+    int32 res, out;
+    
+    for (int32 i=0; i < size; i++)
    {
        uint32 n = *data++;
-        DECODE_IMA_4(n);
+        DECODE_ADPCM4(n);
        n >>= 4;
-        DECODE_IMA_4(n);
+        DECODE_ADPCM4(n);
    }
-
-    state.smp = smp;
-    state.idx = idx;
+    
+    state.zM1   = zM1;
+    state.zM2   = zM2;
+    state.tap   = tap;
+    state.quant = quant;
 }

 int32 sndPCM_c(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer)
@ -101,20 +85,20 @@ struct Music
    const uint8*  data;
    int32         size;
    int32         pos;
-    IMA_STATE     state;
+    ADPCM4_STATE  state;

    void fill(int8* buffer)
    {
        int32 len = X_MIN(size - pos, SND_SAMPLES >> 1);

-        sndIMA_fill(state, buffer, data + pos, len);
+        sndADPCM4_fill(state, buffer, data + pos, len);

        pos += len;

        if (pos >= size)
        {
            data = NULL;
-            memset(buffer, 0, (SND_SAMPLES - (len << 1)) * sizeof(buffer[0]));
+            memset(buffer + (len << 1), 0, (SND_SAMPLES - (len << 1)) * sizeof(buffer[0]));
        }
    }
 };
@ -232,17 +216,22 @@ void sndPlayTrack(int32 track)
        int32 size;
    };
    
-    const TrackInfo* info = (const TrackInfo*)TRACKS_IMA + track;
+    const TrackInfo* info = (const TrackInfo*)TRACKS_AD4 + track;

    if (!info->size)
        return;

-    music.data = (uint8*)TRACKS_IMA + info->offset;
+    // Clear music.data before setup, and write it after to ensure
+    // music.fill() has a consistent state at any point in time
+    music.data = NULL;
    music.size = info->size;
    music.pos = 0;
    //music.volume = (1 << SND_VOL_SHIFT);
-    music.state.smp = 0;
-    music.state.idx = 0;
+    music.state.zM1   = 0;
+    music.state.zM2   = 0;
+    music.state.tap   = 0;
+    music.state.quant = 0x0800;
+    music.data = (const uint8*)TRACKS_AD4 + info->offset;
 }

 void sndStopTrack()