diff --git a/src/fixed/common.cpp b/src/fixed/common.cpp
index bf474ad..e8f5106 100644
--- a/src/fixed/common.cpp
+++ b/src/fixed/common.cpp
@@ -30,7 +30,7 @@ EWRAM_DATA const char* const* STR = STR_EN;
 EWRAM_DATA ExtraInfoLara playersExtra[MAX_PLAYERS];
 #if defined(__GBA__)
-    #include "TRACKS_IMA.h"
+    #include "TRACKS_AD4.h"
     #include "TITLE_SCR.h"
     #include "TITLE_PKD.h"
     #include "GYM_PKD.h"
diff --git a/src/fixed/common.h b/src/fixed/common.h
index 1cdeb75..a7d39f1 100644
--- a/src/fixed/common.h
+++ b/src/fixed/common.h
@@ -2142,6 +2142,14 @@ struct IMA_STATE
     int32 idx;
+// Currently only used for GBA
+struct ADPCM4_STATE
+    int32 zM1, zM2;
+    int32 tap;
+    int32 quant;
 #if defined(GAPI_GL1)
     #define PERSPECTIVE_DZ(z) z
diff --git a/src/platform/gba/Makefile b/src/platform/gba/Makefile
index 01bc272..6e3f089 100644
--- a/src/platform/gba/Makefile
+++ b/src/platform/gba/Makefile
@@ -144,7 +144,7 @@ $(OFILES_SOURCES) : $(HFILES)
 	@echo $(notdir $<)
-%.IMA.o	%_IMA.h :	%.IMA
+%.AD4.o	%_AD4.h :	%.AD4
 	@echo $(notdir $<)
diff --git a/src/platform/gba/asm/sndAD4.s b/src/platform/gba/asm/sndAD4.s
new file mode 100644
index 0000000..cef5d54
--- /dev/null
+++ b/src/platform/gba/asm/sndAD4.s
@@ -0,0 +1,70 @@
+#include "common_asm.inc"
+// Clamping is only required if the encoder gives overflow warnings.
+// To improve on speed, the music volume has been reduced to avoid this.
+#define CLAMP_OUTPUT 0
+// Unrolling saves 3.75 cycles per sample, but uses a lot more RAM.
+#define UNROLL 0
+state   .req r0
+buffer  .req r1
+data    .req r2
+size    .req r3
+zM1     .req r4
+zM2     .req r5
+tap     .req r6
+quant   .req r7
+n       .req r8
+mask    .req r9
+adapt   .req r10
+stepLUT .req r11
+temp    .req r12
+.macro adpcm4_decode zM1, zM2
+    sub   tap, tap, tap, asr #3
+    add   tap, tap, \zM2
+    mov   temp, tap, asr #8
+    teq   temp, temp, lsl #32-8
+    eormi temp, mask, temp, asr #31
+    strb  temp, [buffer], #1
+    mov   n, n, ror #4
+    mov   temp, n, asr #32-4
+    sub   \zM2, \zM1, \zM2
+    mla   \zM2, quant, temp, \zM2
+    // zM1 and zM2 now swapped
+    ldrb  temp, [stepLUT, temp]
+    mla   temp, quant, temp, mask
+    mov   quant, temp, lsr #7
+.global sndADPCM4_fill_asm
+    stmfd sp!, {r4-r11}
+    ldmia state, {zM1,zM2,tap,quant}
+    mov mask, #127
+    ldr stepLUT, =ADPCM4_ADAPT+8
+    ldr   n, [data], #4
+.rept 8/2
+1:  adpcm4_decode zM1, zM2 // zM1 and zM2 get swapped...
+    adpcm4_decode zM2, zM1 // ... and swapped back
+    adds  size, size, #1<<(32-3+1) // Count up the 8 samples
+    bcc   1b
+    subs  size, #8/2 // size is provided as number of bytes
+    bne   .loop
+    stmia state, {zM1,zM2,tap,quant}
+    ldmfd sp!, {r4-r11}
+    bx lr
diff --git a/src/platform/gba/asm/sndIMA.s b/src/platform/gba/asm/sndIMA.s
deleted file mode 100644
index e7f4348..0000000
--- a/src/platform/gba/asm/sndIMA.s
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "common_asm.inc"
-state   .req r0
-buffer  .req r1
-data    .req r2
-size    .req r3
-smp     .req r4
-idx     .req r5
-stepLUT .req r6
-step    .req r7
-n       .req r8
-index   .req r9
-mask    .req r10
-out     .req r12
-tmp     .req out
-diff    .req step
-.macro ima_decode
-    ldr step, [stepLUT, idx, lsl #2]
-    mul tmp, step, index
-    add diff, tmp, lsl #1
-    subne smp, diff, lsr #3
-    addeq smp, diff, lsr #3
-    subs index, #3
-    suble idx, #1
-    addgt idx, index, lsl #1
-    // clamp 0..88
-    bic idx, idx, asr #31
-    cmp idx, #IMA_STEP_SIZE
-    movgt idx, #IMA_STEP_SIZE
-    mov out, smp, asr #(2 + SND_VOL_SHIFT)
-    strb out, [buffer], #1
-.global sndIMA_fill_asm
-    stmfd sp!, {r4-r9}
-    ldmia state, {smp, idx}
-    ldr stepLUT, =IMA_STEP
-    mov mask, #7
-    ldrb n, [data], #1
-    and index, mask, n
-    tst n, #8
-    ima_decode
-    and index, mask, n, lsr #4
-    tst n, #(8 << 4)
-    ima_decode
-    subs size, #1
-    bne .loop
-    stmia state, {smp, idx}
-    ldmfd sp!, {r4-r9}
-    bx lr
diff --git a/src/platform/gba/asm/sndPCM.s b/src/platform/gba/asm/sndPCM.s
index 2244105..247e5ae 100644
--- a/src/platform/gba/asm/sndPCM.s
+++ b/src/platform/gba/asm/sndPCM.s
@@ -8,15 +8,16 @@ volume  .req r3
 data    .req r4
 buffer  .req r5
 tmp     .req r6
+mask    .req r7
 last    .req r12
 tmpSP   .req last
 out     .req size
 .macro clamp
-    // Vanadium's clamp trick (-128..127)
-    mov tmp, out, asr #31  // tmp <- 0xffffffff
-    cmp tmp, out, asr #7   // not equal
-    eorne out, tmp, #0x7F  // out <- 0xffffff80
+    // Aikku93's quick-and-dirty clamp (-128..+127)
+    // This only works for inputs of -256..+255
+    TEQ out, out, lsl #32-8       // If the sign of 8bit value does not match...
+    EORMI out, mask, out, asr #31 // ... then clip using the real sign
 .macro calc_last
@@ -72,8 +73,9 @@ sndPCM_fill_asm:
 .global sndPCM_mix_asm
+    mov mask, #127
     mov tmpSP, sp
-    stmfd sp!, {r4-r6} // tmp reg required
+    stmfd sp!, {r4-r7} // tmp reg required
     ldmia tmpSP, {data, buffer}
@@ -89,7 +91,7 @@ sndPCM_mix_asm:
     cmp pos, last
     blt .loop_mix
-    ldmfd sp!, {r4-r6}
+    ldmfd sp!, {r4-r7}
     bx lr
 .global sndClear_asm
diff --git a/src/platform/gba/data/TRACKS.AD4 b/src/platform/gba/data/TRACKS.AD4
new file mode 100644
index 0000000..cf1d896
Binary files /dev/null and b/src/platform/gba/data/TRACKS.AD4 differ
diff --git a/src/platform/gba/data/TRACKS.IMA b/src/platform/gba/data/TRACKS.IMA
deleted file mode 100644
index cafca2d..0000000
Binary files a/src/platform/gba/data/TRACKS.IMA and /dev/null differ
diff --git a/src/platform/gba/main.cpp b/src/platform/gba/main.cpp
index 301948c..e5b55d8 100644
--- a/src/platform/gba/main.cpp
+++ b/src/platform/gba/main.cpp
@@ -6,7 +6,7 @@ EWRAM_DATA int32 fpsCounter = 0;
 EWRAM_DATA uint32 curSoundBuffer = 0;
 #ifdef __GBA_WIN__
-const void* TRACKS_IMA;
+const void* TRACKS_AD4;
 const void* TITLE_SCR;
 const void* levelData;
@@ -233,9 +233,9 @@ const void* osLoadLevel(LevelID id)
 // tracks
-    if (!TRACKS_IMA)
+    if (!TRACKS_AD4)
-        FILE *f = fopen("data/TRACKS.IMA", "rb");
+        FILE *f = fopen("data/TRACKS.AD4", "rb");
         if (!f)
             return NULL;
@@ -246,7 +246,7 @@ const void* osLoadLevel(LevelID id)
         fread(data, 1, size, f);
-        TRACKS_IMA = data;
+        TRACKS_AD4 = data;
     if (!TITLE_SCR)
diff --git a/src/platform/gba/packer/ad4/AD4.h b/src/platform/gba/packer/ad4/AD4.h
new file mode 100644
index 0000000..45a5d52
--- /dev/null
+++ b/src/platform/gba/packer/ad4/AD4.h
@@ -0,0 +1,81 @@
+#pragma once
+#include <stdint.h>
+struct AD4State_t {
+	 int32_t zM1, zM2;
+	 int32_t Tap;
+	 int32_t Quant;
+	 int32_t Output;
+	uint32_t MaxOutputLevel;
+void AD4_Init(struct AD4State_t *State) {
+	State->zM1    = 0;
+	State->zM2    = 0;
+	State->Tap    = 0;
+	State->Quant  = 0x0800;
+	State->Output = 0;
+	State->MaxOutputLevel = 0;
+uint32_t AD4_EncodeFrame(struct AD4State_t *State, const int16_t *Data) {
+	static const uint8_t AdaptTable[] = {
+	    192,192,136,136,128,128,128,128, // -8..-1
+	    112,128,128,128,128,136,136,192, //  0..+7
+	};
+	uint8_t  n;
+	 int32_t zM1    = State->zM1;
+	 int32_t zM2    = State->zM2;
+	 int32_t Tap    = State->Tap;
+	 int32_t Quant  = State->Quant;
+	 int32_t Output = State->Output;
+	uint32_t MaxOutputLevel = State->MaxOutputLevel;
+	uint32_t FrameData = 0;
+	for(n=0;n<8;n++) {
+		//! Get input, compute prediction, and quantize residue
+		//! Note that we minimize error of Output rather than Y, which implies
+		//! applying the post-filter in the analysis equation to get the residue.
+		int32_t X = Data[n];
+		int32_t P = zM1 - zM2;
+		int32_t R = X - (P + (Tap - (Tap >> 3))); {
+#if 0 //! Lower RMSE, but sounds noisier
+			R = (2*R + ((R < 0) ? (-Quant) : (+Quant))) / (2*Quant); //! (R + Sign[R]*(Quant/2)) / Quant
+			R /= Quant;
+			if(R < -8) R = -8;
+			if(R > +7) R = +7;
+		}
+		int32_t Y = P + R*Quant;
+		//! Calculate output value and update maximum level
+		//! Post-filter: Hpost(z) = 1 / Hpre(z) = 1 / (1 - (7/8)z^-1)
+		Output = Y + Output - (Output >> 3);
+		uint32_t Level = (uint32_t)((Output < 0) ? (-Output) : (+Output));
+		if(Level > MaxOutputLevel) MaxOutputLevel = Level;
+		//! Do the same, but for the encoding tap. This is needed to
+		//! avoid a limit oscillation on silence from round-off error.
+		//! Technically, it does mean a different output, but it should
+		//! be close enough to what we want that it shouldn't matter.
+		Tap = Y + Tap - ((Tap + 4 - (Tap < 0)) >> 3); //! Y + Round[Tap*7/8]
+		//! Update taps and push residue to frame
+		zM2 = zM1;
+		zM1 = Y;
+		FrameData |= (R&0xF) << (n*4);
+		//! Adapt quantizer
+		//! Rounding up means that Quant can never collapse to 0
+		Quant = (Quant * AdaptTable[R+8] + 127) >> 7;
+	}
+	State->zM1    = zM1;
+	State->zM2    = zM2;
+	State->Tap    = Tap;
+	State->Quant  = Quant;
+	State->Output = Output;
+	State->MaxOutputLevel = MaxOutputLevel;
+	return FrameData;
diff --git a/src/platform/gba/packer/ad4/ad4.c b/src/platform/gba/packer/ad4/ad4.c
new file mode 100644
index 0000000..1516ec2
--- /dev/null
+++ b/src/platform/gba/packer/ad4/ad4.c
@@ -0,0 +1,60 @@
+#ifndef __GNUC__
+# warning "Compile with GCC-compatible compiler for endianness checking."
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "AD4.h"
+int main(int argc, const char *argv[]) {
+	if(argc < 3 || argc > 4) {
+		printf(
+			"Usage: ad4 Input.raw Output.ad4 [dBGain]\n"
+			"Input.raw must be mono signed PCM16.\n"
+			"Output will be aligned to 4 bytes.\n"
+		);
+		return 1;
+	}
+	FILE *InFile = fopen(argv[1], "rb");
+	if(!InFile) { printf("Couldn't open input file.\n"); goto Error_InFile; }
+	FILE *OutFile = fopen(argv[2], "wb");
+	if(!OutFile) { printf("Couldn't open output file.\n"); goto Error_OutFile; }
+	double Volume = 1.0;
+	if(argc >= 4) Volume = pow(10.0, atof(argv[3]) / 20.0);
+	size_t nSamples; {
+		fseek(InFile, 0, SEEK_END);
+		nSamples = ftell(InFile) / sizeof(int16_t);
+		rewind(InFile);
+	}
+	struct AD4State_t AD4State; AD4_Init(&AD4State);
+	size_t Frame, nFrames = (nSamples + 7) / 8;
+	for(Frame=0;Frame<nFrames;Frame++) {
+		int16_t Buffer[8]; //! 1 frame = 8 samples
+		size_t n, nRead = fread(Buffer, sizeof(int16_t), 8, InFile);
+		for(n=0;n<nRead;n++) Buffer[n] = (int16_t)(Buffer[n] * Volume);
+		while(nRead < 8) Buffer[nRead++] = 0;
+		uint32_t FrameData = AD4_EncodeFrame(&AD4State, Buffer);
+		//! Target is actually intended to be bytes, so swap the
+		//! endianness of the frame data on big-endian systems
+		FrameData = ((FrameData & 0x000000FF) << 24) |
+			    ((FrameData & 0x0000FF00) <<  8) |
+			    ((FrameData & 0x00FF0000) >>  8) |
+			    ((FrameData & 0xFF000000) >> 24) ;
+		fwrite(&FrameData, 1, sizeof(FrameData), OutFile);
+	}
+	printf("Maximum output level: %u", AD4State.MaxOutputLevel);
+	if(AD4State.MaxOutputLevel < 32768) putchar('\n');
+	else printf(" (overflow by %u)\n", AD4State.MaxOutputLevel - 32767);
+	fclose(OutFile);
+	fclose(InFile);
+	return 0;
diff --git a/src/platform/gba/packer/out_GBA.h b/src/platform/gba/packer/out_GBA.h
index d5e8f19..cc7b649 100644
--- a/src/platform/gba/packer/out_GBA.h
+++ b/src/platform/gba/packer/out_GBA.h
@@ -2189,7 +2189,7 @@ struct out_GBA
     void convertTracks(FileStream &f, const char* from)
         char buf[256];
-        sprintf(buf, "%s/*.ima", from);
+        sprintf(buf, "%s/*.ad4", from);
         WIN32_FIND_DATA fd;
         HANDLE h = FindFirstFile(buf, &fd);
@@ -2209,9 +2209,10 @@ struct out_GBA
             if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY))
                 const char* src = fd.cFileName;
+                const char* srcEnd = strrchr(src, '.');
                 char* dst = buf;
-                while (*src)
+                while (src < srcEnd)
                     if (*src >= '0' && *src <= '9')
@@ -2237,12 +2238,11 @@ struct out_GBA
                     fseek(f, 0, SEEK_END);
                     int32 size = ftell(f);
                     fseek(f, 0, SEEK_SET);
-                    tracks[index].data = new char[size + 4];
+                    tracks[index].data = new char[size];
                     fread(tracks[index].data, 1, size, f);
-                    tracks[index].size = ALIGN(*((int32*)tracks[index].data + 2), 4) - 4;
+                    tracks[index].size = size; // ad4 tool encodes 32-bit chunks, so no need to align
                     ASSERT(tracks[index].size % 4 == 0);
@@ -2268,7 +2268,7 @@ struct out_GBA
             if (tracks[i].size == 0)
-            f.write((uint8*)tracks[i].data + 16, tracks[i].size);
+            f.write(tracks[i].data, tracks[i].size);
             delete[] tracks[i].data;
@@ -2752,7 +2752,7 @@ struct out_GBA
         // audio tracks
-            sprintf(buf, "%s/TRACKS.IMA", dir);
+            sprintf(buf, "%s/TRACKS.AD4", dir);
             FileStream f(buf, true);
             convertTracks(f, "tracks/conv_demo");
diff --git a/src/platform/gba/sound.cpp b/src/platform/gba/sound.cpp
index 277c349..802ed24 100644
--- a/src/platform/gba/sound.cpp
+++ b/src/platform/gba/sound.cpp
@@ -1,81 +1,65 @@
 #include "common.h"
-int32 IMA_STEP[] = { // IWRAM !
-    7,     8,     9,     10,    11,    12,    13,    14,
-    16,    17,    19,    21,    23,    25,    28,    31,
-    34,    37,    41,    45,    50,    55,    60,    66,
-    73,    80,    88,    97,    107,   118,   130,   143,
-    157,   173,   190,   209,   230,   253,   279,   307,
-    337,   371,   408,   449,   494,   544,   598,   658,
-    724,   796,   876,   963,   1060,  1166,  1282,  1411,
-    1552,  1707,  1878,  2066,  2272,  2499,  2749,  3024,
-    3327,  3660,  4026,  4428,  4871,  5358,  5894,  6484,
-    7132,  7845,  8630,  9493,  10442, 11487, 12635, 13899,
-    15289, 16818, 18500, 20350, 22385, 24623, 27086, 29794,
-    32767
+uint8_t ADPCM4_ADAPT[] = { // IWRAM !
+    192,192,136,136,128,128,128,128, // -8..-1
+    112,128,128,128,128,136,136,192, //  0..+7
 #if defined(__GBA__) && defined(USE_ASM)
-    extern const uint8_t TRACKS_IMA[];
+    extern const uint8_t TRACKS_AD4[];
-    extern const void* TRACKS_IMA;
+    extern const void* TRACKS_AD4;
 int8 soundBuffer[2 * SND_SAMPLES + 32]; // 32 bytes of silence for DMA overrun while interrupt
 #ifdef USE_ASM
-    #define sndIMA_fill sndIMA_fill_asm
-    #define sndPCM_fill sndPCM_fill_asm
-    #define sndPCM_mix  sndPCM_mix_asm
-    #define sndClear    sndClear_asm
+    #define sndADPCM4_fill sndADPCM4_fill_asm
+    #define sndPCM_fill    sndPCM_fill_asm
+    #define sndPCM_mix     sndPCM_mix_asm
+    #define sndClear       sndClear_asm
     extern "C" {
         void sndClear_asm(int8* buffer);
-        void sndIMA_fill_asm(IMA_STATE &state, int8* buffer, const uint8* data, int32 size);
+        void sndADPCM4_fill_asm(ADPCM4_STATE &state, int8* buffer, const uint8* data, int32 size);
         int32 sndPCM_fill_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer);
         int32 sndPCM_mix_asm(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer);
-    #define sndIMA_fill sndIMA_c
-    #define sndPCM_fill sndPCM_c
-    #define sndPCM_mix  sndPCM_c
-    #define sndClear(b) dmaFill(b, SND_ENCODE(0), SND_SAMPLES * sizeof(b[0]))
+    #define sndADPCM4_fill sndADPCM4_c
+    #define sndPCM_fill    sndPCM_c
+    #define sndPCM_mix     sndPCM_c
+    #define sndClear(b)    dmaFill(b, SND_ENCODE(0), SND_SAMPLES * sizeof(b[0]))
-#define DECODE_IMA_4(n)\
-    step = IMA_STEP[idx];\
-    index = n & 7;\
-    step += index * step << 1;\
-    if (index < 4) {\
-        idx = X_MAX(idx - 1, 0);\
-    } else {\
-        idx = X_MIN(idx + ((index - 3) << 1), X_COUNT(IMA_STEP) - 1);\
-    }\
-    if (n & 8) {\
-        smp -= step >> 3;\
-    } else {\
-        smp += step >> 3;\
-    }\
-    amp = smp >> 8;\
-    *buffer++ = SND_ENCODE(X_CLAMP(amp, SND_MIN, SND_MAX));
+#define DECODE_ADPCM4(n)\
+    tap = zM2 + tap - (tap >> 3);\
+    *buffer++ = SND_ENCODE(X_CLAMP(tap >> 8, SND_MIN, SND_MAX));\
+    res = ((n&0xF) ^ 8) - 8;\
+    out = res*quant + (zM1 - zM2);\
+    zM2 = zM1;\
+    zM1 = out;\
+    quant = (quant*(int32)ADPCM4_ADAPT[res+8] + 127) >> 7;\
-void sndIMA_c(IMA_STATE &state, int8* buffer, const uint8* data, int32 size)
+void sndADPCM4_c(ADPCM4_STATE &state, int8* buffer, const uint8* data, int32 size)
-    uint32 step, index;
-    int32 smp = state.smp;
-    int32 idx = state.idx;
-    int32 amp;
-    for (int32 i = 0; i < size; i++)
+    int32 zM1   = state.zM1;
+    int32 zM2   = state.zM2;
+    int32 tap   = state.tap;
+    int32 quant = state.quant;
+    int32 res, out;
+    for (int32 i=0; i < size; i++)
         uint32 n = *data++;
-        DECODE_IMA_4(n);
+        DECODE_ADPCM4(n);
         n >>= 4;
-        DECODE_IMA_4(n);
+        DECODE_ADPCM4(n);
-    state.smp = smp;
-    state.idx = idx;
+    state.zM1   = zM1;
+    state.zM2   = zM2;
+    state.tap   = tap;
+    state.quant = quant;
 int32 sndPCM_c(int32 pos, int32 inc, int32 size, int32 volume, const uint8* data, int8* buffer)
@@ -101,20 +85,20 @@ struct Music
     const uint8*  data;
     int32         size;
     int32         pos;
-    IMA_STATE     state;
+    ADPCM4_STATE  state;
     void fill(int8* buffer)
         int32 len = X_MIN(size - pos, SND_SAMPLES >> 1);
-        sndIMA_fill(state, buffer, data + pos, len);
+        sndADPCM4_fill(state, buffer, data + pos, len);
         pos += len;
         if (pos >= size)
             data = NULL;
-            memset(buffer, 0, (SND_SAMPLES - (len << 1)) * sizeof(buffer[0]));
+            memset(buffer + (len << 1), 0, (SND_SAMPLES - (len << 1)) * sizeof(buffer[0]));
@@ -232,17 +216,22 @@ void sndPlayTrack(int32 track)
         int32 size;
-    const TrackInfo* info = (const TrackInfo*)TRACKS_IMA + track;
+    const TrackInfo* info = (const TrackInfo*)TRACKS_AD4 + track;
     if (!info->size)
-    music.data = (uint8*)TRACKS_IMA + info->offset;
+    // Clear music.data before setup, and write it after to ensure
+    // music.fill() has a consistent state at any point in time
+    music.data = NULL;
     music.size = info->size;
     music.pos = 0;
     //music.volume = (1 << SND_VOL_SHIFT);
-    music.state.smp = 0;
-    music.state.idx = 0;
+    music.state.zM1   = 0;
+    music.state.zM2   = 0;
+    music.state.tap   = 0;
+    music.state.quant = 0x0800;
+    music.data = (const uint8*)TRACKS_AD4 + info->offset;
 void sndStopTrack()