/**
 * Bruteforcing decoder for converting ADPCM-encoded AIFC into AIFF, in a way
 * that roundtrips with vadpcm_enc.
 */
#include <unistd.h>
#include <assert.h>
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>

typedef signed char s8;
typedef short s16;
typedef int s32;
typedef unsigned char u8;
typedef unsigned short u16;
typedef unsigned int u32;
typedef unsigned long long u64;
typedef float f32;

#define bswap16(x) __builtin_bswap16(x)
#define bswap32(x) __builtin_bswap32(x)
#define BSWAP16(x) x = __builtin_bswap16(x)
#define BSWAP32(x) x = __builtin_bswap32(x)
#define BSWAP16_MANY(x, n) for (s32 _i = 0; _i < n; _i++) BSWAP16((x)[_i])

#define NORETURN __attribute__((noreturn))
#define UNUSED __attribute__((unused))

typedef struct {
    u32 ckID;
    u32 ckSize;
} ChunkHeader;

typedef struct {
    u32 ckID;
    u32 ckSize;
    u32 formType;
} Chunk;

typedef struct {
    s16 numChannels;
    u16 numFramesH;
    u16 numFramesL;
    s16 sampleSize;
    s16 sampleRate[5]; // 80-bit float
    u16 compressionTypeH;
    u16 compressionTypeL;
} CommonChunk;

typedef struct {
    s16 MarkerID;
    u16 positionH;
    u16 positionL;
} Marker;

typedef struct {
    s16 playMode;
    s16 beginLoop;
    s16 endLoop;
} Loop;

typedef struct {
    s8 baseNote;
    s8 detune;
    s8 lowNote;
    s8 highNote;
    s8 lowVelocity;
    s8 highVelocity;
    s16 gain;
    Loop sustainLoop;
    Loop releaseLoop;
} InstrumentChunk;

typedef struct {
    s32 offset;
    s32 blockSize;
} SoundDataChunk;

typedef struct {
    s16 version;
    s16 order;
    s16 nEntries;
} CodeChunk;

typedef struct
{
    u32 start;
    u32 end;
    u32 count;
    s16 state[16];
} ALADPCMloop;


static char usage[] = "input.aifc output.aiff";
static const char *progname, *infilename;

#define checked_fread(a, b, c, d) if (fread(a, b, c, d) != c) fail_parse("error parsing file")

NORETURN
void fail_parse(const char *fmt, ...)
{
    char *formatted = NULL;
    va_list ap;
    va_start(ap, fmt);
    int size = vsnprintf(NULL, 0, fmt, ap);
    va_end(ap);
    if (size >= 0) {
        size++;
        formatted = malloc(size);
        if (formatted != NULL) {
            va_start(ap, fmt);
            size = vsnprintf(formatted, size, fmt, ap);
            va_end(ap);
            if (size < 0) {
                free(formatted);
                formatted = NULL;
            }
        }
    }

    if (formatted != NULL) {
        fprintf(stderr, "%s: %s [%s]\n", progname, formatted, infilename);
        free(formatted);
    }
    exit(1);
}

s32 myrand()
{
    static u64 state = 1619236481962341ULL;
    state *= 3123692312231ULL;
    state++;
    return state >> 33;
}

s16 qsample(s32 x, s32 scale)
{
    // Compute x / 2^scale rounded to the nearest integer, breaking ties towards zero.
    if (scale == 0) return x;
    return (x + (1 << (scale - 1)) - (x > 0)) >> scale;
}

s16 clamp_to_s16(s32 x)
{
    if (x < -0x8000) return -0x8000;
    if (x > 0x7fff) return 0x7fff;
    return (s16) x;
}

s32 toi4(s32 x)
{
    if (x >= 8) return x - 16;
    return x;
}

s32 readaifccodebook(FILE *fhandle, s32 ****table, s16 *order, s16 *npredictors)
{
    checked_fread(order, sizeof(s16), 1, fhandle);
    BSWAP16(*order);
    checked_fread(npredictors, sizeof(s16), 1, fhandle);
    BSWAP16(*npredictors);
    *table = malloc(*npredictors * sizeof(s32 **));
    for (s32 i = 0; i < *npredictors; i++) {
        (*table)[i] = malloc(8 * sizeof(s32 *));
        for (s32 j = 0; j < 8; j++) {
            (*table)[i][j] = malloc((*order + 8) * sizeof(s32));
        }
    }

    for (s32 i = 0; i < *npredictors; i++) {
        s32 **table_entry = (*table)[i];
        for (s32 j = 0; j < *order; j++) {
            for (s32 k = 0; k < 8; k++) {
                s16 ts;
                checked_fread(&ts, sizeof(s16), 1, fhandle);
                BSWAP16(ts);
                table_entry[k][j] = ts;
            }
        }

        for (s32 k = 1; k < 8; k++) {
            table_entry[k][*order] = table_entry[k - 1][*order - 1];
        }

        table_entry[0][*order] = 1 << 11;

        for (s32 k = 1; k < 8; k++) {
            s32 j = 0;
            for (; j < k; j++) {
                table_entry[j][k + *order] = 0;
            }

            for (; j < 8; j++) {
                table_entry[j][k + *order] = table_entry[j - k][*order];
            }
        }
    }
    return 0;
}

ALADPCMloop *readlooppoints(FILE *ifile, s16 *nloops)
{
    checked_fread(nloops, sizeof(s16), 1, ifile);
    BSWAP16(*nloops);
    ALADPCMloop *al = malloc(*nloops * sizeof(ALADPCMloop));
    for (s32 i = 0; i < *nloops; i++) {
        checked_fread(&al[i], sizeof(ALADPCMloop), 1, ifile);
        BSWAP32(al[i].start);
        BSWAP32(al[i].end);
        BSWAP32(al[i].count);
        BSWAP16_MANY(al[i].state, 16);
    }
    return al;
}

s32 inner_product(s32 length, s32 *v1, s32 *v2)
{
    s32 out = 0;
    for (s32 i = 0; i < length; i++) {
        out += v1[i] * v2[i];
    }

    // Compute "out / 2^11", rounded down.
    s32 dout = out / (1 << 11);
    s32 fiout = dout * (1 << 11);
    return dout - (out - fiout < 0);
}

void my_decodeframe(u8 *frame, s32 *state, s32 order, s32 ***coefTable)
{
    s32 ix[16];

    u8 header = frame[0];
    s32 scale = 1 << (header >> 4);
    s32 optimalp = header & 0xf;

    for (s32 i = 0; i < 16; i += 2) {
        u8 c = frame[1 + i/2];
        ix[i] = c >> 4;
        ix[i + 1] = c & 0xf;
    }

    for (s32 i = 0; i < 16; i++) {
        if (ix[i] >= 8) ix[i] -= 16;
        ix[i] *= scale;
    }

    for (s32 j = 0; j < 2; j++) {
        s32 in_vec[16];
        if (j == 0) {
            for (s32 i = 0; i < order; i++) {
                in_vec[i] = state[16 - order + i];
            }
        } else {
            for (s32 i = 0; i < order; i++) {
                in_vec[i] = state[8 - order + i];
            }
        }

        for (s32 i = 0; i < 8; i++) {
            s32 ind = j * 8 + i;
            in_vec[order + i] = ix[ind];
            state[ind] = inner_product(order + i, coefTable[optimalp][i], in_vec) + ix[ind];
        }
    }
}

void my_encodeframe(u8 *out, s16 *inBuffer, s32 *state, s32 ***coefTable, s32 order, s32 npredictors)
{
    s16 ix[16];
    s32 prediction[16];
    s32 inVector[16];
    s32 saveState[16];
    s32 optimalp = 0;
    s32 scale;
    s32 ie[16];
    s32 e[16];
    f32 min = 1e30;

    for (s32 k = 0; k < npredictors; k++) {
        for (s32 j = 0; j < 2; j++) {
            for (s32 i = 0; i < order; i++) {
                inVector[i] = (j == 0 ? state[16 - order + i] : inBuffer[8 - order + i]);
            }

            for (s32 i = 0; i < 8; i++) {
                prediction[j * 8 + i] = inner_product(order + i, coefTable[k][i], inVector);
                e[j * 8 + i] = inVector[i + order] = inBuffer[j * 8 + i] - prediction[j * 8 + i];
            }
        }

        f32 se = 0.0f;
        for (s32 j = 0; j < 16; j++) {
            se += (f32) e[j] * (f32) e[j];
        }

        if (se < min) {
            min = se;
            optimalp = k;
        }
    }

    for (s32 j = 0; j < 2; j++) {
        for (s32 i = 0; i < order; i++) {
            inVector[i] = (j == 0 ? state[16 - order + i] : inBuffer[8 - order + i]);
        }

        for (s32 i = 0; i < 8; i++) {
            prediction[j * 8 + i] = inner_product(order + i, coefTable[optimalp][i], inVector);
            e[j * 8 + i] = inVector[i + order] = inBuffer[j * 8 + i] - prediction[j * 8 + i];
        }
    }

    for (s32 i = 0; i < 16; i++) {
        ie[i] = clamp_to_s16(e[i]);
    }

    s32 max = 0;
    for (s32 i = 0; i < 16; i++) {
        if (abs(ie[i]) > abs(max)) {
            max = ie[i];
        }
    }

    for (scale = 0; scale <= 12; scale++) {
        if (max <= 7 && max >= -8) break;
        max /= 2;
    }

    for (s32 i = 0; i < 16; i++) {
        saveState[i] = state[i];
    }

    for (s32 nIter = 0, again = 1; nIter < 2 && again; nIter++) {
        again = 0;
        if (nIter == 1) scale++;
        if (scale > 12) {
            scale = 12;
        }

        for (s32 j = 0; j < 2; j++) {
            s32 base = j * 8;
            for (s32 i = 0; i < order; i++) {
                inVector[i] = (j == 0 ?
                        saveState[16 - order + i] : state[8 - order + i]);
            }

            for (s32 i = 0; i < 8; i++) {
                prediction[base + i] = inner_product(order + i, coefTable[optimalp][i], inVector);
                s32 se = inBuffer[base + i] - prediction[base + i];
                ix[base + i] = qsample(se, scale);
                s32 cV = clamp_to_s16(ix[base + i]) - ix[base + i];
                if (cV > 1 || cV < -1) again = 1;
                ix[base + i] += cV;
                inVector[i + order] = ix[base + i] * (1 << scale);
                state[base + i] = prediction[base + i] + inVector[i + order];
            }
        }
    }

    u8 header = (scale << 4) | (optimalp & 0xf);
    out[0] = header;
    for (s32 i = 0; i < 16; i += 2) {
        u8 c = ((ix[i] & 0xf) << 4) | (ix[i + 1] & 0xf);
        out[1 + i/2] = c;
    }
}

void permute(s16 *out, s32 *in, s32 scale)
{
    for (s32 i = 0; i < 16; i++) {
        out[i] = clamp_to_s16(in[i] - scale / 2 + myrand() % (scale + 1));
    }
}

void write_header(FILE *ofile, const char *id, s32 size)
{
    fwrite(id, 4, 1, ofile);
    BSWAP32(size);
    fwrite(&size, sizeof(s32), 1, ofile);
}

int main(int argc, char **argv)
{
    s16 order = -1;
    s16 nloops = 0;
    ALADPCMloop *aloops = NULL;
    s16 npredictors = -1;
    s32 ***coefTable = NULL;
    s32 state[16];
    s32 soundPointer = -1;
    s32 currPos = 0;
    s32 nSamples = 0;
    Chunk FormChunk;
    ChunkHeader Header;
    CommonChunk CommChunk;
    InstrumentChunk InstChunk;
    SoundDataChunk SndDChunk;
    FILE *ifile;
    FILE *ofile;
    progname = argv[0];

    if (argc < 3) {
        fprintf(stderr, "%s %s\n", progname, usage);
        exit(1);
    }

    infilename = argv[1];

    if ((ifile = fopen(infilename, "rb")) == NULL) {
        fail_parse("AIFF-C file could not be opened");
        exit(1);
    }

    if ((ofile = fopen(argv[2], "wb")) == NULL) {
        fprintf(stderr, "%s: output file could not be opened [%s]\n", progname, argv[2]);
        exit(1);
    }

    memset(&InstChunk, 0, sizeof(InstChunk));

    checked_fread(&FormChunk, sizeof(FormChunk), 1, ifile);
    BSWAP32(FormChunk.ckID);
    BSWAP32(FormChunk.formType);
    if ((FormChunk.ckID != 0x464f524d) || (FormChunk.formType != 0x41494643)) { // FORM, AIFC
        fail_parse("not an AIFF-C file");
    }

    for (;;) {
        s32 num = fread(&Header, sizeof(Header), 1, ifile);
        u32 ts;
        if (num <= 0) break;
        BSWAP32(Header.ckID);
        BSWAP32(Header.ckSize);

        Header.ckSize++;
        Header.ckSize &= ~1;
        s32 offset = ftell(ifile);

        switch (Header.ckID) {
        case 0x434f4d4d: // COMM
            checked_fread(&CommChunk, sizeof(CommChunk), 1, ifile);
            BSWAP16(CommChunk.numChannels);
            BSWAP16(CommChunk.numFramesH);
            BSWAP16(CommChunk.numFramesL);
            BSWAP16(CommChunk.sampleSize);
            BSWAP16(CommChunk.compressionTypeH);
            BSWAP16(CommChunk.compressionTypeL);
            s32 cType = (CommChunk.compressionTypeH << 16) + CommChunk.compressionTypeL;
            if (cType != 0x56415043) { // VAPC
                fail_parse("file is of the wrong compression type");
            }
            if (CommChunk.numChannels != 1) {
                fail_parse("file contains %d channels, only 1 channel supported", CommChunk.numChannels);
            }
            if (CommChunk.sampleSize != 16) {
                fail_parse("file contains %d bit samples, only 16 bit samples supported", CommChunk.sampleSize);
            }

            nSamples = (CommChunk.numFramesH << 16) + CommChunk.numFramesL;

            // Allow broken input lengths
            if (nSamples % 16) {
                nSamples--;
            }

            if (nSamples % 16 != 0) {
                fail_parse("number of chunks must be a multiple of 16, found %d", nSamples);
            }
            break;

        case 0x53534e44: // SSND
            checked_fread(&SndDChunk, sizeof(SndDChunk), 1, ifile);
            BSWAP32(SndDChunk.offset);
            BSWAP32(SndDChunk.blockSize);
            assert(SndDChunk.offset == 0);
            assert(SndDChunk.blockSize == 0);
            soundPointer = ftell(ifile);
            break;

        case 0x4150504c: // APPL
            checked_fread(&ts, sizeof(u32), 1, ifile);
            BSWAP32(ts);
            if (ts == 0x73746f63) { // stoc
                u8 len;
                checked_fread(&len, 1, 1, ifile);
                if (len == 11) {
                    char ChunkName[12];
                    s16 version;
                    checked_fread(ChunkName, 11, 1, ifile);
                    ChunkName[11] = '\0';
                    if (strcmp("VADPCMCODES", ChunkName) == 0) {
                        checked_fread(&version, sizeof(s16), 1, ifile);
                        BSWAP16(version);
                        if (version != 1) {
                            fail_parse("Unknown codebook chunk version");
                        }
                        readaifccodebook(ifile, &coefTable, &order, &npredictors);
                    }
                    else if (strcmp("VADPCMLOOPS", ChunkName) == 0) {
                        checked_fread(&version, sizeof(s16), 1, ifile);
                        BSWAP16(version);
                        if (version != 1) {
                            fail_parse("Unknown loop chunk version");
                        }
                        aloops = readlooppoints(ifile, &nloops);
                        if (nloops != 1) {
                            fail_parse("Only a single loop supported");
                        }
                    }
                }
            }
            break;
        }

        fseek(ifile, offset + Header.ckSize, SEEK_SET);
    }

    if (coefTable == NULL) {
        fail_parse("Codebook missing from bitstream");
    }

    for (s32 i = 0; i < order; i++) {
        state[15 - i] = 0;
    }

    u32 outputBytes = nSamples * sizeof(s16);
    u8 *outputBuf = malloc(outputBytes);

    fseek(ifile, soundPointer, SEEK_SET);
    while (currPos < nSamples) {
        u8 input[9];
        u8 encoded[9];
        s32 lastState[16];
        s32 decoded[16];
        s16 guess[16];
        s16 origGuess[16];

        memcpy(lastState, state, sizeof(lastState));
        checked_fread(input, 9, 1, ifile);

        // Decode for real
        my_decodeframe(input, state, order, coefTable);
        memcpy(decoded, state, sizeof(lastState));

        // Create a guess from that, by clamping to 16 bits
        for (s32 i = 0; i < 16; i++) {
            origGuess[i] = clamp_to_s16(state[i]);
        }

        // Encode the guess
        memcpy(state, lastState, sizeof(lastState));
        memcpy(guess, origGuess, sizeof(guess));
        my_encodeframe(encoded, guess, state, coefTable, order, npredictors);

        // If it doesn't match, randomly round numbers until it does.
        if (memcmp(input, encoded, 9) != 0) {
            s32 scale = 1 << (input[0] >> 4);
            do {
                permute(guess, decoded, scale);
                memcpy(state, lastState, sizeof(lastState));
                my_encodeframe(encoded, guess, state, coefTable, order, npredictors);
            } while (memcmp(input, encoded, 9) != 0);

            // Bring the matching closer to the original decode (not strictly
            // necessary, but it will move us closer to the target on average).
            for (s32 failures = 0; failures < 50; failures++) {
                s32 ind = myrand() % 16;
                s32 old = guess[ind];
                if (old == origGuess[ind]) continue;
                guess[ind] = origGuess[ind];
                if (myrand() % 2) guess[ind] += (old - origGuess[ind]) / 2;
                memcpy(state, lastState, sizeof(lastState));
                my_encodeframe(encoded, guess, state, coefTable, order, npredictors);
                if (memcmp(input, encoded, 9) == 0) {
                    failures = -1;
                }
                else {
                    guess[ind] = old;
                }
            }
        }

        memcpy(state, decoded, sizeof(lastState));
        BSWAP16_MANY(guess, 16);
        memcpy(outputBuf + currPos * 2, guess, sizeof(guess));
        currPos += 16;
    }

    // Write an incomplete file header. We'll fill in the size later.
    fwrite("FORM\0\0\0\0AIFF", 12, 1, ofile);

    // Subtract 4 from the COMM size to skip the compression field.
    write_header(ofile, "COMM", sizeof(CommonChunk) - 4);
    CommChunk.numFramesH = nSamples >> 16;
    CommChunk.numFramesL = nSamples & 0xffff;
    BSWAP16(CommChunk.numChannels);
    BSWAP16(CommChunk.numFramesH);
    BSWAP16(CommChunk.numFramesL);
    BSWAP16(CommChunk.sampleSize);
    fwrite(&CommChunk, sizeof(CommonChunk) - 4, 1, ofile);

    if (nloops > 0) {
        s32 startPos = aloops[0].start, endPos = aloops[0].end;
        const char *markerNames[2] = {"start", "end"};
        Marker markers[2] = {
            {1, startPos >> 16, startPos & 0xffff},
            {2, endPos >> 16, endPos & 0xffff}
        };
        write_header(ofile, "MARK", 2 + 2 * sizeof(Marker) + 1 + 5 + 1 + 3);
        s16 numMarkers = bswap16(2);
        fwrite(&numMarkers, sizeof(s16), 1, ofile);
        for (s32 i = 0; i < 2; i++) {
            u8 len = (u8) strlen(markerNames[i]);
            BSWAP16(markers[i].MarkerID);
            BSWAP16(markers[i].positionH);
            BSWAP16(markers[i].positionL);
            fwrite(&markers[i], sizeof(Marker), 1, ofile);
            fwrite(&len, 1, 1, ofile);
            fwrite(markerNames[i], len, 1, ofile);
        }

        write_header(ofile, "INST", sizeof(InstrumentChunk));
        InstChunk.sustainLoop.playMode = bswap16(1);
        InstChunk.sustainLoop.beginLoop = bswap16(1);
        InstChunk.sustainLoop.endLoop = bswap16(2);
        InstChunk.releaseLoop.playMode = 0;
        InstChunk.releaseLoop.beginLoop = 0;
        InstChunk.releaseLoop.endLoop = 0;
        fwrite(&InstChunk, sizeof(InstrumentChunk), 1, ofile);
    }

    // Save the coefficient table for use when encoding. Ideally this wouldn't
    // be needed and "tabledesign -s 1" would generate the right table, but in
    // practice it's difficult to adjust samples to make that happen.
    write_header(ofile, "APPL", 4 + 12 + sizeof(CodeChunk) + npredictors * order * 8 * 2);
    fwrite("stoc", 4, 1, ofile);
    CodeChunk cChunk;
    cChunk.version = bswap16(1);
    cChunk.order = bswap16(order);
    cChunk.nEntries = bswap16(npredictors);
    fwrite("\x0bVADPCMCODES", 12, 1, ofile);
    fwrite(&cChunk, sizeof(CodeChunk), 1, ofile);
    for (s32 i = 0; i < npredictors; i++) {
        for (s32 j = 0; j < order; j++) {
            for (s32 k = 0; k < 8; k++) {
                s16 ts = bswap16(coefTable[i][k][j]);
                fwrite(&ts, sizeof(s16), 1, ofile);
            }
        }
    }

    write_header(ofile, "SSND", outputBytes + 8);
    SndDChunk.offset = 0;
    SndDChunk.blockSize = 0;
    fwrite(&SndDChunk, sizeof(SoundDataChunk), 1, ofile);
    fwrite(outputBuf, outputBytes, 1, ofile);

    // Fix the size in the header
    s32 fileSize = bswap32(ftell(ofile) - 8);
    fseek(ofile, 4, SEEK_SET);
    fwrite(&fileSize, 4, 1, ofile);

    fclose(ifile);
    fclose(ofile);
    return 0;
}