/**************************************************************************** * * * Azimer's HLE Audio Plugin for Project64 Compatible N64 Emulators * * http://www.apollo64.com/ * * Copyright (C) 2000-2019 Azimer. All rights reserved. * * * * License: * * GNU/GPLv2 http://www.gnu.org/licenses/gpl-2.0.html * * * ****************************************************************************/ /* memset() and memcpy() */ #include #include "audiohle.h" u16 adpcmtable[0x88]; void InitInput(s16* inp, int index, u8 icode, u8 mask, u8 shifter, int vscale) { inp[index] = (s16)((icode & mask) << shifter); inp[index] = (s16)((s32)(inp[index] * vscale) >> 16); } void ADPCM_madd(s32* a, s16* book1, s16* book2, s16 l1, s16 l2, s16* inp) { #if defined(SSE2_SUPPORT) __m128i xmm_source, xmm_target; __m128i prod_m, prod_n; /* [0] 0xMMMMNNNN, [1] 0xMMMMNNNN, ... [7] */ __m128i prod_hi, prod_lo; /* (s32)[0, 1, 2, 3], (s32)[4, 5, 6, 7] */ #endif s32 accumulators[4]; s16 b[8]; register int i; #if defined(SSE2_SUPPORT) xmm_source = _mm_set1_epi16(l1); xmm_target = _mm_loadu_si128((__m128i *)book1); prod_m = _mm_mulhi_epi16(xmm_target, xmm_source); prod_n = _mm_mullo_epi16(xmm_target, xmm_source); prod_hi = _mm_unpacklo_epi16(prod_n, prod_m); prod_lo = _mm_unpackhi_epi16(prod_n, prod_m); xmm_source = _mm_set1_epi16(l2); xmm_target = _mm_loadu_si128((__m128i *)book2); prod_m = _mm_mulhi_epi16(xmm_target, xmm_source); prod_n = _mm_mullo_epi16(xmm_target, xmm_source); xmm_target = _mm_unpacklo_epi16(prod_n, prod_m); xmm_source = _mm_unpackhi_epi16(prod_n, prod_m); /* * for (i = 0; i < 8; i++) * products[i] = (l1[i] * book1[i]) + (l2[i] * book2[i]); */ prod_hi = _mm_add_epi32(prod_hi, xmm_target); prod_lo = _mm_add_epi32(prod_lo, xmm_source); /* * for (i = 0; i < 8; i++) * a[i] += inp[i] << 11; */ xmm_source = _mm_loadu_si128((__m128i *)inp); prod_m = _mm_unpacklo_epi16(xmm_source, xmm_source); /* (xmm_source, any) */ prod_n = _mm_unpackhi_epi16(xmm_source, xmm_source); /* Ignore upper 16b. */ prod_m = _mm_slli_epi32(prod_m, 16); /* ready to sign-extend s16 to s32 */ prod_n = _mm_slli_epi32(prod_n, 16); prod_m = _mm_srai_epi32(prod_m, 16 - 11); /* inp[i] << 11 = 2048 * inp[i] */ prod_n = _mm_srai_epi32(prod_n, 16 - 11); prod_hi = _mm_add_epi32(prod_hi, prod_m); prod_lo = _mm_add_epi32(prod_lo, prod_n); _mm_storeu_si128((__m128i *)&a[0], prod_hi); _mm_storeu_si128((__m128i *)&a[4], prod_lo); #else for (i = 0; i < 8; i++) a[i] = (s32)l1; for (i = 0; i < 8; i++) a[i] *= (s32)book1[i]; for (i = 0; i < 8; i++) b[i] = l2; for (i = 0; i < 8; i++) a[i] += (s32)b[i] * (s32)book2[i]; for (i = 0; i < 8; i++) a[i] += 2048 * inp[i]; #endif #if defined(SSE2_SUPPORT) _mm_storeu_si128((__m128i *)&b[0], _mm_setzero_si128()); xmm_source = _mm_loadu_si128((__m128i *)inp); #endif /* * for (j = 0; j < 8; j++) * for (i = 0; i < j; i++) * a[j] += (s32)book2[j - i - 1] * inp[i]; */ for (i = 0; i < 1; i++) b[i] = book2[0 - i]; accumulators[0] = (s32)b[0] * (s32)inp[0]; a[1] += accumulators[0]; for (i = 0; i < 2; i++) b[i] = book2[1 - i]; #if defined(SSE2_SUPPORT) xmm_target = _mm_loadu_si128((__m128i *)&b[0]); xmm_target = _mm_madd_epi16(xmm_target, xmm_source); _mm_storeu_si128((__m128i *)&accumulators[0], xmm_target); #else accumulators[0] = (s32)b[0] * (s32)inp[0] + (s32)b[1] * (s32)inp[1]; #endif a[2] += accumulators[0]; for (i = 0; i < 3; i++) b[i] = book2[2 - i]; #if defined(SSE2_SUPPORT) xmm_target = _mm_loadu_si128((__m128i *)&b[0]); xmm_target = _mm_madd_epi16(xmm_target, xmm_source); _mm_storeu_si128((__m128i *)&accumulators[0], xmm_target); #else accumulators[0] = (s32)b[0] * (s32)inp[0] + (s32)b[1] * (s32)inp[1]; accumulators[1] = (s32)b[2] * (s32)inp[2]; #endif a[3] += accumulators[0] + accumulators[1]; for (i = 0; i < 4; i++) b[i] = book2[3 - i]; #if defined(SSE2_SUPPORT) xmm_target = _mm_loadu_si128((__m128i *)&b[0]); xmm_target = _mm_madd_epi16(xmm_target, xmm_source); _mm_storeu_si128((__m128i *)&accumulators[0], xmm_target); #else accumulators[0] = (s32)b[0] * (s32)inp[0] + (s32)b[1] * (s32)inp[1]; accumulators[1] = (s32)b[2] * (s32)inp[2] + (s32)b[3] * (s32)inp[3]; #endif a[4] += accumulators[0] + accumulators[1]; for (i = 0; i < 5; i++) b[i] = book2[4 - i]; #if defined(SSE2_SUPPORT) xmm_target = _mm_loadu_si128((__m128i *)&b[0]); xmm_target = _mm_madd_epi16(xmm_target, xmm_source); _mm_storeu_si128((__m128i *)&accumulators[0], xmm_target); #else accumulators[0] = (s32)b[0] * (s32)inp[0] + (s32)b[1] * (s32)inp[1]; accumulators[1] = (s32)b[2] * (s32)inp[2] + (s32)b[3] * (s32)inp[3]; accumulators[2] = (s32)b[4] * (s32)inp[4]; #endif a[5] += accumulators[0] + accumulators[1] + accumulators[2]; for (i = 0; i < 6; i++) b[i] = book2[5 - i]; #if defined(SSE2_SUPPORT) xmm_target = _mm_loadu_si128((__m128i *)&b[0]); xmm_target = _mm_madd_epi16(xmm_target, xmm_source); _mm_storeu_si128((__m128i *)&accumulators[0], xmm_target); #else accumulators[0] = (s32)b[0] * (s32)inp[0] + (s32)b[1] * (s32)inp[1]; accumulators[1] = (s32)b[2] * (s32)inp[2] + (s32)b[3] * (s32)inp[3]; accumulators[2] = (s32)b[4] * (s32)inp[4] + (s32)b[5] * (s32)inp[5]; #endif a[6] += accumulators[0] + accumulators[1] + accumulators[2]; for (i = 0; i < 7; i++) b[i] = book2[6 - i]; #if defined(SSE2_SUPPORT) xmm_target = _mm_loadu_si128((__m128i *)&b[0]); xmm_target = _mm_madd_epi16(xmm_target, xmm_source); _mm_storeu_si128((__m128i *)&accumulators[0], xmm_target); #else accumulators[0] = (s32)b[0] * (s32)inp[0] + (s32)b[1] * (s32)inp[1]; accumulators[1] = (s32)b[2] * (s32)inp[2] + (s32)b[3] * (s32)inp[3]; accumulators[2] = (s32)b[4] * (s32)inp[4] + (s32)b[5] * (s32)inp[5]; accumulators[3] = (s32)b[6] * (s32)inp[6]; #endif a[7] += accumulators[0] + accumulators[1] + accumulators[2] + accumulators[3]; } void ADPCM() { // Work in progress! :) u8 Flags = (u8)((k0 >> 16) & 0xff); //u16 Gain = (u16)(k0 & 0xffff); u32 Address = (t9 & 0xffffff);// + SEGMENTS[(t9>>24)&0xf]; u16 inPtr = 0; //s16 *out=(s16 *)(testbuff+(AudioOutBuffer>>2)); s16 *out = (s16 *)(BufferSpace + AudioOutBuffer); //u8 *in = (u8 *)(BufferSpace + AudioInBuffer); s16 count = (s16)AudioCount; int vscale; u16 index; s32 a[8]; s16 b[8]; s16* book1; s16* book2; /* if (Address > (1024*1024*8)) Address = (t9 & 0xffffff); */ memset(out, 0, 32); if (!(Flags & 0x1)) { if (Flags & 0x2) { memcpy(out, &DRAM[loopval], 32); } else { memcpy(out, &DRAM[Address], 32); } } s16 l1 = out[15]; s16 l2 = out[14]; s16 inp1[8]; s16 inp2[8]; out += 16; while (count>0) { // the first interation through, these values are // either 0 in the case of A_INIT, from a special // area of memory in the case of A_LOOP or just // the values we calculated the last time u8 code = BufferSpace[BES(AudioInBuffer + inPtr)]; index = code & 0xf; index <<= 4; // index into the adpcm code table book1 = (s16 *)&adpcmtable[index]; book2 = book1 + 8; code >>= 4; // upper nibble is scale #if 0 assert((12 - code) - 1 >= 0); #endif vscale = 0x8000u >> ((12 - code) - 1); // very strange. 0x8000 would be .5 in 16:16 format // so this appears to be a fractional scale based // on the 12 based inverse of the scale value. note // that this could be negative, in which case we do // not use the calculated vscale value... if ((12 - code) - 1 < 0) vscale = 0x10000; /* null operation: << 16 then >> 16 */ inPtr++; // coded adpcm data lies next for (int i = 0; i < 8; i += 2) // loop of 8, for 8 coded nibbles from 4 bytes // which yields 8 short pcm values { u8 icode = BufferSpace[BES(AudioInBuffer + inPtr)]; inPtr++; InitInput(inp1, i + 0, icode, 0xF0, 8, vscale); // this will in effect be signed InitInput(inp1, i + 1, icode, 0x0F, 12, vscale); } for (int i = 0; i < 8; i += 2) { u8 icode = BufferSpace[BES(AudioInBuffer + inPtr)]; inPtr++; InitInput(inp2, i + 0, icode, 0xF0, 8, vscale); // this will in effect be signed InitInput(inp2, i + 1, icode, 0x0F, 12, vscale); } ADPCM_madd(a, book1, book2, l1, l2, inp1); for (int i = 0; i < 8; i++) a[i] = a[i] >> 11; vsats128(&b[0], &a[0]); swap_elements(out, &b[0]); out += 8; l1 = b[6]; l2 = b[7]; ADPCM_madd(a, book1, book2, l1, l2, inp2); for (int i = 0; i < 8; i++) a[i] = a[i] >> 11; vsats128(&b[0], &a[0]); swap_elements(out, &b[0]); out += 8; l1 = b[6]; l2 = b[7]; count -= 32; } out -= 16; memcpy(&DRAM[Address], out, 32); } void ADPCM2() { // Verified to be 100% Accurate... u8 Flags = (u8)((k0 >> 16) & 0xff); // u16 Gain = (u16)(k0 & 0xffff); u32 Address = (t9 & 0xffffff);// + SEGMENTS[(t9>>24)&0xf]; u16 inPtr = 0; //s16 *out=(s16 *)(testbuff+(AudioOutBuffer>>2)); s16 *out = (s16 *)(BufferSpace + AudioOutBuffer); // u8 *in = (u8 *)(BufferSpace + AudioInBuffer); s16 count = (s16)AudioCount; int vscale; u16 index; s32 a[8]; s16 b[8]; s16* book1; s16* book2; u8 srange; //u8 inpinc; u8 mask1; u8 mask2; u8 shifter; memset(out, 0, 32); if (!(Flags & 0x1)) { if (Flags & 0x2) memcpy(out, &DRAM[loopval], 32); else memcpy(out, &DRAM[Address], 32); } if (Flags & 0x4) { // Needed for Zelda MM srange = 0xE; //inpinc = 0x5; mask1 = 0xC0; mask2 = 0x30; shifter = 10; } else { srange = 0xC; //inpinc = 0x9; mask1 = 0xF0; mask2 = 0x0F; shifter = 12; } s16 l1 = out[15]; s16 l2 = out[14]; s16 inp1[8]; s16 inp2[8]; out += 16; while (count>0) { u8 code = BufferSpace[BES(AudioInBuffer + inPtr)]; index = code & 0xf; index <<= 4; book1 = (s16 *)&adpcmtable[index]; book2 = book1 + 8; code >>= 4; #if 0 assert((srange - code) - 1 >= 0); #endif vscale = 0x8000u >> ((srange - code) - 1); if ((srange - code) - 1 < 0) vscale = 0x10000; /* null operation: << 16 then >> 16 */ inPtr++; for (int i = 0; i < 8; ) { u8 icode = BufferSpace[BES(AudioInBuffer + inPtr)]; inPtr++; InitInput(inp1, i + 0, icode, mask1, 8, vscale); // this will in effect be signed InitInput(inp1, i + 1, icode, mask2, shifter, vscale); i += 2; if (Flags & 4) { InitInput(inp1, i + 0, icode, 0xC, 12, vscale); // this will in effect be signed InitInput(inp1, i + 1, icode, 0x3, 14, vscale); i += 2; } // end flags } // end while for (int i = 0; i < 8;) { u8 icode = BufferSpace[BES(AudioInBuffer + inPtr)]; inPtr++; InitInput(inp2, i + 0, icode, mask1, 8, vscale); InitInput(inp2, i + 1, icode, mask2, shifter, vscale); i += 2; if (Flags & 4) { InitInput(inp2, i + 0, icode, 0xC, 12, vscale); InitInput(inp2, i + 1, icode, 0x3, 14, vscale); i += 2; } // end flags } ADPCM_madd(a, book1, book2, l1, l2, inp1); for (int i = 0; i < 8; i++) a[i] = a[i] >> 11; vsats128(&b[0], &a[0]); swap_elements(out, &b[0]); out += 8; l1 = b[6]; l2 = b[7]; ADPCM_madd(a, book1, book2, l1, l2, inp2); for (int i = 0; i < 8; i++) a[i] = a[i] >> 11; vsats128(&b[0], &a[0]); swap_elements(out, &b[0]); out += 8; l1 = b[6]; l2 = b[7]; count -= 32; } out -= 16; memcpy(&DRAM[Address], out, 32); } void ADPCM3() { // Verified to be 100% Accurate... u8 Flags = (u8)((t9 >> 0x1c) & 0xff); //u16 Gain=(u16)(k0&0xffff); u32 Address = (k0 & 0xffffff);// + SEGMENTS[(t9>>24)&0xf]; u16 inPtr = (t9 >> 12) & 0xf; //s16 *out=(s16 *)(testbuff+(AudioOutBuffer>>2)); s16 *out = (s16 *)(BufferSpace + (t9 & 0xfff) + 0x4f0); // u8 *in = (u8 *)(BufferSpace + ((t9 >> 12) & 0xf) + 0x4f0); s16 count = (s16)((t9 >> 16) & 0xfff); int vscale; u16 index; s32 a[8]; s16 b[8]; s16* book1; s16* book2; memset(out, 0, 32); if (!(Flags & 0x1)) { if (Flags & 0x2) memcpy(out, &DRAM[loopval], 32); else memcpy(out, &DRAM[Address], 32); } s16 l1 = out[15]; s16 l2 = out[14]; s16 inp1[8]; s16 inp2[8]; out += 16; while (count>0) { // the first interation through, these values are // either 0 in the case of A_INIT, from a special // area of memory in the case of A_LOOP or just // the values we calculated the last time u8 code = BufferSpace[BES(0x4f0 + inPtr)]; index = code & 0xf; index <<= 4; // index into the adpcm code table book1 = (s16 *)&adpcmtable[index]; book2 = book1 + 8; code >>= 4; // upper nibble is scale vscale = 0x8000u >> ((12 - code) - 1); // very strange. 0x8000 would be .5 in 16:16 format // so this appears to be a fractional scale based // on the 12 based inverse of the scale value. note // that this could be negative, in which case we do // not use the calculated vscale value... if ((12 - code) - 1 < 0) vscale = 0x10000; /* null operation: << 16 then >> 16 */ inPtr++; // coded adpcm data lies next for (int i = 0; i < 8; i += 2) // loop of 8, for 8 coded nibbles from 4 bytes // which yields 8 short pcm values { u8 icode = BufferSpace[BES(0x4f0 + inPtr)]; inPtr++; InitInput(inp1, i + 0, icode, 0xF0, 8, vscale); // this will in effect be signed InitInput(inp1, i + 1, icode, 0x0F, 12, vscale); } for (int i = 0; i < 8; i += 2) { u8 icode = BufferSpace[BES(0x4F0 + inPtr)]; inPtr++; InitInput(inp2, i + 0, icode, 0xF0, 8, vscale); // this will in effect be signed InitInput(inp2, i + 1, icode, 0x0F, 12, vscale); } ADPCM_madd(a, book1, book2, l1, l2, inp1); for (int i = 0; i < 8; i++) a[i] = a[i] >> 11; vsats128(&b[0], &a[0]); swap_elements(out, &b[0]); out += 8; l1 = b[6]; l2 = b[7]; ADPCM_madd(a, book1, book2, l1, l2, inp2); for (int i = 0; i < 8; i++) a[i] = a[i] >> 11; vsats128(&b[0], &a[0]); swap_elements(out, &b[0]); // *(out + i + 0x1F8) = b[i ^ 1]; out += 8; l1 = b[6]; l2 = b[7]; count -= 32; } out -= 16; memcpy(&DRAM[Address], out, 32); } void LOADADPCM() { // Loads an ADPCM table - Works 100% Now 03-13-01 u32 v0; size_t i, limit; v0 = (t9 & 0xffffff);// + SEGMENTS[(t9>>24)&0xf]; // if (v0 > (1024*1024*8)) // v0 = (t9 & 0xffffff); // memcpy (dmem+0x4c0, rdram+v0, k0&0xffff); // Could prolly get away with not putting this in dmem // assert ((k0&0xffff) <= 0x80); u16 *table = (u16 *)(DRAM + v0); limit = (k0 & 0x0000FFFF) >> 4; for (i = 0; i < limit; i++) swap_elements(&adpcmtable[8*i], &table[8*i]); } void LOADADPCM2() { // Loads an ADPCM table - Works 100% Now 03-13-01 u32 v0; size_t i, limit; v0 = (t9 & 0xffffff);// + SEGMENTS[(t9>>24)&0xf]; u16 *table = (u16 *)(DRAM + v0); // Zelda2 Specific... limit = (k0 & 0x0000FFFF) >> 4; for (i = 0; i < limit; i++) swap_elements(&adpcmtable[8*i], &table[8*i]); } void LOADADPCM3() { // Loads an ADPCM table - Works 100% Now 03-13-01 u32 v0; size_t i, limit; v0 = (t9 & 0xffffff); //memcpy (dmem+0x3f0, rdram+v0, k0&0xffff); //assert ((k0&0xffff) <= 0x80); u16 *table = (u16 *)(DRAM + v0); limit = (k0 & 0x0000FFFF) >> 4; for (i = 0; i < limit; i++) swap_elements(&adpcmtable[8*i], &table[8*i]); }