mirror of
https://github.com/mupen64plus/mupen64plus-rsp-cxd4.git
synced 2025-04-02 10:51:55 -04:00
370 lines
9 KiB
C
370 lines
9 KiB
C
/******************************************************************************\
|
|
* Project: MSP Simulation Layer for Vector Unit Computational Adds *
|
|
* Authors: Iconoclast *
|
|
* Release: 2018.03.18 *
|
|
* License: CC0 Public Domain Dedication *
|
|
* *
|
|
* To the extent possible under law, the author(s) have dedicated all copyright *
|
|
* and related and neighboring rights to this software to the public domain *
|
|
* worldwide. This software is distributed without any warranty. *
|
|
* *
|
|
* You should have received a copy of the CC0 Public Domain Dedication along *
|
|
* with this software. *
|
|
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. *
|
|
\******************************************************************************/
|
|
|
|
#include <string.h>
|
|
|
|
#include "add.h"
|
|
|
|
#ifdef ARCH_MIN_SSE2
|
|
static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT)
|
|
{
|
|
v16 dst, src, vco;
|
|
v16 max, min;
|
|
|
|
src = _mm_load_si128((v16 *)VS);
|
|
dst = _mm_load_si128((v16 *)VT);
|
|
vco = _mm_load_si128((v16 *)cf_co);
|
|
|
|
/*
|
|
* Due to premature clamping in between adds, sometimes we need to add the
|
|
* LESSER of two integers, either VS or VT, to the carry-in flag matching the
|
|
* current vector register slice, BEFORE finally adding the greater integer.
|
|
*/
|
|
max = _mm_max_epi16(dst, src);
|
|
min = _mm_min_epi16(dst, src);
|
|
|
|
min = _mm_adds_epi16(min, vco);
|
|
max = _mm_adds_epi16(max, min);
|
|
_mm_store_si128((v16 *)VD, max);
|
|
return;
|
|
}
|
|
static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT)
|
|
{
|
|
v16 dst, src, vco;
|
|
v16 dif, res, xmm;
|
|
|
|
src = _mm_load_si128((v16 *)VS);
|
|
dst = _mm_load_si128((v16 *)VT);
|
|
vco = _mm_load_si128((v16 *)cf_co);
|
|
|
|
res = _mm_subs_epi16(src, dst);
|
|
|
|
/*
|
|
* Due to premature clamps in-between subtracting two of the three operands,
|
|
* we must be careful not to offset the result accidentally when subtracting
|
|
* the corresponding VCO flag AFTER the saturation from doing (VS - VT).
|
|
*/
|
|
dif = _mm_add_epi16(res, vco);
|
|
dif = _mm_xor_si128(dif, res); /* Adding one suddenly inverts the sign? */
|
|
dif = _mm_and_si128(dif, dst); /* Sign change due to subtracting a neg. */
|
|
xmm = _mm_sub_epi16(src, dst);
|
|
src = _mm_andnot_si128(src, dif); /* VS must be >= 0x0000 for overflow. */
|
|
xmm = _mm_and_si128(xmm, src); /* VS + VT != INT16_MIN; VS + VT >= +32768 */
|
|
xmm = _mm_srli_epi16(xmm, 15); /* src = (INT16_MAX + 1 === INT16_MIN) ? */
|
|
|
|
xmm = _mm_andnot_si128(xmm, vco); /* If it's NOT overflow, keep flag. */
|
|
res = _mm_subs_epi16(res, xmm);
|
|
_mm_store_si128((v16 *)VD, res);
|
|
return;
|
|
}
|
|
#else
|
|
static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT)
|
|
{
|
|
i32 sum[N];
|
|
i16 hi[N], lo[N];
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
sum[i] = VS[i] + VT[i] + cf_co[i];
|
|
for (i = 0; i < N; i++)
|
|
lo[i] = (sum[i] + 0x8000) >> 31;
|
|
for (i = 0; i < N; i++)
|
|
hi[i] = (0x7FFF - sum[i]) >> 31;
|
|
vector_copy(VD, VACC_L);
|
|
for (i = 0; i < N; i++)
|
|
VD[i] &= ~lo[i];
|
|
for (i = 0; i < N; i++)
|
|
VD[i] |= hi[i];
|
|
for (i = 0; i < N; i++)
|
|
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
|
|
return;
|
|
}
|
|
static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT)
|
|
{
|
|
i32 dif[N];
|
|
i16 hi[N], lo[N];
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
dif[i] = VS[i] - VT[i] - cf_co[i];
|
|
for (i = 0; i < N; i++)
|
|
lo[i] = (dif[i] + 0x8000) >> 31;
|
|
for (i = 0; i < N; i++)
|
|
hi[i] = (0x7FFF - dif[i]) >> 31;
|
|
vector_copy(VD, VACC_L);
|
|
for (i = 0; i < N; i++)
|
|
VD[i] &= ~lo[i];
|
|
for (i = 0; i < N; i++)
|
|
VD[i] |= hi[i];
|
|
for (i = 0; i < N; i++)
|
|
VD[i] ^= 0x8000 & (hi[i] | lo[i]);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
INLINE static void clr_ci(pi16 VD, pi16 VS, pi16 VT)
|
|
{ /* clear CARRY and carry in to accumulators */
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
VACC_L[i] = VS[i] + VT[i] + cf_co[i];
|
|
SIGNED_CLAMP_ADD(VD, VS, VT);
|
|
|
|
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
|
|
vector_wipe(cf_ne);
|
|
vector_wipe(cf_co);
|
|
return;
|
|
}
|
|
|
|
INLINE static void clr_bi(pi16 VD, pi16 VS, pi16 VT)
|
|
{ /* clear CARRY and borrow in to accumulators */
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
VACC_L[i] = VS[i] - VT[i] - cf_co[i];
|
|
SIGNED_CLAMP_SUB(VD, VS, VT);
|
|
|
|
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
|
|
vector_wipe(cf_ne);
|
|
vector_wipe(cf_co);
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* -1: VT *= -1, because VS < 0 // VT ^= -2 if even, or ^= -1, += 1
|
|
* 0: VT *= 0, because VS = 0 // VT ^= VT
|
|
* +1: VT *= +1, because VS > 0 // VT ^= 0
|
|
* VT ^= -1, "negate" -32768 as ~+32767 (corner case hack for N64 SP)
|
|
*/
|
|
INLINE static void do_abs(pi16 VD, pi16 VS, pi16 VT)
|
|
{
|
|
i16 neg[N], pos[N];
|
|
i16 nez[N], cch[N]; /* corner case hack -- abs(-32768) == +32767 */
|
|
ALIGNED i16 res[N];
|
|
register unsigned int i;
|
|
|
|
vector_copy(res, VT);
|
|
for (i = 0; i < N; i++)
|
|
cch[i] = (res[i] == -32768);
|
|
|
|
for (i = 0; i < N; i++)
|
|
neg[i] = (VS[i] < 0x0000);
|
|
for (i = 0; i < N; i++)
|
|
pos[i] = (VS[i] > 0x0000);
|
|
//vector_wipe(nez);
|
|
memset(&nez, 0, sizeof(nez));
|
|
|
|
for (i = 0; i < N; i++)
|
|
nez[i] -= neg[i];
|
|
for (i = 0; i < N; i++)
|
|
nez[i] += pos[i];
|
|
|
|
for (i = 0; i < N; i++)
|
|
res[i] *= nez[i];
|
|
for (i = 0; i < N; i++)
|
|
res[i] -= cch[i];
|
|
vector_copy(VACC_L, res);
|
|
vector_copy(VD, VACC_L);
|
|
return;
|
|
}
|
|
|
|
INLINE static void set_co(pi16 VD, pi16 VS, pi16 VT)
|
|
{ /* set CARRY and carry out from sum */
|
|
i32 sum[N];
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
sum[i] = (u16)(VS[i]) + (u16)(VT[i]);
|
|
for (i = 0; i < N; i++)
|
|
VACC_L[i] = VS[i] + VT[i];
|
|
vector_copy(VD, VACC_L);
|
|
|
|
vector_wipe(cf_ne);
|
|
for (i = 0; i < N; i++)
|
|
cf_co[i] = sum[i] >> 16; /* native: (sum[i] > +65535) */
|
|
return;
|
|
}
|
|
|
|
INLINE static void set_bo(pi16 VD, pi16 VS, pi16 VT)
|
|
{ /* set CARRY and borrow out from difference */
|
|
i32 dif[N];
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
dif[i] = (u16)(VS[i]) - (u16)(VT[i]);
|
|
for (i = 0; i < N; i++)
|
|
VACC_L[i] = VS[i] - VT[i];
|
|
for (i = 0; i < N; i++)
|
|
cf_ne[i] = (VS[i] != VT[i]);
|
|
for (i = 0; i < N; i++)
|
|
cf_co[i] = (dif[i] < 0);
|
|
vector_copy(VD, VACC_L);
|
|
return;
|
|
}
|
|
|
|
VECTOR_OPERATION VADD(v16 vs, v16 vt)
|
|
{
|
|
ALIGNED i16 VD[N];
|
|
#ifdef ARCH_MIN_SSE2
|
|
ALIGNED i16 VS[N], VT[N];
|
|
|
|
*(v16 *)VS = vs;
|
|
*(v16 *)VT = vt;
|
|
#else
|
|
v16 VS, VT;
|
|
|
|
VS = vs;
|
|
VT = vt;
|
|
#endif
|
|
clr_ci(VD, VS, VT);
|
|
#ifdef ARCH_MIN_SSE2
|
|
COMPILER_FENCE();
|
|
vs = *(v16 *)VD;
|
|
return (vs);
|
|
#else
|
|
vector_copy(V_result, VD);
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
VECTOR_OPERATION VSUB(v16 vs, v16 vt)
|
|
{
|
|
ALIGNED i16 VD[N];
|
|
#ifdef ARCH_MIN_SSE2
|
|
ALIGNED i16 VS[N], VT[N];
|
|
|
|
*(v16 *)VS = vs;
|
|
*(v16 *)VT = vt;
|
|
#else
|
|
v16 VS, VT;
|
|
|
|
VS = vs;
|
|
VT = vt;
|
|
#endif
|
|
clr_bi(VD, VS, VT);
|
|
#ifdef ARCH_MIN_SSE2
|
|
COMPILER_FENCE();
|
|
vs = *(v16 *)VD;
|
|
return (vs);
|
|
#else
|
|
vector_copy(V_result, VD);
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
VECTOR_OPERATION VABS(v16 vs, v16 vt)
|
|
{
|
|
ALIGNED i16 VD[N];
|
|
#ifdef ARCH_MIN_SSE2
|
|
ALIGNED i16 VS[N], VT[N];
|
|
|
|
*(v16 *)VS = vs;
|
|
*(v16 *)VT = vt;
|
|
#else
|
|
v16 VS, VT;
|
|
|
|
VS = vs;
|
|
VT = vt;
|
|
#endif
|
|
do_abs(VD, VS, VT);
|
|
#ifdef ARCH_MIN_SSE2
|
|
COMPILER_FENCE();
|
|
vs = *(v16 *)VD;
|
|
return (vs);
|
|
#else
|
|
vector_copy(V_result, VD);
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
VECTOR_OPERATION VADDC(v16 vs, v16 vt)
|
|
{
|
|
ALIGNED i16 VD[N];
|
|
#ifdef ARCH_MIN_SSE2
|
|
ALIGNED i16 VS[N], VT[N];
|
|
|
|
*(v16 *)VS = vs;
|
|
*(v16 *)VT = vt;
|
|
#else
|
|
v16 VS, VT;
|
|
|
|
VS = vs;
|
|
VT = vt;
|
|
#endif
|
|
set_co(VD, VS, VT);
|
|
#ifdef ARCH_MIN_SSE2
|
|
COMPILER_FENCE();
|
|
vs = *(v16 *)VD;
|
|
return (vs);
|
|
#else
|
|
vector_copy(V_result, VD);
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
VECTOR_OPERATION VSUBC(v16 vs, v16 vt)
|
|
{
|
|
ALIGNED i16 VD[N];
|
|
#ifdef ARCH_MIN_SSE2
|
|
ALIGNED i16 VS[N], VT[N];
|
|
|
|
*(v16 *)VS = vs;
|
|
*(v16 *)VT = vt;
|
|
#else
|
|
v16 VS, VT;
|
|
|
|
VS = vs;
|
|
VT = vt;
|
|
#endif
|
|
set_bo(VD, VS, VT);
|
|
#ifdef ARCH_MIN_SSE2
|
|
COMPILER_FENCE();
|
|
vs = *(v16 *)VD;
|
|
return (vs);
|
|
#else
|
|
vector_copy(V_result, VD);
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
VECTOR_OPERATION VSAW(v16 vs, v16 vt)
|
|
{
|
|
unsigned int element;
|
|
|
|
element = 0xF & (inst_word >> 21);
|
|
element ^= 0x8; /* Convert scalar whole elements 8:F to 0:7. */
|
|
|
|
if (element > 0x2) {
|
|
message("VSAW\nIllegal mask.");
|
|
#ifdef ARCH_MIN_SSE2
|
|
vector_wipe(vs);
|
|
#else
|
|
vector_wipe(V_result);
|
|
#endif
|
|
} else {
|
|
#ifdef ARCH_MIN_SSE2
|
|
vs = *(v16 *)VACC[element];
|
|
#else
|
|
vector_copy(V_result, VACC[element]);
|
|
#endif
|
|
}
|
|
#ifdef ARCH_MIN_SSE2
|
|
return (vt = vs);
|
|
#else
|
|
if (vt == vs)
|
|
return; /* -Wunused-but-set-parameter */
|
|
return;
|
|
#endif
|
|
}
|