mupen64plus-rsp-cxd4/vu/select.c
2019-07-15 20:51:11 +02:00

551 lines
13 KiB
C

/******************************************************************************\
* Project: MSP Simulation Layer for Vector Unit Computational Test Selects *
* Authors: Iconoclast *
* Release: 2018.11.26 *
* License: CC0 Public Domain Dedication *
* *
* To the extent possible under law, the author(s) have dedicated all copyright *
* and related and neighboring rights to this software to the public domain *
* worldwide. This software is distributed without any warranty. *
* *
* You should have received a copy of the CC0 Public Domain Dedication along *
* with this software. *
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. *
\******************************************************************************/
#include "select.h"
/*
* vector select merge (`VMRG`) formula
*
* This is really just a vectorizer for ternary conditional storage.
* I've named it so because it directly maps to the VMRG op-code.
* -- example --
* for (i = 0; i < N; i++)
* if (c_pass)
* dest = element_a;
* else
* dest = element_b;
*/
static void merge(pi16 VD, pi16 cmp, pi16 pass, pi16 fail)
{
register unsigned int i;
#if (0 != 0)
/* Do not use this version yet, as it still does not vectorize to SSE2. */
for (i = 0; i < N; i++)
VD[i] = (cmp[i] != 0) ? pass[i] : fail[i];
#else
i16 diff[N];
for (i = 0; i < N; i++)
diff[i] = pass[i] - fail[i];
for (i = 0; i < N; i++)
VD[i] = fail[i] + cmp[i]*diff[i]; /* actually `(cmp[i] != 0)*diff[i]` */
#endif
return;
}
INLINE static void do_lt(pi16 VD, pi16 VS, pi16 VT)
{
i16 cn[N];
i16 eq[N];
register unsigned int i;
for (i = 0; i < N; i++)
eq[i] = (VS[i] == VT[i]);
for (i = 0; i < N; i++)
cn[i] = cf_ne[i] & cf_co[i];
for (i = 0; i < N; i++)
eq[i] = eq[i] & cn[i];
for (i = 0; i < N; i++)
cf_comp[i] = (VS[i] < VT[i]); /* less than */
for (i = 0; i < N; i++)
cf_comp[i] = cf_comp[i] | eq[i]; /* ... or equal (uncommonly) */
merge(VACC_L, cf_comp, VS, VT);
vector_copy(VD, VACC_L);
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
vector_wipe(cf_ne);
vector_wipe(cf_co);
vector_wipe(cf_clip);
return;
}
INLINE static void do_eq(pi16 VD, pi16 VS, pi16 VT)
{
register unsigned int i;
for (i = 0; i < N; i++)
cf_comp[i] = (VS[i] == VT[i]);
for (i = 0; i < N; i++)
cf_comp[i] = cf_comp[i] & (cf_ne[i] ^ 1);
#if (0)
merge(VACC_L, cf_comp, VS, VT); /* correct but redundant */
#else
vector_copy(VACC_L, VT);
#endif
vector_copy(VD, VACC_L);
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
vector_wipe(cf_ne);
vector_wipe(cf_co);
vector_wipe(cf_clip);
return;
}
INLINE static void do_ne(pi16 VD, pi16 VS, pi16 VT)
{
register unsigned int i;
for (i = 0; i < N; i++)
cf_comp[i] = (VS[i] != VT[i]);
for (i = 0; i < N; i++)
cf_comp[i] = cf_comp[i] | cf_ne[i];
#if (0)
merge(VACC_L, cf_comp, VS, VT); /* correct but redundant */
#else
vector_copy(VACC_L, VS);
#endif
vector_copy(VD, VACC_L);
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
vector_wipe(cf_ne);
vector_wipe(cf_co);
vector_wipe(cf_clip);
return;
}
INLINE static void do_ge(pi16 VD, pi16 VS, pi16 VT)
{
i16 ce[N];
i16 eq[N];
register unsigned int i;
for (i = 0; i < N; i++)
eq[i] = (VS[i] == VT[i]);
for (i = 0; i < N; i++)
ce[i] = (cf_ne[i] & cf_co[i]) ^ 1;
for (i = 0; i < N; i++)
eq[i] = eq[i] & ce[i];
for (i = 0; i < N; i++)
cf_comp[i] = (VS[i] > VT[i]); /* greater than */
for (i = 0; i < N; i++)
cf_comp[i] = cf_comp[i] | eq[i]; /* ... or equal (commonly) */
merge(VACC_L, cf_comp, VS, VT);
vector_copy(VD, VACC_L);
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
vector_wipe(cf_ne);
vector_wipe(cf_co);
vector_wipe(cf_clip);
return;
}
INLINE static void do_cl(pi16 VD, pi16 VS, pi16 VT)
{
ALIGNED u16 VB[N], VC[N];
ALIGNED i16 eq[N], ge[N], le[N];
ALIGNED i16 gen[N], len[N], lz[N], uz[N], sn[N];
i16 diff[N];
i16 cmp[N];
register unsigned int i;
vector_copy((pi16)VB, VS);
vector_copy((pi16)VC, VT);
/*
for (i = 0; i < N; i++)
ge[i] = cf_clip[i];
for (i = 0; i < N; i++)
le[i] = cf_comp[i];
*/
for (i = 0; i < N; i++)
eq[i] = cf_ne[i] ^ 1;
vector_copy(sn, cf_co);
/*
* Now that we have extracted all the flags, we will essentially be masking
* them back in where they came from redundantly, unless the corresponding
* NOTEQUAL bit from VCO upper was not set....
*/
for (i = 0; i < N; i++)
VC[i] = VC[i] ^ -sn[i];
for (i = 0; i < N; i++)
VC[i] = VC[i] + sn[i]; /* conditional negation, if sn */
for (i = 0; i < N; i++)
diff[i] = VB[i] - VC[i];
for (i = 0; i < N; i++)
uz[i] = (VB[i] + (u16)VT[i] - 65536) >> 31;
for (i = 0; i < N; i++)
lz[i] = (diff[i] == 0x0000);
for (i = 0; i < N; i++)
gen[i] = lz[i] | uz[i];
for (i = 0; i < N; i++)
len[i] = lz[i] & uz[i];
for (i = 0; i < N; i++)
gen[i] = gen[i] & cf_vce[i];
for (i = 0; i < N; i++)
len[i] = len[i] & (cf_vce[i] ^ 1);
for (i = 0; i < N; i++)
len[i] = len[i] | gen[i];
for (i = 0; i < N; i++)
gen[i] = (VB[i] >= VC[i]);
for (i = 0; i < N; i++)
cmp[i] = eq[i] & sn[i];
merge(le, cmp, len, cf_comp);
for (i = 0; i < N; i++)
cmp[i] = eq[i] & (sn[i] ^ 1);
merge(ge, cmp, gen, cf_clip);
merge(cmp, sn, le, ge);
merge(VACC_L, cmp, (pi16)VC, VS);
vector_copy(VD, VACC_L);
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
vector_wipe(cf_ne);
vector_wipe(cf_co);
vector_copy(cf_clip, ge);
vector_copy(cf_comp, le);
/* CTC2 $0, $vce # zeroing RSP flags VCF[2] */
vector_wipe(cf_vce);
return;
}
INLINE static void do_ch(pi16 VD, pi16 VS, pi16 VT)
{
ALIGNED i16 VC[N];
ALIGNED i16 eq[N], ge[N], le[N];
ALIGNED i16 sn[N];
#ifndef _DEBUG
i16 diff[N];
#endif
i16 cch[N]; /* corner case hack: -(-32768) with undefined sign */
register unsigned int i;
for (i = 0; i < N; i++)
cch[i] = (VT[i] == -32768) ? ~0 : 0; /* -(-32768) might not be >= 0. */
vector_copy(VC, VT);
for (i = 0; i < N; i++)
sn[i] = VS[i] ^ VT[i];
for (i = 0; i < N; i++)
sn[i] = (sn[i] < 0) ? ~0 : 0; /* signed SRA (sn), 15 */
for (i = 0; i < N; i++)
VC[i] ^= sn[i]; /* if (sn == ~0) {VT = ~VT;} else {VT = VT;} */
for (i = 0; i < N; i++)
cf_vce[i] = (VS[i] == VC[i]); /* 2's complement: VC = -VT - 1 = ~VT */
for (i = 0; i < N; i++)
cf_vce[i] &= sn[i];
/*
* if (sign flag), then converts ~(VT) into -(VT) a.k.a. ~(VT) - (-1)
* Note that if (VT == INT16_MIN) a.k.a. cch[i], -(-32768) is undefined.
*/
for (i = 0; i < N; i++)
VC[i] -= sn[i] & ~cch[i]; /* cch[i] causes -(-32768) to stay ~-32768. */
for (i = 0; i < N; i++)
eq[i] = (VS[i] == VC[i]) & ~cch[i]; /* VS = -(-32768) never happens. */
for (i = 0; i < N; i++)
eq[i] |= cf_vce[i];
#ifdef _DEBUG
for (i = 0; i < N; i++)
le[i] = sn[i] ? (VS[i] <= VC[i]) : (VC[i] < 0);
for (i = 0; i < N; i++)
ge[i] = sn[i] ? (VC[i] > 0x0000) : (VS[i] >= VC[i]);
#elif (0)
for (i = 0; i < N; i++)
le[i] = sn[i] ? (VT[i] <= -VS[i]) : (VT[i] <= ~0x0000);
for (i = 0; i < N; i++)
ge[i] = sn[i] ? (~0x0000 >= VT[i]) : (VS[i] >= VT[i]);
#else
for (i = 0; i < N; i++)
diff[i] = sn[i] | VS[i];
for (i = 0; i < N; i++)
ge[i] = (diff[i] >= VT[i]);
for (i = 0; i < N; i++)
sn[i] = (u16)(sn[i]) >> 15; /* ~0 to 1, 0 to 0 */
for (i = 0; i < N; i++)
diff[i] = VC[i] - VS[i];
for (i = 0; i < N; i++)
diff[i] = (diff[i] >= 0);
for (i = 0; i < N; i++)
le[i] = (VT[i] < 0);
merge(le, sn, diff, le);
#endif
merge(cf_comp, sn, le, ge);
merge(VACC_L, cf_comp, VC, VS);
vector_copy(VD, VACC_L);
vector_copy(cf_clip, ge);
vector_copy(cf_comp, le);
for (i = 0; i < N; i++)
cf_ne[i] = eq[i] ^ 1;
vector_copy(cf_co, sn);
return;
}
INLINE static void do_cr(pi16 VD, pi16 VS, pi16 VT)
{
ALIGNED i16 ge[N], le[N], sn[N];
ALIGNED i16 VC[N];
i16 cmp[N];
register unsigned int i;
vector_copy(VC, VT);
for (i = 0; i < N; i++)
sn[i] = VS[i] ^ VT[i];
for (i = 0; i < N; i++)
sn[i] = (sn[i] < 0) ? ~0 : 0;
#ifdef _DEBUG
for (i = 0; i < N; i++)
le[i] = sn[i] ? (VT[i] <= ~VS[i]) : (VT[i] <= ~0x0000);
for (i = 0; i < N; i++)
ge[i] = sn[i] ? (~0x0000 >= VT[i]) : (VS[i] >= VT[i]);
#else
for (i = 0; i < N; i++)
cmp[i] = ~(VS[i] & sn[i]);
for (i = 0; i < N; i++)
le[i] = (VT[i] <= cmp[i]);
for (i = 0; i < N; i++)
cmp[i] = (VS[i] | sn[i]);
for (i = 0; i < N; i++)
ge[i] = (cmp[i] >= VT[i]);
#endif
for (i = 0; i < N; i++)
VC[i] ^= sn[i]; /* if (sn == ~0) {VT = ~VT;} else {VT = VT;} */
merge(cmp, sn, le, ge);
merge(VACC_L, cmp, VC, VS);
vector_copy(VD, VACC_L);
/* CTC2 $0, $vco # zeroing RSP flags VCF[0] */
vector_wipe(cf_ne);
vector_wipe(cf_co);
vector_copy(cf_clip, ge);
vector_copy(cf_comp, le);
/* CTC2 $0, $vce # zeroing RSP flags VCF[2] */
vector_wipe(cf_vce);
return;
}
INLINE static void do_mrg(pi16 VD, pi16 VS, pi16 VT)
{
merge(VACC_L, cf_comp, VS, VT);
vector_copy(VD, VACC_L);
return;
}
VECTOR_OPERATION VLT(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_lt(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}
VECTOR_OPERATION VEQ(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_eq(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}
VECTOR_OPERATION VNE(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_ne(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}
VECTOR_OPERATION VGE(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_ge(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}
VECTOR_OPERATION VCL(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_cl(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}
VECTOR_OPERATION VCH(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_ch(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}
VECTOR_OPERATION VCR(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_cr(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}
VECTOR_OPERATION VMRG(v16 vs, v16 vt)
{
ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
ALIGNED i16 VS[N], VT[N];
*(v16 *)VS = vs;
*(v16 *)VT = vt;
#else
v16 VS, VT;
VS = vs;
VT = vt;
#endif
do_mrg(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
vector_copy(V_result, VD);
return;
#endif
}