mupen64plus-rsp-cxd4/vu/add.c

/******************************************************************************\
* Project:  MSP Simulation Layer for Vector Unit Computational Adds            *
* Authors:  Iconoclast                                                         *
* Release:  2018.03.18                                                         *
* License:  CC0 Public Domain Dedication                                       *
*                                                                              *
* To the extent possible under law, the author(s) have dedicated all copyright *
* and related and neighboring rights to this software to the public domain     *
* worldwide. This software is distributed without any warranty.                *
*                                                                              *
* You should have received a copy of the CC0 Public Domain Dedication along    *
* with this software.                                                          *
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.             *
\******************************************************************************/

#include <string.h>

#include "add.h"

#ifdef ARCH_MIN_SSE2
static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT)
{
    v16 dst, src, vco;
    v16 max, min;

    src = _mm_load_si128((v16 *)VS);
    dst = _mm_load_si128((v16 *)VT);
    vco = _mm_load_si128((v16 *)cf_co);

/*
 * Due to premature clamping in between adds, sometimes we need to add the
 * LESSER of two integers, either VS or VT, to the carry-in flag matching the
 * current vector register slice, BEFORE finally adding the greater integer.
 */
    max = _mm_max_epi16(dst, src);
    min = _mm_min_epi16(dst, src);

    min = _mm_adds_epi16(min, vco);
    max = _mm_adds_epi16(max, min);
    _mm_store_si128((v16 *)VD, max);
    return;
}
static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT)
{
    v16 dst, src, vco;
    v16 dif, res, xmm;

    src = _mm_load_si128((v16 *)VS);
    dst = _mm_load_si128((v16 *)VT);
    vco = _mm_load_si128((v16 *)cf_co);

    res = _mm_subs_epi16(src, dst);

/*
 * Due to premature clamps in-between subtracting two of the three operands,
 * we must be careful not to offset the result accidentally when subtracting
 * the corresponding VCO flag AFTER the saturation from doing (VS - VT).
 */
    dif = _mm_add_epi16(res, vco);
    dif = _mm_xor_si128(dif, res); /* Adding one suddenly inverts the sign? */
    dif = _mm_and_si128(dif, dst); /* Sign change due to subtracting a neg. */
    xmm = _mm_sub_epi16(src, dst);
    src = _mm_andnot_si128(src, dif); /* VS must be >= 0x0000 for overflow. */
    xmm = _mm_and_si128(xmm, src); /* VS + VT != INT16_MIN; VS + VT >= +32768 */
    xmm = _mm_srli_epi16(xmm, 15); /* src = (INT16_MAX + 1 === INT16_MIN) ? */

    xmm = _mm_andnot_si128(xmm, vco); /* If it's NOT overflow, keep flag. */
    res = _mm_subs_epi16(res, xmm);
    _mm_store_si128((v16 *)VD, res);
    return;
}
#else
static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT)
{
    i32 sum[N];
    i16 hi[N], lo[N];
    register unsigned int i;

    for (i = 0; i < N; i++)
        sum[i] = VS[i] + VT[i] + cf_co[i];
    for (i = 0; i < N; i++)
        lo[i] = (sum[i] + 0x8000) >> 31;
    for (i = 0; i < N; i++)
        hi[i] = (0x7FFF - sum[i]) >> 31;
    vector_copy(VD, VACC_L);
    for (i = 0; i < N; i++)
        VD[i] &= ~lo[i];
    for (i = 0; i < N; i++)
        VD[i] |=  hi[i];
    for (i = 0; i < N; i++)
        VD[i] ^= 0x8000 & (hi[i] | lo[i]);
    return;
}
static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT)
{
    i32 dif[N];
    i16 hi[N], lo[N];
    register unsigned int i;

    for (i = 0; i < N; i++)
        dif[i] = VS[i] - VT[i] - cf_co[i];
    for (i = 0; i < N; i++)
        lo[i] = (dif[i] + 0x8000) >> 31;
    for (i = 0; i < N; i++)
        hi[i] = (0x7FFF - dif[i]) >> 31;
    vector_copy(VD, VACC_L);
    for (i = 0; i < N; i++)
        VD[i] &= ~lo[i];
    for (i = 0; i < N; i++)
        VD[i] |=  hi[i];
    for (i = 0; i < N; i++)
        VD[i] ^= 0x8000 & (hi[i] | lo[i]);
    return;
}
#endif

INLINE static void clr_ci(pi16 VD, pi16 VS, pi16 VT)
{ /* clear CARRY and carry in to accumulators */
    register unsigned int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = VS[i] + VT[i] + cf_co[i];
    SIGNED_CLAMP_ADD(VD, VS, VT);

 /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
    vector_wipe(cf_ne);
    vector_wipe(cf_co);
    return;
}

INLINE static void clr_bi(pi16 VD, pi16 VS, pi16 VT)
{ /* clear CARRY and borrow in to accumulators */
    register unsigned int i;

    for (i = 0; i < N; i++)
        VACC_L[i] = VS[i] - VT[i] - cf_co[i];
    SIGNED_CLAMP_SUB(VD, VS, VT);

 /* CTC2    $0, $vco # zeroing RSP flags VCF[0] */
    vector_wipe(cf_ne);
    vector_wipe(cf_co);
    return;
}

/*
 * -1:  VT *= -1, because VS < 0 // VT ^= -2 if even, or ^= -1, += 1
 *  0:  VT *=  0, because VS = 0 // VT ^= VT
 * +1:  VT *= +1, because VS > 0 // VT ^=  0
 *      VT ^= -1, "negate" -32768 as ~+32767 (corner case hack for N64 SP)
 */
INLINE static void do_abs(pi16 VD, pi16 VS, pi16 VT)
{
    i16 neg[N], pos[N];
    i16 nez[N], cch[N]; /* corner case hack -- abs(-32768) == +32767 */
    ALIGNED i16 res[N];
    register unsigned int i;

    vector_copy(res, VT);
    for (i = 0; i < N; i++)
        cch[i]  = (res[i] == -32768);

    for (i = 0; i < N; i++)
        neg[i]  = (VS[i] <  0x0000);
    for (i = 0; i < N; i++)
        pos[i]  = (VS[i] >  0x0000);
    //vector_wipe(nez);
    memset(&nez, 0, sizeof(nez));

    for (i = 0; i < N; i++)
        nez[i] -= neg[i];
    for (i = 0; i < N; i++)
        nez[i] += pos[i];

    for (i = 0; i < N; i++)
        res[i] *= nez[i];
    for (i = 0; i < N; i++)
        res[i] -= cch[i];
    vector_copy(VACC_L, res);
    vector_copy(VD, VACC_L);
    return;
}

INLINE static void set_co(pi16 VD, pi16 VS, pi16 VT)
{ /* set CARRY and carry out from sum */
    i32 sum[N];
    register unsigned int i;

    for (i = 0; i < N; i++)
        sum[i] = (u16)(VS[i]) + (u16)(VT[i]);
    for (i = 0; i < N; i++)
        VACC_L[i] = VS[i] + VT[i];
    vector_copy(VD, VACC_L);

    vector_wipe(cf_ne);
    for (i = 0; i < N; i++)
        cf_co[i] = sum[i] >> 16; /* native:  (sum[i] > +65535) */
    return;
}

INLINE static void set_bo(pi16 VD, pi16 VS, pi16 VT)
{ /* set CARRY and borrow out from difference */
    i32 dif[N];
    register unsigned int i;

    for (i = 0; i < N; i++)
        dif[i] = (u16)(VS[i]) - (u16)(VT[i]);
    for (i = 0; i < N; i++)
        VACC_L[i] = VS[i] - VT[i];
    for (i = 0; i < N; i++)
        cf_ne[i] = (VS[i] != VT[i]);
    for (i = 0; i < N; i++)
        cf_co[i] = (dif[i] < 0);
    vector_copy(VD, VACC_L);
    return;
}

VECTOR_OPERATION VADD(v16 vs, v16 vt)
{
    ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
    ALIGNED i16 VS[N], VT[N];

    *(v16 *)VS = vs;
    *(v16 *)VT = vt;
#else
    v16 VS, VT;

    VS = vs;
    VT = vt;
#endif
    clr_ci(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
    COMPILER_FENCE();
    vs = *(v16 *)VD;
    return (vs);
#else
    vector_copy(V_result, VD);
    return;
#endif
}

VECTOR_OPERATION VSUB(v16 vs, v16 vt)
{
    ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
    ALIGNED i16 VS[N], VT[N];

    *(v16 *)VS = vs;
    *(v16 *)VT = vt;
#else
    v16 VS, VT;

    VS = vs;
    VT = vt;
#endif
    clr_bi(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
    COMPILER_FENCE();
    vs = *(v16 *)VD;
    return (vs);
#else
    vector_copy(V_result, VD);
    return;
#endif
}

VECTOR_OPERATION VABS(v16 vs, v16 vt)
{
    ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
    ALIGNED i16 VS[N], VT[N];

    *(v16 *)VS = vs;
    *(v16 *)VT = vt;
#else
    v16 VS, VT;

    VS = vs;
    VT = vt;
#endif
    do_abs(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
    COMPILER_FENCE();
    vs = *(v16 *)VD;
    return (vs);
#else
    vector_copy(V_result, VD);
    return;
#endif
}

VECTOR_OPERATION VADDC(v16 vs, v16 vt)
{
    ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
    ALIGNED i16 VS[N], VT[N];

    *(v16 *)VS = vs;
    *(v16 *)VT = vt;
#else
    v16 VS, VT;

    VS = vs;
    VT = vt;
#endif
    set_co(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
    COMPILER_FENCE();
    vs = *(v16 *)VD;
    return (vs);
#else
    vector_copy(V_result, VD);
    return;
#endif
}

VECTOR_OPERATION VSUBC(v16 vs, v16 vt)
{
    ALIGNED i16 VD[N];
#ifdef ARCH_MIN_SSE2
    ALIGNED i16 VS[N], VT[N];

    *(v16 *)VS = vs;
    *(v16 *)VT = vt;
#else
    v16 VS, VT;

    VS = vs;
    VT = vt;
#endif
    set_bo(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
    COMPILER_FENCE();
    vs = *(v16 *)VD;
    return (vs);
#else
    vector_copy(V_result, VD);
    return;
#endif
}

VECTOR_OPERATION VSAW(v16 vs, v16 vt)
{
    unsigned int element;

    element  = 0xF & (inst_word >> 21);
    element ^= 0x8; /* Convert scalar whole elements 8:F to 0:7. */

    if (element > 0x2) {
        message("VSAW\nIllegal mask.");
#ifdef ARCH_MIN_SSE2
        vector_wipe(vs);
#else
        vector_wipe(V_result);
#endif
    } else {
#ifdef ARCH_MIN_SSE2
        vs = *(v16 *)VACC[element];
#else
        vector_copy(V_result, VACC[element]);
#endif
    }
#ifdef ARCH_MIN_SSE2
    return (vt = vs);
#else
    if (vt == vs)
        return; /* -Wunused-but-set-parameter */
    return;
#endif
}