/******************************************************************************\ * Project: MSP Simulation Layer for Vector Unit Computational Adds * * Authors: Iconoclast * * Release: 2018.03.18 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * * and related and neighboring rights to this software to the public domain * * worldwide. This software is distributed without any warranty. * * * * You should have received a copy of the CC0 Public Domain Dedication along * * with this software. * * If not, see . * \******************************************************************************/ #include #include "add.h" #ifdef ARCH_MIN_SSE2 static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT) { v16 dst, src, vco; v16 max, min; src = _mm_load_si128((v16 *)VS); dst = _mm_load_si128((v16 *)VT); vco = _mm_load_si128((v16 *)cf_co); /* * Due to premature clamping in between adds, sometimes we need to add the * LESSER of two integers, either VS or VT, to the carry-in flag matching the * current vector register slice, BEFORE finally adding the greater integer. */ max = _mm_max_epi16(dst, src); min = _mm_min_epi16(dst, src); min = _mm_adds_epi16(min, vco); max = _mm_adds_epi16(max, min); _mm_store_si128((v16 *)VD, max); return; } static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT) { v16 dst, src, vco; v16 dif, res, xmm; src = _mm_load_si128((v16 *)VS); dst = _mm_load_si128((v16 *)VT); vco = _mm_load_si128((v16 *)cf_co); res = _mm_subs_epi16(src, dst); /* * Due to premature clamps in-between subtracting two of the three operands, * we must be careful not to offset the result accidentally when subtracting * the corresponding VCO flag AFTER the saturation from doing (VS - VT). */ dif = _mm_add_epi16(res, vco); dif = _mm_xor_si128(dif, res); /* Adding one suddenly inverts the sign? */ dif = _mm_and_si128(dif, dst); /* Sign change due to subtracting a neg. */ xmm = _mm_sub_epi16(src, dst); src = _mm_andnot_si128(src, dif); /* VS must be >= 0x0000 for overflow. */ xmm = _mm_and_si128(xmm, src); /* VS + VT != INT16_MIN; VS + VT >= +32768 */ xmm = _mm_srli_epi16(xmm, 15); /* src = (INT16_MAX + 1 === INT16_MIN) ? */ xmm = _mm_andnot_si128(xmm, vco); /* If it's NOT overflow, keep flag. */ res = _mm_subs_epi16(res, xmm); _mm_store_si128((v16 *)VD, res); return; } #else static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT) { i32 sum[N]; i16 hi[N], lo[N]; register unsigned int i; for (i = 0; i < N; i++) sum[i] = VS[i] + VT[i] + cf_co[i]; for (i = 0; i < N; i++) lo[i] = (sum[i] + 0x8000) >> 31; for (i = 0; i < N; i++) hi[i] = (0x7FFF - sum[i]) >> 31; vector_copy(VD, VACC_L); for (i = 0; i < N; i++) VD[i] &= ~lo[i]; for (i = 0; i < N; i++) VD[i] |= hi[i]; for (i = 0; i < N; i++) VD[i] ^= 0x8000 & (hi[i] | lo[i]); return; } static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT) { i32 dif[N]; i16 hi[N], lo[N]; register unsigned int i; for (i = 0; i < N; i++) dif[i] = VS[i] - VT[i] - cf_co[i]; for (i = 0; i < N; i++) lo[i] = (dif[i] + 0x8000) >> 31; for (i = 0; i < N; i++) hi[i] = (0x7FFF - dif[i]) >> 31; vector_copy(VD, VACC_L); for (i = 0; i < N; i++) VD[i] &= ~lo[i]; for (i = 0; i < N; i++) VD[i] |= hi[i]; for (i = 0; i < N; i++) VD[i] ^= 0x8000 & (hi[i] | lo[i]); return; } #endif INLINE static void clr_ci(pi16 VD, pi16 VS, pi16 VT) { /* clear CARRY and carry in to accumulators */ register unsigned int i; for (i = 0; i < N; i++) VACC_L[i] = VS[i] + VT[i] + cf_co[i]; SIGNED_CLAMP_ADD(VD, VS, VT); /* CTC2 $0, $vco # zeroing RSP flags VCF[0] */ vector_wipe(cf_ne); vector_wipe(cf_co); return; } INLINE static void clr_bi(pi16 VD, pi16 VS, pi16 VT) { /* clear CARRY and borrow in to accumulators */ register unsigned int i; for (i = 0; i < N; i++) VACC_L[i] = VS[i] - VT[i] - cf_co[i]; SIGNED_CLAMP_SUB(VD, VS, VT); /* CTC2 $0, $vco # zeroing RSP flags VCF[0] */ vector_wipe(cf_ne); vector_wipe(cf_co); return; } /* * -1: VT *= -1, because VS < 0 // VT ^= -2 if even, or ^= -1, += 1 * 0: VT *= 0, because VS = 0 // VT ^= VT * +1: VT *= +1, because VS > 0 // VT ^= 0 * VT ^= -1, "negate" -32768 as ~+32767 (corner case hack for N64 SP) */ INLINE static void do_abs(pi16 VD, pi16 VS, pi16 VT) { i16 neg[N], pos[N]; i16 nez[N], cch[N]; /* corner case hack -- abs(-32768) == +32767 */ ALIGNED i16 res[N]; register unsigned int i; vector_copy(res, VT); for (i = 0; i < N; i++) cch[i] = (res[i] == -32768); for (i = 0; i < N; i++) neg[i] = (VS[i] < 0x0000); for (i = 0; i < N; i++) pos[i] = (VS[i] > 0x0000); //vector_wipe(nez); memset(&nez, 0, sizeof(nez)); for (i = 0; i < N; i++) nez[i] -= neg[i]; for (i = 0; i < N; i++) nez[i] += pos[i]; for (i = 0; i < N; i++) res[i] *= nez[i]; for (i = 0; i < N; i++) res[i] -= cch[i]; vector_copy(VACC_L, res); vector_copy(VD, VACC_L); return; } INLINE static void set_co(pi16 VD, pi16 VS, pi16 VT) { /* set CARRY and carry out from sum */ i32 sum[N]; register unsigned int i; for (i = 0; i < N; i++) sum[i] = (u16)(VS[i]) + (u16)(VT[i]); for (i = 0; i < N; i++) VACC_L[i] = VS[i] + VT[i]; vector_copy(VD, VACC_L); vector_wipe(cf_ne); for (i = 0; i < N; i++) cf_co[i] = sum[i] >> 16; /* native: (sum[i] > +65535) */ return; } INLINE static void set_bo(pi16 VD, pi16 VS, pi16 VT) { /* set CARRY and borrow out from difference */ i32 dif[N]; register unsigned int i; for (i = 0; i < N; i++) dif[i] = (u16)(VS[i]) - (u16)(VT[i]); for (i = 0; i < N; i++) VACC_L[i] = VS[i] - VT[i]; for (i = 0; i < N; i++) cf_ne[i] = (VS[i] != VT[i]); for (i = 0; i < N; i++) cf_co[i] = (dif[i] < 0); vector_copy(VD, VACC_L); return; } VECTOR_OPERATION VADD(v16 vs, v16 vt) { ALIGNED i16 VD[N]; #ifdef ARCH_MIN_SSE2 ALIGNED i16 VS[N], VT[N]; *(v16 *)VS = vs; *(v16 *)VT = vt; #else v16 VS, VT; VS = vs; VT = vt; #endif clr_ci(VD, VS, VT); #ifdef ARCH_MIN_SSE2 COMPILER_FENCE(); vs = *(v16 *)VD; return (vs); #else vector_copy(V_result, VD); return; #endif } VECTOR_OPERATION VSUB(v16 vs, v16 vt) { ALIGNED i16 VD[N]; #ifdef ARCH_MIN_SSE2 ALIGNED i16 VS[N], VT[N]; *(v16 *)VS = vs; *(v16 *)VT = vt; #else v16 VS, VT; VS = vs; VT = vt; #endif clr_bi(VD, VS, VT); #ifdef ARCH_MIN_SSE2 COMPILER_FENCE(); vs = *(v16 *)VD; return (vs); #else vector_copy(V_result, VD); return; #endif } VECTOR_OPERATION VABS(v16 vs, v16 vt) { ALIGNED i16 VD[N]; #ifdef ARCH_MIN_SSE2 ALIGNED i16 VS[N], VT[N]; *(v16 *)VS = vs; *(v16 *)VT = vt; #else v16 VS, VT; VS = vs; VT = vt; #endif do_abs(VD, VS, VT); #ifdef ARCH_MIN_SSE2 COMPILER_FENCE(); vs = *(v16 *)VD; return (vs); #else vector_copy(V_result, VD); return; #endif } VECTOR_OPERATION VADDC(v16 vs, v16 vt) { ALIGNED i16 VD[N]; #ifdef ARCH_MIN_SSE2 ALIGNED i16 VS[N], VT[N]; *(v16 *)VS = vs; *(v16 *)VT = vt; #else v16 VS, VT; VS = vs; VT = vt; #endif set_co(VD, VS, VT); #ifdef ARCH_MIN_SSE2 COMPILER_FENCE(); vs = *(v16 *)VD; return (vs); #else vector_copy(V_result, VD); return; #endif } VECTOR_OPERATION VSUBC(v16 vs, v16 vt) { ALIGNED i16 VD[N]; #ifdef ARCH_MIN_SSE2 ALIGNED i16 VS[N], VT[N]; *(v16 *)VS = vs; *(v16 *)VT = vt; #else v16 VS, VT; VS = vs; VT = vt; #endif set_bo(VD, VS, VT); #ifdef ARCH_MIN_SSE2 COMPILER_FENCE(); vs = *(v16 *)VD; return (vs); #else vector_copy(V_result, VD); return; #endif } VECTOR_OPERATION VSAW(v16 vs, v16 vt) { unsigned int element; element = 0xF & (inst_word >> 21); element ^= 0x8; /* Convert scalar whole elements 8:F to 0:7. */ if (element > 0x2) { message("VSAW\nIllegal mask."); #ifdef ARCH_MIN_SSE2 vector_wipe(vs); #else vector_wipe(V_result); #endif } else { #ifdef ARCH_MIN_SSE2 vs = *(v16 *)VACC[element]; #else vector_copy(V_result, VACC[element]); #endif } #ifdef ARCH_MIN_SSE2 return (vt = vs); #else if (vt == vs) return; /* -Wunused-but-set-parameter */ return; #endif }