/******************************************************************************\ * Project: MSP Emulation Layer for Vector Unit Computational Operations * * Authors: Iconoclast * * Release: 2016.03.23 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * * and related and neighboring rights to this software to the public domain * * worldwide. This software is distributed without any warranty. * * * * You should have received a copy of the CC0 Public Domain Dedication along * * with this software. * * If not, see . * \******************************************************************************/ #ifndef _VU_H_ #define _VU_H_ #if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON) #include #endif #include "../my_types.h" #define N 8 /* N: number of processor elements in SIMD processor */ /* * Illegal, unaligned LWC2 operations on the RSP may write past the terminal * byte of a vector, while SWC2 operations may have to wrap around stores * from the end to the start of a vector. Both of these risk out-of-bounds * memory access, but by doubling the number of bytes allocated (shift left) * per each vector register, we could stabilize and probably optimize this. */ #if 0 #define VR_STATIC_WRAPAROUND 0 #else #define VR_STATIC_WRAPAROUND 1 #endif /* * We are going to need this for vector operations doing scalar things. * The divides and VSAW need bit-wise information from the instruction word. */ extern u32 inst_word; /* * RSP virtual registers (of vector unit) * The most important are the 32 general-purpose vector registers. * The correct way to accurately store these is using big-endian vectors. * * For ?WC2 we may need to do byte-precision access just as directly. * This is amended by using the `VU_S` and `VU_B` macros defined in `rsp.h`. */ ALIGNED extern i16 VR[32][N << VR_STATIC_WRAPAROUND]; /* * The RSP accumulator is a vector of 3 48-bit integers. Nearly all of the * vector operations access it, but it's for multiply-accumulate operations. * * Access dimensions would be VACC[8][3] but are inverted for SIMD benefits. */ ALIGNED extern i16 VACC[3][N]; /* * When compiling without SSE2, we need to use a pointer to a destination * vector instead of an XMM register in the return slot of the function. * The vector "result" register will be emulated to serve this pointer * as a shared global rather than the return slot of a function call. */ #ifndef ARCH_MIN_SSE2 ALIGNED extern i16 V_result[N]; #endif /* * accumulator-indexing macros */ #define HI 00 #define MD 01 #define LO 02 #define VACC_L (VACC[LO]) #define VACC_M (VACC[MD]) #define VACC_H (VACC[HI]) #define ACC_L(i) (VACC_L)[i] #define ACC_M(i) (VACC_M)[i] #define ACC_H(i) (VACC_H)[i] #ifdef ARCH_MIN_SSE2 typedef __m128i v16; #else typedef pi16 v16; #endif #ifdef ARCH_MIN_SSE2 #define VECTOR_OPERATION v16 #else #define VECTOR_OPERATION void #endif #define VECTOR_EXTERN extern VECTOR_OPERATION NOINLINE extern void message(const char* body); VECTOR_EXTERN (*COP2_C2[8*7 + 8])(v16, v16); #ifdef ARCH_MIN_SSE2 #define vector_copy(vd, vs) { \ *(v16 *)(vd) = *(v16 *)(vs); } #define vector_wipe(vd) { \ *(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); } #define vector_fill(vd) { \ *(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); } #define vector_and(vd, vs) { \ *(v16 *)&(vd) = _mm_and_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); } #define vector_or(vd, vs) { \ *(v16 *)&(vd) = _mm_or_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); } #define vector_xor(vd, vs) { \ *(v16 *)&(vd) = _mm_xor_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); } /* * Every competent vector unit should have at least two vector comparison * operations: EQ and LT/GT. (MMX makes us say GT; SSE's LT is just a GT.) * * Default examples when compiling for the x86 SSE2 architecture below. */ #define vector_cmplt(vd, vs) { \ *(v16 *)&(vd) = _mm_cmplt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); } #define vector_cmpeq(vd, vs) { \ *(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); } #define vector_cmpgt(vd, vs) { \ *(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); } #else #define vector_copy(vd, vs) { \ (vd)[0] = (vs)[0]; \ (vd)[1] = (vs)[1]; \ (vd)[2] = (vs)[2]; \ (vd)[3] = (vs)[3]; \ (vd)[4] = (vs)[4]; \ (vd)[5] = (vs)[5]; \ (vd)[6] = (vs)[6]; \ (vd)[7] = (vs)[7]; \ } #define vector_wipe(vd) { \ (vd)[0] = 0x0000; \ (vd)[1] = 0x0000; \ (vd)[2] = 0x0000; \ (vd)[3] = 0x0000; \ (vd)[4] = 0x0000; \ (vd)[5] = 0x0000; \ (vd)[6] = 0x0000; \ (vd)[7] = 0x0000; \ } #define vector_fill(vd) { \ (vd)[0] = ~0x0000; \ (vd)[1] = ~0x0000; \ (vd)[2] = ~0x0000; \ (vd)[3] = ~0x0000; \ (vd)[4] = ~0x0000; \ (vd)[5] = ~0x0000; \ (vd)[6] = ~0x0000; \ (vd)[7] = ~0x0000; \ } #define vector_and(vd, vs) { \ (vd)[0] &= (vs)[0]; \ (vd)[1] &= (vs)[1]; \ (vd)[2] &= (vs)[2]; \ (vd)[3] &= (vs)[3]; \ (vd)[4] &= (vs)[4]; \ (vd)[5] &= (vs)[5]; \ (vd)[6] &= (vs)[6]; \ (vd)[7] &= (vs)[7]; \ } #define vector_or(vd, vs) { \ (vd)[0] |= (vs)[0]; \ (vd)[1] |= (vs)[1]; \ (vd)[2] |= (vs)[2]; \ (vd)[3] |= (vs)[3]; \ (vd)[4] |= (vs)[4]; \ (vd)[5] |= (vs)[5]; \ (vd)[6] |= (vs)[6]; \ (vd)[7] |= (vs)[7]; \ } #define vector_xor(vd, vs) { \ (vd)[0] ^= (vs)[0]; \ (vd)[1] ^= (vs)[1]; \ (vd)[2] ^= (vs)[2]; \ (vd)[3] ^= (vs)[3]; \ (vd)[4] ^= (vs)[4]; \ (vd)[5] ^= (vs)[5]; \ (vd)[6] ^= (vs)[6]; \ (vd)[7] ^= (vs)[7]; \ } #define vector_cmplt(vd, vs) { \ (vd)[0] = ((vd)[0] < (vs)[0]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[1] < (vs)[1]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[2] < (vs)[2]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[3] < (vs)[3]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[4] < (vs)[4]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[5] < (vs)[5]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[6] < (vs)[6]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[7] < (vs)[7]) ? ~0x0000 : 0x0000; \ } #define vector_cmpeq(vd, vs) { \ (vd)[0] = ((vd)[0] == (vs)[0]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[1] == (vs)[1]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[2] == (vs)[2]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[3] == (vs)[3]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[4] == (vs)[4]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[5] == (vs)[5]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[6] == (vs)[6]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[7] == (vs)[7]) ? ~0x0000 : 0x0000; \ } #define vector_cmpgt(vd, vs) { \ (vd)[0] = ((vd)[0] > (vs)[0]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[1] > (vs)[1]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[2] > (vs)[2]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[3] > (vs)[3]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[4] > (vs)[4]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[5] > (vs)[5]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[6] > (vs)[6]) ? ~0x0000 : 0x0000; \ (vd)[0] = ((vd)[7] > (vs)[7]) ? ~0x0000 : 0x0000; \ } #endif /* * Many vector units have pairs of "vector condition flags" registers. * In SGI's vector unit implementation, these are denoted as the * "vector control registers" under coprocessor 2. * * VCF-0 is the carry-out flags register: $vco. * VCF-1 is the compare code flags register: $vcc. * VCF-2 is the compare extension flags register: $vce. * There is no fourth RSP flags register. */ extern u16 VCO; extern u16 VCC; extern u8 VCE; ALIGNED extern i16 cf_ne[N]; ALIGNED extern i16 cf_co[N]; ALIGNED extern i16 cf_clip[N]; ALIGNED extern i16 cf_comp[N]; ALIGNED extern i16 cf_vce[N]; extern u16 get_VCO(void); extern u16 get_VCC(void); extern u8 get_VCE(void); extern void set_VCO(u16 vco); extern void set_VCC(u16 vcc); extern void set_VCE(u8 vce); /* * shuffling convenience macros for Intel SIMD * An 8-bit shuffle imm. of SHUFFLE(0, 1, 2, 3) should be a null operation. */ #define B(x) ((x) & 3) #define SHUFFLE(a,b,c,d) ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0)) /* * RSP vector opcode function names are currently just literally named after * the actual opcode that is being emulated, but names this short could * collide with global symbols exported from somewhere else within the * emulation thread. (This did happen on Linux Mupen64, with my old function * name "MFC0", which had to be renamed.) Rather than uglify the function * names, we'll treat them as macros from now on, should the need arise. */ #ifndef _WIN32 #define VMULF mulf_v_msp #define VMULU mulu_v_msp #define VMULI rndp_v_msp #define VMULQ mulq_v_msp #define VMUDL mudl_v_msp #define VMUDM mudm_v_msp #define VMUDN mudn_v_msp #define VMUDH mudh_v_msp #define VMACF macf_v_msp #define VMACU macu_v_msp #define VMACI rndn_v_msp #define VMACQ macq_v_msp #define VMADL madl_v_msp #define VMADM madm_v_msp #define VMADN madn_v_msp #define VMADH madh_v_msp #define VADD add_v_msp #define VSUB sub_v_msp #define VSUT sut_v_msp #define VABS abs_v_msp #define VADDC addc_v_msp #define VSUBC subc_v_msp #define VADDB addb_v_msp #define VSUBB subb_v_msp #define VACCB accb_v_msp #define VSUCB sucb_v_msp #define VSAD sad_v_msp #define VSAC sac_v_msp #define VSUM sum_v_msp #define VSAW sar_v_msp /* #define VACC */ /* #define VSUC */ #define VLT lt_v_msp #define VEQ eq_v_msp #define VNE ne_v_msp #define VGE ge_v_msp #define VCL cl_v_msp #define VCH ch_v_msp #define VCR cr_v_msp #define VMRG mrg_v_msp #define VAND and_v_msp #define VNAND nand_v_msp #define VOR or_v_msp #define VNOR nor_v_msp #define VXOR xor_v_msp #define VNXOR nxor_v_msp #define VRCP rcp_v_msp #define VRCPL rcpl_v_msp #define VRCPH rcph_v_msp #define VMOV mov_v_msp #define VRSQ rsq_v_msp #define VRSQL rsql_v_msp #define VRSQH rsqh_v_msp #define VNOP nop_v_msp #define VEXTT extt_v_msp #define VEXTQ extq_v_msp #define VEXTN extn_v_msp #define VINST inst_v_msp #define VINSQ insq_v_msp #define VINSN insn_v_msp #define VNULLOP nop_v_msp #endif #endif