mupen64plus-rsp-cxd4/vu/vu.h

355 lines
11 KiB
C

/******************************************************************************\
* Project: MSP Emulation Layer for Vector Unit Computational Operations *
* Authors: Iconoclast *
* Release: 2016.03.23 *
* License: CC0 Public Domain Dedication *
* *
* To the extent possible under law, the author(s) have dedicated all copyright *
* and related and neighboring rights to this software to the public domain *
* worldwide. This software is distributed without any warranty. *
* *
* You should have received a copy of the CC0 Public Domain Dedication along *
* with this software. *
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. *
\******************************************************************************/
#ifndef _VU_H_
#define _VU_H_
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
#include <emmintrin.h>
#endif
#include "../my_types.h"
#define N 8
/* N: number of processor elements in SIMD processor */
/*
* Illegal, unaligned LWC2 operations on the RSP may write past the terminal
* byte of a vector, while SWC2 operations may have to wrap around stores
* from the end to the start of a vector. Both of these risk out-of-bounds
* memory access, but by doubling the number of bytes allocated (shift left)
* per each vector register, we could stabilize and probably optimize this.
*/
#if 0
#define VR_STATIC_WRAPAROUND 0
#else
#define VR_STATIC_WRAPAROUND 1
#endif
/*
* We are going to need this for vector operations doing scalar things.
* The divides and VSAW need bit-wise information from the instruction word.
*/
extern u32 inst_word;
/*
* RSP virtual registers (of vector unit)
* The most important are the 32 general-purpose vector registers.
* The correct way to accurately store these is using big-endian vectors.
*
* For ?WC2 we may need to do byte-precision access just as directly.
* This is amended by using the `VU_S` and `VU_B` macros defined in `rsp.h`.
*/
ALIGNED extern i16 VR[32][N << VR_STATIC_WRAPAROUND];
/*
* The RSP accumulator is a vector of 3 48-bit integers. Nearly all of the
* vector operations access it, but it's for multiply-accumulate operations.
*
* Access dimensions would be VACC[8][3] but are inverted for SIMD benefits.
*/
ALIGNED extern i16 VACC[3][N];
/*
* When compiling without SSE2, we need to use a pointer to a destination
* vector instead of an XMM register in the return slot of the function.
* The vector "result" register will be emulated to serve this pointer
* as a shared global rather than the return slot of a function call.
*/
#ifndef ARCH_MIN_SSE2
ALIGNED extern i16 V_result[N];
#endif
/*
* accumulator-indexing macros
*/
#define HI 00
#define MD 01
#define LO 02
#define VACC_L (VACC[LO])
#define VACC_M (VACC[MD])
#define VACC_H (VACC[HI])
#define ACC_L(i) (VACC_L)[i]
#define ACC_M(i) (VACC_M)[i]
#define ACC_H(i) (VACC_H)[i]
#ifdef ARCH_MIN_SSE2
typedef __m128i v16;
#else
typedef pi16 v16;
#endif
#ifdef ARCH_MIN_SSE2
#define VECTOR_OPERATION v16
#else
#define VECTOR_OPERATION void
#endif
#define VECTOR_EXTERN extern VECTOR_OPERATION
NOINLINE extern void message(const char* body);
VECTOR_EXTERN (*COP2_C2[8*7 + 8])(v16, v16);
#ifdef ARCH_MIN_SSE2
#define vector_copy(vd, vs) { \
*(v16 *)(vd) = *(v16 *)(vs); }
#define vector_wipe(vd) { \
*(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); }
#define vector_fill(vd) { \
*(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); }
#define vector_and(vd, vs) { \
*(v16 *)&(vd) = _mm_and_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); }
#define vector_or(vd, vs) { \
*(v16 *)&(vd) = _mm_or_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); }
#define vector_xor(vd, vs) { \
*(v16 *)&(vd) = _mm_xor_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); }
/*
* Every competent vector unit should have at least two vector comparison
* operations: EQ and LT/GT. (MMX makes us say GT; SSE's LT is just a GT.)
*
* Default examples when compiling for the x86 SSE2 architecture below.
*/
#define vector_cmplt(vd, vs) { \
*(v16 *)&(vd) = _mm_cmplt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
#define vector_cmpeq(vd, vs) { \
*(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
#define vector_cmpgt(vd, vs) { \
*(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
#else
#define vector_copy(vd, vs) { \
(vd)[0] = (vs)[0]; \
(vd)[1] = (vs)[1]; \
(vd)[2] = (vs)[2]; \
(vd)[3] = (vs)[3]; \
(vd)[4] = (vs)[4]; \
(vd)[5] = (vs)[5]; \
(vd)[6] = (vs)[6]; \
(vd)[7] = (vs)[7]; \
}
#define vector_wipe(vd) { \
(vd)[0] = 0x0000; \
(vd)[1] = 0x0000; \
(vd)[2] = 0x0000; \
(vd)[3] = 0x0000; \
(vd)[4] = 0x0000; \
(vd)[5] = 0x0000; \
(vd)[6] = 0x0000; \
(vd)[7] = 0x0000; \
}
#define vector_fill(vd) { \
(vd)[0] = ~0x0000; \
(vd)[1] = ~0x0000; \
(vd)[2] = ~0x0000; \
(vd)[3] = ~0x0000; \
(vd)[4] = ~0x0000; \
(vd)[5] = ~0x0000; \
(vd)[6] = ~0x0000; \
(vd)[7] = ~0x0000; \
}
#define vector_and(vd, vs) { \
(vd)[0] &= (vs)[0]; \
(vd)[1] &= (vs)[1]; \
(vd)[2] &= (vs)[2]; \
(vd)[3] &= (vs)[3]; \
(vd)[4] &= (vs)[4]; \
(vd)[5] &= (vs)[5]; \
(vd)[6] &= (vs)[6]; \
(vd)[7] &= (vs)[7]; \
}
#define vector_or(vd, vs) { \
(vd)[0] |= (vs)[0]; \
(vd)[1] |= (vs)[1]; \
(vd)[2] |= (vs)[2]; \
(vd)[3] |= (vs)[3]; \
(vd)[4] |= (vs)[4]; \
(vd)[5] |= (vs)[5]; \
(vd)[6] |= (vs)[6]; \
(vd)[7] |= (vs)[7]; \
}
#define vector_xor(vd, vs) { \
(vd)[0] ^= (vs)[0]; \
(vd)[1] ^= (vs)[1]; \
(vd)[2] ^= (vs)[2]; \
(vd)[3] ^= (vs)[3]; \
(vd)[4] ^= (vs)[4]; \
(vd)[5] ^= (vs)[5]; \
(vd)[6] ^= (vs)[6]; \
(vd)[7] ^= (vs)[7]; \
}
#define vector_cmplt(vd, vs) { \
(vd)[0] = ((vd)[0] < (vs)[0]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[1] < (vs)[1]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[2] < (vs)[2]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[3] < (vs)[3]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[4] < (vs)[4]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[5] < (vs)[5]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[6] < (vs)[6]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[7] < (vs)[7]) ? ~0x0000 : 0x0000; \
}
#define vector_cmpeq(vd, vs) { \
(vd)[0] = ((vd)[0] == (vs)[0]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[1] == (vs)[1]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[2] == (vs)[2]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[3] == (vs)[3]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[4] == (vs)[4]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[5] == (vs)[5]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[6] == (vs)[6]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[7] == (vs)[7]) ? ~0x0000 : 0x0000; \
}
#define vector_cmpgt(vd, vs) { \
(vd)[0] = ((vd)[0] > (vs)[0]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[1] > (vs)[1]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[2] > (vs)[2]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[3] > (vs)[3]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[4] > (vs)[4]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[5] > (vs)[5]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[6] > (vs)[6]) ? ~0x0000 : 0x0000; \
(vd)[0] = ((vd)[7] > (vs)[7]) ? ~0x0000 : 0x0000; \
}
#endif
/*
* Many vector units have pairs of "vector condition flags" registers.
* In SGI's vector unit implementation, these are denoted as the
* "vector control registers" under coprocessor 2.
*
* VCF-0 is the carry-out flags register: $vco.
* VCF-1 is the compare code flags register: $vcc.
* VCF-2 is the compare extension flags register: $vce.
* There is no fourth RSP flags register.
*/
extern u16 VCO;
extern u16 VCC;
extern u8 VCE;
ALIGNED extern i16 cf_ne[N];
ALIGNED extern i16 cf_co[N];
ALIGNED extern i16 cf_clip[N];
ALIGNED extern i16 cf_comp[N];
ALIGNED extern i16 cf_vce[N];
extern u16 get_VCO(void);
extern u16 get_VCC(void);
extern u8 get_VCE(void);
extern void set_VCO(u16 vco);
extern void set_VCC(u16 vcc);
extern void set_VCE(u8 vce);
/*
* shuffling convenience macros for Intel SIMD
* An 8-bit shuffle imm. of SHUFFLE(0, 1, 2, 3) should be a null operation.
*/
#define B(x) ((x) & 3)
#define SHUFFLE(a,b,c,d) ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0))
/*
* RSP vector opcode function names are currently just literally named after
* the actual opcode that is being emulated, but names this short could
* collide with global symbols exported from somewhere else within the
* emulation thread. (This did happen on Linux Mupen64, with my old function
* name "MFC0", which had to be renamed.) Rather than uglify the function
* names, we'll treat them as macros from now on, should the need arise.
*/
#ifndef _WIN32
#define VMULF mulf_v_msp
#define VMULU mulu_v_msp
#define VMULI rndp_v_msp
#define VMULQ mulq_v_msp
#define VMUDL mudl_v_msp
#define VMUDM mudm_v_msp
#define VMUDN mudn_v_msp
#define VMUDH mudh_v_msp
#define VMACF macf_v_msp
#define VMACU macu_v_msp
#define VMACI rndn_v_msp
#define VMACQ macq_v_msp
#define VMADL madl_v_msp
#define VMADM madm_v_msp
#define VMADN madn_v_msp
#define VMADH madh_v_msp
#define VADD add_v_msp
#define VSUB sub_v_msp
#define VSUT sut_v_msp
#define VABS abs_v_msp
#define VADDC addc_v_msp
#define VSUBC subc_v_msp
#define VADDB addb_v_msp
#define VSUBB subb_v_msp
#define VACCB accb_v_msp
#define VSUCB sucb_v_msp
#define VSAD sad_v_msp
#define VSAC sac_v_msp
#define VSUM sum_v_msp
#define VSAW sar_v_msp
/* #define VACC */
/* #define VSUC */
#define VLT lt_v_msp
#define VEQ eq_v_msp
#define VNE ne_v_msp
#define VGE ge_v_msp
#define VCL cl_v_msp
#define VCH ch_v_msp
#define VCR cr_v_msp
#define VMRG mrg_v_msp
#define VAND and_v_msp
#define VNAND nand_v_msp
#define VOR or_v_msp
#define VNOR nor_v_msp
#define VXOR xor_v_msp
#define VNXOR nxor_v_msp
#define VRCP rcp_v_msp
#define VRCPL rcpl_v_msp
#define VRCPH rcph_v_msp
#define VMOV mov_v_msp
#define VRSQ rsq_v_msp
#define VRSQL rsql_v_msp
#define VRSQH rsqh_v_msp
#define VNOP nop_v_msp
#define VEXTT extt_v_msp
#define VEXTQ extq_v_msp
#define VEXTN extn_v_msp
#define VINST inst_v_msp
#define VINSQ insq_v_msp
#define VINSN insn_v_msp
#define VNULLOP nop_v_msp
#endif
#endif