mirror of
https://github.com/mupen64plus/mupen64plus-rsp-cxd4.git
synced 2025-04-02 10:51:55 -04:00
355 lines
11 KiB
C
355 lines
11 KiB
C
/******************************************************************************\
|
|
* Project: MSP Emulation Layer for Vector Unit Computational Operations *
|
|
* Authors: Iconoclast *
|
|
* Release: 2016.03.23 *
|
|
* License: CC0 Public Domain Dedication *
|
|
* *
|
|
* To the extent possible under law, the author(s) have dedicated all copyright *
|
|
* and related and neighboring rights to this software to the public domain *
|
|
* worldwide. This software is distributed without any warranty. *
|
|
* *
|
|
* You should have received a copy of the CC0 Public Domain Dedication along *
|
|
* with this software. *
|
|
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. *
|
|
\******************************************************************************/
|
|
#ifndef _VU_H_
|
|
#define _VU_H_
|
|
|
|
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
|
|
#include <emmintrin.h>
|
|
#endif
|
|
|
|
#include "../my_types.h"
|
|
|
|
#define N 8
|
|
/* N: number of processor elements in SIMD processor */
|
|
|
|
/*
|
|
* Illegal, unaligned LWC2 operations on the RSP may write past the terminal
|
|
* byte of a vector, while SWC2 operations may have to wrap around stores
|
|
* from the end to the start of a vector. Both of these risk out-of-bounds
|
|
* memory access, but by doubling the number of bytes allocated (shift left)
|
|
* per each vector register, we could stabilize and probably optimize this.
|
|
*/
|
|
#if 0
|
|
#define VR_STATIC_WRAPAROUND 0
|
|
#else
|
|
#define VR_STATIC_WRAPAROUND 1
|
|
#endif
|
|
|
|
/*
|
|
* We are going to need this for vector operations doing scalar things.
|
|
* The divides and VSAW need bit-wise information from the instruction word.
|
|
*/
|
|
extern u32 inst_word;
|
|
|
|
/*
|
|
* RSP virtual registers (of vector unit)
|
|
* The most important are the 32 general-purpose vector registers.
|
|
* The correct way to accurately store these is using big-endian vectors.
|
|
*
|
|
* For ?WC2 we may need to do byte-precision access just as directly.
|
|
* This is amended by using the `VU_S` and `VU_B` macros defined in `rsp.h`.
|
|
*/
|
|
ALIGNED extern i16 VR[32][N << VR_STATIC_WRAPAROUND];
|
|
|
|
/*
|
|
* The RSP accumulator is a vector of 3 48-bit integers. Nearly all of the
|
|
* vector operations access it, but it's for multiply-accumulate operations.
|
|
*
|
|
* Access dimensions would be VACC[8][3] but are inverted for SIMD benefits.
|
|
*/
|
|
ALIGNED extern i16 VACC[3][N];
|
|
|
|
/*
|
|
* When compiling without SSE2, we need to use a pointer to a destination
|
|
* vector instead of an XMM register in the return slot of the function.
|
|
* The vector "result" register will be emulated to serve this pointer
|
|
* as a shared global rather than the return slot of a function call.
|
|
*/
|
|
#ifndef ARCH_MIN_SSE2
|
|
ALIGNED extern i16 V_result[N];
|
|
#endif
|
|
|
|
/*
|
|
* accumulator-indexing macros
|
|
*/
|
|
#define HI 00
|
|
#define MD 01
|
|
#define LO 02
|
|
|
|
#define VACC_L (VACC[LO])
|
|
#define VACC_M (VACC[MD])
|
|
#define VACC_H (VACC[HI])
|
|
|
|
#define ACC_L(i) (VACC_L)[i]
|
|
#define ACC_M(i) (VACC_M)[i]
|
|
#define ACC_H(i) (VACC_H)[i]
|
|
|
|
#ifdef ARCH_MIN_SSE2
|
|
typedef __m128i v16;
|
|
#else
|
|
typedef pi16 v16;
|
|
#endif
|
|
|
|
#ifdef ARCH_MIN_SSE2
|
|
#define VECTOR_OPERATION v16
|
|
#else
|
|
#define VECTOR_OPERATION void
|
|
#endif
|
|
#define VECTOR_EXTERN extern VECTOR_OPERATION
|
|
|
|
NOINLINE extern void message(const char* body);
|
|
|
|
VECTOR_EXTERN (*COP2_C2[8*7 + 8])(v16, v16);
|
|
|
|
#ifdef ARCH_MIN_SSE2
|
|
|
|
#define vector_copy(vd, vs) { \
|
|
*(v16 *)(vd) = *(v16 *)(vs); }
|
|
#define vector_wipe(vd) { \
|
|
*(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); }
|
|
#define vector_fill(vd) { \
|
|
*(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vd)); }
|
|
|
|
#define vector_and(vd, vs) { \
|
|
*(v16 *)&(vd) = _mm_and_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); }
|
|
#define vector_or(vd, vs) { \
|
|
*(v16 *)&(vd) = _mm_or_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); }
|
|
#define vector_xor(vd, vs) { \
|
|
*(v16 *)&(vd) = _mm_xor_si128 (*(v16 *)&(vd), *(v16 *)&(vs)); }
|
|
|
|
/*
|
|
* Every competent vector unit should have at least two vector comparison
|
|
* operations: EQ and LT/GT. (MMX makes us say GT; SSE's LT is just a GT.)
|
|
*
|
|
* Default examples when compiling for the x86 SSE2 architecture below.
|
|
*/
|
|
#define vector_cmplt(vd, vs) { \
|
|
*(v16 *)&(vd) = _mm_cmplt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
|
|
#define vector_cmpeq(vd, vs) { \
|
|
*(v16 *)&(vd) = _mm_cmpeq_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
|
|
#define vector_cmpgt(vd, vs) { \
|
|
*(v16 *)&(vd) = _mm_cmpgt_epi16(*(v16 *)&(vd), *(v16 *)&(vs)); }
|
|
|
|
#else
|
|
|
|
#define vector_copy(vd, vs) { \
|
|
(vd)[0] = (vs)[0]; \
|
|
(vd)[1] = (vs)[1]; \
|
|
(vd)[2] = (vs)[2]; \
|
|
(vd)[3] = (vs)[3]; \
|
|
(vd)[4] = (vs)[4]; \
|
|
(vd)[5] = (vs)[5]; \
|
|
(vd)[6] = (vs)[6]; \
|
|
(vd)[7] = (vs)[7]; \
|
|
}
|
|
#define vector_wipe(vd) { \
|
|
(vd)[0] = 0x0000; \
|
|
(vd)[1] = 0x0000; \
|
|
(vd)[2] = 0x0000; \
|
|
(vd)[3] = 0x0000; \
|
|
(vd)[4] = 0x0000; \
|
|
(vd)[5] = 0x0000; \
|
|
(vd)[6] = 0x0000; \
|
|
(vd)[7] = 0x0000; \
|
|
}
|
|
#define vector_fill(vd) { \
|
|
(vd)[0] = ~0x0000; \
|
|
(vd)[1] = ~0x0000; \
|
|
(vd)[2] = ~0x0000; \
|
|
(vd)[3] = ~0x0000; \
|
|
(vd)[4] = ~0x0000; \
|
|
(vd)[5] = ~0x0000; \
|
|
(vd)[6] = ~0x0000; \
|
|
(vd)[7] = ~0x0000; \
|
|
}
|
|
#define vector_and(vd, vs) { \
|
|
(vd)[0] &= (vs)[0]; \
|
|
(vd)[1] &= (vs)[1]; \
|
|
(vd)[2] &= (vs)[2]; \
|
|
(vd)[3] &= (vs)[3]; \
|
|
(vd)[4] &= (vs)[4]; \
|
|
(vd)[5] &= (vs)[5]; \
|
|
(vd)[6] &= (vs)[6]; \
|
|
(vd)[7] &= (vs)[7]; \
|
|
}
|
|
#define vector_or(vd, vs) { \
|
|
(vd)[0] |= (vs)[0]; \
|
|
(vd)[1] |= (vs)[1]; \
|
|
(vd)[2] |= (vs)[2]; \
|
|
(vd)[3] |= (vs)[3]; \
|
|
(vd)[4] |= (vs)[4]; \
|
|
(vd)[5] |= (vs)[5]; \
|
|
(vd)[6] |= (vs)[6]; \
|
|
(vd)[7] |= (vs)[7]; \
|
|
}
|
|
#define vector_xor(vd, vs) { \
|
|
(vd)[0] ^= (vs)[0]; \
|
|
(vd)[1] ^= (vs)[1]; \
|
|
(vd)[2] ^= (vs)[2]; \
|
|
(vd)[3] ^= (vs)[3]; \
|
|
(vd)[4] ^= (vs)[4]; \
|
|
(vd)[5] ^= (vs)[5]; \
|
|
(vd)[6] ^= (vs)[6]; \
|
|
(vd)[7] ^= (vs)[7]; \
|
|
}
|
|
|
|
#define vector_cmplt(vd, vs) { \
|
|
(vd)[0] = ((vd)[0] < (vs)[0]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[1] < (vs)[1]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[2] < (vs)[2]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[3] < (vs)[3]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[4] < (vs)[4]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[5] < (vs)[5]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[6] < (vs)[6]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[7] < (vs)[7]) ? ~0x0000 : 0x0000; \
|
|
}
|
|
#define vector_cmpeq(vd, vs) { \
|
|
(vd)[0] = ((vd)[0] == (vs)[0]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[1] == (vs)[1]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[2] == (vs)[2]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[3] == (vs)[3]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[4] == (vs)[4]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[5] == (vs)[5]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[6] == (vs)[6]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[7] == (vs)[7]) ? ~0x0000 : 0x0000; \
|
|
}
|
|
#define vector_cmpgt(vd, vs) { \
|
|
(vd)[0] = ((vd)[0] > (vs)[0]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[1] > (vs)[1]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[2] > (vs)[2]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[3] > (vs)[3]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[4] > (vs)[4]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[5] > (vs)[5]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[6] > (vs)[6]) ? ~0x0000 : 0x0000; \
|
|
(vd)[0] = ((vd)[7] > (vs)[7]) ? ~0x0000 : 0x0000; \
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
* Many vector units have pairs of "vector condition flags" registers.
|
|
* In SGI's vector unit implementation, these are denoted as the
|
|
* "vector control registers" under coprocessor 2.
|
|
*
|
|
* VCF-0 is the carry-out flags register: $vco.
|
|
* VCF-1 is the compare code flags register: $vcc.
|
|
* VCF-2 is the compare extension flags register: $vce.
|
|
* There is no fourth RSP flags register.
|
|
*/
|
|
extern u16 VCO;
|
|
extern u16 VCC;
|
|
extern u8 VCE;
|
|
|
|
ALIGNED extern i16 cf_ne[N];
|
|
ALIGNED extern i16 cf_co[N];
|
|
ALIGNED extern i16 cf_clip[N];
|
|
ALIGNED extern i16 cf_comp[N];
|
|
ALIGNED extern i16 cf_vce[N];
|
|
|
|
extern u16 get_VCO(void);
|
|
extern u16 get_VCC(void);
|
|
extern u8 get_VCE(void);
|
|
|
|
extern void set_VCO(u16 vco);
|
|
extern void set_VCC(u16 vcc);
|
|
extern void set_VCE(u8 vce);
|
|
|
|
/*
|
|
* shuffling convenience macros for Intel SIMD
|
|
* An 8-bit shuffle imm. of SHUFFLE(0, 1, 2, 3) should be a null operation.
|
|
*/
|
|
#define B(x) ((x) & 3)
|
|
#define SHUFFLE(a,b,c,d) ((B(d)<<6) | (B(c)<<4) | (B(b)<<2) | (B(a)<<0))
|
|
|
|
/*
|
|
* RSP vector opcode function names are currently just literally named after
|
|
* the actual opcode that is being emulated, but names this short could
|
|
* collide with global symbols exported from somewhere else within the
|
|
* emulation thread. (This did happen on Linux Mupen64, with my old function
|
|
* name "MFC0", which had to be renamed.) Rather than uglify the function
|
|
* names, we'll treat them as macros from now on, should the need arise.
|
|
*/
|
|
#ifndef _WIN32
|
|
|
|
#define VMULF mulf_v_msp
|
|
#define VMULU mulu_v_msp
|
|
#define VMULI rndp_v_msp
|
|
#define VMULQ mulq_v_msp
|
|
|
|
#define VMUDL mudl_v_msp
|
|
#define VMUDM mudm_v_msp
|
|
#define VMUDN mudn_v_msp
|
|
#define VMUDH mudh_v_msp
|
|
|
|
#define VMACF macf_v_msp
|
|
#define VMACU macu_v_msp
|
|
#define VMACI rndn_v_msp
|
|
#define VMACQ macq_v_msp
|
|
|
|
#define VMADL madl_v_msp
|
|
#define VMADM madm_v_msp
|
|
#define VMADN madn_v_msp
|
|
#define VMADH madh_v_msp
|
|
|
|
#define VADD add_v_msp
|
|
#define VSUB sub_v_msp
|
|
#define VSUT sut_v_msp
|
|
#define VABS abs_v_msp
|
|
|
|
#define VADDC addc_v_msp
|
|
#define VSUBC subc_v_msp
|
|
#define VADDB addb_v_msp
|
|
#define VSUBB subb_v_msp
|
|
|
|
#define VACCB accb_v_msp
|
|
#define VSUCB sucb_v_msp
|
|
#define VSAD sad_v_msp
|
|
#define VSAC sac_v_msp
|
|
|
|
#define VSUM sum_v_msp
|
|
#define VSAW sar_v_msp
|
|
/* #define VACC */
|
|
/* #define VSUC */
|
|
|
|
#define VLT lt_v_msp
|
|
#define VEQ eq_v_msp
|
|
#define VNE ne_v_msp
|
|
#define VGE ge_v_msp
|
|
|
|
#define VCL cl_v_msp
|
|
#define VCH ch_v_msp
|
|
#define VCR cr_v_msp
|
|
#define VMRG mrg_v_msp
|
|
|
|
#define VAND and_v_msp
|
|
#define VNAND nand_v_msp
|
|
#define VOR or_v_msp
|
|
#define VNOR nor_v_msp
|
|
#define VXOR xor_v_msp
|
|
#define VNXOR nxor_v_msp
|
|
|
|
#define VRCP rcp_v_msp
|
|
#define VRCPL rcpl_v_msp
|
|
#define VRCPH rcph_v_msp
|
|
#define VMOV mov_v_msp
|
|
|
|
#define VRSQ rsq_v_msp
|
|
#define VRSQL rsql_v_msp
|
|
#define VRSQH rsqh_v_msp
|
|
#define VNOP nop_v_msp
|
|
|
|
#define VEXTT extt_v_msp
|
|
#define VEXTQ extq_v_msp
|
|
#define VEXTN extn_v_msp
|
|
|
|
|
|
#define VINST inst_v_msp
|
|
#define VINSQ insq_v_msp
|
|
#define VINSN insn_v_msp
|
|
#define VNULLOP nop_v_msp
|
|
|
|
#endif
|
|
|
|
#endif
|