mirror of
https://github.com/mupen64plus/mupen64plus-rsp-cxd4.git
synced 2025-04-02 10:51:55 -04:00
Although functionally there is no difference (when just looping vector elements from 0 to 7) between using a signed int or an unsigned int, repeatedly seeing an inconsistent mix in usage between the two across different vector functions has been an ongoing distraction for years. It should be the same everywhere, and between signed int and unsigned int, unsigned int is the type which always fits within size_t from stddef.h, the safe type for memory pointers and dereference indices.
236 lines
7.1 KiB
C
236 lines
7.1 KiB
C
/******************************************************************************\
|
|
* Project: MSP Emulation Layer for Vector Unit Computational Operations *
|
|
* Authors: Iconoclast *
|
|
* Release: 2018.03.18 *
|
|
* License: CC0 Public Domain Dedication *
|
|
* *
|
|
* To the extent possible under law, the author(s) have dedicated all copyright *
|
|
* and related and neighboring rights to this software to the public domain *
|
|
* worldwide. This software is distributed without any warranty. *
|
|
* *
|
|
* You should have received a copy of the CC0 Public Domain Dedication along *
|
|
* with this software. *
|
|
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. *
|
|
\******************************************************************************/
|
|
|
|
#include "vu.h"
|
|
|
|
#include "multiply.h"
|
|
#include "add.h"
|
|
#include "select.h"
|
|
#include "logical.h"
|
|
#include "divide.h"
|
|
#if 0
|
|
#include "pack.h"
|
|
#endif
|
|
|
|
ALIGNED i16 VR[32][N << VR_STATIC_WRAPAROUND];
|
|
ALIGNED i16 VACC[3][N];
|
|
#ifndef ARCH_MIN_SSE2
|
|
ALIGNED i16 V_result[N];
|
|
#endif
|
|
|
|
/*
|
|
* These normally should have type `int` because they are Boolean T/F arrays.
|
|
* However, since SSE2 uses 128-bit XMM's, and Win32 `int` storage is 32-bit,
|
|
* we have the problem of 32*8 > 128 bits, so we use `short` to reduce packs.
|
|
*/
|
|
ALIGNED i16 cf_ne[N]; /* $vco: high "NOTEQUAL" */
|
|
ALIGNED i16 cf_co[N]; /* $vco: low "carry/borrow in/out" */
|
|
ALIGNED i16 cf_clip[N]; /* $vcc: high (clip tests: VCL, VCH, VCR) */
|
|
ALIGNED i16 cf_comp[N]; /* $vcc: low (VEQ, VNE, VLT, VGE, VCL, VCH, VCR) */
|
|
ALIGNED i16 cf_vce[N]; /* $vce: vector compare extension register */
|
|
|
|
VECTOR_OPERATION res_V(v16 vs, v16 vt)
|
|
{
|
|
vt = vs; /* unused */
|
|
message("C2\nRESERVED"); /* uncertain how to handle reserved, untested */
|
|
#ifdef ARCH_MIN_SSE2
|
|
vs = _mm_setzero_si128();
|
|
return (vt = vs); /* -Wunused-but-set-parameter */
|
|
#else
|
|
vector_wipe(V_result);
|
|
if (vt == vs)
|
|
return; /* -Wunused-but-set-parameter */
|
|
return;
|
|
#endif
|
|
}
|
|
VECTOR_OPERATION res_M(v16 vs, v16 vt)
|
|
{ /* Ultra64 OS did have these, so one could implement this ext. */
|
|
message("VMUL IQ");
|
|
#ifdef ARCH_MIN_SSE2
|
|
vs = res_V(vs, vt);
|
|
return (vs);
|
|
#else
|
|
res_V(vs, vt);
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Op-code-accurate matrix of all the known RSP vector operations.
|
|
* To do: Either remove VMACQ, or add VRNDP, VRNDN, and VMULQ.
|
|
*
|
|
* Note that these are not our literal function names, just macro names.
|
|
*/
|
|
VECTOR_OPERATION (*COP2_C2[8 * 8])(v16, v16) = {
|
|
VMULF ,VMULU ,res_M ,res_M ,VMUDL ,VMUDM ,VMUDN ,VMUDH , /* 000 */
|
|
VMACF ,VMACU ,res_M ,res_M ,VMADL ,VMADM ,VMADN ,VMADH , /* 001 */
|
|
VADD ,VSUB ,res_V ,VABS ,VADDC ,VSUBC ,res_V ,res_V , /* 010 */
|
|
res_V ,res_V ,res_V ,res_V ,res_V ,VSAW ,res_V ,res_V , /* 011 */
|
|
VLT ,VEQ ,VNE ,VGE ,VCL ,VCH ,VCR ,VMRG , /* 100 */
|
|
VAND ,VNAND ,VOR ,VNOR ,VXOR ,VNXOR ,res_V ,res_V , /* 101 */
|
|
VRCP ,VRCPL ,VRCPH ,VMOV ,VRSQ ,VRSQL ,VRSQH ,VNOP , /* 110 */
|
|
res_V ,res_V ,res_V ,res_V ,res_V ,res_V ,res_V ,res_V , /* 111 */
|
|
}; /* 000 001 010 011 100 101 110 111 */
|
|
|
|
#ifndef ARCH_MIN_SSE2
|
|
u16 get_VCO(void)
|
|
{
|
|
register u16 vco;
|
|
|
|
vco = 0x0000
|
|
| (cf_ne[0xF % 8] << 0xF)
|
|
| (cf_ne[0xE % 8] << 0xE)
|
|
| (cf_ne[0xD % 8] << 0xD)
|
|
| (cf_ne[0xC % 8] << 0xC)
|
|
| (cf_ne[0xB % 8] << 0xB)
|
|
| (cf_ne[0xA % 8] << 0xA)
|
|
| (cf_ne[0x9 % 8] << 0x9)
|
|
| (cf_ne[0x8 % 8] << 0x8)
|
|
| (cf_co[0x7 % 8] << 0x7)
|
|
| (cf_co[0x6 % 8] << 0x6)
|
|
| (cf_co[0x5 % 8] << 0x5)
|
|
| (cf_co[0x4 % 8] << 0x4)
|
|
| (cf_co[0x3 % 8] << 0x3)
|
|
| (cf_co[0x2 % 8] << 0x2)
|
|
| (cf_co[0x1 % 8] << 0x1)
|
|
| (cf_co[0x0 % 8] << 0x0);
|
|
return (vco); /* Big endian becomes little. */
|
|
}
|
|
u16 get_VCC(void)
|
|
{
|
|
register u16 vcc;
|
|
|
|
vcc = 0x0000
|
|
| (cf_clip[0xF % 8] << 0xF)
|
|
| (cf_clip[0xE % 8] << 0xE)
|
|
| (cf_clip[0xD % 8] << 0xD)
|
|
| (cf_clip[0xC % 8] << 0xC)
|
|
| (cf_clip[0xB % 8] << 0xB)
|
|
| (cf_clip[0xA % 8] << 0xA)
|
|
| (cf_clip[0x9 % 8] << 0x9)
|
|
| (cf_clip[0x8 % 8] << 0x8)
|
|
| (cf_comp[0x7 % 8] << 0x7)
|
|
| (cf_comp[0x6 % 8] << 0x6)
|
|
| (cf_comp[0x5 % 8] << 0x5)
|
|
| (cf_comp[0x4 % 8] << 0x4)
|
|
| (cf_comp[0x3 % 8] << 0x3)
|
|
| (cf_comp[0x2 % 8] << 0x2)
|
|
| (cf_comp[0x1 % 8] << 0x1)
|
|
| (cf_comp[0x0 % 8] << 0x0);
|
|
return (vcc); /* Big endian becomes little. */
|
|
}
|
|
u8 get_VCE(void)
|
|
{
|
|
unsigned int result;
|
|
register u8 vce;
|
|
|
|
result = 0x00
|
|
| (cf_vce[0x7] << 0x7)
|
|
| (cf_vce[0x6] << 0x6)
|
|
| (cf_vce[0x5] << 0x5)
|
|
| (cf_vce[0x4] << 0x4)
|
|
| (cf_vce[0x3] << 0x3)
|
|
| (cf_vce[0x2] << 0x2)
|
|
| (cf_vce[0x1] << 0x1)
|
|
| (cf_vce[0x0] << 0x0)
|
|
;
|
|
vce = (u8)(result & 0xFF);
|
|
return (vce); /* Big endian becomes little. */
|
|
}
|
|
#else
|
|
u16 get_VCO(void)
|
|
{
|
|
v16 xmm, hi, lo;
|
|
register u16 vco;
|
|
|
|
hi = _mm_load_si128((v16 *)cf_ne);
|
|
lo = _mm_load_si128((v16 *)cf_co);
|
|
|
|
/*
|
|
* Rotate Boolean storage from LSB to MSB.
|
|
*/
|
|
hi = _mm_slli_epi16(hi, 15);
|
|
lo = _mm_slli_epi16(lo, 15);
|
|
|
|
xmm = _mm_packs_epi16(lo, hi); /* Decompress INT16 Booleans to INT8 ones. */
|
|
vco = _mm_movemask_epi8(xmm) & 0x0000FFFF; /* PMOVMSKB combines each MSB. */
|
|
return (vco);
|
|
}
|
|
u16 get_VCC(void)
|
|
{
|
|
v16 xmm, hi, lo;
|
|
register u16 vcc;
|
|
|
|
hi = _mm_load_si128((v16 *)cf_clip);
|
|
lo = _mm_load_si128((v16 *)cf_comp);
|
|
|
|
/*
|
|
* Rotate Boolean storage from LSB to MSB.
|
|
*/
|
|
hi = _mm_slli_epi16(hi, 15);
|
|
lo = _mm_slli_epi16(lo, 15);
|
|
|
|
xmm = _mm_packs_epi16(lo, hi); /* Decompress INT16 Booleans to INT8 ones. */
|
|
vcc = _mm_movemask_epi8(xmm) & 0x0000FFFF; /* PMOVMSKB combines each MSB. */
|
|
return (vcc);
|
|
}
|
|
u8 get_VCE(void)
|
|
{
|
|
v16 xmm, hi, lo;
|
|
register u8 vce;
|
|
|
|
hi = _mm_setzero_si128();
|
|
lo = _mm_load_si128((v16 *)cf_vce);
|
|
|
|
lo = _mm_slli_epi16(lo, 15); /* Rotate Boolean storage from LSB to MSB. */
|
|
|
|
xmm = _mm_packs_epi16(lo, hi); /* Decompress INT16 Booleans to INT8 ones. */
|
|
vce = _mm_movemask_epi8(xmm) & 0x000000FF; /* PMOVMSKB combines each MSB. */
|
|
return (vce);
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* CTC2 resources
|
|
* not sure how to vectorize going the other direction into SSE2
|
|
*/
|
|
void set_VCO(u16 vco)
|
|
{
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
cf_co[i] = (vco >> (i + 0x0)) & 1;
|
|
for (i = 0; i < N; i++)
|
|
cf_ne[i] = (vco >> (i + 0x8)) & 1;
|
|
return; /* Little endian becomes big. */
|
|
}
|
|
void set_VCC(u16 vcc)
|
|
{
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
cf_comp[i] = (vcc >> (i + 0x0)) & 1;
|
|
for (i = 0; i < N; i++)
|
|
cf_clip[i] = (vcc >> (i + 0x8)) & 1;
|
|
return; /* Little endian becomes big. */
|
|
}
|
|
void set_VCE(u8 vce)
|
|
{
|
|
register unsigned int i;
|
|
|
|
for (i = 0; i < N; i++)
|
|
cf_vce[i] = (vce >> i) & 1;
|
|
return; /* Little endian becomes big. */
|
|
}
|