VFPU JIT: start setting up infrastructure. very incomplete. vdot works if undisabled, but isn't complete.

This commit is contained in:
Henrik Rydgard 2013-01-26 01:33:32 +01:00
parent 68991511ee
commit 2738417040
8 changed files with 215 additions and 23 deletions

View file

@ -462,7 +462,9 @@ public:
// SSE/SSE2: Floating point bitwise (yes)
void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
void ANDSS(X64Reg regOp, OpArg arg);
// I don't think these exist
/*
void ANDSD(X64Reg regOp, OpArg arg);
void ANDNSS(X64Reg regOp, OpArg arg);
void ANDNSD(X64Reg regOp, OpArg arg);
@ -470,6 +472,7 @@ public:
void ORSD(X64Reg regOp, OpArg arg);
void XORSS(X64Reg regOp, OpArg arg);
void XORSD(X64Reg regOp, OpArg arg);
*/
// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
void ADDPS(X64Reg regOp, OpArg arg);

View file

@ -490,7 +490,7 @@ const MIPSInstruction tableVFPU0[8] =
const MIPSInstruction tableVFPU1[8] =
{
INSTR("vmul",&Jit::Comp_Generic, Dis_VectorSet3, Int_VecDo3, IS_VFPU),
INSTR("vdot",&Jit::Comp_Generic, Dis_VectorDot, Int_VDot, IS_VFPU),
INSTR("vdot",&Jit::Comp_VDot, Dis_VectorDot, Int_VDot, IS_VFPU),
INSTR("vscl",&Jit::Comp_Generic, Dis_VScl, Int_VScl, IS_VFPU),
{-2},
INSTR("vhdp",&Jit::Comp_Generic, Dis_Generic, Int_VHdp, IS_VFPU),

View file

@ -185,7 +185,9 @@ namespace MIPSComp
if ((doImm == &RType3_ImmAdd || doImm == &RType3_ImmOr) && (rs == 0 || rt == 0))
{
gpr.BindToRegister(rd, rd == rs || rd == rt, true);
MOV(32, gpr.R(rd), gpr.R(rt == 0 ? rs : rt));
int rsource = rt == 0 ? rs : rt;
if (rsource != rd)
MOV(32, gpr.R(rd), gpr.R(rsource));
}
else
{

View file

@ -23,6 +23,8 @@
#include "../MIPSVFPUUtils.h"
#include "RegCache.h"
// VERY UNFINISHED
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
// Currently known non working ones should have DISABLE.
@ -45,6 +47,99 @@ using namespace Gen;
namespace MIPSComp
{
static const float one = 1.0f;
static const float minus_one = -1.0f;
static const float zero = -1.0f;
const u32 GC_ALIGNED16( noSignMask[4] ) = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
const u32 GC_ALIGNED16( signBitLower[4] ) = {0x80000000, 0, 0, 0};
void Jit::Comp_VPFX(u32 op)
{
int data = op & 0xFFFFF;
int regnum = (op >> 24) & 3;
switch (regnum) {
case 0: // S
js.prefixS = data;
js.prefixSKnown = true;
break;
case 1: // T
js.prefixT = data;
js.prefixTKnown = true;
break;
case 2: // D
js.prefixD = data;
js.prefixDKnown = true;
break;
}
// TODO: Defer this to end of block
MOV(32, M((void *)&mips_->vfpuCtrl[VFPU_CTRL_SPREFIX + regnum]), Imm32(data));
}
// TODO: Got register value ownership issues. We need to be sure that if we modify input
// like this, it does NOT get written back!
void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
if (prefix == 0xE4) return;
int n = GetNumVectorElements(sz);
u8 origV[4];
static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};
for (int i = 0; i < n; i++)
{
origV[i] = vregs[i];
}
for (int i = 0; i < n; i++)
{
int regnum = (prefix >> (i*2)) & 3;
int abs = (prefix >> (8+i)) & 1;
int negate = (prefix >> (16+i)) & 1;
int constants = (prefix >> (12+i)) & 1;
if (!constants) {
vregs[i] = origV[regnum];
if (abs) {
ANDPS(fpr.VX(vregs[i]), M((void *)&noSignMask));
}
} else {
MOVSS(fpr.VX(vregs[i]), M((void *)&constantArray[regnum + (abs<<2)]));
}
if (negate)
XORPS(fpr.VX(vregs[i]), M((void *)&signBitLower));
}
}
void Jit::ApplyPrefixD(const u8 *vregs, u32 prefix, VectorSize sz, bool onlyWriteMask) {
_assert_(js.prefixDKnown);
if (!prefix) return;
int n = GetNumVectorElements(sz);
for (int i = 0; i < n; i++)
{
int mask = (prefix >> (8 + i)) & 1;
js.writeMask[i] = mask ? true : false;
if (onlyWriteMask)
continue;
if (!mask) {
int sat = (prefix >> (i * 2)) & 3;
if (sat == 1)
{
MAXSS(fpr.VX(vregs[i]), M((void *)&zero));
MINSS(fpr.VX(vregs[i]), M((void *)&one));
}
else if (sat == 3)
{
MAXSS(fpr.VX(vregs[i]), M((void *)&minus_one));
MINSS(fpr.VX(vregs[i]), M((void *)&one));
}
}
}
}
void Jit::Comp_SVQ(u32 op)
{
int imm = (signed short)(op&0xFFFC);
@ -58,32 +153,28 @@ void Jit::Comp_SVQ(u32 op)
if (!g_Config.bFastMemory) {
DISABLE;
}
fpr.Flush();
gpr.BindToRegister(rs, true, true);
u8 vregs[4];
GetVectorRegs(vregs, V_Quad, vt);
MOV(32, R(EAX), gpr.R(rs));
// Just copy 4 words the easiest way while not wasting registers.
#ifndef _M_X64
AND(32, R(EAX), Imm32(0x3FFFFFFF));
#endif
fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
// MOVSS to prime any crazy cache mechanism that might assume that there's a float somewhere...
for (int i = 0; i < 4; i++) {
#ifdef _M_X64
MOVSS((X64Reg)(XMM0 + i), MComplex(RBX, EAX, 1, i * 4 + imm));
MOVSS(fpr.VX(vregs[i]), MComplex(RBX, EAX, 1, i * 4 + imm));
#else
MOVSS((X64Reg)(XMM0 + i), MDisp(EAX, (u32)(Memory::base + i * 4 + imm)));
MOVSS(fpr.VX(vregs[i]), MDisp(EAX, (u32)(Memory::base + i * 4 + imm)));
#endif
}
// It would be pretty nice to have these in registers for the next instruction...
// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
for (int i = 0; i < 4; i++) {
MOVSS(M((void *)&mips_->v[vregs[i]]), (X64Reg)(XMM0 + i));
}
gpr.UnlockAll();
fpr.ReleaseSpillLocks();
}
break;
@ -107,18 +198,18 @@ void Jit::Comp_SVQ(u32 op)
// It would be pretty nice to have these in registers for the next instruction...
// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
for (int i = 0; i < 4; i++) {
MOVSS((X64Reg)(XMM0 + i), M((void *)&mips_->v[vregs[i]]));
}
fpr.MapRegsV(vregs, V_Quad, 0);
for (int i = 0; i < 4; i++) {
#ifdef _M_X64
MOVSS(MComplex(RBX, EAX, 1, i * 4 + imm), (X64Reg)(XMM0 + i));
MOVSS(MComplex(RBX, EAX, 1, i * 4 + imm), fpr.VX(vregs[i]));
#else
MOVSS(MDisp(EAX, (u32)(Memory::base + i * 4 + imm)), (X64Reg)(XMM0 + i));
MOVSS(MDisp(EAX, (u32)(Memory::base + i * 4 + imm)), fpr.VX(vregs[i]));
#endif
}
fpr.ReleaseSpillLocks();
gpr.UnlockAll();
}
break;
@ -129,4 +220,48 @@ void Jit::Comp_SVQ(u32 op)
}
}
void Jit::Comp_VDot(u32 op) {
DISABLE;
// WARNING: No prefix support!
int vd = _VD;
int vs = _VS;
int vt = _VT;
VectorSize sz = GetVecSize(op);
// TODO: Force read one of them into regs? probably not.
u8 sregs[4], tregs[4], dregs[4];
GetVectorRegs(sregs, sz, vs);
GetVectorRegs(tregs, sz, vt);
GetVectorRegs(dregs, sz, vd);
// TODO: applyprefixST here somehow (shuffle, etc...)
MOVSS(XMM0, fpr.V(sregs[0]));
MULSS(XMM0, fpr.V(tregs[0]));
float sum = 0.0f;
int n = GetNumVectorElements(sz);
for (int i = 1; i < n; i++)
{
// sum += s[i]*t[i];
MOVSS(XMM1, fpr.V(sregs[i]));
MULSS(XMM1, fpr.V(tregs[i]));
ADDSS(XMM0, R(XMM1));
}
fpr.ReleaseSpillLocks();
fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
// TODO: applyprefixD here somehow (write mask etc..)
MOVSS(fpr.V(vd), XMM0);
fpr.ReleaseSpillLocks();
js.EatPrefix();
}
}

View file

@ -184,6 +184,7 @@ const u8 *Jit::DoJit(u32 em_address, JitBlock *b)
js.curBlock = b;
js.compiling = true;
js.inDelaySlot = false;
js.PrefixStart();
// We add a check before the block, used when entering from a linked block.
b->checkedEntry = GetCodePtr();

View file

@ -55,6 +55,29 @@ struct JitState
int downcountAmount;
bool compiling; // TODO: get rid of this in favor of using analysis results to determine end of block
JitBlock *curBlock;
// VFPU prefix magic
u32 prefixS;
u32 prefixT;
u32 prefixD;
bool writeMask[4];
bool prefixSKnown;
bool prefixTKnown;
bool prefixDKnown;
void PrefixStart() {
prefixSKnown = false;
prefixTKnown = false;
prefixDKnown = false;
}
void EatPrefix() {
prefixSKnown = true;
prefixTKnown = true;
prefixDKnown = true;
prefixS = 0xE4;
prefixT = 0xE4;
prefixD = 0x0;
writeMask[0] = writeMask[1] = writeMask[2] = writeMask[3] = false;
}
};
enum CompileDelaySlotFlags
@ -111,6 +134,10 @@ public:
void Comp_mxc1(u32 op);
void Comp_SVQ(u32 op);
void Comp_VPFX(u32 op);
void Comp_VDot(u32 op);
void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz);
void ApplyPrefixD(const u8 *vregs, u32 prefix, VectorSize sz, bool onlyWriteMask = false);
JitBlockCache *GetBlockCache() { return &blocks; }
AsmRoutineManager &Asm() { return asm_; }

View file

@ -48,16 +48,32 @@ void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
if (p4 != 0xFF) regs[p4].locked = true;
}
void FPURegCache::SpillLockV(const u8 *v, VectorSize vsz) {
for (int i = 0; i < GetNumVectorElements(vsz); i++) {
vregs[i].locked = true;
void FPURegCache::SpillLockV(const u8 *v, VectorSize sz) {
for (int i = 0; i < GetNumVectorElements(sz); i++) {
vregs[v[i]].locked = true;
}
}
void FPURegCache::SpillLockV(int vec, VectorSize vsz) {
void FPURegCache::SpillLockV(int vec, VectorSize sz) {
u8 v[4];
GetVectorRegs(v, vsz, vec);
SpillLockV(v, vsz);
GetVectorRegs(v, sz, vec);
SpillLockV(v, sz);
}
void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) {
u8 v[4];
GetVectorRegs(v, sz, vec);
SpillLockV(v, sz);
for (int i = 0; i < GetNumVectorElements(sz); i++) {
BindToRegister(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
}
}
void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) {
SpillLockV(v, sz);
for (int i = 0; i < GetNumVectorElements(sz); i++) {
BindToRegister(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
}
}
void FPURegCache::ReleaseSpillLocks() {
@ -85,6 +101,7 @@ void FPURegCache::BindToRegister(int i, bool doLoad, bool makeDirty) {
} else {
// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
xregs[RX(i)].dirty |= makeDirty;
_assert_msg_(DYNA_REC, regs[i].location.IsSimpleReg(), "not loaded and not simple.");
}
}

View file

@ -47,6 +47,11 @@ struct MIPSCachedFPReg {
bool locked;
};
enum {
MAP_DIRTY = 1,
MAP_NOINIT = 2,
};
// The PSP has 160 FP registers: 32 FPRs + 128 VFPU registers.
// Soon we will support them all.
@ -89,6 +94,8 @@ public:
void SpillLock(int p1, int p2=0xff, int p3=0xff, int p4=0xff);
void ReleaseSpillLocks();
void MapRegsV(int vec, VectorSize vsz, int flags);
void MapRegsV(const u8 *v, VectorSize vsz, int flags);
void SpillLockV(const u8 *v, VectorSize vsz);
void SpillLockV(int vec, VectorSize vsz);