VFPU JIT: start setting up infrastructure. very incomplete. vdot works if undisabled, but isn't complete.

2025-04-02 11:01:50 -04:00 · 2013-01-26 01:33:32 +01:00 · 2013-01-26 01:33:32 +01:00 · 2738417040
commit 2738417040
parent 68991511ee
8 changed files with 215 additions and 23 deletions
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@ -462,7 +462,9 @@ public:
 	// SSE/SSE2: Floating point bitwise (yes)
 	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);  
 	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);  
-	void ANDSS(X64Reg regOp, OpArg arg);  
+
+	// I don't think these exist
+	/*
 	void ANDSD(X64Reg regOp, OpArg arg);  
 	void ANDNSS(X64Reg regOp, OpArg arg); 
 	void ANDNSD(X64Reg regOp, OpArg arg); 
@ -470,6 +472,7 @@ public:
 	void ORSD(X64Reg regOp, OpArg arg);   
 	void XORSS(X64Reg regOp, OpArg arg);   
 	void XORSD(X64Reg regOp, OpArg arg);   
+	*/

 	// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
 	void ADDPS(X64Reg regOp, OpArg arg); 
--- a/Core/MIPS/MIPSTables.cpp
+++ b/Core/MIPS/MIPSTables.cpp
@ -490,7 +490,7 @@ const MIPSInstruction tableVFPU0[8] =
 const MIPSInstruction tableVFPU1[8] = 
 {
 	INSTR("vmul",&Jit::Comp_Generic, Dis_VectorSet3, Int_VecDo3, IS_VFPU),
-	INSTR("vdot",&Jit::Comp_Generic, Dis_VectorDot, Int_VDot, IS_VFPU), 
+	INSTR("vdot",&Jit::Comp_VDot, Dis_VectorDot, Int_VDot, IS_VFPU), 
 	INSTR("vscl",&Jit::Comp_Generic, Dis_VScl, Int_VScl, IS_VFPU),
 	{-2},
 	INSTR("vhdp",&Jit::Comp_Generic, Dis_Generic, Int_VHdp, IS_VFPU), 
--- a/Core/MIPS/x86/CompALU.cpp
+++ b/Core/MIPS/x86/CompALU.cpp
@ -185,7 +185,9 @@ namespace MIPSComp
 		if ((doImm == &RType3_ImmAdd || doImm == &RType3_ImmOr) && (rs == 0 || rt == 0))
 		{
 			gpr.BindToRegister(rd, rd == rs || rd == rt, true);
-			MOV(32, gpr.R(rd), gpr.R(rt == 0 ? rs : rt));
+			int rsource = rt == 0 ? rs : rt;
+			if (rsource != rd)
+				MOV(32, gpr.R(rd), gpr.R(rsource));
 		}
 		else
 		{
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -23,6 +23,8 @@
 #include "../MIPSVFPUUtils.h"
 #include "RegCache.h"

+// VERY UNFINISHED
+
 // All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
 // Currently known non working ones should have DISABLE.

@ -45,6 +47,99 @@ using namespace Gen;
 namespace MIPSComp
 {

+static const float one = 1.0f;
+static const float minus_one = -1.0f;
+static const float zero = -1.0f;
+
+const u32 GC_ALIGNED16( noSignMask[4] ) = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+const u32 GC_ALIGNED16( signBitLower[4] ) = {0x80000000, 0, 0, 0};
+
+void Jit::Comp_VPFX(u32 op)
+{
+	int data = op & 0xFFFFF;
+	int regnum = (op >> 24) & 3;
+	switch (regnum) {
+	case 0:  // S
+		js.prefixS = data;
+		js.prefixSKnown = true;
+		break;
+	case 1:  // T
+		js.prefixT = data;
+		js.prefixTKnown = true;
+		break;
+	case 2:  // D
+		js.prefixD = data;
+		js.prefixDKnown = true;
+		break;
+	}
+	// TODO: Defer this to end of block
+	MOV(32, M((void *)&mips_->vfpuCtrl[VFPU_CTRL_SPREFIX + regnum]), Imm32(data));
+}
+
+
+// TODO:  Got register value ownership issues. We need to be sure that if we modify input
+// like this, it does NOT get written back!
+void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
+	if (prefix == 0xE4) return;
+
+	int n = GetNumVectorElements(sz);
+	u8 origV[4];
+	static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};
+
+	for (int i = 0; i < n; i++)
+	{
+		origV[i] = vregs[i];
+	}
+
+	for (int i = 0; i < n; i++)
+	{
+		int regnum = (prefix >> (i*2)) & 3;
+		int abs    = (prefix >> (8+i)) & 1;
+		int negate = (prefix >> (16+i)) & 1;
+		int constants = (prefix >> (12+i)) & 1;
+
+		if (!constants) {
+			vregs[i] = origV[regnum];
+			if (abs) {
+				ANDPS(fpr.VX(vregs[i]), M((void *)&noSignMask));
+			}
+		}	else {
+			MOVSS(fpr.VX(vregs[i]), M((void *)&constantArray[regnum + (abs<<2)]));
+		}
+
+		if (negate)
+			XORPS(fpr.VX(vregs[i]), M((void *)&signBitLower));
+	}
+}
+
+void Jit::ApplyPrefixD(const u8 *vregs, u32 prefix, VectorSize sz, bool onlyWriteMask) {
+	_assert_(js.prefixDKnown);
+	if (!prefix) return;
+
+	int n = GetNumVectorElements(sz);
+	for (int i = 0; i < n; i++)
+	{
+		int mask = (prefix >> (8 + i)) & 1;
+		js.writeMask[i] = mask ? true : false;
+		if (onlyWriteMask)
+			continue;
+		if (!mask) {
+			int sat = (prefix >> (i * 2)) & 3;
+			if (sat == 1)
+			{
+				MAXSS(fpr.VX(vregs[i]), M((void *)&zero));
+				MINSS(fpr.VX(vregs[i]), M((void *)&one));
+			}
+			else if (sat == 3)
+			{
+				MAXSS(fpr.VX(vregs[i]), M((void *)&minus_one));
+				MINSS(fpr.VX(vregs[i]), M((void *)&one));
+			}
+		}
+	}
+}
+
+
 void Jit::Comp_SVQ(u32 op)
 {
 	int imm = (signed short)(op&0xFFFC);
@ -58,32 +153,28 @@ void Jit::Comp_SVQ(u32 op)
 			if (!g_Config.bFastMemory) {
 				DISABLE;
 			}
-			fpr.Flush();
 			gpr.BindToRegister(rs, true, true);
 	
 			u8 vregs[4];
 			GetVectorRegs(vregs, V_Quad, vt);
-
 			MOV(32, R(EAX), gpr.R(rs));
 			// Just copy 4 words the easiest way while not wasting registers.
 #ifndef _M_X64
 			AND(32, R(EAX), Imm32(0x3FFFFFFF));
 #endif
+			fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
+
 			// MOVSS to prime any crazy cache mechanism that might assume that there's a float somewhere...
 			for (int i = 0; i < 4; i++) {
 #ifdef _M_X64
-				MOVSS((X64Reg)(XMM0 + i), MComplex(RBX, EAX, 1, i * 4 + imm));
+				MOVSS(fpr.VX(vregs[i]), MComplex(RBX, EAX, 1, i * 4 + imm));
 #else
-				MOVSS((X64Reg)(XMM0 + i), MDisp(EAX, (u32)(Memory::base + i * 4 + imm)));
+				MOVSS(fpr.VX(vregs[i]), MDisp(EAX, (u32)(Memory::base + i * 4 + imm)));
 #endif
 			}

-			// It would be pretty nice to have these in registers for the next instruction...
-			// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
-			for (int i = 0; i < 4; i++) {
-				MOVSS(M((void *)&mips_->v[vregs[i]]), (X64Reg)(XMM0 + i));
-			}
 			gpr.UnlockAll();
+			fpr.ReleaseSpillLocks();
 		}
 		break;

@ -107,18 +198,18 @@ void Jit::Comp_SVQ(u32 op)

 			// It would be pretty nice to have these in registers for the next instruction...
 			// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
-			for (int i = 0; i < 4; i++) {
-				MOVSS((X64Reg)(XMM0 + i), M((void *)&mips_->v[vregs[i]]));
-			}
+			
+			fpr.MapRegsV(vregs, V_Quad, 0);

 			for (int i = 0; i < 4; i++) {
 #ifdef _M_X64
-				MOVSS(MComplex(RBX, EAX, 1, i * 4 + imm), (X64Reg)(XMM0 + i));
+				MOVSS(MComplex(RBX, EAX, 1, i * 4 + imm), fpr.VX(vregs[i]));
 #else
-				MOVSS(MDisp(EAX, (u32)(Memory::base + i * 4 + imm)), (X64Reg)(XMM0 + i));
+				MOVSS(MDisp(EAX, (u32)(Memory::base + i * 4 + imm)), fpr.VX(vregs[i]));
 #endif
 			}

+			fpr.ReleaseSpillLocks();
 			gpr.UnlockAll();
 		}
 		break;
@ -129,4 +220,48 @@ void Jit::Comp_SVQ(u32 op)
 	}
 }

+
+void Jit::Comp_VDot(u32 op) {
+	DISABLE;
+	// WARNING: No prefix support!
+
+	int vd = _VD;
+	int vs = _VS;
+	int vt = _VT;
+	VectorSize sz = GetVecSize(op);
+	
+	// TODO: Force read one of them into regs? probably not.
+	u8 sregs[4], tregs[4], dregs[4];
+	GetVectorRegs(sregs, sz, vs);
+	GetVectorRegs(tregs, sz, vt);
+	GetVectorRegs(dregs, sz, vd);
+
+	// TODO: applyprefixST here somehow (shuffle, etc...)
+
+	MOVSS(XMM0, fpr.V(sregs[0]));
+	MULSS(XMM0, fpr.V(tregs[0]));
+
+	float sum = 0.0f;
+	int n = GetNumVectorElements(sz);
+	for (int i = 1; i < n; i++)
+	{
+		// sum += s[i]*t[i];
+		MOVSS(XMM1, fpr.V(sregs[i]));
+		MULSS(XMM1, fpr.V(tregs[i]));
+		ADDSS(XMM0, R(XMM1));
+	}
+	fpr.ReleaseSpillLocks();
+
+	fpr.MapRegsV(dregs, V_Single, MAP_NOINIT);
+
+	// TODO: applyprefixD here somehow (write mask etc..)
+
+	MOVSS(fpr.V(vd), XMM0);
+
+	fpr.ReleaseSpillLocks();
+
+	js.EatPrefix();
+}
+
+
 }
--- a/Core/MIPS/x86/Jit.cpp
+++ b/Core/MIPS/x86/Jit.cpp
@ -184,6 +184,7 @@ const u8 *Jit::DoJit(u32 em_address, JitBlock *b)
 	js.curBlock = b;
 	js.compiling = true;
 	js.inDelaySlot = false;
+	js.PrefixStart();

 	// We add a check before the block, used when entering from a linked block.
 	b->checkedEntry = GetCodePtr();
--- a/Core/MIPS/x86/Jit.h
+++ b/Core/MIPS/x86/Jit.h
@ -55,6 +55,29 @@ struct JitState
 	int downcountAmount;
 	bool compiling;	// TODO: get rid of this in favor of using analysis results to determine end of block
 	JitBlock *curBlock;
+
+	// VFPU prefix magic
+	u32 prefixS;
+	u32 prefixT;
+	u32 prefixD;
+	bool writeMask[4];
+	bool prefixSKnown;
+	bool prefixTKnown;
+	bool prefixDKnown;
+	void PrefixStart() {
+		prefixSKnown = false;
+		prefixTKnown = false;
+		prefixDKnown = false;
+	}
+	void EatPrefix() {
+		prefixSKnown = true;
+		prefixTKnown = true;
+		prefixDKnown = true;
+		prefixS = 0xE4;
+		prefixT = 0xE4;
+		prefixD = 0x0;
+		writeMask[0] = writeMask[1] = writeMask[2] = writeMask[3] = false;
+	}
 };

 enum CompileDelaySlotFlags
@ -111,6 +134,10 @@ public:
 	void Comp_mxc1(u32 op);

 	void Comp_SVQ(u32 op);
+	void Comp_VPFX(u32 op);
+	void Comp_VDot(u32 op);
+	void ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz);
+	void ApplyPrefixD(const u8 *vregs, u32 prefix, VectorSize sz, bool onlyWriteMask = false);

 	JitBlockCache *GetBlockCache() { return &blocks; }
 	AsmRoutineManager &Asm() { return asm_; }
--- a/Core/MIPS/x86/RegCacheFPU.cpp
+++ b/Core/MIPS/x86/RegCacheFPU.cpp
@ -48,16 +48,32 @@ void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
 	if (p4 != 0xFF) regs[p4].locked = true;
 }

-void FPURegCache::SpillLockV(const u8 *v, VectorSize vsz) {
-	for (int i = 0; i < GetNumVectorElements(vsz); i++) {
-		vregs[i].locked = true;
+void FPURegCache::SpillLockV(const u8 *v, VectorSize sz) {
+	for (int i = 0; i < GetNumVectorElements(sz); i++) {
+		vregs[v[i]].locked = true;
 	}
 }

-void FPURegCache::SpillLockV(int vec, VectorSize vsz) {
+void FPURegCache::SpillLockV(int vec, VectorSize sz) {
 	u8 v[4];
-	GetVectorRegs(v, vsz, vec);
-	SpillLockV(v, vsz);
+	GetVectorRegs(v, sz, vec);
+	SpillLockV(v, sz);
+}
+
+void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) {
+	u8 v[4];
+	GetVectorRegs(v, sz, vec);
+	SpillLockV(v, sz);
+	for (int i = 0; i < GetNumVectorElements(sz); i++) {
+		BindToRegister(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
+	}
+}
+
+void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) {
+	SpillLockV(v, sz);
+	for (int i = 0; i < GetNumVectorElements(sz); i++) {
+		BindToRegister(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0);
+	}
 }

 void FPURegCache::ReleaseSpillLocks() {
@ -85,6 +101,7 @@ void FPURegCache::BindToRegister(int i, bool doLoad, bool makeDirty) {
 	} else {
 		// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
 		xregs[RX(i)].dirty |= makeDirty;
+		_assert_msg_(DYNA_REC, regs[i].location.IsSimpleReg(), "not loaded and not simple.");
 	}
 }

--- a/Core/MIPS/x86/RegCacheFPU.h
+++ b/Core/MIPS/x86/RegCacheFPU.h
@ -47,6 +47,11 @@ struct MIPSCachedFPReg {
 	bool locked;
 };

+enum {
+	MAP_DIRTY = 1,
+	MAP_NOINIT = 2,
+};
+
 // The PSP has 160 FP registers: 32 FPRs + 128 VFPU registers.
 // Soon we will support them all.

@ -89,6 +94,8 @@ public:
 	void SpillLock(int p1, int p2=0xff, int p3=0xff, int p4=0xff);
 	void ReleaseSpillLocks();

+	void MapRegsV(int vec, VectorSize vsz, int flags);
+	void MapRegsV(const u8 *v, VectorSize vsz, int flags);
 	void SpillLockV(const u8 *v, VectorSize vsz);
 	void SpillLockV(int vec, VectorSize vsz);