More armjit-fpu work - dot product working for example. Add some non working DISABLEd stuff too.

2025-04-02 11:01:50 -04:00 · 2013-02-16 02:06:02 +01:00 · 2013-02-16 02:06:02 +01:00 · b8abb77eee
commit b8abb77eee
parent 81589b67e5
11 changed files with 307 additions and 36 deletions
--- a/Core/HLE/sceKernelThread.cpp
+++ b/Core/HLE/sceKernelThread.cpp
@ -2802,7 +2802,7 @@ std::vector<DebugThreadInfo> GetThreadsInfo()
 		DebugThreadInfo info;
 		info.id = *iter;
 		strncpy(info.name,t->GetName(),KERNELOBJECT_MAX_NAME_LENGTH);
-		info.name[KERNELOBJECT_MAX_NAME_LENGTH+1] = 0;
+		info.name[KERNELOBJECT_MAX_NAME_LENGTH] = 0;
 		info.status = t->nt.status;
 		info.entrypoint = t->nt.entrypoint;
 		info.curPC = t->context.pc;
--- a/Core/MIPS/ARM/ArmAsm.cpp
+++ b/Core/MIPS/ARM/ArmAsm.cpp
@ -159,7 +159,7 @@ void Jit::GenerateFixedCode()
 			// MOV(R0, R13);
 			// QuickCallFunction(R1, (void *)&ShowPC);

-			LDR(R0, R10, offsetof(MIPSState, pc));
+			LDR(R0, CTXREG, offsetof(MIPSState, pc));
 			BIC(R0, R0, Operand2(0xC0, 4));   // &= 0x3FFFFFFF
 			LDR(R0, R11, R0, true, true);
 			AND(R1, R0, Operand2(0xFC, 4));   // rotation is to the right, in 2-bit increments.
--- a/Core/MIPS/ARM/ArmCompFPU.cpp
+++ b/Core/MIPS/ARM/ArmCompFPU.cpp
@ -222,7 +222,7 @@ void Jit::Comp_mxc1(u32 op)
 	{
 	case 0: // R(rt) = FI(fs); break; //mfc1
 		// Let's just go through RAM for now.
-		fpr.FlushMipsReg(fs);
+		fpr.FlushR(fs);
 		gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT);
 		LDR(gpr.R(rt), CTXREG, fpr.GetMipsRegOffset(fs));
 		return;
@ -233,7 +233,7 @@ void Jit::Comp_mxc1(u32 op)

 	case 4: //FI(fs) = R(rt);	break; //mtc1
 		// Let's just go through RAM for now.
-		gpr.FlushMipsReg(rt);
+		gpr.FlushR(rt);
 		fpr.MapReg(fs, MAP_DIRTY | MAP_NOINIT);
 		VLDR(fpr.R(fs), CTXREG, gpr.GetMipsRegOffset(rt));
 		return;
--- a/Core/MIPS/ARM/ArmCompVFPU.cpp
+++ b/Core/MIPS/ARM/ArmCompVFPU.cpp
@ -42,9 +42,46 @@ namespace MIPSComp
 		}
 	}

+	void Jit::Comp_SV(u32 op) {
+		CONDITIONAL_DISABLE;
+
+		s32 imm = (signed short)(op&0xFFFC);
+		int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
+		int rs = _RS;
+
+		switch (op >> 26)
+		{
+		case 50: //lv.s  // VI(vt) = Memory::Read_U32(addr);
+			{
+				gpr.MapReg(rs);
+				SetR0ToEffectiveAddress(rs, imm);
+				ADD(R0, R0, R11);
+				fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
+				fpr.ReleaseSpillLocks();
+				VLDR(fpr.V(vt), R0, 0);
+			}
+			break;
+
+		case 58: //sv.s   // Memory::Write_U32(VI(vt), addr);
+			{
+				gpr.MapReg(rs);
+				SetR0ToEffectiveAddress(rs, imm);
+				ADD(R0, R0, R11);
+				fpr.MapRegV(vt);
+				fpr.ReleaseSpillLocks();
+				VSTR(fpr.V(vt), R0, 0);
+			}
+			break;
+
+
+		default:
+			DISABLE;
+		}
+	}
+
 	void Jit::Comp_SVQ(u32 op)
 	{
-		DISABLE;
+		CONDITIONAL_DISABLE;

 		int imm = (signed short)(op&0xFFFC);
 		int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);
@ -56,11 +93,12 @@ namespace MIPSComp
 			{
 				gpr.MapReg(rs);
 				SetR0ToEffectiveAddress(rs, imm);
+				ADD(R0, R0, R11);
+
 				u8 vregs[4];
 				GetVectorRegs(vregs, V_Quad, vt);
 				fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
 				fpr.ReleaseSpillLocks();
-				// Just copy 4 words the easiest way while not wasting registers.
 				for (int i = 0; i < 4; i++)
 					VLDR(fpr.V(vregs[i]), R0, i * 4);
 			}
@ -68,17 +106,14 @@ namespace MIPSComp

 		case 62: //sv.q
 			{
-				DISABLE;
-
 				gpr.MapReg(rs);
 				SetR0ToEffectiveAddress(rs, imm);
+				ADD(R0, R0, R11);

 				u8 vregs[4];
 				GetVectorRegs(vregs, V_Quad, vt);
-				// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
 				fpr.MapRegsV(vregs, V_Quad, 0);
 				fpr.ReleaseSpillLocks();
-				// Just copy 4 words the easiest way while not wasting registers.
 				for (int i = 0; i < 4; i++)
 					VSTR(fpr.V(vregs[i]), R0, i * 4);
 			}
@ -92,25 +127,205 @@ namespace MIPSComp

 	void Jit::Comp_VDot(u32 op)
 	{
-		DISABLE;
+		// DISABLE;
+		CONDITIONAL_DISABLE;
+		// WARNING: No prefix support!
+		if (js.MayHavePrefix()) {
+			Comp_Generic(op);
+			js.EatPrefix();
+			return;
+		}
+
+		int vd = _VD;
+		int vs = _VS;
+		int vt = _VT;
+		VectorSize sz = GetVecSize(op);
+
+		// TODO: Force read one of them into regs? probably not.
+		u8 sregs[4], tregs[4];
+		GetVectorRegs(sregs, sz, vs);
+		GetVectorRegs(tregs, sz, vt);
+
+		// TODO: applyprefixST here somehow (shuffle, etc...)
+		fpr.MapRegsV(sregs, sz, 0);
+		fpr.MapRegsV(tregs, sz, 0);
+		VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0]));
+
+		int n = GetNumVectorElements(sz);
+		for (int i = 1; i < n; i++)
+		{
+			// sum += s[i]*t[i];
+			VMUL(S1, fpr.V(sregs[i]), fpr.V(tregs[i]));
+			VADD(S0, S0, S1);
+		}
+		fpr.ReleaseSpillLocks();
+
+		fpr.MapRegV(vd, MAP_NOINIT | MAP_DIRTY);
+
+		// TODO: applyprefixD here somehow (write mask etc..)
+		VMOV(fpr.V(vd), S0);
+
+		fpr.ReleaseSpillLocks();
+
+		js.EatPrefix();
 	}

 	void Jit::Comp_VecDo3(u32 op)
 	{
-		DISABLE;
+		DISABLE;  // Still buggy
+
+		// WARNING: No prefix support!
+		if (js.MayHavePrefix())
+		{
+			Comp_Generic(op);
+			js.EatPrefix();
+			return;
+		}
+
+		int vd = _VD;
+		int vs = _VS;
+		int vt = _VT;
+		VectorSize sz = GetVecSize(op);
+
+		u8 sregs[4], tregs[4], dregs[4];
+		GetVectorRegs(sregs, sz, vs);
+		GetVectorRegs(tregs, sz, vt);
+		GetVectorRegs(dregs, sz, vd);
+
+		void (ARMXEmitter::*triop)(ARMReg, ARMReg, ARMReg) = NULL;
+		switch (op >> 26)
+		{
+		case 24: //VFPU0
+			switch ((op >> 23)&7)
+			{
+			case 0: // d[i] = s[i] + t[i]; break; //vadd
+				triop = &ARMXEmitter::VADD;
+				break;
+			case 1: // d[i] = s[i] - t[i]; break; //vsub
+				triop = &ARMXEmitter::VSUB;
+				break;
+			case 7: // d[i] = s[i] / t[i]; break; //vdiv
+				triop = &ARMXEmitter::VDIV;
+				break;
+			}
+			break;
+		case 25: //VFPU1
+			switch ((op >> 23)&7)
+			{
+			case 0: // d[i] = s[i] * t[i]; break; //vmul
+				triop = &ARMXEmitter::VMUL;
+				break;
+			}
+			break;
+		}
+
+		if (triop == NULL)
+		{
+			Comp_Generic(op);
+			js.EatPrefix();
+			return;
+		}
+
+		int n = GetNumVectorElements(sz);
+		fpr.MapRegsV(sregs, sz, 0);
+		fpr.MapRegsV(tregs, sz, 0);
+		fpr.MapReg(TEMP1);
+		fpr.MapReg(TEMP2);
+		fpr.MapReg(TEMP3);
+
+		for (int i = 0; i < n; ++i) {
+			fpr.MapReg(TEMP0 + i);
+			(this->*triop)(fpr.R(TEMP0 + i), fpr.V(sregs[i]), fpr.V(tregs[i]));
+			fpr.ReleaseSpillLock(sregs[i]);
+			fpr.ReleaseSpillLock(tregs[i]);
+		}
+		fpr.MapRegsV(dregs, sz, MAP_DIRTY | MAP_NOINIT);
+		// TODO: Can avoid this when no overlap
+		for (int i = 0; i < n; i++) {
+			VMOV(fpr.V(dregs[i]), fpr.R(TEMP0 + i));
+		}
+		fpr.ReleaseSpillLocks();
+
+		js.EatPrefix();
 	}

 	void Jit::Comp_Mftv(u32 op)
 	{
-		DISABLE;
-	}
+		// DISABLE;
+		CONDITIONAL_DISABLE;

-	void Jit::Comp_SV(u32 op) {
-		DISABLE;
+		int imm = op & 0xFF;
+		int rt = _RT;
+		switch ((op >> 21) & 0x1f)
+		{
+		case 3: //mfv / mfvc
+			// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
+			if (rt != 0) {
+				if (imm < 128) {  //R(rt) = VI(imm);
+					fpr.FlushV(imm);
+					gpr.MapReg(rt, MAP_NOINIT | MAP_DIRTY);
+					LDR(gpr.R(rt), CTXREG, fpr.GetMipsRegOffsetV(imm));
+				} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc
+					DISABLE;
+					// In case we have a saved prefix.
+					//FlushPrefixV();
+					//gpr.BindToRegister(rt, false, true);
+					//MOV(32, gpr.R(rt), M(&currentMIPS->vfpuCtrl[imm - 128]));
+				} else {
+					//ERROR - maybe need to make this value too an "interlock" value?
+					_dbg_assert_msg_(CPU,0,"mfv - invalid register");
+				}
+			}
+			break;
+
+		case 7: //mtv
+			if (imm < 128) {
+				gpr.FlushR(rt);
+				fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
+				VLDR(fpr.V(imm), CTXREG, gpr.GetMipsRegOffset(rt));
+			} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);
+				DISABLE; 
+				//gpr.BindToRegister(rt, true, false);
+				//MOV(32, M(&currentMIPS->vfpuCtrl[imm - 128]), gpr.R(rt));
+
+				// TODO: Optimization if rt is Imm?
+				//if (imm - 128 == VFPU_CTRL_SPREFIX) {
+				//js.prefixSFlag = JitState::PREFIX_UNKNOWN;
+				//} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
+				//	js.prefixTFlag = JitState::PREFIX_UNKNOWN;
+				//} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
+				//	js.prefixDFlag = JitState::PREFIX_UNKNOWN;
+				//}
+			} else {
+				//ERROR
+				_dbg_assert_msg_(CPU,0,"mtv - invalid register");
+			}
+			break;
+
+		default:
+			DISABLE;
+		}
 	}

 	void Jit::Comp_Vmtvc(u32 op) {
 		DISABLE;
+
+		int vs = _VS;
+		int imm = op & 0xFF;
+		if (imm >= 128 && imm < 128 + VFPU_CTRL_MAX) {
+			fpr.MapRegV(vs, 0);
+			ADD(R0, CTXREG, offsetof(MIPSState, vfpuCtrl[0]) + (imm - 128) * 4);
+			VSTR(fpr.V(vs), R0, 0);
+			fpr.ReleaseSpillLocks();
+
+			if (imm - 128 == VFPU_CTRL_SPREFIX) {
+				js.prefixSFlag = ArmJitState::PREFIX_UNKNOWN;
+			} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
+				js.prefixTFlag = ArmJitState::PREFIX_UNKNOWN;
+			} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
+				js.prefixDFlag = ArmJitState::PREFIX_UNKNOWN;
+			}
+		}
 	}

 }
--- a/Core/MIPS/ARM/ArmRegCache.cpp
+++ b/Core/MIPS/ARM/ArmRegCache.cpp
@ -180,12 +180,7 @@ void ArmRegCache::FlushArmReg(ARMReg r) {
 	ar[r].mipsReg = -1;
 }

-void ArmRegCache::FlushMipsReg(MIPSReg r) {
-	/*
-	if (r == 0) {
-		ERROR_LOG(JIT, "Flushing r0");
-		return;
-	}*/
+void ArmRegCache::FlushR(MIPSReg r) {
 	switch (mr[r].loc) {
 	case ML_IMM:
 		// IMM is always "dirty".
@ -219,7 +214,7 @@ void ArmRegCache::FlushMipsReg(MIPSReg r) {

 void ArmRegCache::FlushAll() {
 	for (int i = 0; i < NUM_MIPSREG; i++) {
-		FlushMipsReg(i);
+		FlushR(i);
 	}
 	// Sanity check
 	for (int i = 0; i < NUM_ARMREG; i++) {
--- a/Core/MIPS/ARM/ArmRegCache.h
+++ b/Core/MIPS/ARM/ArmRegCache.h
@ -93,7 +93,7 @@ public:
 	void MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad = true);
 	void MapDirtyDirtyInIn(MIPSReg rd1, MIPSReg rd2, MIPSReg rs, MIPSReg rt, bool avoidLoad = true);
 	void FlushArmReg(ARMReg r);
-	void FlushMipsReg(MIPSReg r);
+	void FlushR(MIPSReg r);

 	void FlushAll();

--- a/Core/MIPS/ARM/ArmRegCacheFPU.cpp
+++ b/Core/MIPS/ARM/ArmRegCacheFPU.cpp
@ -47,8 +47,19 @@ static const ARMReg *GetMIPSAllocationOrder(int &count) {
 	static const ARMReg allocationOrder[] = {
 		S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15
 	};
-	count = sizeof(allocationOrder) / sizeof(const int);
-	return allocationOrder;
+	// With NEON, we'll have many more.
+	static const ARMReg allocationOrderNEON[] = {
+		S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15,
+		S16, S17, S18, S19, S20, S21, S22, S23, S24, S25, S26, S27, S28, S29, S30, S31
+	};
+	bool useNEON = false;  // TODO: Use cpu detect
+	if (useNEON) {
+		count = sizeof(allocationOrderNEON) / sizeof(const int);
+		return allocationOrderNEON;
+	} else {
+		count = sizeof(allocationOrder) / sizeof(const int);
+		return allocationOrder;
+	}
 }

 ARMReg ArmRegCacheFPU::MapReg(MIPSReg mipsReg, int mapFlags) {
@ -79,7 +90,7 @@ allocate:
 			// That means it's free. Grab it, and load the value into it (if requested).
 			ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;
 			if (!(mapFlags & MAP_NOINIT)) {
-				if (mr[mipsReg].loc == ML_MEM) {
+				if (mr[mipsReg].loc == ML_MEM && mipsReg < TEMP0) {
 					emit->VLDR((ARMReg)(reg + S0), CTXREG, GetMipsRegOffset(mipsReg));
 				}
 			}
@ -180,7 +191,6 @@ void ArmRegCacheFPU::FlushArmReg(ARMReg r) {
 		if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == ML_ARMREG)
 		{
 			//INFO_LOG(HLE, "Flushing ARM reg %i", reg);
-
 			emit->VSTR(r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));
 		}
 		// IMMs won't be in an ARM reg.
@ -193,7 +203,7 @@ void ArmRegCacheFPU::FlushArmReg(ARMReg r) {
 	ar[reg].mipsReg = -1;
 }

-void ArmRegCacheFPU::FlushMipsReg(MIPSReg r) {
+void ArmRegCacheFPU::FlushR(MIPSReg r) {
 	switch (mr[r].loc) {
 	case ML_IMM:
 		// IMM is always "dirty".
@ -203,7 +213,7 @@ void ArmRegCacheFPU::FlushMipsReg(MIPSReg r) {

 	case ML_ARMREG:
 		if (mr[r].reg == (int)INVALID_REG) {
-			ERROR_LOG(HLE, "FlushMipsReg: MipsReg had bad ArmReg");
+			ERROR_LOG(HLE, "FlushR: MipsReg had bad ArmReg");
 		}
 		if (ar[mr[r].reg].isDirty) {
 			//INFO_LOG(HLE, "Flushing dirty reg %i", mr[r].reg);
@ -225,9 +235,42 @@ void ArmRegCacheFPU::FlushMipsReg(MIPSReg r) {
 	mr[r].reg = (int)INVALID_REG;
 }

+void ArmRegCacheFPU::DiscardR(MIPSReg r) {
+	switch (mr[r].loc) {
+	case ML_IMM:
+		// IMM is always "dirty".
+		// IMM is not allowed for FP (yet).
+		ERROR_LOG(HLE, "Imm in FP register?");
+		break;
+
+	case ML_ARMREG:
+		if (mr[r].reg == (int)INVALID_REG) {
+			ERROR_LOG(HLE, "DiscardR: MipsReg had bad ArmReg");
+		}
+		// Note that we DO NOT write it back here. That's the whole point of Discard.
+		ar[mr[r].reg].isDirty = false;
+		ar[mr[r].reg].mipsReg = -1;
+		break;
+
+	case ML_MEM:
+		// Already there, nothing to do.
+		break;
+
+	default:
+		//BAD
+		break;
+	}
+	mr[r].loc = ML_MEM;
+	mr[r].reg = (int)INVALID_REG;
+}
+
 void ArmRegCacheFPU::FlushAll() {
+	// Discard temps!
+	for (int i = TEMP0; i < TEMP0 + NUM_TEMPS; i++) {
+		DiscardR(i);
+	}
 	for (int i = 0; i < NUM_MIPSFPUREG; i++) {
-		FlushMipsReg(i);
+		FlushR(i);
 	}
 	// Sanity check
 	for (int i = 0; i < NUM_ARMFPUREG; i++) {
@ -239,7 +282,7 @@ void ArmRegCacheFPU::FlushAll() {

 int ArmRegCacheFPU::GetMipsRegOffset(MIPSReg r) {
 	// These are offsets within the MIPSState structure. First there are the GPRS, then FPRS, then the "VFPURs".
-	if (r < 32 + 128)
+	if (r < 32 + 128 + NUM_TEMPS)
 		return (r + 32) << 2;
 	ERROR_LOG(JIT, "bad mips register %i", r);
 	return 0;  // or what?
--- a/Core/MIPS/ARM/ArmRegCacheFPU.h
+++ b/Core/MIPS/ARM/ArmRegCacheFPU.h
@ -28,7 +28,12 @@
 using namespace ArmGen;

 enum {
-	TOTAL_MAPPABLE_MIPSFPUREGS = 32 + 128,
+	NUM_TEMPS = 4,
+	TEMP0 = 32 + 128,
+	TEMP1 = TEMP0 + 1,
+	TEMP2 = TEMP0 + 2,
+	TEMP3 = TEMP0 + 3,
+	TOTAL_MAPPABLE_MIPSFPUREGS = 32 + 128 + NUM_TEMPS,
 };

 struct FPURegARM {
@ -59,6 +64,10 @@ public:
 	// it's being kept allocated.
 	void SpillLock(MIPSReg reg, MIPSReg reg2 = -1, MIPSReg reg3 = -1, MIPSReg reg4 = -1);
 	void ReleaseSpillLocks();
+	void ReleaseSpillLock(int mipsreg)
+	{
+		mr[mipsreg].spillLock = false;
+	}

 	void SetImm(MIPSReg reg, u32 immVal);
 	bool IsImm(MIPSReg reg) const;
@ -71,7 +80,10 @@ public:
 	void MapDirtyIn(MIPSReg rd, MIPSReg rs, bool avoidLoad = true);
 	void MapDirtyInIn(MIPSReg rd, MIPSReg rs, MIPSReg rt, bool avoidLoad = true);
 	void FlushArmReg(ARMReg r);
-	void FlushMipsReg(MIPSReg r);
+	void FlushR(MIPSReg r);
+	void FlushV(MIPSReg r) { FlushR(r + 32); }
+	void DiscardR(MIPSReg r);
+	void DiscardV(MIPSReg r) { DiscardR(r + 32);}

 	void FlushAll();

@ -81,7 +93,7 @@ public:
 	
 	ARMReg V(int vreg) { return R(vreg + 32); }

-	void MapRegV(int vreg, int flags);
+	void MapRegV(int vreg, int flags = 0);

 	// NOTE: These require you to release spill locks manually!
 	void MapRegsV(int vec, VectorSize vsz, int flags);
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -304,7 +304,7 @@ void Jit::Comp_VDot(u32 op) {
 	u8 sregs[4], tregs[4], dregs[4];
 	GetVectorRegs(sregs, sz, vs);
 	GetVectorRegs(tregs, sz, vt);
-	GetVectorRegs(dregs, sz, vd);
+	GetVectorRegs(dregs, V_Single, vd);

 	// TODO: applyprefixST here somehow (shuffle, etc...)

--- a/Core/MIPS/x86/RegCacheFPU.cpp
+++ b/Core/MIPS/x86/RegCacheFPU.cpp
@ -79,6 +79,11 @@ void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) {
 	}
 }

+void FPURegCache::ReleaseSpillLock(int mipsreg)
+{
+	regs[mipsreg].locked = false;
+}
+
 void FPURegCache::ReleaseSpillLocks() {
 	for (int i = 0; i < NUM_MIPS_FPRS; i++)
 		regs[i].locked = false;
--- a/Core/MIPS/x86/RegCacheFPU.h
+++ b/Core/MIPS/x86/RegCacheFPU.h
@ -95,6 +95,7 @@ public:

 	// Register locking. Prevents them from being spilled.
 	void SpillLock(int p1, int p2=0xff, int p3=0xff, int p4=0xff);
+	void ReleaseSpillLock(int mipsrega);
 	void ReleaseSpillLocks();

 	void MapRegV(int vreg, int flags);