Merge pull request #2647 from unknownbrackets/jit-minor

Fix x64 jit bugs (memchecks and lwl/lwr/swl/swr)
2025-04-02 11:01:50 -04:00 · 2013-07-06 09:35:12 -07:00 · 2013-07-06 09:35:12 -07:00 · 8988d697e3
commit 8988d697e3
parent a855303b20 2b4344f61d
6 changed files with 133 additions and 61 deletions
--- a/Common/ABI.cpp
+++ b/Common/ABI.cpp
@ -559,6 +559,7 @@ void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
 	PUSH(R14); 
 	PUSH(R15);
 	PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
+	// TODO: XMM?
 }

 void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
--- a/Common/Thunk.h
+++ b/Common/Thunk.h
@ -57,6 +57,14 @@ public:
 		Shutdown();
 	}
 	void *ProtectFunction(void *function, int num_params);
+
+	const u8 *GetSaveRegsFunction() const {
+		return save_regs;
+	}
+	const u8 *GetLoadRegsFunction() const {
+		return load_regs;
+	}
+
 private:
 	void Init();
 	void Shutdown();
--- a/Core/MIPS/x86/CompLoadStore.cpp
+++ b/Core/MIPS/x86/CompLoadStore.cpp
@ -104,25 +104,28 @@ namespace MIPSComp

 	void Jit::CompITypeMemUnpairedLR(u32 op, bool isStore)
 	{
-		// TODO: ECX getting overwritten?  Why?
-		DISABLE;
-
 		CONDITIONAL_DISABLE;
 		int o = op>>26;
 		int offset = (signed short)(op&0xFFFF);
 		int rt = _RT;
 		int rs = _RS;

+		X64Reg shiftReg = ECX;
 		gpr.FlushLockX(ECX, EDX);
+#ifdef _M_X64
+		// On x64, we need ECX for CL, but it's also the first arg and gets lost.  Annoying.
+		gpr.FlushLockX(R9);
+		shiftReg = R9;
+#endif

 		gpr.Lock(rt);
 		gpr.BindToRegister(rt, true, !isStore);

 		// Grab the offset from alignment for shifting (<< 3 for bytes -> bits.)
-		MOV(32, R(ECX), gpr.R(rs));
-		ADD(32, R(ECX), Imm32(offset));
-		AND(32, R(ECX), Imm32(3));
-		SHL(32, R(ECX), Imm8(3));
+		MOV(32, R(shiftReg), gpr.R(rs));
+		ADD(32, R(shiftReg), Imm32(offset));
+		AND(32, R(shiftReg), Imm32(3));
+		SHL(32, R(shiftReg), Imm8(3));

 		{
 			JitSafeMem safe(this, rs, offset, ~3);
@ -133,10 +136,10 @@ namespace MIPSComp
 				if (!src.IsSimpleReg(EAX))
 					MOV(32, R(EAX), src);

-				CompITypeMemUnpairedLRInner(op);
+				CompITypeMemUnpairedLRInner(op, shiftReg);
 			}
 			if (safe.PrepareSlowRead((void *) &Memory::Read_U32))
-				CompITypeMemUnpairedLRInner(op);
+				CompITypeMemUnpairedLRInner(op, shiftReg);
 			safe.Finish();
 		}

@ -156,37 +159,69 @@ namespace MIPSComp
 		gpr.UnlockAllX();
 	}

-	void Jit::CompITypeMemUnpairedLRInner(u32 op)
+	void Jit::CompITypeMemUnpairedLRInner(u32 op, X64Reg shiftReg)
 	{
 		CONDITIONAL_DISABLE;
 		int o = op>>26;
 		int rt = _RT;

+		// Make sure we have the shift for the target in ECX.
+		if (shiftReg != ECX)
+			MOV(32, R(ECX), R(shiftReg));
+
+		// Now use that shift (left on target, right on source.)
 		switch (o)
 		{
 		case 34: //lwl
-			// First clear the target bits.
 			MOV(32, R(EDX), Imm32(0x00ffffff));
 			SHR(32, R(EDX), R(CL));
 			AND(32, gpr.R(rt), R(EDX));
+			break;

-			// Adjust the shift to the bits we want.
+		case 38: //lwr
+			SHR(32, R(EAX), R(CL));
+			break;
+
+		case 42: //swl
+			MOV(32, R(EDX), Imm32(0xffffff00));
+			SHL(32, R(EDX), R(CL));
+			AND(32, R(EAX), R(EDX));
+			break;
+
+		case 46: //swr
+			MOV(32, R(EDX), gpr.R(rt));
+			SHL(32, R(EDX), R(CL));
+			// EDX is already the target value to write, but may be overwritten below.  Save it.
+			PUSH(EDX);
+			break;
+
+		default:
+			_dbg_assert_msg_(JIT, 0, "Unsupported left/right load/store instruction.");
+		}
+
+		// Flip ECX around from 3 bytes / 24 bits.
+		if (shiftReg == ECX)
+		{
 			MOV(32, R(EDX), Imm32(24));
 			SUB(32, R(EDX), R(ECX));
 			MOV(32, R(ECX), R(EDX));
+		}
+		else
+		{
+			MOV(32, R(ECX), Imm32(24));
+			SUB(32, R(ECX), R(shiftReg));
+		}
+
+		// Use the flipped shift (left on source, right on target) and write target.
+		switch (o)
+		{
+		case 34: //lwl
 			SHL(32, R(EAX), R(CL));

 			OR(32, gpr.R(rt), R(EAX));
 			break;

 		case 38: //lwr
-			// Adjust the shift to the bits we want.
-			SHR(32, R(EAX), R(CL));
-
-			// Clear the target bits we're replacing.
-			MOV(32, R(EDX), Imm32(24));
-			SUB(32, R(EDX), R(ECX));
-			MOV(32, R(ECX), R(EDX));
 			MOV(32, R(EDX), Imm32(0xffffff00));
 			SHL(32, R(EDX), R(CL));
 			AND(32, gpr.R(rt), R(EDX));
@ -195,15 +230,6 @@ namespace MIPSComp
 			break;

 		case 42: //swl
-			// First clear the target memory bits.
-			MOV(32, R(EDX), Imm32(0xffffff00));
-			SHL(32, R(EDX), R(CL));
-			AND(32, R(EAX), R(EDX));
-
-			// Flip the shift, and adjust the shift in a temporary.
-			MOV(32, R(EDX), Imm32(24));
-			SUB(32, R(EDX), R(ECX));
-			MOV(32, R(ECX), R(EDX));
 			MOV(32, R(EDX), gpr.R(rt));
 			SHR(32, R(EDX), R(CL));

@ -211,19 +237,11 @@ namespace MIPSComp
 			break;

 		case 46: //swr
-			// Adjust the shift to the bits we want.
-			MOV(32, R(EDX), gpr.R(rt));
-			SHL(32, R(EDX), R(CL));
-			PUSH(EDX);
-
-			// Clear the target bits we're replacing.
-			MOV(32, R(EDX), Imm32(24));
-			SUB(32, R(EDX), R(ECX));
-			MOV(32, R(ECX), R(EDX));
 			MOV(32, R(EDX), Imm32(0x00ffffff));
 			SHR(32, R(EDX), R(CL));
 			AND(32, R(EAX), R(EDX));

+			// This is the target value we saved earlier.
 			POP(EDX);
 			OR(32, R(EDX), R(EAX));
 			break;
--- a/Core/MIPS/x86/Jit.cpp
+++ b/Core/MIPS/x86/Jit.cpp
@ -292,14 +292,17 @@ const u8 *Jit::DoJit(u32 em_address, JitBlock *b)
 		{
 			// TODO: Save/restore?
 			FlushAll();
-			CMP(32, M((void*)&coreState), Imm32(0));
-			FixupBranch skipCheck = J_CC(CC_E);
+			CMP(32, M((void*)&coreState), Imm32(CORE_RUNNING));
+			FixupBranch skipCheck1 = J_CC(CC_E);
+			CMP(32, M((void*)&coreState), Imm32(CORE_NEXTFRAME));
+			FixupBranch skipCheck2 = J_CC(CC_E);
 			if (js.afterOp & JitState::AFTER_REWIND_PC_BAD_STATE)
 				MOV(32, M(&mips_->pc), Imm32(js.compilerPC));
 			else
 				MOV(32, M(&mips_->pc), Imm32(js.compilerPC + 4));
 			WriteSyscallExit();
-			SetJumpTarget(skipCheck);
+			SetJumpTarget(skipCheck1);
+			SetJumpTarget(skipCheck2);

 			js.afterOp = JitState::AFTER_NONE;
 		}
@ -355,11 +358,14 @@ void Jit::WriteExit(u32 destination, int exit_num)
 	// If we need to verify coreState and rewind, we may not jump yet.
 	if (js.afterOp & (JitState::AFTER_CORE_STATE | JitState::AFTER_REWIND_PC_BAD_STATE))
 	{
-		CMP(32, M((void*)&coreState), Imm32(0));
-		FixupBranch skipCheck = J_CC(CC_E);
+		CMP(32, M((void*)&coreState), Imm32(CORE_RUNNING));
+		FixupBranch skipCheck1 = J_CC(CC_E);
+		CMP(32, M((void*)&coreState), Imm32(CORE_NEXTFRAME));
+		FixupBranch skipCheck2 = J_CC(CC_E);
 		MOV(32, M(&mips_->pc), Imm32(js.compilerPC));
 		WriteSyscallExit();
-		SetJumpTarget(skipCheck);
+		SetJumpTarget(skipCheck1);
+		SetJumpTarget(skipCheck2);

 		js.afterOp = JitState::AFTER_NONE;
 	}
@ -392,11 +398,14 @@ void Jit::WriteExitDestInEAX()
 	// If we need to verify coreState and rewind, we may not jump yet.
 	if (js.afterOp & (JitState::AFTER_CORE_STATE | JitState::AFTER_REWIND_PC_BAD_STATE))
 	{
-		CMP(32, M((void*)&coreState), Imm32(0));
-		FixupBranch skipCheck = J_CC(CC_E);
+		CMP(32, M((void*)&coreState), Imm32(CORE_RUNNING));
+		FixupBranch skipCheck1 = J_CC(CC_E);
+		CMP(32, M((void*)&coreState), Imm32(CORE_NEXTFRAME));
+		FixupBranch skipCheck2 = J_CC(CC_E);
 		MOV(32, M(&mips_->pc), Imm32(js.compilerPC));
 		WriteSyscallExit();
-		SetJumpTarget(skipCheck);
+		SetJumpTarget(skipCheck1);
+		SetJumpTarget(skipCheck2);

 		js.afterOp = JitState::AFTER_NONE;
 	}
@ -418,13 +427,13 @@ void Jit::WriteExitDestInEAX()
 		SetJumpTarget(tooLow);
 		SetJumpTarget(tooHigh);

-		ABI_CallFunctionA(thunks.ProtectFunction((void *) Memory::GetPointer, 1), R(EAX));
+		CallProtectedFunction((void *) Memory::GetPointer, R(EAX));
 		CMP(32, R(EAX), Imm32(0));
 		FixupBranch skip = J_CC(CC_NE);

 		// TODO: "Ignore" this so other threads can continue?
 		if (g_Config.bIgnoreBadMemAccess)
-			ABI_CallFunctionA(thunks.ProtectFunction((void *) Core_UpdateState, 1), Imm32(CORE_ERROR));
+			CallProtectedFunction((void *) Core_UpdateState, Imm32(CORE_ERROR));

 		SUB(32, M(&currentMIPS->downcount), Imm32(0));
 		JMP(asm_.dispatcherCheckCoreState, true);
@ -607,7 +616,6 @@ OpArg Jit::JitSafeMem::PrepareMemoryOpArg(ReadType type)
 		jit_->SUB(32, R(xaddr_), Imm32(offset_));
 	}

-
 #ifdef _M_IX86
 	return MDisp(xaddr_, (u32) Memory::base + offset_);
 #else
@ -657,7 +665,7 @@ void Jit::JitSafeMem::DoSlowWrite(void *safeFunc, const OpArg src, int suboffset
 			jit_->AND(32, R(EAX), Imm32(alignMask_));
 	}

-	jit_->ABI_CallFunctionAA(jit_->thunks.ProtectFunction(safeFunc, 2), src, R(EAX));
+	jit_->CallProtectedFunction(safeFunc, src, R(EAX));
 	needsCheck_ = true;
 }

@ -680,7 +688,7 @@ bool Jit::JitSafeMem::PrepareSlowRead(void *safeFunc)
 				jit_->AND(32, R(EAX), Imm32(alignMask_));
 		}

-		jit_->ABI_CallFunctionA(jit_->thunks.ProtectFunction(safeFunc, 1), R(EAX));
+		jit_->CallProtectedFunction(safeFunc, R(EAX));
 		needsCheck_ = true;
 		return true;
 	}
@ -710,7 +718,7 @@ void Jit::JitSafeMem::NextSlowRead(void *safeFunc, int suboffset)
 			jit_->AND(32, R(EAX), Imm32(alignMask_));
 	}

-	jit_->ABI_CallFunctionA(jit_->thunks.ProtectFunction(safeFunc, 1), R(EAX));
+	jit_->CallProtectedFunction(safeFunc, R(EAX));
 }

 bool Jit::JitSafeMem::ImmValid()
@ -755,7 +763,7 @@ void Jit::JitSafeMem::MemCheckImm(ReadType type)
 			return;

 		jit_->MOV(32, M(&jit_->mips_->pc), Imm32(jit_->js.compilerPC));
-		jit_->ABI_CallFunctionCCC(jit_->thunks.ProtectFunction((void *)&JitMemCheck, 3), iaddr_, size_, type == MEM_WRITE ? 1 : 0);
+		jit_->CallProtectedFunction((void *)&JitMemCheck, iaddr_, size_, type == MEM_WRITE ? 1 : 0);

 		jit_->CMP(32, M((void*)&coreState), Imm32(0));
 		skipChecks_.push_back(jit_->J_CC(CC_NE, true));
@ -790,11 +798,14 @@ void Jit::JitSafeMem::MemCheckAsm(ReadType type)
 			skipNext = jit_->J_CC(CC_NE);
 		}

-		jit_->PUSH(xaddr_);
+		// Keep the stack 16-byte aligned, just PUSH/POP 4 times.
+		for (int i = 0; i < 4; ++i)
+			jit_->PUSH(xaddr_);
 		jit_->MOV(32, M(&jit_->mips_->pc), Imm32(jit_->js.compilerPC));
 		jit_->ADD(32, R(xaddr_), Imm32(offset_));
-		jit_->ABI_CallFunctionACC(jit_->thunks.ProtectFunction((void *)&JitMemCheck, 3), R(xaddr_), size_, type == MEM_WRITE ? 1 : 0);
-		jit_->POP(xaddr_);
+		jit_->CallProtectedFunction((void *)&JitMemCheck, R(xaddr_), size_, type == MEM_WRITE ? 1 : 0);
+		for (int i = 0; i < 4; ++i)
+			jit_->POP(xaddr_);

 		jit_->SetJumpTarget(skipNext);
 		if (it->end != 0)
@ -809,6 +820,34 @@ void Jit::JitSafeMem::MemCheckAsm(ReadType type)
 	}
 }

+void Jit::CallProtectedFunction(void *func, const OpArg &arg1)
+{
+	// We don't regcache RCX, so the below is safe (and also faster, maybe branch prediction?)
+	ABI_CallFunctionA(thunks.ProtectFunction(func, 1), arg1);
+}
+
+void Jit::CallProtectedFunction(void *func, const OpArg &arg1, const OpArg &arg2)
+{
+	// We don't regcache RCX/RDX, so the below is safe (and also faster, maybe branch prediction?)
+	ABI_CallFunctionAA(thunks.ProtectFunction(func, 2), arg1, arg2);
+}
+
+void Jit::CallProtectedFunction(void *func, const u32 arg1, const u32 arg2, const u32 arg3)
+{
+	// On x64, we need to save R8, which is caller saved.
+	ABI_CallFunction((void *)thunks.GetSaveRegsFunction());
+	ABI_CallFunctionCCC(func, arg1, arg2, arg3);
+	ABI_CallFunction((void *)thunks.GetLoadRegsFunction());
+}
+
+void Jit::CallProtectedFunction(void *func, const OpArg &arg1, const u32 arg2, const u32 arg3)
+{
+	// On x64, we need to save R8, which is caller saved.
+	ABI_CallFunction((void *)thunks.GetSaveRegsFunction());
+	ABI_CallFunctionACC(func, arg1, arg2, arg3);
+	ABI_CallFunction((void *)thunks.GetLoadRegsFunction());
+}
+
 void Jit::Comp_DoNothing(u32 op) { }

 } // namespace
--- a/Core/MIPS/x86/Jit.h
+++ b/Core/MIPS/x86/Jit.h
@ -275,11 +275,16 @@ private:
 	void CompITypeMemRead(u32 op, u32 bits, void (XEmitter::*mov)(int, int, X64Reg, OpArg), void *safeFunc);
 	void CompITypeMemWrite(u32 op, u32 bits, void *safeFunc);
 	void CompITypeMemUnpairedLR(u32 op, bool isStore);
-	void CompITypeMemUnpairedLRInner(u32 op);
+	void CompITypeMemUnpairedLRInner(u32 op, X64Reg shiftReg);

 	void CompFPTriArith(u32 op, void (XEmitter::*arith)(X64Reg reg, OpArg), bool orderMatters);
 	void CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN = false);

+	void CallProtectedFunction(void *func, const OpArg &arg1);
+	void CallProtectedFunction(void *func, const OpArg &arg1, const OpArg &arg2);
+	void CallProtectedFunction(void *func, const u32 arg1, const u32 arg2, const u32 arg3);
+	void CallProtectedFunction(void *func, const OpArg &arg1, const u32 arg2, const u32 arg3);
+
 	JitBlockCache blocks;
 	JitOptions jo;
 	JitState js;
--- a/Core/MIPS/x86/RegCache.cpp
+++ b/Core/MIPS/x86/RegCache.cpp
@ -26,15 +26,16 @@ using namespace Gen;

 static const int allocationOrder[] = 
 {
-  // R12, when used as base register, for example in a LEA, can generate bad code! Need to look into this.
+	// R12, when used as base register, for example in a LEA, can generate bad code! Need to look into this.
+	// On x64, RCX and RDX are the first args.  CallProtectedFunction() assumes they're not regcached.
 #ifdef _M_X64
 #ifdef _WIN32
-  RSI, RDI, R13, R14, R8, R9, R10, R11, R12, //, RCX
+	RSI, RDI, R13, R14, R8, R9, R10, R11, R12,
 #else
-  RBP, R13, R14, R8, R9, R10, R11, R12, //, RCX
+	RBP, R13, R14, R8, R9, R10, R11, R12,
 #endif
 #elif _M_IX86
-  ESI, EDI, EBP, EDX, ECX,  // Let's try to free up EBX as well.
+	ESI, EDI, EBP, EDX, ECX,  // Let's try to free up EBX as well.
 #endif
 };