More RIP elimination

2025-04-02 11:01:50 -04:00 · 2017-07-05 15:06:44 +02:00 · 2017-07-05 15:06:44 +02:00 · 7c3b37c561
commit 7c3b37c561
parent 7c1ae5b3e6
8 changed files with 97 additions and 65 deletions
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@ -1073,7 +1073,7 @@ public:
 class XCodeBlock : public CodeBlock<XEmitter> {
 public:
 	void PoisonMemory(int offset) override;
-	bool RipAccessible(void *ptr) const {
+	bool RipAccessible(const void *ptr) const {
 #ifdef _M_IX86
 		return true;
 #else
--- a/Core/MIPS/MIPS.h
+++ b/Core/MIPS/MIPS.h
@ -232,6 +232,8 @@ public:
 	u32 intBranchExit;
 	u32 jitBranchExit;

+	u32 savedPC;
+
 	static const u32 FCR0_VALUE = 0x00003351;

 #if defined(PPSSPP_ARCH_X86) || defined(PPSSPP_ARCH_AMD64)
--- a/Core/MIPS/x86/Asm.cpp
+++ b/Core/MIPS/x86/Asm.cpp
@ -114,15 +114,10 @@ void Jit::GenerateFixedCode(JitOptions &jo) {
 		// This is the most common situation.
 		TEST(32, MIPSSTATE_VAR(fcr31), Imm32(0x01000003));
 		FixupBranch skip = J_CC(CC_Z);
-#ifdef _M_X64
 		// TODO: Move the hasSetRounding flag somewhere we can reach it through the context pointer, or something.
-		MOV(64, R(RAX), Imm64((uintptr_t)&js.hasSetRounding));
+		MOV(PTRBITS, R(RAX), ImmPtr(&js.hasSetRounding));
 		MOV(8, MatR(RAX), Imm8(1));
-#else
-		MOV(8, M(&js.hasSetRounding), Imm8(1));
-#endif
 		SetJumpTarget(skip);
-
 		RET();
 	}

@ -153,7 +148,12 @@ void Jit::GenerateFixedCode(JitOptions &jo) {
 		FixupBranch bailCoreState = J_CC(CC_S, true);

 		SetJumpTarget(skipToCoreStateCheck);
-		CMP(32, M(&coreState), Imm32(0));
+		if (RipAccessible((const void *)&coreState)) {
+			CMP(32, M(&coreState), Imm32(0));
+		} else {
+			MOV(PTRBITS, R(RAX), ImmPtr((const void *)&coreState));
+			CMP(32, MatR(RAX), Imm32(0));
+		}
 		FixupBranch badCoreState = J_CC(CC_NZ, true);
 		FixupBranch skipToRealDispatch2 = J(); //skip the sync and compare first time

@ -210,7 +210,12 @@ void Jit::GenerateFixedCode(JitOptions &jo) {
 		SetJumpTarget(bail);
 		SetJumpTarget(bailCoreState);

-		CMP(32, M(&coreState), Imm32(0));
+		if (RipAccessible((const void *)&coreState)) {
+			CMP(32, M(&coreState), Imm32(0));
+		} else {
+			MOV(PTRBITS, R(RAX), ImmPtr((const void *)&coreState));
+			CMP(32, MatR(RAX), Imm32(0));
+		}
 		J_CC(CC_Z, outerLoop, true);

 	SetJumpTarget(badCoreState);
--- a/Core/MIPS/x86/CompBranch.cpp
+++ b/Core/MIPS/x86/CompBranch.cpp
@ -658,8 +658,6 @@ void Jit::Comp_Jump(MIPSOpcode op) {
 	js.compiling = false;
 }

-static u32 savedPC;
-
 void Jit::Comp_JumpReg(MIPSOpcode op)
 {
 	CONDITIONAL_LOG;
@ -725,21 +723,18 @@ void Jit::Comp_JumpReg(MIPSOpcode op)
 			MOV(32, R(EAX), gpr.R(rs));
 		}
 		FlushAll();
-	}
-	else
-	{
+	} else {
 		// Latch destination now - save it in memory.
 		gpr.MapReg(rs, true, false);
-		MOV(32, M(&savedPC), gpr.R(rs));
+		MOV(32, MIPSSTATE_VAR(savedPC), gpr.R(rs));
 		if (andLink)
 			gpr.SetImm(rd, GetCompilerPC() + 8);
 		CompileDelaySlot(DELAYSLOT_NICE);
-		MOV(32, R(EAX), M(&savedPC));
+		MOV(32, R(EAX), MIPSSTATE_VAR(savedPC));
 		FlushAll();
 	}

-	switch (op & 0x3f)
-	{
+	switch (op & 0x3f) {
 	case 8: //jr
 		break;
 	case 9: //jalr
--- a/Core/MIPS/x86/CompFPU.cpp
+++ b/Core/MIPS/x86/CompFPU.cpp
@ -98,8 +98,6 @@ void Jit::Comp_FPU3op(MIPSOpcode op) {
 	}
 }

-static u32 MEMORY_ALIGNED16(ssLoadStoreTemp);
-
 void Jit::Comp_FPULS(MIPSOpcode op) {
 	CONDITIONAL_DISABLE;
 	s32 offset = _IMM16;
@ -137,8 +135,8 @@ void Jit::Comp_FPULS(MIPSOpcode op) {
 				MOVSS(dest, fpr.RX(ft));
 			if (safe.PrepareSlowWrite())
 			{
-				MOVSS(M(&ssLoadStoreTemp), fpr.RX(ft));
-				safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp));
+				MOVSS(MIPSSTATE_VAR(temp), fpr.RX(ft));
+				safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp));
 			}
 			safe.Finish();

--- a/Core/MIPS/x86/CompReplace.cpp
+++ b/Core/MIPS/x86/CompReplace.cpp
@ -32,7 +32,8 @@ int Jit::Replace_fabsf() {
 	fpr.SpillLock(0, 12);
 	fpr.MapReg(0, false, true);
 	MOVSS(fpr.RX(0), fpr.R(12));
-	ANDPS(fpr.RX(0), M(&ssNoSignMask));
+	MOV(PTRBITS, R(RAX), ImmPtr(&ssNoSignMask));
+	ANDPS(fpr.RX(0), MatR(RAX));
 	fpr.ReleaseSpillLocks();
 	return 4;  // Number of instructions in the MIPS function
 }
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@ -15,6 +15,9 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

+// Table 13.10 in http://agner.org/optimize/optimizing_assembly.pdf is cool - generate constants with
+// short instruction sequences. Surprisingly many are possible.
+
 #include "ppsspp_config.h"
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)

@ -66,9 +69,6 @@ const u32 MEMORY_ALIGNED16( noSignMask[4] ) = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFF
 const u32 MEMORY_ALIGNED16( signBitAll[4] ) = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
 const u32 MEMORY_ALIGNED16( signBitLower[4] ) = {0x80000000, 0, 0, 0};
 const float MEMORY_ALIGNED16( oneOneOneOne[4] ) = {1.0f, 1.0f, 1.0f, 1.0f};
-const u32 MEMORY_ALIGNED16( solidOnes[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
-const u32 MEMORY_ALIGNED16( lowOnes[4] ) = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000};
-const u32 MEMORY_ALIGNED16( lowZeroes[4] ) = {0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF};
 const u32 MEMORY_ALIGNED16( fourinfnan[4] ) = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
 const float MEMORY_ALIGNED16( identityMatrix[4][4]) = { { 1.0f, 0, 0, 0 }, { 0, 1.0f, 0, 0 }, { 0, 0, 1.0f, 0 }, { 0, 0, 0, 1.0f} };

@ -176,13 +176,15 @@ void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
 			ANDNPS(XMM0, fpr.V(vregs[i]));

 			// Retain a NAN in XMM0 (must be second operand.)
-			MOVSS(fpr.VX(vregs[i]), M(&one));
+			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
+			MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
 			MINSS(fpr.VX(vregs[i]), R(XMM0));
 		} else if (sat == 3) {
 			fpr.MapRegV(vregs[i], MAP_DIRTY);

 			// Check for < -1.0f, but careful of NANs.
-			MOVSS(XMM1, M(&minus_one));
+			MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
+			MOVSS(XMM1, MatR(TEMPREG));
 			MOVSS(R(XMM0), fpr.VX(vregs[i]));
 			CMPLESS(XMM0, R(XMM1));
 			// If it was NOT less, the three ops below do nothing.
@ -192,7 +194,8 @@ void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
 			ORPS(XMM0, R(XMM1));

 			// Retain a NAN in XMM0 (must be second operand.)
-			MOVSS(fpr.VX(vregs[i]), M(&one));
+			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
+			MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
 			MINSS(fpr.VX(vregs[i]), R(XMM0));
 		}
 	}
@ -236,12 +239,10 @@ void Jit::Comp_SV(MIPSOpcode op) {
 			JitSafeMem safe(this, rs, imm);
 			safe.SetFar();
 			OpArg src;
-			if (safe.PrepareRead(src, 4))
-			{
+			if (safe.PrepareRead(src, 4)) {
 				MOVSS(fpr.VX(vt), safe.NextFastAddress(0));
 			}
-			if (safe.PrepareSlowRead(safeMemFuncs.readU32))
-			{
+			if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
 				MOVD_xmm(fpr.VX(vt), R(EAX));
 			}
 			safe.Finish();
@ -260,14 +261,12 @@ void Jit::Comp_SV(MIPSOpcode op) {
 			JitSafeMem safe(this, rs, imm);
 			safe.SetFar();
 			OpArg dest;
-			if (safe.PrepareWrite(dest, 4))
-			{
+			if (safe.PrepareWrite(dest, 4)) {
 				MOVSS(safe.NextFastAddress(0), fpr.VX(vt));
 			}
-			if (safe.PrepareSlowWrite())
-			{
-				MOVSS(M(&ssLoadStoreTemp), fpr.VX(vt));
-				safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp), 0);
+			if (safe.PrepareSlowWrite()) {
+				MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vt));
+				safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), 0);
 			}
 			safe.Finish();

@ -453,9 +452,9 @@ void Jit::Comp_SVQ(MIPSOpcode op) {
 				if (safe.PrepareSlowWrite()) {
 					MOVAPS(XMM0, fpr.VS(vregs));
 					for (int i = 0; i < 4; i++) {
-						MOVSS(M(&ssLoadStoreTemp), XMM0);
+						MOVSS(MIPSSTATE_VAR(temp), XMM0);
 						SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
-						safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp), i * 4);
+						safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
 					}
 				}
 				safe.Finish();
@ -476,8 +475,8 @@ void Jit::Comp_SVQ(MIPSOpcode op) {
 			}
 			if (safe.PrepareSlowWrite()) {
 				for (int i = 0; i < 4; i++) {
-					MOVSS(M(&ssLoadStoreTemp), fpr.VX(vregs[i]));
-					safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp), i * 4);
+					MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vregs[i]));
+					safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
 				}
 			}
 			safe.Finish();
@ -508,7 +507,12 @@ void Jit::Comp_VVectorInit(MIPSOpcode op) {
 		if (type == 6) {
 			XORPS(fpr.VSX(dregs), fpr.VS(dregs));
 		} else if (type == 7) {
-			MOVAPS(fpr.VSX(dregs), M(&oneOneOneOne));
+			if (RipAccessible(&oneOneOneOne)) {
+				MOVAPS(fpr.VSX(dregs), M(&oneOneOneOne));
+			} else {
+				MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
+				MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
+			}
 		} else {
 			DISABLE;
 		}
@ -522,7 +526,12 @@ void Jit::Comp_VVectorInit(MIPSOpcode op) {
 		XORPS(XMM0, R(XMM0));
 		break;
 	case 7: // v=ones; break;   //vone
-		MOVSS(XMM0, M(&one));
+		if (RipAccessible(&one)) {
+			MOVSS(XMM0, M(&one));
+		} else {
+			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
+			MOVSS(XMM0, MatR(TEMPREG));
+		}
 		break;
 	default:
 		DISABLE;
@ -558,7 +567,12 @@ void Jit::Comp_VIdt(MIPSOpcode op) {
 	}

 	XORPS(XMM0, R(XMM0));
-	MOVSS(XMM1, M(&one));
+	if (RipAccessible(&one)) {
+		MOVSS(XMM1, M(&one));
+	} else {
+		MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
+		MOVSS(XMM1, MatR(TEMPREG));
+	}
 	fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
 	switch (sz) {
 	case V_Pair:
@ -1426,13 +1440,16 @@ void Jit::Comp_Vcmp(MIPSOpcode op) {

 		// Finalize the comparison for ES/NS.
 		if (cond == VC_ES || cond == VC_NS) {
-			ANDPS(XMM0, M(&fourinfnan));
-			PCMPEQD(XMM0, M(&fourinfnan));  // Integer comparison
+			MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
+			ANDPS(XMM0, MatR(TEMPREG));
+			PCMPEQD(XMM0, MatR(TEMPREG));  // Integer comparison
 			// It's inversed below for NS.
 		}

 		if (inverse) {
-			XORPS(XMM0, M(&solidOnes));
+			// The canonical way to generate a bunch of ones, see https://stackoverflow.com/questions/35085059/what-are-the-best-instruction-sequences-to-generate-vector-constants-on-the-fly
+			PCMPEQW(XMM1, R(XMM1));
+			XORPS(XMM0, R(XMM1));
 		}
 		ANDPS(XMM0, M(vcmpMask[n - 1]));
 		MOVAPS(M(vcmpResult), XMM0);
@ -1451,8 +1468,9 @@ void Jit::Comp_Vcmp(MIPSOpcode op) {
 	} else {
 		// Finalize the comparison for ES/NS.
 		if (cond == VC_ES || cond == VC_NS) {
-			ANDPS(XMM0, M(&fourinfnan));
-			PCMPEQD(XMM0, M(&fourinfnan));  // Integer comparison
+			MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
+			ANDPS(XMM0, MatR(TEMPREG));
+			PCMPEQD(XMM0, MatR(TEMPREG));  // Integer comparison
 			// It's inversed below for NS.
 		}

@ -1983,26 +2001,22 @@ void Jit::Comp_Vocp(MIPSOpcode op) {
 	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);

 	X64Reg tempxregs[4];
-	for (int i = 0; i < n; ++i)
-	{
-		if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs))
-		{
+	for (int i = 0; i < n; ++i) {
+		if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
 			int reg = fpr.GetTempV();
 			fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
 			fpr.SpillLockV(reg);
 			tempxregs[i] = fpr.VX(reg);
-		}
-		else
-		{
+		} else {
 			fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
 			fpr.SpillLockV(dregs[i]);
 			tempxregs[i] = fpr.VX(dregs[i]);
 		}
 	}

-	MOVSS(XMM1, M(&one));
-	for (int i = 0; i < n; ++i)
-	{
+	MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
+	MOVSS(XMM1, MatR(TEMPREG));
+	for (int i = 0; i < n; ++i) {
 		MOVSS(XMM0, R(XMM1));
 		SUBSS(XMM0, fpr.V(sregs[i]));
 		MOVSS(tempxregs[i], R(XMM0));
@ -2274,13 +2288,23 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
 			MINSS(tempxregs[i], R(XMM0));
 			break;
 		case 16: // d[i] = 1.0f / s[i]; break; //vrcp
-			MOVSS(XMM0, M(&one));
+			if (RipAccessible(&one)) {
+				MOVSS(XMM0, M(&one));
+			} else {
+				MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
+				MOVSS(XMM0, MatR(TEMPREG));
+			}
 			DIVSS(XMM0, fpr.V(sregs[i]));
 			MOVSS(tempxregs[i], R(XMM0));
 			break;
 		case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
 			SQRTSS(XMM0, fpr.V(sregs[i]));
-			MOVSS(tempxregs[i], M(&one));
+			if (RipAccessible(&one)) {
+				MOVSS(tempxregs[i], M(&one));
+			} else {
+				MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
+				MOVSS(tempxregs[i], MatR(TEMPREG));
+			}
 			DIVSS(tempxregs[i], R(XMM0));
 			break;
 		case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
@ -2306,7 +2330,9 @@ void Jit::Comp_VV2Op(MIPSOpcode op) {
 			MOVSS(tempxregs[i], M(&sincostemp[0]));
 			break;
 		case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
-			MOVSS(XMM0, M(&minus_one));
+			// Rare so let's not bother checking for RipAccessible.
+			MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
+			MOVSS(XMM0, MatR(TEMPREG));
 			DIVSS(XMM0, fpr.V(sregs[i]));
 			MOVSS(tempxregs[i], R(XMM0));
 			break;
--- a/Core/MIPS/x86/Jit.cpp
+++ b/Core/MIPS/x86/Jit.cpp
@ -374,7 +374,12 @@ const u8 *Jit::DoJit(u32 em_address, JitBlock *b) {
 			// If we're rewinding, CORE_NEXTFRAME should not cause a rewind.
 			// It doesn't really matter either way if we're not rewinding.
 			// CORE_RUNNING is <= CORE_NEXTFRAME.
-			CMP(32, M(&coreState), Imm32(CORE_NEXTFRAME));
+			if (RipAccessible((const void *)coreState)) {
+				CMP(32, M(&coreState), Imm32(CORE_NEXTFRAME));
+			} else {
+				MOV(PTRBITS, R(RAX), ImmPtr((const void *)&coreState));
+				CMP(32, MatR(RAX), Imm32(CORE_NEXTFRAME));
+			}
 			FixupBranch skipCheck = J_CC(CC_LE);
 			if (js.afterOp & JitState::AFTER_REWIND_PC_BAD_STATE)
 				MOV(32, MIPSSTATE_VAR(pc), Imm32(GetCompilerPC()));