From 7e38df077fc347f4b5ecb7370ebc8ec15b266d5c Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 10 Apr 2015 19:57:13 -0700
Subject: [PATCH 1/5] x86jit: Prefer MOVAPS over MOVSS for reg->reg.

---
 Core/MIPS/x86/CompFPU.cpp | 53 ++++++++++++++++++++++++++-------------
 Core/MIPS/x86/Jit.h       |  1 +
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/Core/MIPS/x86/CompFPU.cpp b/Core/MIPS/x86/CompFPU.cpp
index d19fb1f5d8..c0f2fc25a4 100644
--- a/Core/MIPS/x86/CompFPU.cpp
+++ b/Core/MIPS/x86/CompFPU.cpp
@@ -47,6 +47,14 @@ namespace MIPSComp {
 using namespace Gen;
 using namespace X64JitConstants;
 
+void Jit::CopyFPReg(X64Reg dst, OpArg src) {
+	if (src.IsSimpleReg()) {
+		MOVAPS(dst, src);
+	} else {
+		MOVSS(dst, src);
+	}
+}
+
 void Jit::CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(X64Reg reg, OpArg), bool orderMatters) {
 	int ft = _FT;
 	int fs = _FS;
@@ -62,14 +70,14 @@ void Jit::CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(X64Reg reg, OpAr
 	} else if (ft != fd) {
 		// fs can't be fd (handled above.)
 		fpr.MapReg(fd, false, true);
-		MOVSS(fpr.RX(fd), fpr.R(fs));
+		CopyFPReg(fpr.RX(fd), fpr.R(fs));
 		(this->*arith)(fpr.RX(fd), fpr.R(ft));
 	} else {
-		// fd must be ft.
+		// fd must be ft, and order must matter.
 		fpr.MapReg(fd, true, true);
-		MOVSS(XMM0, fpr.R(fs));
+		CopyFPReg(XMM0, fpr.R(fs));
 		(this->*arith)(XMM0, fpr.R(ft));
-		MOVSS(fpr.RX(fd), R(XMM0));
+		MOVAPS(fpr.RX(fd), R(XMM0));
 	}
 	fpr.ReleaseSpillLocks();
 }
@@ -151,14 +159,14 @@ void Jit::CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN) {
 
 	// This means that NaN also means true, e.g. !<> or !>, etc.
 	if (allowNaN) {
-		MOVSS(XMM0, fpr.R(lhs));
-		MOVSS(XMM1, fpr.R(lhs));
+		CopyFPReg(XMM0, fpr.R(lhs));
+		CopyFPReg(XMM1, fpr.R(lhs));
 		CMPSS(XMM0, fpr.R(rhs), compare);
 		CMPUNORDSS(XMM1, fpr.R(rhs));
 
 		POR(XMM0, R(XMM1));
 	} else {
-		MOVSS(XMM0, fpr.R(lhs));
+		CopyFPReg(XMM0, fpr.R(lhs));
 		CMPSS(XMM0, fpr.R(rhs), compare);
 	}
 
@@ -226,7 +234,8 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
 	int fd = _FD;
 
 	auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) {
-		fpr.SpillLock(fs);
+		fpr.SpillLock(fd, fs);
+		fpr.MapReg(fd, fs == fd, true);
 
 		// Small optimization: 0 is our default mode anyway.
 		if (setMXCSR == 0 && !js.hasSetRounding) {
@@ -268,32 +277,42 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
 	case 5:	//F(fd)	= fabsf(F(fs)); break; //abs
 		fpr.SpillLock(fd, fs);
 		fpr.MapReg(fd, fd == fs, true);
-		if (fd != fs) {
-			MOVSS(fpr.RX(fd), fpr.R(fs));
+		if (fd != fs && fpr.IsMapped(fs)) {
+			MOVAPS(fpr.RX(fd), M(ssNoSignMask));
+			ANDPS(fpr.RX(fd), fpr.R(fs));
+		} else {
+			if (fd != fs) {
+				MOVSS(fpr.RX(fd), fpr.R(fs));
+			}
+			ANDPS(fpr.RX(fd), M(ssNoSignMask));
 		}
-		ANDPS(fpr.RX(fd), M(ssNoSignMask));
 		break;
 
 	case 6:	//F(fd)	= F(fs);				break; //mov
 		if (fd != fs) {
 			fpr.SpillLock(fd, fs);
 			fpr.MapReg(fd, fd == fs, true);
-			MOVSS(fpr.RX(fd), fpr.R(fs));
+			CopyFPReg(fpr.RX(fd), fpr.R(fs));
 		}
 		break;
 
 	case 7:	//F(fd)	= -F(fs);			 break; //neg
 		fpr.SpillLock(fd, fs);
 		fpr.MapReg(fd, fd == fs, true);
-		if (fd != fs) {
-			MOVSS(fpr.RX(fd), fpr.R(fs));
+		if (fd != fs && fpr.IsMapped(fs)) {
+			MOVAPS(fpr.RX(fd), M(ssSignBits2));
+			XORPS(fpr.RX(fd), fpr.R(fs));
+		} else {
+			if (fd != fs) {
+				MOVSS(fpr.RX(fd), fpr.R(fs));
+			}
+			XORPS(fpr.RX(fd), M(ssSignBits2));
 		}
-		XORPS(fpr.RX(fd), M(ssSignBits2));
 		break;
 
 
 	case 4:	//F(fd)	= sqrtf(F(fs)); break; //sqrt
-		fpr.SpillLock(fd, fs); // this probably works, just badly tested
+		fpr.SpillLock(fd, fs);
 		fpr.MapReg(fd, fd == fs, true);
 		SQRTSS(fpr.RX(fd), fpr.R(fs));
 		break;
@@ -305,7 +324,7 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
 	case 32: //F(fd)	= (float)FsI(fs);			break; //cvt.s.w
 		fpr.SpillLock(fd, fs);
 		fpr.MapReg(fd, fs == fd, true);
-		if (fpr.R(fs).IsSimpleReg()) {
+		if (fpr.IsMapped(fs)) {
 			CVTDQ2PS(fpr.RX(fd), fpr.R(fs));
 		} else {
 			// If fs was fd, we'd be in the case above since we mapped fd.
diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h
index 3e3b20b6ec..d062b9123b 100644
--- a/Core/MIPS/x86/Jit.h
+++ b/Core/MIPS/x86/Jit.h
@@ -238,6 +238,7 @@ private:
 	static Gen::CCFlags FlipCCFlag(Gen::CCFlags flag);
 	static Gen::CCFlags SwapCCFlag(Gen::CCFlags flag);
 
+	void CopyFPReg(Gen::X64Reg dst, Gen::OpArg src);
 	void CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(Gen::X64Reg reg, Gen::OpArg), bool orderMatters);
 	void CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN = false);
 	void CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin);

From e58eb5e1862e6eca8773b5fde06b529c5a8aa897 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 10 Apr 2015 19:58:56 -0700
Subject: [PATCH 2/5] x86jit: Small optimization for fd->fd fp convert.

We just generate a little less code.  This is also slightly faster
generally.
---
 Core/MIPS/x86/CompFPU.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Core/MIPS/x86/CompFPU.cpp b/Core/MIPS/x86/CompFPU.cpp
index c0f2fc25a4..e90452afe4 100644
--- a/Core/MIPS/x86/CompFPU.cpp
+++ b/Core/MIPS/x86/CompFPU.cpp
@@ -255,18 +255,19 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
 		// Did we get an indefinite integer value?
 		CMP(32, R(TEMPREG), Imm32(0x80000000));
 		FixupBranch skip = J_CC(CC_NE);
-		MOVSS(XMM0, fpr.R(fs));
+		if (fd != fs) {
+			CopyFPReg(fpr.RX(fd), fpr.R(fs));
+		}
 		XORPS(XMM1, R(XMM1));
-		CMPSS(XMM0, R(XMM1), CMP_LT);
+		CMPSS(fpr.RX(fd), R(XMM1), CMP_LT);
 
 		// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
 		// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
-		MOVD_xmm(R(TEMPREG), XMM0);
+		MOVD_xmm(R(TEMPREG), fpr.RX(fd));
 		XOR(32, R(TEMPREG), Imm32(0x7fffffff));
 
 		SetJumpTarget(skip);
-		fpr.DiscardR(fd);
-		MOV(32, fpr.R(fd), R(TEMPREG));
+		MOVD_xmm(fpr.RX(fd), R(TEMPREG));
 
 		if (setMXCSR != -1) {
 			LDMXCSR(M(&mxcsrTemp));

From eaed080add176292239d6aa8029ac093dc329e9a Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 10 Apr 2015 20:25:29 -0700
Subject: [PATCH 3/5] x86jit: Fix immediate kernel addresses.

Using a signed add + a value with the top bit set = bad.  Will have to
live with losing the kernel bit here, should be fine.
---
 Core/MIPS/x86/JitSafeMem.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Core/MIPS/x86/JitSafeMem.cpp b/Core/MIPS/x86/JitSafeMem.cpp
index 481f1524a0..d08ed74b8d 100644
--- a/Core/MIPS/x86/JitSafeMem.cpp
+++ b/Core/MIPS/x86/JitSafeMem.cpp
@@ -51,8 +51,9 @@ JitSafeMem::JitSafeMem(Jit *jit, MIPSGPReg raddr, s32 offset, u32 alignMask)
 {
 	// This makes it more instructions, so let's play it safe and say we need a far jump.
 	far_ = !g_Config.bIgnoreBadMemAccess || !CBreakPoints::GetMemChecks().empty();
+	// Mask out the kernel RAM bit, because we'll end up with a negative offset to MEMBASEREG.
 	if (jit_->gpr.IsImm(raddr_))
-		iaddr_ = jit_->gpr.GetImm(raddr_) + offset_;
+		iaddr_ = (jit_->gpr.GetImm(raddr_) + offset_) & 0x7FFFFFFF;
 	else
 		iaddr_ = (u32) -1;
 
@@ -123,9 +124,9 @@ bool JitSafeMem::PrepareRead(OpArg &src, int size)
 
 OpArg JitSafeMem::NextFastAddress(int suboffset)
 {
-	if (jit_->gpr.IsImm(raddr_))
+	if (iaddr_ != (u32) -1)
 	{
-		u32 addr = (jit_->gpr.GetImm(raddr_) + offset_ + suboffset) & alignMask_;
+		u32 addr = (iaddr_ + suboffset) & alignMask_;
 
 #ifdef _M_IX86
 		return M(Memory::base + (addr & Memory::MEMVIEW32_MASK));

From 7ea9bcbc13c5a68d3daf5861f1dbc7dfe15d5558 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 10 Apr 2015 20:26:54 -0700
Subject: [PATCH 4/5] x86jit: Avoid mapping rs in vfpu load/store.

This allows immediate address load/store, when possible, which can be
faster (especially with slow mem enabled.)
---
 Core/MIPS/x86/CompVFPU.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp
index 89f76fabce..10f186ea88 100644
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@@ -239,7 +239,6 @@ void Jit::Comp_SV(MIPSOpcode op) {
 	case 50: //lv.s  // VI(vt) = Memory::Read_U32(addr);
 		{
 			gpr.Lock(rs);
-			gpr.MapReg(rs, true, false);
 			fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
 
 			JitSafeMem safe(this, rs, imm);
@@ -263,9 +262,7 @@ void Jit::Comp_SV(MIPSOpcode op) {
 	case 58: //sv.s   // Memory::Write_U32(VI(vt), addr);
 		{
 			gpr.Lock(rs);
-			gpr.MapReg(rs, true, false);
 
-			// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
 			fpr.MapRegV(vt, 0);
 
 			JitSafeMem safe(this, rs, imm);
@@ -380,7 +377,6 @@ void Jit::Comp_SVQ(MIPSOpcode op)
 	case 54: //lv.q
 		{
 			gpr.Lock(rs);
-			gpr.MapReg(rs, true, false);
 	
 			u8 vregs[4];
 			GetVectorRegs(vregs, V_Quad, vt);
@@ -429,7 +425,6 @@ void Jit::Comp_SVQ(MIPSOpcode op)
 	case 62: //sv.q
 		{
 			gpr.Lock(rs);
-			gpr.MapReg(rs, true, false);
 
 			u8 vregs[4];
 			GetVectorRegs(vregs, V_Quad, vt);

From 56f071d26a87bd874eb1086061466978f7d4dbee Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 10 Apr 2015 20:44:39 -0700
Subject: [PATCH 5/5] x86jit: Support SIMD load/store with fastmem off.

Which is a lot faster, since it usually takes the fast path.
---
 Core/MIPS/x86/CompVFPU.cpp | 41 ++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp
index 10f186ea88..cb1f5ff575 100644
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@@ -381,14 +381,28 @@ void Jit::Comp_SVQ(MIPSOpcode op)
 			u8 vregs[4];
 			GetVectorRegs(vregs, V_Quad, vt);
 
-			if (g_Config.bFastMemory && fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {
+			if (fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {
 				JitSafeMem safe(this, rs, imm);
 				safe.SetFar();
 				OpArg src;
 				if (safe.PrepareRead(src, 16)) {
-					MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));
-				} else {
-					// Hmm... probably never happens.
+					// Should be safe, since lv.q must be aligned, but let's try to avoid crashing in safe mode.
+					if (g_Config.bFastMemory) {
+						MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));
+					} else {
+						MOVUPS(fpr.VSX(vregs), safe.NextFastAddress(0));
+					}
+				}
+				if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
+					for (int i = 0; i < 4; i++) {
+						safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
+						// We use XMM0 as a temporary since MOVSS and MOVD would clear the higher bits.
+						MOVD_xmm(XMM0, R(EAX));
+						MOVSS(fpr.VSX(vregs), R(XMM0));
+						// Rotate things so we can read in the next higher float.
+						// By the end (4 rotates), they'll all be back into place.
+						SHUFPS(fpr.VSX(vregs), fpr.VS(vregs), _MM_SHUFFLE(0, 3, 2, 1));
+					}
 				}
 				safe.Finish();
 				gpr.UnlockAll();
@@ -429,14 +443,25 @@ void Jit::Comp_SVQ(MIPSOpcode op)
 			u8 vregs[4];
 			GetVectorRegs(vregs, V_Quad, vt);
 
-			if (g_Config.bFastMemory && fpr.TryMapRegsVS(vregs, V_Quad, 0)) {
+			if (fpr.TryMapRegsVS(vregs, V_Quad, 0)) {
 				JitSafeMem safe(this, rs, imm);
 				safe.SetFar();
 				OpArg dest;
 				if (safe.PrepareWrite(dest, 16)) {
-					MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));
-				} else {
-					// Hmm... probably never happens.
+					// Should be safe, since sv.q must be aligned, but let's try to avoid crashing in safe mode.
+					if (g_Config.bFastMemory) {
+						MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));
+					} else {
+						MOVUPS(safe.NextFastAddress(0), fpr.VSX(vregs));
+					}
+				}
+				if (safe.PrepareSlowWrite()) {
+					MOVAPS(XMM0, fpr.VS(vregs));
+					for (int i = 0; i < 4; i++) {
+						MOVSS(M(&ssLoadStoreTemp), XMM0);
+						SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
+						safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp), i * 4);
+					}
 				}
 				safe.Finish();
 				gpr.UnlockAll();