From 7e38df077fc347f4b5ecb7370ebc8ec15b266d5c Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 10 Apr 2015 19:57:13 -0700 Subject: [PATCH 1/5] x86jit: Prefer MOVAPS over MOVSS for reg->reg. --- Core/MIPS/x86/CompFPU.cpp | 53 ++++++++++++++++++++++++++------------- Core/MIPS/x86/Jit.h | 1 + 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/Core/MIPS/x86/CompFPU.cpp b/Core/MIPS/x86/CompFPU.cpp index d19fb1f5d8..c0f2fc25a4 100644 --- a/Core/MIPS/x86/CompFPU.cpp +++ b/Core/MIPS/x86/CompFPU.cpp @@ -47,6 +47,14 @@ namespace MIPSComp { using namespace Gen; using namespace X64JitConstants; +void Jit::CopyFPReg(X64Reg dst, OpArg src) { + if (src.IsSimpleReg()) { + MOVAPS(dst, src); + } else { + MOVSS(dst, src); + } +} + void Jit::CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(X64Reg reg, OpArg), bool orderMatters) { int ft = _FT; int fs = _FS; @@ -62,14 +70,14 @@ void Jit::CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(X64Reg reg, OpAr } else if (ft != fd) { // fs can't be fd (handled above.) fpr.MapReg(fd, false, true); - MOVSS(fpr.RX(fd), fpr.R(fs)); + CopyFPReg(fpr.RX(fd), fpr.R(fs)); (this->*arith)(fpr.RX(fd), fpr.R(ft)); } else { - // fd must be ft. + // fd must be ft, and order must matter. fpr.MapReg(fd, true, true); - MOVSS(XMM0, fpr.R(fs)); + CopyFPReg(XMM0, fpr.R(fs)); (this->*arith)(XMM0, fpr.R(ft)); - MOVSS(fpr.RX(fd), R(XMM0)); + MOVAPS(fpr.RX(fd), R(XMM0)); } fpr.ReleaseSpillLocks(); } @@ -151,14 +159,14 @@ void Jit::CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN) { // This means that NaN also means true, e.g. !<> or !>, etc. if (allowNaN) { - MOVSS(XMM0, fpr.R(lhs)); - MOVSS(XMM1, fpr.R(lhs)); + CopyFPReg(XMM0, fpr.R(lhs)); + CopyFPReg(XMM1, fpr.R(lhs)); CMPSS(XMM0, fpr.R(rhs), compare); CMPUNORDSS(XMM1, fpr.R(rhs)); POR(XMM0, R(XMM1)); } else { - MOVSS(XMM0, fpr.R(lhs)); + CopyFPReg(XMM0, fpr.R(lhs)); CMPSS(XMM0, fpr.R(rhs), compare); } @@ -226,7 +234,8 @@ void Jit::Comp_FPU2op(MIPSOpcode op) { int fd = _FD; auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) { - fpr.SpillLock(fs); + fpr.SpillLock(fd, fs); + fpr.MapReg(fd, fs == fd, true); // Small optimization: 0 is our default mode anyway. if (setMXCSR == 0 && !js.hasSetRounding) { @@ -268,32 +277,42 @@ void Jit::Comp_FPU2op(MIPSOpcode op) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); - if (fd != fs) { - MOVSS(fpr.RX(fd), fpr.R(fs)); + if (fd != fs && fpr.IsMapped(fs)) { + MOVAPS(fpr.RX(fd), M(ssNoSignMask)); + ANDPS(fpr.RX(fd), fpr.R(fs)); + } else { + if (fd != fs) { + MOVSS(fpr.RX(fd), fpr.R(fs)); + } + ANDPS(fpr.RX(fd), M(ssNoSignMask)); } - ANDPS(fpr.RX(fd), M(ssNoSignMask)); break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); - MOVSS(fpr.RX(fd), fpr.R(fs)); + CopyFPReg(fpr.RX(fd), fpr.R(fs)); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); - if (fd != fs) { - MOVSS(fpr.RX(fd), fpr.R(fs)); + if (fd != fs && fpr.IsMapped(fs)) { + MOVAPS(fpr.RX(fd), M(ssSignBits2)); + XORPS(fpr.RX(fd), fpr.R(fs)); + } else { + if (fd != fs) { + MOVSS(fpr.RX(fd), fpr.R(fs)); + } + XORPS(fpr.RX(fd), M(ssSignBits2)); } - XORPS(fpr.RX(fd), M(ssSignBits2)); break; case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt - fpr.SpillLock(fd, fs); // this probably works, just badly tested + fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); break; @@ -305,7 +324,7 @@ void Jit::Comp_FPU2op(MIPSOpcode op) { case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w fpr.SpillLock(fd, fs); fpr.MapReg(fd, fs == fd, true); - if (fpr.R(fs).IsSimpleReg()) { + if (fpr.IsMapped(fs)) { CVTDQ2PS(fpr.RX(fd), fpr.R(fs)); } else { // If fs was fd, we'd be in the case above since we mapped fd. diff --git a/Core/MIPS/x86/Jit.h b/Core/MIPS/x86/Jit.h index 3e3b20b6ec..d062b9123b 100644 --- a/Core/MIPS/x86/Jit.h +++ b/Core/MIPS/x86/Jit.h @@ -238,6 +238,7 @@ private: static Gen::CCFlags FlipCCFlag(Gen::CCFlags flag); static Gen::CCFlags SwapCCFlag(Gen::CCFlags flag); + void CopyFPReg(Gen::X64Reg dst, Gen::OpArg src); void CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(Gen::X64Reg reg, Gen::OpArg), bool orderMatters); void CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN = false); void CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin); From e58eb5e1862e6eca8773b5fde06b529c5a8aa897 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 10 Apr 2015 19:58:56 -0700 Subject: [PATCH 2/5] x86jit: Small optimization for fd->fd fp convert. We just generate a little less code. This is also slightly faster generally. --- Core/MIPS/x86/CompFPU.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Core/MIPS/x86/CompFPU.cpp b/Core/MIPS/x86/CompFPU.cpp index c0f2fc25a4..e90452afe4 100644 --- a/Core/MIPS/x86/CompFPU.cpp +++ b/Core/MIPS/x86/CompFPU.cpp @@ -255,18 +255,19 @@ void Jit::Comp_FPU2op(MIPSOpcode op) { // Did we get an indefinite integer value? CMP(32, R(TEMPREG), Imm32(0x80000000)); FixupBranch skip = J_CC(CC_NE); - MOVSS(XMM0, fpr.R(fs)); + if (fd != fs) { + CopyFPReg(fpr.RX(fd), fpr.R(fs)); + } XORPS(XMM1, R(XMM1)); - CMPSS(XMM0, R(XMM1), CMP_LT); + CMPSS(fpr.RX(fd), R(XMM1), CMP_LT); // At this point, -inf = 0xffffffff, inf/nan = 0x00000000. // We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits. - MOVD_xmm(R(TEMPREG), XMM0); + MOVD_xmm(R(TEMPREG), fpr.RX(fd)); XOR(32, R(TEMPREG), Imm32(0x7fffffff)); SetJumpTarget(skip); - fpr.DiscardR(fd); - MOV(32, fpr.R(fd), R(TEMPREG)); + MOVD_xmm(fpr.RX(fd), R(TEMPREG)); if (setMXCSR != -1) { LDMXCSR(M(&mxcsrTemp)); From eaed080add176292239d6aa8029ac093dc329e9a Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 10 Apr 2015 20:25:29 -0700 Subject: [PATCH 3/5] x86jit: Fix immediate kernel addresses. Using a signed add + a value with the top bit set = bad. Will have to live with losing the kernel bit here, should be fine. --- Core/MIPS/x86/JitSafeMem.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Core/MIPS/x86/JitSafeMem.cpp b/Core/MIPS/x86/JitSafeMem.cpp index 481f1524a0..d08ed74b8d 100644 --- a/Core/MIPS/x86/JitSafeMem.cpp +++ b/Core/MIPS/x86/JitSafeMem.cpp @@ -51,8 +51,9 @@ JitSafeMem::JitSafeMem(Jit *jit, MIPSGPReg raddr, s32 offset, u32 alignMask) { // This makes it more instructions, so let's play it safe and say we need a far jump. far_ = !g_Config.bIgnoreBadMemAccess || !CBreakPoints::GetMemChecks().empty(); + // Mask out the kernel RAM bit, because we'll end up with a negative offset to MEMBASEREG. if (jit_->gpr.IsImm(raddr_)) - iaddr_ = jit_->gpr.GetImm(raddr_) + offset_; + iaddr_ = (jit_->gpr.GetImm(raddr_) + offset_) & 0x7FFFFFFF; else iaddr_ = (u32) -1; @@ -123,9 +124,9 @@ bool JitSafeMem::PrepareRead(OpArg &src, int size) OpArg JitSafeMem::NextFastAddress(int suboffset) { - if (jit_->gpr.IsImm(raddr_)) + if (iaddr_ != (u32) -1) { - u32 addr = (jit_->gpr.GetImm(raddr_) + offset_ + suboffset) & alignMask_; + u32 addr = (iaddr_ + suboffset) & alignMask_; #ifdef _M_IX86 return M(Memory::base + (addr & Memory::MEMVIEW32_MASK)); From 7ea9bcbc13c5a68d3daf5861f1dbc7dfe15d5558 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 10 Apr 2015 20:26:54 -0700 Subject: [PATCH 4/5] x86jit: Avoid mapping rs in vfpu load/store. This allows immediate address load/store, when possible, which can be faster (especially with slow mem enabled.) --- Core/MIPS/x86/CompVFPU.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 89f76fabce..10f186ea88 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -239,7 +239,6 @@ void Jit::Comp_SV(MIPSOpcode op) { case 50: //lv.s // VI(vt) = Memory::Read_U32(addr); { gpr.Lock(rs); - gpr.MapReg(rs, true, false); fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT); JitSafeMem safe(this, rs, imm); @@ -263,9 +262,7 @@ void Jit::Comp_SV(MIPSOpcode op) { case 58: //sv.s // Memory::Write_U32(VI(vt), addr); { gpr.Lock(rs); - gpr.MapReg(rs, true, false); - // Even if we don't use real SIMD there's still 8 or 16 scalar float registers. fpr.MapRegV(vt, 0); JitSafeMem safe(this, rs, imm); @@ -380,7 +377,6 @@ void Jit::Comp_SVQ(MIPSOpcode op) case 54: //lv.q { gpr.Lock(rs); - gpr.MapReg(rs, true, false); u8 vregs[4]; GetVectorRegs(vregs, V_Quad, vt); @@ -429,7 +425,6 @@ void Jit::Comp_SVQ(MIPSOpcode op) case 62: //sv.q { gpr.Lock(rs); - gpr.MapReg(rs, true, false); u8 vregs[4]; GetVectorRegs(vregs, V_Quad, vt); From 56f071d26a87bd874eb1086061466978f7d4dbee Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 10 Apr 2015 20:44:39 -0700 Subject: [PATCH 5/5] x86jit: Support SIMD load/store with fastmem off. Which is a lot faster, since it usually takes the fast path. --- Core/MIPS/x86/CompVFPU.cpp | 41 ++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp index 10f186ea88..cb1f5ff575 100644 --- a/Core/MIPS/x86/CompVFPU.cpp +++ b/Core/MIPS/x86/CompVFPU.cpp @@ -381,14 +381,28 @@ void Jit::Comp_SVQ(MIPSOpcode op) u8 vregs[4]; GetVectorRegs(vregs, V_Quad, vt); - if (g_Config.bFastMemory && fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) { + if (fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) { JitSafeMem safe(this, rs, imm); safe.SetFar(); OpArg src; if (safe.PrepareRead(src, 16)) { - MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0)); - } else { - // Hmm... probably never happens. + // Should be safe, since lv.q must be aligned, but let's try to avoid crashing in safe mode. + if (g_Config.bFastMemory) { + MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0)); + } else { + MOVUPS(fpr.VSX(vregs), safe.NextFastAddress(0)); + } + } + if (safe.PrepareSlowRead(safeMemFuncs.readU32)) { + for (int i = 0; i < 4; i++) { + safe.NextSlowRead(safeMemFuncs.readU32, i * 4); + // We use XMM0 as a temporary since MOVSS and MOVD would clear the higher bits. + MOVD_xmm(XMM0, R(EAX)); + MOVSS(fpr.VSX(vregs), R(XMM0)); + // Rotate things so we can read in the next higher float. + // By the end (4 rotates), they'll all be back into place. + SHUFPS(fpr.VSX(vregs), fpr.VS(vregs), _MM_SHUFFLE(0, 3, 2, 1)); + } } safe.Finish(); gpr.UnlockAll(); @@ -429,14 +443,25 @@ void Jit::Comp_SVQ(MIPSOpcode op) u8 vregs[4]; GetVectorRegs(vregs, V_Quad, vt); - if (g_Config.bFastMemory && fpr.TryMapRegsVS(vregs, V_Quad, 0)) { + if (fpr.TryMapRegsVS(vregs, V_Quad, 0)) { JitSafeMem safe(this, rs, imm); safe.SetFar(); OpArg dest; if (safe.PrepareWrite(dest, 16)) { - MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs)); - } else { - // Hmm... probably never happens. + // Should be safe, since sv.q must be aligned, but let's try to avoid crashing in safe mode. + if (g_Config.bFastMemory) { + MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs)); + } else { + MOVUPS(safe.NextFastAddress(0), fpr.VSX(vregs)); + } + } + if (safe.PrepareSlowWrite()) { + MOVAPS(XMM0, fpr.VS(vregs)); + for (int i = 0; i < 4; i++) { + MOVSS(M(&ssLoadStoreTemp), XMM0); + SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1)); + safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp), i * 4); + } } safe.Finish(); gpr.UnlockAll();