mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Merge pull request #7672 from unknownbrackets/jit-minor
More x86jit micro optimizations for the FPU
This commit is contained in:
commit
a1f5c537d4
4 changed files with 80 additions and 38 deletions
|
@ -47,6 +47,14 @@ namespace MIPSComp {
|
|||
using namespace Gen;
|
||||
using namespace X64JitConstants;
|
||||
|
||||
void Jit::CopyFPReg(X64Reg dst, OpArg src) {
|
||||
if (src.IsSimpleReg()) {
|
||||
MOVAPS(dst, src);
|
||||
} else {
|
||||
MOVSS(dst, src);
|
||||
}
|
||||
}
|
||||
|
||||
void Jit::CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(X64Reg reg, OpArg), bool orderMatters) {
|
||||
int ft = _FT;
|
||||
int fs = _FS;
|
||||
|
@ -62,14 +70,14 @@ void Jit::CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(X64Reg reg, OpAr
|
|||
} else if (ft != fd) {
|
||||
// fs can't be fd (handled above.)
|
||||
fpr.MapReg(fd, false, true);
|
||||
MOVSS(fpr.RX(fd), fpr.R(fs));
|
||||
CopyFPReg(fpr.RX(fd), fpr.R(fs));
|
||||
(this->*arith)(fpr.RX(fd), fpr.R(ft));
|
||||
} else {
|
||||
// fd must be ft.
|
||||
// fd must be ft, and order must matter.
|
||||
fpr.MapReg(fd, true, true);
|
||||
MOVSS(XMM0, fpr.R(fs));
|
||||
CopyFPReg(XMM0, fpr.R(fs));
|
||||
(this->*arith)(XMM0, fpr.R(ft));
|
||||
MOVSS(fpr.RX(fd), R(XMM0));
|
||||
MOVAPS(fpr.RX(fd), R(XMM0));
|
||||
}
|
||||
fpr.ReleaseSpillLocks();
|
||||
}
|
||||
|
@ -151,14 +159,14 @@ void Jit::CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN) {
|
|||
|
||||
// This means that NaN also means true, e.g. !<> or !>, etc.
|
||||
if (allowNaN) {
|
||||
MOVSS(XMM0, fpr.R(lhs));
|
||||
MOVSS(XMM1, fpr.R(lhs));
|
||||
CopyFPReg(XMM0, fpr.R(lhs));
|
||||
CopyFPReg(XMM1, fpr.R(lhs));
|
||||
CMPSS(XMM0, fpr.R(rhs), compare);
|
||||
CMPUNORDSS(XMM1, fpr.R(rhs));
|
||||
|
||||
POR(XMM0, R(XMM1));
|
||||
} else {
|
||||
MOVSS(XMM0, fpr.R(lhs));
|
||||
CopyFPReg(XMM0, fpr.R(lhs));
|
||||
CMPSS(XMM0, fpr.R(rhs), compare);
|
||||
}
|
||||
|
||||
|
@ -226,7 +234,8 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
|
|||
int fd = _FD;
|
||||
|
||||
auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) {
|
||||
fpr.SpillLock(fs);
|
||||
fpr.SpillLock(fd, fs);
|
||||
fpr.MapReg(fd, fs == fd, true);
|
||||
|
||||
// Small optimization: 0 is our default mode anyway.
|
||||
if (setMXCSR == 0 && !js.hasSetRounding) {
|
||||
|
@ -246,18 +255,19 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
|
|||
// Did we get an indefinite integer value?
|
||||
CMP(32, R(TEMPREG), Imm32(0x80000000));
|
||||
FixupBranch skip = J_CC(CC_NE);
|
||||
MOVSS(XMM0, fpr.R(fs));
|
||||
if (fd != fs) {
|
||||
CopyFPReg(fpr.RX(fd), fpr.R(fs));
|
||||
}
|
||||
XORPS(XMM1, R(XMM1));
|
||||
CMPSS(XMM0, R(XMM1), CMP_LT);
|
||||
CMPSS(fpr.RX(fd), R(XMM1), CMP_LT);
|
||||
|
||||
// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
|
||||
// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
|
||||
MOVD_xmm(R(TEMPREG), XMM0);
|
||||
MOVD_xmm(R(TEMPREG), fpr.RX(fd));
|
||||
XOR(32, R(TEMPREG), Imm32(0x7fffffff));
|
||||
|
||||
SetJumpTarget(skip);
|
||||
fpr.DiscardR(fd);
|
||||
MOV(32, fpr.R(fd), R(TEMPREG));
|
||||
MOVD_xmm(fpr.RX(fd), R(TEMPREG));
|
||||
|
||||
if (setMXCSR != -1) {
|
||||
LDMXCSR(M(&mxcsrTemp));
|
||||
|
@ -268,32 +278,42 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
|
|||
case 5: //F(fd) = fabsf(F(fs)); break; //abs
|
||||
fpr.SpillLock(fd, fs);
|
||||
fpr.MapReg(fd, fd == fs, true);
|
||||
if (fd != fs) {
|
||||
MOVSS(fpr.RX(fd), fpr.R(fs));
|
||||
if (fd != fs && fpr.IsMapped(fs)) {
|
||||
MOVAPS(fpr.RX(fd), M(ssNoSignMask));
|
||||
ANDPS(fpr.RX(fd), fpr.R(fs));
|
||||
} else {
|
||||
if (fd != fs) {
|
||||
MOVSS(fpr.RX(fd), fpr.R(fs));
|
||||
}
|
||||
ANDPS(fpr.RX(fd), M(ssNoSignMask));
|
||||
}
|
||||
ANDPS(fpr.RX(fd), M(ssNoSignMask));
|
||||
break;
|
||||
|
||||
case 6: //F(fd) = F(fs); break; //mov
|
||||
if (fd != fs) {
|
||||
fpr.SpillLock(fd, fs);
|
||||
fpr.MapReg(fd, fd == fs, true);
|
||||
MOVSS(fpr.RX(fd), fpr.R(fs));
|
||||
CopyFPReg(fpr.RX(fd), fpr.R(fs));
|
||||
}
|
||||
break;
|
||||
|
||||
case 7: //F(fd) = -F(fs); break; //neg
|
||||
fpr.SpillLock(fd, fs);
|
||||
fpr.MapReg(fd, fd == fs, true);
|
||||
if (fd != fs) {
|
||||
MOVSS(fpr.RX(fd), fpr.R(fs));
|
||||
if (fd != fs && fpr.IsMapped(fs)) {
|
||||
MOVAPS(fpr.RX(fd), M(ssSignBits2));
|
||||
XORPS(fpr.RX(fd), fpr.R(fs));
|
||||
} else {
|
||||
if (fd != fs) {
|
||||
MOVSS(fpr.RX(fd), fpr.R(fs));
|
||||
}
|
||||
XORPS(fpr.RX(fd), M(ssSignBits2));
|
||||
}
|
||||
XORPS(fpr.RX(fd), M(ssSignBits2));
|
||||
break;
|
||||
|
||||
|
||||
case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt
|
||||
fpr.SpillLock(fd, fs); // this probably works, just badly tested
|
||||
fpr.SpillLock(fd, fs);
|
||||
fpr.MapReg(fd, fd == fs, true);
|
||||
SQRTSS(fpr.RX(fd), fpr.R(fs));
|
||||
break;
|
||||
|
@ -305,7 +325,7 @@ void Jit::Comp_FPU2op(MIPSOpcode op) {
|
|||
case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w
|
||||
fpr.SpillLock(fd, fs);
|
||||
fpr.MapReg(fd, fs == fd, true);
|
||||
if (fpr.R(fs).IsSimpleReg()) {
|
||||
if (fpr.IsMapped(fs)) {
|
||||
CVTDQ2PS(fpr.RX(fd), fpr.R(fs));
|
||||
} else {
|
||||
// If fs was fd, we'd be in the case above since we mapped fd.
|
||||
|
|
|
@ -239,7 +239,6 @@ void Jit::Comp_SV(MIPSOpcode op) {
|
|||
case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);
|
||||
{
|
||||
gpr.Lock(rs);
|
||||
gpr.MapReg(rs, true, false);
|
||||
fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
|
||||
|
||||
JitSafeMem safe(this, rs, imm);
|
||||
|
@ -263,9 +262,7 @@ void Jit::Comp_SV(MIPSOpcode op) {
|
|||
case 58: //sv.s // Memory::Write_U32(VI(vt), addr);
|
||||
{
|
||||
gpr.Lock(rs);
|
||||
gpr.MapReg(rs, true, false);
|
||||
|
||||
// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
|
||||
fpr.MapRegV(vt, 0);
|
||||
|
||||
JitSafeMem safe(this, rs, imm);
|
||||
|
@ -380,19 +377,32 @@ void Jit::Comp_SVQ(MIPSOpcode op)
|
|||
case 54: //lv.q
|
||||
{
|
||||
gpr.Lock(rs);
|
||||
gpr.MapReg(rs, true, false);
|
||||
|
||||
u8 vregs[4];
|
||||
GetVectorRegs(vregs, V_Quad, vt);
|
||||
|
||||
if (g_Config.bFastMemory && fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {
|
||||
if (fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {
|
||||
JitSafeMem safe(this, rs, imm);
|
||||
safe.SetFar();
|
||||
OpArg src;
|
||||
if (safe.PrepareRead(src, 16)) {
|
||||
MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));
|
||||
} else {
|
||||
// Hmm... probably never happens.
|
||||
// Should be safe, since lv.q must be aligned, but let's try to avoid crashing in safe mode.
|
||||
if (g_Config.bFastMemory) {
|
||||
MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));
|
||||
} else {
|
||||
MOVUPS(fpr.VSX(vregs), safe.NextFastAddress(0));
|
||||
}
|
||||
}
|
||||
if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
|
||||
// We use XMM0 as a temporary since MOVSS and MOVD would clear the higher bits.
|
||||
MOVD_xmm(XMM0, R(EAX));
|
||||
MOVSS(fpr.VSX(vregs), R(XMM0));
|
||||
// Rotate things so we can read in the next higher float.
|
||||
// By the end (4 rotates), they'll all be back into place.
|
||||
SHUFPS(fpr.VSX(vregs), fpr.VS(vregs), _MM_SHUFFLE(0, 3, 2, 1));
|
||||
}
|
||||
}
|
||||
safe.Finish();
|
||||
gpr.UnlockAll();
|
||||
|
@ -429,19 +439,29 @@ void Jit::Comp_SVQ(MIPSOpcode op)
|
|||
case 62: //sv.q
|
||||
{
|
||||
gpr.Lock(rs);
|
||||
gpr.MapReg(rs, true, false);
|
||||
|
||||
u8 vregs[4];
|
||||
GetVectorRegs(vregs, V_Quad, vt);
|
||||
|
||||
if (g_Config.bFastMemory && fpr.TryMapRegsVS(vregs, V_Quad, 0)) {
|
||||
if (fpr.TryMapRegsVS(vregs, V_Quad, 0)) {
|
||||
JitSafeMem safe(this, rs, imm);
|
||||
safe.SetFar();
|
||||
OpArg dest;
|
||||
if (safe.PrepareWrite(dest, 16)) {
|
||||
MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));
|
||||
} else {
|
||||
// Hmm... probably never happens.
|
||||
// Should be safe, since sv.q must be aligned, but let's try to avoid crashing in safe mode.
|
||||
if (g_Config.bFastMemory) {
|
||||
MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));
|
||||
} else {
|
||||
MOVUPS(safe.NextFastAddress(0), fpr.VSX(vregs));
|
||||
}
|
||||
}
|
||||
if (safe.PrepareSlowWrite()) {
|
||||
MOVAPS(XMM0, fpr.VS(vregs));
|
||||
for (int i = 0; i < 4; i++) {
|
||||
MOVSS(M(&ssLoadStoreTemp), XMM0);
|
||||
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
|
||||
safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp), i * 4);
|
||||
}
|
||||
}
|
||||
safe.Finish();
|
||||
gpr.UnlockAll();
|
||||
|
|
|
@ -228,6 +228,7 @@ private:
|
|||
static Gen::CCFlags FlipCCFlag(Gen::CCFlags flag);
|
||||
static Gen::CCFlags SwapCCFlag(Gen::CCFlags flag);
|
||||
|
||||
void CopyFPReg(Gen::X64Reg dst, Gen::OpArg src);
|
||||
void CompFPTriArith(MIPSOpcode op, void (XEmitter::*arith)(Gen::X64Reg reg, Gen::OpArg), bool orderMatters);
|
||||
void CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN = false);
|
||||
void CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin);
|
||||
|
|
|
@ -51,8 +51,9 @@ JitSafeMem::JitSafeMem(Jit *jit, MIPSGPReg raddr, s32 offset, u32 alignMask)
|
|||
{
|
||||
// This makes it more instructions, so let's play it safe and say we need a far jump.
|
||||
far_ = !g_Config.bIgnoreBadMemAccess || !CBreakPoints::GetMemChecks().empty();
|
||||
// Mask out the kernel RAM bit, because we'll end up with a negative offset to MEMBASEREG.
|
||||
if (jit_->gpr.IsImm(raddr_))
|
||||
iaddr_ = jit_->gpr.GetImm(raddr_) + offset_;
|
||||
iaddr_ = (jit_->gpr.GetImm(raddr_) + offset_) & 0x7FFFFFFF;
|
||||
else
|
||||
iaddr_ = (u32) -1;
|
||||
|
||||
|
@ -123,9 +124,9 @@ bool JitSafeMem::PrepareRead(OpArg &src, int size)
|
|||
|
||||
OpArg JitSafeMem::NextFastAddress(int suboffset)
|
||||
{
|
||||
if (jit_->gpr.IsImm(raddr_))
|
||||
if (iaddr_ != (u32) -1)
|
||||
{
|
||||
u32 addr = (jit_->gpr.GetImm(raddr_) + offset_ + suboffset) & alignMask_;
|
||||
u32 addr = (iaddr_ + suboffset) & alignMask_;
|
||||
|
||||
#ifdef _M_IX86
|
||||
return M(Memory::base + (addr & Memory::MEMVIEW32_MASK));
|
||||
|
|
Loading…
Add table
Reference in a new issue