From 9742aaaffe34c9ea7da3bc63651f487bd1694139 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 23 Sep 2023 11:52:42 -0700 Subject: [PATCH 1/3] x86jit: Use MOVAPS directly when we can. May help older processors or reduce total bytes. --- Core/MIPS/x86/X64IRRegCache.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Core/MIPS/x86/X64IRRegCache.cpp b/Core/MIPS/x86/X64IRRegCache.cpp index a169a43791..d3f554e32b 100644 --- a/Core/MIPS/x86/X64IRRegCache.cpp +++ b/Core/MIPS/x86/X64IRRegCache.cpp @@ -353,6 +353,8 @@ void X64IRRegCache::LoadNativeReg(IRNativeReg nreg, IRReg first, int lanes) { emit_->MOVSS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first))); else if (lanes == 2) emit_->MOVLPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first))); + else if (lanes == 4 && (first & 3) == 0) + emit_->MOVAPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first))); else if (lanes == 4) emit_->MOVUPS(r, MDisp(CTXREG, -128 + GetMipsRegOffset(first))); else @@ -381,6 +383,8 @@ void X64IRRegCache::StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) { emit_->MOVSS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r); else if (lanes == 2) emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r); + else if (lanes == 4 && (first & 3) == 0) + emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r); else if (lanes == 4) emit_->MOVUPS(MDisp(CTXREG, -128 + GetMipsRegOffset(first)), r); else From decccf199ae9751ea19417d487404bd08a2ff7ce Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 23 Sep 2023 12:09:36 -0700 Subject: [PATCH 2/3] x86jit: Flush floats together if possible. --- Core/MIPS/ARM64/Arm64IRRegCache.cpp | 16 ++++---- Core/MIPS/x86/X64IRRegCache.cpp | 61 +++++++++++++++++++++++++++++ Core/MIPS/x86/X64IRRegCache.h | 2 + 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/Core/MIPS/ARM64/Arm64IRRegCache.cpp b/Core/MIPS/ARM64/Arm64IRRegCache.cpp index 0ce5422fd5..0420a808ce 100644 --- a/Core/MIPS/ARM64/Arm64IRRegCache.cpp +++ b/Core/MIPS/ARM64/Arm64IRRegCache.cpp @@ -437,19 +437,21 @@ void Arm64IRRegCache::FlushAll(bool gprs, bool fprs) { // Note: make sure not to change the registers when flushing: // Branching code may expect the armreg to retain its value. + auto needsFlush = [&](IRReg i) { + if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic) + return false; + if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty) + return false; + return true; + }; + // Try to flush in pairs when possible. for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) { - if (mr[i].loc == MIPSLoc::MEM || mr[i].loc == MIPSLoc::MEM || mr[i].isStatic || mr[i + 1].isStatic) + if (!needsFlush(i) || !needsFlush(i + 1)) continue; // Ignore multilane regs. Could handle with more smartness... if (mr[i].lane != -1 || mr[i + 1].lane != -1) continue; - if (mr[i].nReg != -1 && !nr[mr[i].nReg].isDirty) - continue; - if (mr[i + 1].nReg != -1 && !nr[mr[i + 1].nReg].isDirty) - continue; - if (mr[i].loc == MIPSLoc::MEM || mr[i + 1].loc == MIPSLoc::MEM) - continue; int offset = GetMipsRegOffset(i); diff --git a/Core/MIPS/x86/X64IRRegCache.cpp b/Core/MIPS/x86/X64IRRegCache.cpp index d3f554e32b..ee176546c8 100644 --- a/Core/MIPS/x86/X64IRRegCache.cpp +++ b/Core/MIPS/x86/X64IRRegCache.cpp @@ -147,6 +147,67 @@ void X64IRRegCache::FlushBeforeCall() { #endif } +void X64IRRegCache::FlushAll(bool gprs, bool fprs) { + // Note: make sure not to change the registers when flushing: + // Branching code may expect the x64reg to retain its value. + + auto needsFlush = [&](IRReg i) { + if (mr[i].loc != MIPSLoc::MEM || mr[i].isStatic) + return false; + if (mr[i].nReg == -1 || !nr[mr[i].nReg].isDirty) + return false; + return true; + }; + + auto isSingleFloat = [&](IRReg i) { + if (mr[i].lane != -1 || mr[i].loc != MIPSLoc::FREG) + return false; + return true; + }; + + // Sometimes, float/vector regs may be in separate regs in a sequence. + // It's worth combining and flushing together. + for (int i = 1; i < TOTAL_MAPPABLE_IRREGS - 1; ++i) { + if (!needsFlush(i) || !needsFlush(i + 1)) + continue; + // GPRs are probably not worth it. Merging Vec2s might be, but pretty uncommon. + if (!isSingleFloat(i) || !isSingleFloat(i + 1)) + continue; + + X64Reg regs[4]{ INVALID_REG, INVALID_REG, INVALID_REG, INVALID_REG }; + regs[0] = FromNativeReg(mr[i + 0].nReg); + regs[1] = FromNativeReg(mr[i + 1].nReg); + + bool flushVec4 = i + 3 < TOTAL_MAPPABLE_IRREGS && needsFlush(i + 2) && needsFlush(i + 3); + if (flushVec4 && isSingleFloat(i + 2) && isSingleFloat(i + 3) && (i & 3) == 0) { + regs[2] = FromNativeReg(mr[i + 2].nReg); + regs[3] = FromNativeReg(mr[i + 3].nReg); + + // Note that this doesn't change the low lane of any of these regs. + emit_->UNPCKLPS(regs[1], ::R(regs[3])); + emit_->UNPCKLPS(regs[0], ::R(regs[2])); + emit_->UNPCKLPS(regs[0], ::R(regs[1])); + emit_->MOVAPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]); + + for (int j = 0; j < 4; ++j) + DiscardReg(i + j); + i += 3; + continue; + } + + // TODO: Maybe this isn't always worth doing. + emit_->UNPCKLPS(regs[0], ::R(regs[1])); + emit_->MOVLPS(MDisp(CTXREG, -128 + GetMipsRegOffset(i)), regs[0]); + + DiscardReg(i); + DiscardReg(i + 1); + ++i; + continue; + } + + IRNativeRegCacheBase::FlushAll(gprs, fprs); +} + X64Reg X64IRRegCache::TryMapTempImm(IRReg r, X64Map flags) { _dbg_assert_(IsValidGPR(r)); diff --git a/Core/MIPS/x86/X64IRRegCache.h b/Core/MIPS/x86/X64IRRegCache.h index 90e0259914..f33e4e8d89 100644 --- a/Core/MIPS/x86/X64IRRegCache.h +++ b/Core/MIPS/x86/X64IRRegCache.h @@ -92,6 +92,8 @@ public: void MapWithFlags(IRInst inst, X64IRJitConstants::X64Map destFlags, X64IRJitConstants::X64Map src1Flags = X64IRJitConstants::X64Map::NONE, X64IRJitConstants::X64Map src2Flags = X64IRJitConstants::X64Map::NONE); + // Note: may change the high lanes of single-register XMMs. + void FlushAll(bool gprs = true, bool fprs = true) override; void FlushBeforeCall(); Gen::X64Reg GetAndLockTempGPR(); From d36728e532aac8f5df765fdfaf72ce185285d7b5 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 24 Sep 2023 07:04:14 -0700 Subject: [PATCH 3/3] x86jit: Load common float vals from constants. --- Core/MIPS/x86/X64IRCompSystem.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Core/MIPS/x86/X64IRCompSystem.cpp b/Core/MIPS/x86/X64IRCompSystem.cpp index c038eee02d..b544c06854 100644 --- a/Core/MIPS/x86/X64IRCompSystem.cpp +++ b/Core/MIPS/x86/X64IRCompSystem.cpp @@ -62,6 +62,20 @@ void X64JitBackend::CompIR_Basic(IRInst inst) { regs_.Map(inst); if (inst.constant == 0) { XORPS(regs_.FX(inst.dest), regs_.F(inst.dest)); + } else if (inst.constant == 0x7FFFFFFF) { + MOVSS(regs_.FX(inst.dest), M(constants.noSignMask)); // rip accessible + } else if (inst.constant == 0x80000000) { + MOVSS(regs_.FX(inst.dest), M(constants.signBitAll)); // rip accessible + } else if (inst.constant == 0x7F800000) { + MOVSS(regs_.FX(inst.dest), M(constants.positiveInfinity)); // rip accessible + } else if (inst.constant == 0x7FC00000) { + MOVSS(regs_.FX(inst.dest), M(constants.qNAN)); // rip accessible + } else if (inst.constant == 0x3F800000) { + MOVSS(regs_.FX(inst.dest), M(constants.positiveOnes)); // rip accessible + } else if (inst.constant == 0xBF800000) { + MOVSS(regs_.FX(inst.dest), M(constants.negativeOnes)); // rip accessible + } else if (inst.constant == 0x4EFFFFFF) { + MOVSS(regs_.FX(inst.dest), M(constants.maxIntBelowAsFloat)); // rip accessible } else { MOV(32, R(SCRATCH1), Imm32(inst.constant)); MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));