diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp index 73e84a3739..c2a5ba8c4d 100644 --- a/Common/x64Emitter.cpp +++ b/Common/x64Emitter.cpp @@ -1689,6 +1689,9 @@ void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));} +void XEmitter::MOVSHDUP(X64Reg regOp, OpArg arg) { WriteSSEOp(0xF3, sseMOVHPfromRM, regOp, arg); } +void XEmitter::MOVSLDUP(X64Reg regOp, OpArg arg) { WriteSSEOp(0xF3, sseMOVLPfromRM, regOp, arg); } + void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);} void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);} diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h index 4dcfda6d70..16f30a35b0 100644 --- a/Common/x64Emitter.h +++ b/Common/x64Emitter.h @@ -746,6 +746,10 @@ public: void MOVD_xmm(const OpArg &arg, X64Reg src); void MOVQ_xmm(OpArg arg, X64Reg src); + // SSE3: Some additional moves. + void MOVSHDUP(X64Reg regOp1, OpArg arg); + void MOVSLDUP(X64Reg regOp1, OpArg arg); + // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. void MOVMSKPS(X64Reg dest, OpArg arg); void MOVMSKPD(X64Reg dest, OpArg arg); diff --git a/Core/MIPS/IR/IRRegCache.cpp b/Core/MIPS/IR/IRRegCache.cpp index 73ee3fa52d..1f10205cea 100644 --- a/Core/MIPS/IR/IRRegCache.cpp +++ b/Core/MIPS/IR/IRRegCache.cpp @@ -159,6 +159,15 @@ bool IRNativeRegCacheBase::IsFPRMapped(IRReg fpr) { return mr[fpr + 32].loc == MIPSLoc::FREG || mr[fpr + 32].loc == MIPSLoc::VREG; } +int IRNativeRegCacheBase::GetFPRLane(IRReg fpr) { + _dbg_assert_(IsValidFPR(fpr)); + if (mr[fpr + 32].loc == MIPSLoc::FREG || mr[fpr + 32].loc == MIPSLoc::VREG) { + int l = mr[fpr + 32].lane; + return l == -1 ? 0 : l; + } + return -1; +} + bool IRNativeRegCacheBase::IsGPRMappedAsPointer(IRReg gpr) { _dbg_assert_(IsValidGPR(gpr)); if (mr[gpr].loc == MIPSLoc::REG) { diff --git a/Core/MIPS/IR/IRRegCache.h b/Core/MIPS/IR/IRRegCache.h index 3f7d78686a..2504ad718d 100644 --- a/Core/MIPS/IR/IRRegCache.h +++ b/Core/MIPS/IR/IRRegCache.h @@ -163,6 +163,7 @@ public: bool IsFPRMapped(IRReg fpr); bool IsGPRMappedAsPointer(IRReg gpr); bool IsGPRMappedAsStaticPointer(IRReg gpr); + int GetFPRLane(IRReg fpr); bool IsGPRImm(IRReg gpr); bool IsGPR2Imm(IRReg base); diff --git a/Core/MIPS/RiscV/RiscVCompFPU.cpp b/Core/MIPS/RiscV/RiscVCompFPU.cpp index 37296ab40f..00c4ff8661 100644 --- a/Core/MIPS/RiscV/RiscVCompFPU.cpp +++ b/Core/MIPS/RiscV/RiscVCompFPU.cpp @@ -578,9 +578,10 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) { #error Currently hard float is required. #endif - auto callFuncF_F = [&](float (*func)(float)){ + auto callFuncF_F = [&](float (*func)(float)) { regs_.FlushBeforeCall(); // It might be in a non-volatile register. + // TODO: May have to handle a transfer if SIMD here. if (regs_.IsFPRMapped(inst.src1)) { FMV(32, F10, regs_.F(inst.src1)); } else { diff --git a/Core/MIPS/x86/X64IRCompFPU.cpp b/Core/MIPS/x86/X64IRCompFPU.cpp index 9012de9c24..460363fc2d 100644 --- a/Core/MIPS/x86/X64IRCompFPU.cpp +++ b/Core/MIPS/x86/X64IRCompFPU.cpp @@ -45,6 +45,7 @@ alignas(16) const u32 reverseQNAN[4] = { 0x803FFFFF, 0x803FFFFF, 0x803FFFFF, 0x8 alignas(16) const u32 noSignMask[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF }; alignas(16) const u32 positiveInfinity[4] = { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 }; alignas(16) const u32 signBitAll[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; +alignas(16) const u32 ones[4] = { 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 }; } simdConstants; void X64JitBackend::CompIR_FArith(IRInst inst) { @@ -646,16 +647,154 @@ void X64JitBackend::CompIR_FSat(IRInst inst) { } } +#if X64JIT_USE_XMM_CALL +static float X64JIT_XMM_CALL x64_sin(float f) { + return vfpu_sin(f); +} + +static float X64JIT_XMM_CALL x64_cos(float f) { + return vfpu_cos(f); +} + +static float X64JIT_XMM_CALL x64_asin(float f) { + return vfpu_asin(f); +} +#else +static uint32_t x64_sin(uint32_t v) { + float f; + memcpy(&f, &v, sizeof(v)); + f = vfpu_sin(f); + memcpy(&v, &f, sizeof(v)); + return v; +} + +static uint32_t x64_cos(uint32_t v) { + float f; + memcpy(&f, &v, sizeof(v)); + f = vfpu_cos(f); + memcpy(&v, &f, sizeof(v)); + return v; +} + +static uint32_t x64_asin(uint32_t v) { + float f; + memcpy(&f, &v, sizeof(v)); + f = vfpu_asin(f); + memcpy(&v, &f, sizeof(v)); + return v; +} +#endif + void X64JitBackend::CompIR_FSpecial(IRInst inst) { CONDITIONAL_DISABLE; + // TODO: Regcache... maybe emitter helper too? + auto laneToReg0 = [&](X64Reg dest, X64Reg src, int lane) { + if (lane == 0) { + if (dest != src) + MOVAPS(dest, R(src)); + } else if (lane == 1 && cpu_info.bSSE3) { + MOVSHDUP(dest, R(src)); + } else if (lane == 2) { + MOVHLPS(dest, src); + } else if (cpu_info.bAVX) { + VPERMILPS(128, dest, R(src), VFPU_SWIZZLE(lane, lane, lane, lane)); + } else { + if (dest != src) + MOVAPS(dest, R(src)); + SHUFPS(dest, R(dest), VFPU_SWIZZLE(lane, lane, lane, lane)); + } + }; + + auto callFuncF_F = [&](const void *func) { + regs_.FlushBeforeCall(); + +#if X64JIT_USE_XMM_CALL + if (regs_.IsFPRMapped(inst.src1)) { + int lane = regs_.GetFPRLane(inst.src1); + laneToReg0(XMM0, regs_.FX(inst.src1), lane); + } else { + // Account for CTXREG being increased by 128 to reduce imm sizes. + int offset = offsetof(MIPSState, f) + inst.src1 * 4 - 128; + MOVSS(XMM0, MDisp(CTXREG, offset)); + } + ABI_CallFunction((const void *)func); + + // It's already in place, NOINIT won't modify. + regs_.MapFPR(inst.dest, MIPSMap::NOINIT | X64Map::XMM0); +#else + if (regs_.IsFPRMapped(inst.src1)) { + int lane = regs_.GetFPRLane(inst.src1); + if (lane == 0) { + MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src1)); + } else { + laneToReg0(XMM0, regs_.FX(inst.src1), lane); + MOVD_xmm(R(SCRATCH1), XMM0); + } + } else { + int offset = offsetof(MIPSState, f) + inst.src1 * 4; + MOV(32, R(SCRATCH1), MDisp(CTXREG, offset)); + } + ABI_CallFunctionR((const void *)func, SCRATCH1); + + regs_.MapFPR(inst.dest, MIPSMap::NOINIT); + MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1)); +#endif + }; + switch (inst.op) { case IROp::FSin: + callFuncF_F((const void *)&x64_sin); + break; + case IROp::FCos: + callFuncF_F((const void *)&x64_cos); + break; + case IROp::FRSqrt: + { + X64Reg tempReg = regs_.MapWithFPRTemp(inst); + SQRTSS(tempReg, regs_.F(inst.src1)); + + if (RipAccessible(&simdConstants.ones)) { + MOVSS(regs_.FX(inst.dest), M(&simdConstants.ones)); // rip accessible + } else { + MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones)); + MOVSS(regs_.FX(inst.dest), MatR(SCRATCH1)); + } + DIVSS(regs_.FX(inst.dest), R(tempReg)); + break; + } + case IROp::FRecip: + if (inst.dest != inst.src1) { + regs_.Map(inst); + if (RipAccessible(&simdConstants.ones)) { + MOVSS(regs_.FX(inst.dest), M(&simdConstants.ones)); // rip accessible + } else { + MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones)); + MOVSS(regs_.FX(inst.dest), MatR(SCRATCH1)); + } + DIVSS(regs_.FX(inst.dest), regs_.F(inst.src1)); + } else { + X64Reg tempReg = regs_.MapWithFPRTemp(inst); + if (RipAccessible(&simdConstants.ones)) { + MOVSS(tempReg, M(&simdConstants.ones)); // rip accessible + } else { + MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones)); + MOVSS(tempReg, MatR(SCRATCH1)); + } + if (cpu_info.bAVX) { + VDIVSS(regs_.FX(inst.dest), tempReg, regs_.F(inst.src1)); + } else { + DIVSS(tempReg, regs_.F(inst.src1)); + MOVSS(regs_.FX(inst.dest), R(tempReg)); + } + } + break; + case IROp::FAsin: - CompIR_Generic(inst); + callFuncF_F((const void *)&x64_asin); break; default: diff --git a/Core/MIPS/x86/X64IRJit.h b/Core/MIPS/x86/X64IRJit.h index f061603535..fb50106e88 100644 --- a/Core/MIPS/x86/X64IRJit.h +++ b/Core/MIPS/x86/X64IRJit.h @@ -32,7 +32,7 @@ #if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)) #define X64JIT_XMM_CALL __vectorcall #define X64JIT_USE_XMM_CALL 1 -#elif PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(WINDOWS) +#elif PPSSPP_ARCH(AMD64) // SystemV ABI supports XMM registers. #define X64JIT_XMM_CALL #define X64JIT_USE_XMM_CALL 1 diff --git a/Core/MIPS/x86/X64IRRegCache.cpp b/Core/MIPS/x86/X64IRRegCache.cpp index 906d952142..6193fbbc21 100644 --- a/Core/MIPS/x86/X64IRRegCache.cpp +++ b/Core/MIPS/x86/X64IRRegCache.cpp @@ -103,6 +103,13 @@ const int *X64IRRegCache::GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &c #endif }; + if ((flags & X64Map::MASK) == X64Map::XMM0) { + // Certain cases require this reg. + static const int blendReg[] = { XMM0 }; + count = 1; + return blendReg; + } + count = ARRAY_SIZE(allocationOrder); return allocationOrder; } else { @@ -238,6 +245,11 @@ void X64IRRegCache::MapWithFlags(IRInst inst, X64Map destFlags, X64Map src1Flags flushReg(RDX); break; + case X64Map::XMM0: + if (nr[XMMToNativeReg(XMM0)].mipsReg != mapping[i].reg) + flushReg(XMMToNativeReg(XMM0)); + break; + default: break; } diff --git a/Core/MIPS/x86/X64IRRegCache.h b/Core/MIPS/x86/X64IRRegCache.h index 42dd7a3041..a263eb157c 100644 --- a/Core/MIPS/x86/X64IRRegCache.h +++ b/Core/MIPS/x86/X64IRRegCache.h @@ -45,10 +45,12 @@ enum class X64Map : uint8_t { NONE = 0, // On 32-bit: EAX, EBX, ECX, EDX LOW_SUBREG = 0x10, - // EDX/RDX + // EDX/RDX for DIV/MUL/similar. HIGH_DATA = 0x20, - // ECX/RCX + // ECX/RCX only, for shifts. SHIFT = 0x30, + // XMM0 for BLENDVPS, funcs. + XMM0 = 0x40, MASK = 0xF0, }; static inline MIPSMap operator |(const MIPSMap &lhs, const X64Map &rhs) {