x86jit: Implement trig/reciprocals.

This commit is contained in:
Unknown W. Brackets 2023-08-27 06:32:48 -07:00
parent 4b1c809886
commit 61a99b4bac
9 changed files with 176 additions and 5 deletions

View file

@ -1689,6 +1689,9 @@ void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp,
void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src) {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}
void XEmitter::MOVSHDUP(X64Reg regOp, OpArg arg) { WriteSSEOp(0xF3, sseMOVHPfromRM, regOp, arg); }
void XEmitter::MOVSLDUP(X64Reg regOp, OpArg arg) { WriteSSEOp(0xF3, sseMOVLPfromRM, regOp, arg); }
void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}

View file

@ -746,6 +746,10 @@ public:
void MOVD_xmm(const OpArg &arg, X64Reg src);
void MOVQ_xmm(OpArg arg, X64Reg src);
// SSE3: Some additional moves.
void MOVSHDUP(X64Reg regOp1, OpArg arg);
void MOVSLDUP(X64Reg regOp1, OpArg arg);
// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
void MOVMSKPS(X64Reg dest, OpArg arg);
void MOVMSKPD(X64Reg dest, OpArg arg);

View file

@ -159,6 +159,15 @@ bool IRNativeRegCacheBase::IsFPRMapped(IRReg fpr) {
return mr[fpr + 32].loc == MIPSLoc::FREG || mr[fpr + 32].loc == MIPSLoc::VREG;
}
int IRNativeRegCacheBase::GetFPRLane(IRReg fpr) {
_dbg_assert_(IsValidFPR(fpr));
if (mr[fpr + 32].loc == MIPSLoc::FREG || mr[fpr + 32].loc == MIPSLoc::VREG) {
int l = mr[fpr + 32].lane;
return l == -1 ? 0 : l;
}
return -1;
}
bool IRNativeRegCacheBase::IsGPRMappedAsPointer(IRReg gpr) {
_dbg_assert_(IsValidGPR(gpr));
if (mr[gpr].loc == MIPSLoc::REG) {

View file

@ -163,6 +163,7 @@ public:
bool IsFPRMapped(IRReg fpr);
bool IsGPRMappedAsPointer(IRReg gpr);
bool IsGPRMappedAsStaticPointer(IRReg gpr);
int GetFPRLane(IRReg fpr);
bool IsGPRImm(IRReg gpr);
bool IsGPR2Imm(IRReg base);

View file

@ -578,9 +578,10 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
#error Currently hard float is required.
#endif
auto callFuncF_F = [&](float (*func)(float)){
auto callFuncF_F = [&](float (*func)(float)) {
regs_.FlushBeforeCall();
// It might be in a non-volatile register.
// TODO: May have to handle a transfer if SIMD here.
if (regs_.IsFPRMapped(inst.src1)) {
FMV(32, F10, regs_.F(inst.src1));
} else {

View file

@ -45,6 +45,7 @@ alignas(16) const u32 reverseQNAN[4] = { 0x803FFFFF, 0x803FFFFF, 0x803FFFFF, 0x8
alignas(16) const u32 noSignMask[4] = { 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF };
alignas(16) const u32 positiveInfinity[4] = { 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000 };
alignas(16) const u32 signBitAll[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000 };
alignas(16) const u32 ones[4] = { 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 };
} simdConstants;
void X64JitBackend::CompIR_FArith(IRInst inst) {
@ -646,16 +647,154 @@ void X64JitBackend::CompIR_FSat(IRInst inst) {
}
}
#if X64JIT_USE_XMM_CALL
static float X64JIT_XMM_CALL x64_sin(float f) {
return vfpu_sin(f);
}
static float X64JIT_XMM_CALL x64_cos(float f) {
return vfpu_cos(f);
}
static float X64JIT_XMM_CALL x64_asin(float f) {
return vfpu_asin(f);
}
#else
static uint32_t x64_sin(uint32_t v) {
float f;
memcpy(&f, &v, sizeof(v));
f = vfpu_sin(f);
memcpy(&v, &f, sizeof(v));
return v;
}
static uint32_t x64_cos(uint32_t v) {
float f;
memcpy(&f, &v, sizeof(v));
f = vfpu_cos(f);
memcpy(&v, &f, sizeof(v));
return v;
}
static uint32_t x64_asin(uint32_t v) {
float f;
memcpy(&f, &v, sizeof(v));
f = vfpu_asin(f);
memcpy(&v, &f, sizeof(v));
return v;
}
#endif
void X64JitBackend::CompIR_FSpecial(IRInst inst) {
CONDITIONAL_DISABLE;
// TODO: Regcache... maybe emitter helper too?
auto laneToReg0 = [&](X64Reg dest, X64Reg src, int lane) {
if (lane == 0) {
if (dest != src)
MOVAPS(dest, R(src));
} else if (lane == 1 && cpu_info.bSSE3) {
MOVSHDUP(dest, R(src));
} else if (lane == 2) {
MOVHLPS(dest, src);
} else if (cpu_info.bAVX) {
VPERMILPS(128, dest, R(src), VFPU_SWIZZLE(lane, lane, lane, lane));
} else {
if (dest != src)
MOVAPS(dest, R(src));
SHUFPS(dest, R(dest), VFPU_SWIZZLE(lane, lane, lane, lane));
}
};
auto callFuncF_F = [&](const void *func) {
regs_.FlushBeforeCall();
#if X64JIT_USE_XMM_CALL
if (regs_.IsFPRMapped(inst.src1)) {
int lane = regs_.GetFPRLane(inst.src1);
laneToReg0(XMM0, regs_.FX(inst.src1), lane);
} else {
// Account for CTXREG being increased by 128 to reduce imm sizes.
int offset = offsetof(MIPSState, f) + inst.src1 * 4 - 128;
MOVSS(XMM0, MDisp(CTXREG, offset));
}
ABI_CallFunction((const void *)func);
// It's already in place, NOINIT won't modify.
regs_.MapFPR(inst.dest, MIPSMap::NOINIT | X64Map::XMM0);
#else
if (regs_.IsFPRMapped(inst.src1)) {
int lane = regs_.GetFPRLane(inst.src1);
if (lane == 0) {
MOVD_xmm(R(SCRATCH1), regs_.FX(inst.src1));
} else {
laneToReg0(XMM0, regs_.FX(inst.src1), lane);
MOVD_xmm(R(SCRATCH1), XMM0);
}
} else {
int offset = offsetof(MIPSState, f) + inst.src1 * 4;
MOV(32, R(SCRATCH1), MDisp(CTXREG, offset));
}
ABI_CallFunctionR((const void *)func, SCRATCH1);
regs_.MapFPR(inst.dest, MIPSMap::NOINIT);
MOVD_xmm(regs_.FX(inst.dest), R(SCRATCH1));
#endif
};
switch (inst.op) {
case IROp::FSin:
callFuncF_F((const void *)&x64_sin);
break;
case IROp::FCos:
callFuncF_F((const void *)&x64_cos);
break;
case IROp::FRSqrt:
{
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
SQRTSS(tempReg, regs_.F(inst.src1));
if (RipAccessible(&simdConstants.ones)) {
MOVSS(regs_.FX(inst.dest), M(&simdConstants.ones)); // rip accessible
} else {
MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones));
MOVSS(regs_.FX(inst.dest), MatR(SCRATCH1));
}
DIVSS(regs_.FX(inst.dest), R(tempReg));
break;
}
case IROp::FRecip:
if (inst.dest != inst.src1) {
regs_.Map(inst);
if (RipAccessible(&simdConstants.ones)) {
MOVSS(regs_.FX(inst.dest), M(&simdConstants.ones)); // rip accessible
} else {
MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones));
MOVSS(regs_.FX(inst.dest), MatR(SCRATCH1));
}
DIVSS(regs_.FX(inst.dest), regs_.F(inst.src1));
} else {
X64Reg tempReg = regs_.MapWithFPRTemp(inst);
if (RipAccessible(&simdConstants.ones)) {
MOVSS(tempReg, M(&simdConstants.ones)); // rip accessible
} else {
MOV(PTRBITS, R(SCRATCH1), ImmPtr(&simdConstants.ones));
MOVSS(tempReg, MatR(SCRATCH1));
}
if (cpu_info.bAVX) {
VDIVSS(regs_.FX(inst.dest), tempReg, regs_.F(inst.src1));
} else {
DIVSS(tempReg, regs_.F(inst.src1));
MOVSS(regs_.FX(inst.dest), R(tempReg));
}
}
break;
case IROp::FAsin:
CompIR_Generic(inst);
callFuncF_F((const void *)&x64_asin);
break;
default:

View file

@ -32,7 +32,7 @@
#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
#define X64JIT_XMM_CALL __vectorcall
#define X64JIT_USE_XMM_CALL 1
#elif PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(WINDOWS)
#elif PPSSPP_ARCH(AMD64)
// SystemV ABI supports XMM registers.
#define X64JIT_XMM_CALL
#define X64JIT_USE_XMM_CALL 1

View file

@ -103,6 +103,13 @@ const int *X64IRRegCache::GetAllocationOrder(MIPSLoc type, MIPSMap flags, int &c
#endif
};
if ((flags & X64Map::MASK) == X64Map::XMM0) {
// Certain cases require this reg.
static const int blendReg[] = { XMM0 };
count = 1;
return blendReg;
}
count = ARRAY_SIZE(allocationOrder);
return allocationOrder;
} else {
@ -238,6 +245,11 @@ void X64IRRegCache::MapWithFlags(IRInst inst, X64Map destFlags, X64Map src1Flags
flushReg(RDX);
break;
case X64Map::XMM0:
if (nr[XMMToNativeReg(XMM0)].mipsReg != mapping[i].reg)
flushReg(XMMToNativeReg(XMM0));
break;
default:
break;
}

View file

@ -45,10 +45,12 @@ enum class X64Map : uint8_t {
NONE = 0,
// On 32-bit: EAX, EBX, ECX, EDX
LOW_SUBREG = 0x10,
// EDX/RDX
// EDX/RDX for DIV/MUL/similar.
HIGH_DATA = 0x20,
// ECX/RCX
// ECX/RCX only, for shifts.
SHIFT = 0x30,
// XMM0 for BLENDVPS, funcs.
XMM0 = 0x40,
MASK = 0xF0,
};
static inline MIPSMap operator |(const MIPSMap &lhs, const X64Map &rhs) {