Merge pull request #17783 from unknownbrackets/riscv-jit

Implement float/vec operations in RISC-V jit
This commit is contained in:
Henrik Rydgård 2023-07-28 08:38:19 +02:00 committed by GitHub
commit 4aa2b1fcac
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
21 changed files with 1088 additions and 142 deletions

View file

@ -180,7 +180,7 @@ void Arm64RegCache::MapRegTo(ARM64Reg reg, MIPSGPReg mipsReg, int mapFlags) {
ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;
if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) {
if (mipsReg == MIPS_REG_ZERO) {
// If we get a request to load the zero register, at least we won't spend
// If we get a request to map the zero register, at least we won't spend
// time on a memory access...
emit_->MOVI2R(reg, 0);

View file

@ -319,6 +319,7 @@ void Arm64RegCacheFPU::FlushR(MIPSReg r) {
if (mr[r].reg == INVALID_REG) {
ERROR_LOG(JIT, "FlushR: MipsReg had bad ArmReg");
}
FlushArmReg((ARM64Reg)(S0 + mr[r].reg));
break;
case ML_MEM:
@ -329,8 +330,6 @@ void Arm64RegCacheFPU::FlushR(MIPSReg r) {
//BAD
break;
}
mr[r].loc = ML_MEM;
mr[r].reg = (int)INVALID_REG;
}
Arm64Gen::ARM64Reg Arm64RegCacheFPU::ARM64RegForFlush(int r) {

View file

@ -17,8 +17,6 @@
#pragma once
#pragma once
#include "Core/MIPS/MIPS.h"
#include "Core/MIPS/ARM64/Arm64RegCache.h"
#include "Core/MIPS/MIPSVFPUUtils.h"
@ -165,7 +163,6 @@ private:
MIPSComp::JitOptions *jo_;
int numARMFpuReg_;
int qTime_;
enum {
// On ARM64, each of the 32 registers are full 128-bit. No sharing of components!

View file

@ -455,6 +455,7 @@ void IRFrontend::Comp_Syscall(MIPSOpcode op) {
}
void IRFrontend::Comp_Break(MIPSOpcode op) {
ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC()));
ir.Write(IROp::Break);
js.compiling = false;
}

View file

@ -203,7 +203,8 @@ void IRFrontend::Comp_mxc1(MIPSOpcode op) {
return;
}
if (fs == 31) {
DISABLE; // TODO: Add a new op
// This needs to insert fpcond.
ir.Write(IROp::FpCtrlToReg, rt);
} else if (fs == 0) {
ir.Write(IROp::SetConst, rt, ir.AddConstant(MIPSState::FCR0_VALUE));
} else {
@ -219,7 +220,10 @@ void IRFrontend::Comp_mxc1(MIPSOpcode op) {
case 6: //ctc1
if (fs == 31) {
// Set rounding mode
DISABLE;
RestoreRoundingMode();
ir.Write(IROp::FpCtrlFromReg, 0, rt);
UpdateRoundingMode();
ApplyRoundingMode();
} else {
Comp_Generic(op);
}

View file

@ -112,6 +112,8 @@ static const IRMeta irMeta[] = {
{ IROp::FMovToGPR, "FMovToGPR", "GF" },
{ IROp::ZeroFpCond, "ZeroFpCond", "" },
{ IROp::FpCondToReg, "FpCondToReg", "G" },
{ IROp::FpCtrlFromReg, "FpCtrlFromReg", "_G" },
{ IROp::FpCtrlToReg, "FpCtrlToReg", "G" },
{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
{ IROp::SetCtrlVFPUReg, "SetCtrlVFPUReg", "TG" },

View file

@ -135,6 +135,8 @@ enum class IROp : u8 {
FSatMinus1_1,
FpCondToReg,
FpCtrlFromReg,
FpCtrlToReg,
VfpuCtrlToReg,
ZeroFpCond,

View file

@ -768,9 +768,11 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
break;
case IROp::FMin:
// TODO: This doesn't handle VFPU ordering right.
mips->f[inst->dest] = std::min(mips->f[inst->src1], mips->f[inst->src2]);
break;
case IROp::FMax:
// TODO: This doesn't handle VFPU ordering right.
mips->f[inst->dest] = std::max(mips->f[inst->src1], mips->f[inst->src2]);
break;
@ -811,6 +813,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
case IROp::FpCondToReg:
mips->r[inst->dest] = mips->fpcond;
break;
case IROp::FpCtrlFromReg:
mips->fcr31 = mips->r[inst->src1] & 0x0181FFFF;
// Extract the new fpcond value.
// TODO: Is it really helping us to keep it separate?
mips->fpcond = (mips->fcr31 >> 23) & 1;
break;
case IROp::FpCtrlToReg:
// Update the fpcond bit first.
mips->fcr31 = (mips->fcr31 & ~(1 << 23)) | ((mips->fpcond & 1) << 23);
mips->r[inst->dest] = mips->fcr31;
break;
case IROp::VfpuCtrlToReg:
mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
break;

View file

@ -694,6 +694,13 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts
out.Write(inst);
}
break;
case IROp::FpCtrlFromReg:
gpr.MapDirtyIn(IRREG_FCR31, inst.src1);
gpr.MapDirty(IRREG_FPCOND);
goto doDefault;
case IROp::FpCtrlToReg:
gpr.MapDirtyInIn(inst.dest, IRREG_FPCOND, IRREG_FCR31);
goto doDefault;
case IROp::Vec4Init:
case IROp::Vec4Mov:

View file

@ -75,9 +75,8 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
loadStaticRegisters_ = nullptr;
}
// TODO: Do we actually need updateRoundingMode_? Hm.
//applyRoundingMode_ = AlignCode16();
if (false) {
applyRoundingMode_ = AlignCode16();
{
// Not sure if RISC-V has any flush to zero capability? Leaving it off for now...
LWU(SCRATCH2, CTXREG, offsetof(MIPSState, fcr31));
@ -105,30 +104,6 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
RET();
}
//updateRoundingMode_ = AlignCode16();
if (false) {
LWU(SCRATCH2, CTXREG, offsetof(MIPSState, fcr31));
// Set SCRATCH2 to FZ:RM (FZ is bit 24, and RM are lowest 2 bits.)
ANDI(SCRATCH1, SCRATCH2, 1 << 24);
ANDI(SCRATCH2, SCRATCH2, 3);
SRLI(SCRATCH1, SCRATCH1, 22);
OR(SCRATCH2, SCRATCH2, SCRATCH1);
// Let's update js.currentRoundingFunc with the right convertS0ToSCRATCH1 func.
//LI(SCRATCH1, convertS0ToSCRATCH1);
if (cpu_info.RiscV_Zba) {
SH_ADD(3, SCRATCH1, SCRATCH2, SCRATCH1);
} else {
SLLI(SCRATCH2, SCRATCH2, 3);
ADD(SCRATCH1, SCRATCH1, SCRATCH2);
}
LD(SCRATCH2, SCRATCH1, 0);
//LI(SCRATCH1, &js.currentRoundingFunc);
SW(SCRATCH2, SCRATCH1, 0);
RET();
}
enterDispatcher_ = AlignCode16();
// Start by saving some regs on the stack. There are 12 GPs and 12 FPs we want.
@ -280,15 +255,6 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
SW(SCRATCH2, SCRATCH1, 0);
J(quitLoop);
// TODO: Do we need this?
static const Round roundModes[8] = { Round::NEAREST_EVEN, Round::TOZERO, Round::UP, Round::DOWN, Round::NEAREST_EVEN, Round::TOZERO, Round::UP, Round::DOWN };
for (size_t i = 0; i < ARRAY_SIZE(roundModes); ++i) {
//convertS0ToSCRATCH1[i] = AlignCode16();
//FCVT(FConv::W, FConv::S, SCRATCH1, F0, roundModes[i]);
//RET();
}
// Leave this at the end, add more stuff above.
if (enableDisasm) {
std::vector<std::string> lines = DisassembleRV64(start, GetCodePtr() - start);

View file

@ -55,7 +55,7 @@ void RiscVJit::CompIR_Exit(IRInst inst) {
case IROp::ExitToPC:
FlushAll();
QuickJ(R_RA, dispatcher_);
QuickJ(R_RA, dispatcherCheckCoreState_);
break;
default:
@ -134,7 +134,8 @@ void RiscVJit::CompIR_ExitIf(IRInst inst) {
case IROp::ExitToConstIfFpTrue:
case IROp::ExitToConstIfFpFalse:
CompIR_Generic(inst);
// Note: not used.
DISABLE;
break;
default:

View file

@ -39,12 +39,67 @@ void RiscVJit::CompIR_FArith(IRInst inst) {
switch (inst.op) {
case IROp::FAdd:
fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
FADD(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
break;
case IROp::FSub:
fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
FSUB(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
break;
case IROp::FMul:
fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
// TODO: If FMUL consistently produces NAN across chip vendors, we can skip this.
// Luckily this does match the RISC-V canonical NAN.
if (inst.src1 != inst.src2) {
// These will output 0x80/0x01 if infinity, 0x10/0x80 if zero.
// We need to check if one is infinity and the other zero.
// First, try inf * zero.
FCLASS(32, SCRATCH1, fpr.R(inst.src1));
FCLASS(32, SCRATCH2, fpr.R(inst.src2));
ANDI(R_RA, SCRATCH1, 0x81);
FixupBranch lhsNotInf = BEQ(R_RA, R_ZERO);
ANDI(R_RA, SCRATCH2, 0x18);
FixupBranch infZero = BNE(R_RA, R_ZERO);
// Okay, what about the other order?
SetJumpTarget(lhsNotInf);
ANDI(R_RA, SCRATCH1, 0x18);
FixupBranch lhsNotZero = BEQ(R_RA, R_ZERO);
ANDI(R_RA, SCRATCH2, 0x81);
FixupBranch zeroInf = BNE(R_RA, R_ZERO);
// Nope, all good.
SetJumpTarget(lhsNotZero);
FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
FixupBranch skip = J();
SetJumpTarget(infZero);
SetJumpTarget(zeroInf);
LI(SCRATCH1, 0x7FC00000);
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
SetJumpTarget(skip);
} else {
FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
}
break;
case IROp::FDiv:
fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
FDIV(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
break;
case IROp::FSqrt:
fpr.MapDirtyIn(inst.dest, inst.src1);
FSQRT(32, fpr.R(inst.dest), fpr.R(inst.src1));
break;
case IROp::FNeg:
CompIR_Generic(inst);
fpr.MapDirtyIn(inst.dest, inst.src1);
FNEG(32, fpr.R(inst.dest), fpr.R(inst.src1));
break;
default:
@ -59,6 +114,7 @@ void RiscVJit::CompIR_FCondAssign(IRInst inst) {
switch (inst.op) {
case IROp::FMin:
case IROp::FMax:
// TODO: These are tricky, have to handle order correctly.
CompIR_Generic(inst);
break;
@ -73,11 +129,39 @@ void RiscVJit::CompIR_FAssign(IRInst inst) {
switch (inst.op) {
case IROp::FMov:
case IROp::FAbs:
case IROp::FSign:
CompIR_Generic(inst);
fpr.MapDirtyIn(inst.dest, inst.src1);
FMV(32, fpr.R(inst.dest), fpr.R(inst.src1));
break;
case IROp::FAbs:
fpr.MapDirtyIn(inst.dest, inst.src1);
FABS(32, fpr.R(inst.dest), fpr.R(inst.src1));
break;
case IROp::FSign:
{
fpr.MapDirtyIn(inst.dest, inst.src1);
// Check if it's negative zero, either 0x10/0x08 is zero.
FCLASS(32, SCRATCH1, fpr.R(inst.src1));
ANDI(SCRATCH1, SCRATCH1, 0x18);
SEQZ(SCRATCH1, SCRATCH1);
// Okay, it's zero if zero, 1 otherwise. Convert 1 to a constant 1.0.
// Probably non-zero is the common case, so we make that the straight line.
FixupBranch skipOne = BEQ(SCRATCH1, R_ZERO);
LI(SCRATCH1, 1.0f);
// Now we just need the sign from it.
FMV(FMv::X, FMv::W, SCRATCH2, fpr.R(inst.src1));
// Use a wall to isolate the sign, and combine.
SRAIW(SCRATCH2, SCRATCH2, 31);
SLLIW(SCRATCH2, SCRATCH2, 31);
OR(SCRATCH1, SCRATCH1, SCRATCH2);
SetJumpTarget(skipOne);
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
break;
}
default:
INVALIDOP;
break;
@ -135,7 +219,6 @@ void RiscVJit::CompIR_FCompare(IRInst inst) {
CONDITIONAL_DISABLE;
switch (inst.op) {
case IROp::ZeroFpCond:
case IROp::FCmp:
case IROp::FCmovVfpuCC:
case IROp::FCmpVfpuBit:
@ -154,9 +237,15 @@ void RiscVJit::CompIR_RoundingMode(IRInst inst) {
switch (inst.op) {
case IROp::RestoreRoundingMode:
RestoreRoundingMode();
break;
case IROp::ApplyRoundingMode:
ApplyRoundingMode();
break;
case IROp::UpdateRoundingMode:
CompIR_Generic(inst);
// We don't need to do anything, instructions allow a "dynamic" rounding mode.
break;
default:

View file

@ -53,8 +53,8 @@ void RiscVJit::SetScratch1ToSrc1Address(IRReg src1) {
#endif
}
int32_t RiscVJit::AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant) {
if (constant < -2048 || constant > 2047) {
int32_t RiscVJit::AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant, int32_t range) {
if (constant < -2048 || constant + range > 2047) {
LI(SCRATCH2, constant);
ADD(SCRATCH1, *reg, SCRATCH2);
*reg = SCRATCH1;
@ -124,7 +124,8 @@ void RiscVJit::CompIR_LoadShift(IRInst inst) {
switch (inst.op) {
case IROp::Load32Left:
case IROp::Load32Right:
CompIR_Generic(inst);
// Should not happen if the pass to split is active.
DISABLE;
break;
default:
@ -136,9 +137,28 @@ void RiscVJit::CompIR_LoadShift(IRInst inst) {
void RiscVJit::CompIR_FLoad(IRInst inst) {
CONDITIONAL_DISABLE;
RiscVReg addrReg = INVALID_REG;
if (inst.src1 == MIPS_REG_ZERO) {
// This will get changed by AdjustForAddressOffset.
addrReg = MEMBASEREG;
#ifdef MASKED_PSP_MEMORY
inst.constant &= Memory::MEMVIEW32_MASK;
#endif
} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
addrReg = gpr.MapRegAsPointer(inst.src1);
} else {
SetScratch1ToSrc1Address(inst.src1);
addrReg = SCRATCH1;
}
s32 imm = AdjustForAddressOffset(&addrReg, inst.constant);
// TODO: Safe memory? Or enough to have crash handler + validate?
switch (inst.op) {
case IROp::LoadFloat:
CompIR_Generic(inst);
fpr.MapReg(inst.dest, MIPSMap::NOINIT);
FL(32, fpr.R(inst.dest), addrReg, imm);
break;
default:
@ -150,9 +170,32 @@ void RiscVJit::CompIR_FLoad(IRInst inst) {
void RiscVJit::CompIR_VecLoad(IRInst inst) {
CONDITIONAL_DISABLE;
RiscVReg addrReg = INVALID_REG;
if (inst.src1 == MIPS_REG_ZERO) {
// This will get changed by AdjustForAddressOffset.
addrReg = MEMBASEREG;
#ifdef MASKED_PSP_MEMORY
inst.constant &= Memory::MEMVIEW32_MASK;
#endif
} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
addrReg = gpr.MapRegAsPointer(inst.src1);
} else {
SetScratch1ToSrc1Address(inst.src1);
addrReg = SCRATCH1;
}
// We need to be able to address the whole 16 bytes, so offset of 12.
s32 imm = AdjustForAddressOffset(&addrReg, inst.constant, 12);
// TODO: Safe memory? Or enough to have crash handler + validate?
switch (inst.op) {
case IROp::LoadVec4:
CompIR_Generic(inst);
for (int i = 0; i < 4; ++i) {
// Spilling is okay.
fpr.MapReg(inst.dest + i, MIPSMap::NOINIT);
FL(32, fpr.R(inst.dest + i), addrReg, imm + 4 * i);
}
break;
default:
@ -212,7 +255,8 @@ void RiscVJit::CompIR_StoreShift(IRInst inst) {
switch (inst.op) {
case IROp::Store32Left:
case IROp::Store32Right:
CompIR_Generic(inst);
// Should not happen if the pass to split is active.
DISABLE;
break;
default:
@ -224,9 +268,28 @@ void RiscVJit::CompIR_StoreShift(IRInst inst) {
void RiscVJit::CompIR_FStore(IRInst inst) {
CONDITIONAL_DISABLE;
RiscVReg addrReg = INVALID_REG;
if (inst.src1 == MIPS_REG_ZERO) {
// This will get changed by AdjustForAddressOffset.
addrReg = MEMBASEREG;
#ifdef MASKED_PSP_MEMORY
inst.constant &= Memory::MEMVIEW32_MASK;
#endif
} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
addrReg = gpr.MapRegAsPointer(inst.src1);
} else {
SetScratch1ToSrc1Address(inst.src1);
addrReg = SCRATCH1;
}
s32 imm = AdjustForAddressOffset(&addrReg, inst.constant);
// TODO: Safe memory? Or enough to have crash handler + validate?
switch (inst.op) {
case IROp::StoreFloat:
CompIR_Generic(inst);
fpr.MapReg(inst.src3);
FS(32, fpr.R(inst.src3), addrReg, imm);
break;
default:
@ -238,9 +301,32 @@ void RiscVJit::CompIR_FStore(IRInst inst) {
void RiscVJit::CompIR_VecStore(IRInst inst) {
CONDITIONAL_DISABLE;
RiscVReg addrReg = INVALID_REG;
if (inst.src1 == MIPS_REG_ZERO) {
// This will get changed by AdjustForAddressOffset.
addrReg = MEMBASEREG;
#ifdef MASKED_PSP_MEMORY
inst.constant &= Memory::MEMVIEW32_MASK;
#endif
} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
addrReg = gpr.MapRegAsPointer(inst.src1);
} else {
SetScratch1ToSrc1Address(inst.src1);
addrReg = SCRATCH1;
}
// We need to be able to address the whole 16 bytes, so offset of 12.
s32 imm = AdjustForAddressOffset(&addrReg, inst.constant, 12);
// TODO: Safe memory? Or enough to have crash handler + validate?
switch (inst.op) {
case IROp::StoreVec4:
CompIR_Generic(inst);
for (int i = 0; i < 4; ++i) {
// Spilling is okay, though not ideal.
fpr.MapReg(inst.src3 + i);
FS(32, fpr.R(inst.src3 + i), addrReg, imm + 4 * i);
}
break;
default:

View file

@ -15,7 +15,12 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "Common/Profiler/Profiler.h"
#include "Core/Core.h"
#include "Core/HLE/HLE.h"
#include "Core/HLE/ReplaceTables.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSTables.h"
#include "Core/MIPS/RiscV/RiscVJit.h"
#include "Core/MIPS/RiscV/RiscVRegCache.h"
@ -45,7 +50,15 @@ void RiscVJit::CompIR_Basic(IRInst inst) {
break;
case IROp::SetConstF:
CompIR_Generic(inst);
fpr.MapReg(inst.dest, MIPSMap::NOINIT);
if (inst.constant == 0) {
FCVT(FConv::S, FConv::W, fpr.R(inst.dest), R_ZERO);
} else {
// TODO: In the future, could use FLI if it's approved.
// Also, is FCVT faster?
LI(SCRATCH1, (int32_t)inst.constant);
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
}
break;
case IROp::Downcount:
@ -78,13 +91,85 @@ void RiscVJit::CompIR_Transfer(IRInst inst) {
switch (inst.op) {
case IROp::SetCtrlVFPU:
gpr.SetImm(IRREG_VFPU_CTRL_BASE + inst.dest, (int32_t)inst.constant);
break;
case IROp::SetCtrlVFPUReg:
gpr.MapDirtyIn(IRREG_VFPU_CTRL_BASE + inst.dest, inst.src1);
MV(gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), gpr.R(inst.src1));
gpr.MarkDirty(gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), gpr.IsNormalized32(inst.src1));
break;
case IROp::SetCtrlVFPUFReg:
gpr.MapReg(IRREG_VFPU_CTRL_BASE + inst.dest, MIPSMap::NOINIT);
fpr.MapReg(inst.src1);
FMV(FMv::X, FMv::W, gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), fpr.R(inst.src1));
break;
case IROp::FpCondToReg:
gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);
MV(gpr.R(inst.dest), gpr.R(IRREG_FPCOND));
gpr.MarkDirty(gpr.R(inst.dest), gpr.IsNormalized32(IRREG_FPCOND));
break;
case IROp::ZeroFpCond:
gpr.SetImm(IRREG_FPCOND, 0);
break;
case IROp::FpCtrlFromReg:
gpr.MapDirtyIn(IRREG_FPCOND, inst.src1, MapType::AVOID_LOAD_MARK_NORM32);
LI(SCRATCH1, 0x0181FFFF);
AND(SCRATCH1, gpr.R(inst.src1), SCRATCH1);
// Extract the new fpcond value.
if (cpu_info.RiscV_Zbs) {
BEXTI(gpr.R(IRREG_FPCOND), SCRATCH1, 23);
} else {
SRLI(gpr.R(IRREG_FPCOND), SCRATCH1, 23);
ANDI(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND), 1);
}
SW(SCRATCH1, CTXREG, IRREG_FCR31 * 4);
break;
case IROp::FpCtrlToReg:
gpr.MapDirtyIn(inst.dest, IRREG_FPCOND, MapType::AVOID_LOAD_MARK_NORM32);
// Load fcr31 and clear the fpcond bit.
LW(SCRATCH1, CTXREG, IRREG_FCR31 * 4);
if (cpu_info.RiscV_Zbs) {
BCLRI(SCRATCH1, SCRATCH1, 23);
} else {
LI(SCRATCH2, ~(1 << 23));
AND(SCRATCH1, SCRATCH1, SCRATCH2);
}
// Now get the correct fpcond bit.
ANDI(SCRATCH2, gpr.R(IRREG_FPCOND), 1);
SLLI(SCRATCH2, SCRATCH2, 23);
OR(gpr.R(inst.dest), SCRATCH1, SCRATCH2);
// Also update mips->fcr31 while we're here.
SW(gpr.R(inst.dest), CTXREG, IRREG_FCR31 * 4);
break;
case IROp::VfpuCtrlToReg:
gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
MV(gpr.R(inst.dest), gpr.R(IRREG_VFPU_CTRL_BASE + inst.src1));
gpr.MarkDirty(gpr.R(inst.dest), gpr.IsNormalized32(IRREG_VFPU_CTRL_BASE + inst.src1));
break;
case IROp::FMovFromGPR:
fpr.MapReg(inst.dest, MIPSMap::NOINIT);
if (gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) {
FCVT(FConv::S, FConv::W, fpr.R(inst.dest), R_ZERO);
} else {
gpr.MapReg(inst.src1);
FMV(FMv::W, FMv::X, fpr.R(inst.dest), gpr.R(inst.src1));
}
break;
case IROp::FMovToGPR:
CompIR_Generic(inst);
gpr.MapReg(inst.dest, MIPSMap::NOINIT);
fpr.MapReg(inst.src1);
FMV(FMv::X, FMv::W, gpr.R(inst.dest), fpr.R(inst.src1));
break;
default:
@ -98,10 +183,61 @@ void RiscVJit::CompIR_System(IRInst inst) {
switch (inst.op) {
case IROp::Interpret:
// IR protects us against this being a branching instruction (well, hopefully.)
FlushAll();
SaveStaticRegisters();
LI(X10, (int32_t)inst.constant);
QuickCallFunction((const u8 *)MIPSGetInterpretFunc(MIPSOpcode(inst.constant)));
LoadStaticRegisters();
break;
case IROp::Syscall:
FlushAll();
SaveStaticRegisters();
#ifdef USE_PROFILER
// When profiling, we can't skip CallSyscall, since it times syscalls.
LI(X10, (int32_t)inst.constant);
QuickCallFunction(&CallSyscall);
#else
// Skip the CallSyscall where possible.
{
MIPSOpcode op(inst.constant);
void *quickFunc = GetQuickSyscallFunc(op);
if (quickFunc) {
LI(X10, (uintptr_t)GetSyscallFuncPointer(op));
QuickCallFunction((const u8 *)quickFunc);
} else {
LI(X10, (int32_t)inst.constant);
QuickCallFunction(&CallSyscall);
}
}
#endif
LoadStaticRegisters();
// This is always followed by an ExitToPC, where we check coreState.
break;
case IROp::CallReplacement:
FlushAll();
SaveStaticRegisters();
QuickCallFunction(GetReplacementFunc(inst.constant)->replaceFunc);
LoadStaticRegisters();
SUB(DOWNCOUNTREG, DOWNCOUNTREG, X10);
break;
case IROp::Break:
CompIR_Generic(inst);
FlushAll();
// This doesn't naturally have restore/apply around it.
RestoreRoundingMode(true);
SaveStaticRegisters();
MovFromPC(X10);
QuickCallFunction(&Core_Break);
LoadStaticRegisters();
ApplyRoundingMode(true);
MovFromPC(SCRATCH1);
ADDI(SCRATCH1, SCRATCH1, 4);
QuickJ(R_RA, dispatcherPCInSCRATCH1_);
break;
default:

View file

@ -39,9 +39,88 @@ void RiscVJit::CompIR_VecAssign(IRInst inst) {
switch (inst.op) {
case IROp::Vec4Init:
for (int i = 0; i < 4; ++i)
fpr.SpillLock(inst.dest + i);
for (int i = 0; i < 4; ++i)
fpr.MapReg(inst.dest + i, MIPSMap::NOINIT);
for (int i = 0; i < 4; ++i)
fpr.ReleaseSpillLock(inst.dest + i);
// TODO: Check if FCVT/FMV/FL is better.
switch ((Vec4Init)inst.src1) {
case Vec4Init::AllZERO:
for (int i = 0; i < 4; ++i)
FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
break;
case Vec4Init::AllONE:
LI(SCRATCH1, 1.0f);
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
for (int i = 1; i < 4; ++i)
FMV(32, fpr.R(inst.dest + i), fpr.R(inst.dest));
break;
case Vec4Init::AllMinusONE:
LI(SCRATCH1, -1.0f);
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
for (int i = 1; i < 4; ++i)
FMV(32, fpr.R(inst.dest + i), fpr.R(inst.dest));
break;
case Vec4Init::Set_1000:
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 0)
FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
else
FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
}
break;
case Vec4Init::Set_0100:
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 1)
FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
else
FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
}
break;
case Vec4Init::Set_0010:
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 2)
FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
else
FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
}
break;
case Vec4Init::Set_0001:
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 3)
FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
else
FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
}
break;
}
break;
case IROp::Vec4Shuffle:
fpr.Map4DirtyIn(inst.dest, inst.src1);
for (int i = 0; i < 4; ++i) {
int lane = (inst.src2 >> (i * 2)) & 3;
FMV(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + lane));
}
break;
case IROp::Vec4Mov:
CompIR_Generic(inst);
fpr.Map4DirtyIn(inst.dest, inst.src1);
for (int i = 0; i < 4; ++i)
FMV(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i));
break;
default:
@ -55,13 +134,48 @@ void RiscVJit::CompIR_VecArith(IRInst inst) {
switch (inst.op) {
case IROp::Vec4Add:
fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
for (int i = 0; i < 4; ++i)
FADD(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
break;
case IROp::Vec4Sub:
fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
for (int i = 0; i < 4; ++i)
FSUB(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
break;
case IROp::Vec4Mul:
fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
for (int i = 0; i < 4; ++i)
FMUL(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
break;
case IROp::Vec4Div:
fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
for (int i = 0; i < 4; ++i)
FDIV(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
break;
case IROp::Vec4Scale:
fpr.SpillLock(inst.src2);
fpr.MapReg(inst.src2);
fpr.Map4DirtyIn(inst.dest, inst.src1);
fpr.ReleaseSpillLock(inst.src2);
for (int i = 0; i < 4; ++i)
FMUL(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2));
break;
case IROp::Vec4Neg:
fpr.Map4DirtyIn(inst.dest, inst.src1);
for (int i = 0; i < 4; ++i)
FNEG(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i));
break;
case IROp::Vec4Abs:
CompIR_Generic(inst);
fpr.Map4DirtyIn(inst.dest, inst.src1);
for (int i = 0; i < 4; ++i)
FABS(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i));
break;
default:
@ -75,7 +189,39 @@ void RiscVJit::CompIR_VecHoriz(IRInst inst) {
switch (inst.op) {
case IROp::Vec4Dot:
CompIR_Generic(inst);
// TODO: Maybe some option to call the slow accurate mode?
fpr.SpillLock(inst.dest);
for (int i = 0; i < 4; ++i) {
fpr.SpillLock(inst.src1 + i);
fpr.SpillLock(inst.src2 + i);
}
for (int i = 0; i < 4; ++i) {
fpr.MapReg(inst.src1 + i);
fpr.MapReg(inst.src2 + i);
}
fpr.MapReg(inst.dest, MIPSMap::NOINIT);
for (int i = 0; i < 4; ++i) {
fpr.ReleaseSpillLock(inst.src1 + i);
fpr.ReleaseSpillLock(inst.src2 + i);
}
fpr.ReleaseSpillLock(inst.dest);
if ((inst.dest < inst.src1 + 4 && inst.dest >= inst.src1) || (inst.dest < inst.src2 + 4 && inst.dest >= inst.src2)) {
// This means inst.dest overlaps one of src1 or src2. We have to do that one first.
// Technically this may impact -0.0 and such, but dots accurately need to be aligned anyway.
for (int i = 0; i < 4; ++i) {
if (inst.dest == inst.src1 + i || inst.dest == inst.src2 + i)
FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
}
for (int i = 0; i < 4; ++i) {
if (inst.dest != inst.src1 + i && inst.dest != inst.src2 + i)
FMADD(32, fpr.R(inst.dest), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i), fpr.R(inst.dest));
}
} else {
FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
for (int i = 1; i < 4; ++i)
FMADD(32, fpr.R(inst.dest), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i), fpr.R(inst.dest));
}
break;
default:

View file

@ -26,7 +26,7 @@ namespace MIPSComp {
using namespace RiscVGen;
using namespace RiscVJitConstants;
RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo) {
RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo), fpr(mipsState, &jo) {
// Automatically disable incompatible options.
if (((intptr_t)Memory::base & 0x00000000FFFFFFFFUL) != 0) {
jo.enablePointerify = false;
@ -40,7 +40,7 @@ RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo)
memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);
gpr.Init(this);
// TODO: fpr
fpr.Init(this);
GenerateFixedCode(jo);
}
@ -79,7 +79,7 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
blockStartAddrs_[block_num] = GetCodePointer();
gpr.Start();
// TODO: fpr.
fpr.Start();
for (const IRInst &inst : instructions) {
CompileIRInst(inst);
@ -87,9 +87,8 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
if (jo.Disabled(JitDisable::REGALLOC_GPR)) {
gpr.FlushAll();
}
// TODO
if (jo.Disabled(JitDisable::REGALLOC_FPR)) {
//fpr.FlushAll();
fpr.FlushAll();
}
// Safety check, in case we get a bunch of really large jit ops without a lot of branching.
@ -107,13 +106,6 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
return true;
}
static u32 DoIRInst(uint64_t value) {
IRInst inst;
memcpy(&inst, &value, sizeof(inst));
return IRInterpret(currentMIPS, &inst, 1);
}
void RiscVJit::CompileIRInst(IRInst inst) {
switch (inst.op) {
case IROp::Nop:
@ -281,7 +273,6 @@ void RiscVJit::CompileIRInst(IRInst inst) {
CompIR_FSat(inst);
break;
case IROp::ZeroFpCond:
case IROp::FCmp:
case IROp::FCmovVfpuCC:
case IROp::FCmpVfpuBit:
@ -299,6 +290,9 @@ void RiscVJit::CompileIRInst(IRInst inst) {
case IROp::SetCtrlVFPUReg:
case IROp::SetCtrlVFPUFReg:
case IROp::FpCondToReg:
case IROp::ZeroFpCond:
case IROp::FpCtrlFromReg:
case IROp::FpCtrlToReg:
case IROp::VfpuCtrlToReg:
case IROp::FMovFromGPR:
case IROp::FMovToGPR:
@ -392,9 +386,15 @@ void RiscVJit::CompileIRInst(IRInst inst) {
}
}
static u32 DoIRInst(uint64_t value) {
IRInst inst;
memcpy(&inst, &value, sizeof(inst));
return IRInterpret(currentMIPS, &inst, 1);
}
void RiscVJit::CompIR_Generic(IRInst inst) {
// For now, we're gonna do it the slow and ugly way.
// Maybe there's a smarter way to fallback?
// If we got here, we're going the slow way.
uint64_t value;
memcpy(&value, &inst, sizeof(inst));
@ -403,20 +403,24 @@ void RiscVJit::CompIR_Generic(IRInst inst) {
SaveStaticRegisters();
QuickCallFunction(&DoIRInst);
LoadStaticRegisters();
// Result in X10 aka SCRATCH1.
_assert_(X10 == SCRATCH1);
if (BInRange(dispatcherPCInSCRATCH1_)) {
BNE(X10, R_ZERO, dispatcherPCInSCRATCH1_);
} else {
FixupBranch skip = BEQ(X10, R_ZERO);
QuickJ(R_RA, dispatcherPCInSCRATCH1_);
SetJumpTarget(skip);
// We only need to check the return value if it's a potential exit.
if ((GetIRMeta(inst.op)->flags & IRFLAG_EXIT) != 0) {
// Result in X10 aka SCRATCH1.
_assert_(X10 == SCRATCH1);
if (BInRange(dispatcherPCInSCRATCH1_)) {
BNE(X10, R_ZERO, dispatcherPCInSCRATCH1_);
} else {
FixupBranch skip = BEQ(X10, R_ZERO);
QuickJ(R_RA, dispatcherPCInSCRATCH1_);
SetJumpTarget(skip);
}
}
}
void RiscVJit::FlushAll() {
gpr.FlushAll();
// TODO: fpr.
fpr.FlushAll();
}
bool RiscVJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
@ -433,6 +437,8 @@ bool RiscVJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
name = "loadStaticRegisters";
} else if (ptr == enterDispatcher_) {
name = "enterDispatcher";
} else if (ptr == applyRoundingMode_) {
name = "applyRoundingMode";
} else if (!IsInSpace(ptr)) {
return false;
} else {
@ -492,20 +498,12 @@ void RiscVJit::ClearCache() {
memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);
}
void RiscVJit::UpdateFCR31() {
IRJit::UpdateFCR31();
// TODO: Handle rounding modes?
}
void RiscVJit::RestoreRoundingMode(bool force) {
// TODO: Could maybe skip if not hasSetRounding? But that's on IRFrontend...
FSRMI(Round::NEAREST_EVEN);
}
void RiscVJit::ApplyRoundingMode(bool force) {
// TODO: Also could maybe sometimes skip?
//QuickCallFunction(applyRoundingMode_);
QuickCallFunction(applyRoundingMode_);
}
void RiscVJit::MovFromPC(RiscVReg r) {

View file

@ -24,6 +24,7 @@
#include "Core/MIPS/JitCommon/JitState.h"
#include "Core/MIPS/JitCommon/JitCommon.h"
#include "Core/MIPS/RiscV/RiscVRegCache.h"
#include "Core/MIPS/RiscV/RiscVRegCacheFPU.h"
namespace MIPSComp {
@ -41,7 +42,6 @@ public:
const u8 *GetCrashHandler() const override;
void ClearCache() override;
void UpdateFCR31() override;
// TODO: GetBlockCacheDebugInterface, block linking?
@ -107,12 +107,13 @@ private:
void SetScratch1ToSrc1Address(IRReg src1);
// Modifies SCRATCH regs.
int32_t AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant);
int32_t AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant, int32_t range = 0);
void NormalizeSrc1(IRInst inst, RiscVGen::RiscVReg *reg, RiscVGen::RiscVReg tempReg, bool allowOverlap);
void NormalizeSrc12(IRInst inst, RiscVGen::RiscVReg *lhs, RiscVGen::RiscVReg *rhs, RiscVGen::RiscVReg lhsTempReg, RiscVGen::RiscVReg rhsTempReg, bool allowOverlap);
RiscVGen::RiscVReg NormalizeR(IRRegIndex rs, IRRegIndex rd, RiscVGen::RiscVReg tempReg);
RiscVRegCache gpr;
RiscVRegCacheFPU fpr;
static constexpr int MAX_ALLOWED_JIT_BLOCKS = 262144;
@ -125,6 +126,7 @@ private:
const u8 *dispatcher_ = nullptr;
const u8 *dispatcherNoCheck_ = nullptr;
const u8 *dispatcherFetch_ = nullptr;
const u8 *applyRoundingMode_ = nullptr;
const u8 *saveStaticRegisters_ = nullptr;
const u8 *loadStaticRegisters_ = nullptr;

View file

@ -15,15 +15,15 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#ifndef offsetof
#include <cstddef>
#endif
#include "Common/CPUDetect.h"
#include "Core/MIPS/RiscV/RiscVRegCache.h"
#include "Core/MIPS/JitCommon/JitState.h"
#include "Core/Reporting.h"
#ifndef offsetof
#include "stddef.h"
#endif
using namespace RiscVGen;
using namespace RiscVJitConstants;
@ -36,20 +36,14 @@ void RiscVRegCache::Init(RiscVEmitter *emitter) {
}
void RiscVRegCache::Start() {
for (int i = 0; i < NUM_RVREG; i++) {
ar[i].mipsReg = IRREG_INVALID;
ar[i].isDirty = false;
ar[i].pointerified = false;
ar[i].tempLocked = false;
ar[i].normalized32 = false;
}
for (int i = 0; i < NUM_MIPSREG; i++) {
mr[i].loc = MIPSLoc::MEM;
mr[i].reg = INVALID_REG;
mr[i].imm = -1;
mr[i].spillLock = false;
mr[i].isStatic = false;
if (!initialReady_) {
SetupInitialRegs();
initialReady_ = true;
}
memcpy(ar, arInitial_, sizeof(ar));
memcpy(mr, mrInitial_, sizeof(mr));
int numStatics;
const StaticAllocation *statics = GetStaticAllocations(numStatics);
for (int i = 0; i < numStatics; i++) {
@ -61,24 +55,41 @@ void RiscVRegCache::Start() {
mr[statics[i].mr].isStatic = true;
mr[statics[i].mr].spillLock = true;
}
}
void RiscVRegCache::SetupInitialRegs() {
for (int i = 0; i < NUM_RVREG; i++) {
arInitial_[i].mipsReg = IRREG_INVALID;
arInitial_[i].isDirty = false;
arInitial_[i].pointerified = false;
arInitial_[i].tempLocked = false;
arInitial_[i].normalized32 = false;
}
for (int i = 0; i < NUM_MIPSREG; i++) {
mrInitial_[i].loc = MIPSLoc::MEM;
mrInitial_[i].reg = INVALID_REG;
mrInitial_[i].imm = -1;
mrInitial_[i].spillLock = false;
mrInitial_[i].isStatic = false;
}
// Treat R_ZERO a bit specially, but it's basically static alloc too.
ar[R_ZERO].mipsReg = MIPS_REG_ZERO;
ar[R_ZERO].normalized32 = true;
mr[MIPS_REG_ZERO].loc = MIPSLoc::RVREG_IMM;
mr[MIPS_REG_ZERO].reg = R_ZERO;
mr[MIPS_REG_ZERO].imm = 0;
mr[MIPS_REG_ZERO].isStatic = true;
arInitial_[R_ZERO].mipsReg = MIPS_REG_ZERO;
arInitial_[R_ZERO].normalized32 = true;
mrInitial_[MIPS_REG_ZERO].loc = MIPSLoc::RVREG_IMM;
mrInitial_[MIPS_REG_ZERO].reg = R_ZERO;
mrInitial_[MIPS_REG_ZERO].imm = 0;
mrInitial_[MIPS_REG_ZERO].isStatic = true;
}
const RiscVReg *RiscVRegCache::GetMIPSAllocationOrder(int &count) {
// X8 and X9 are the most ideal for static alloc because they can be used with compression.
// Otherwise we stick to saved regs - might not be necessary.
static const RiscVReg allocationOrder[] = {
X7, X8, X9, X12, X13, X14, X5, X6, X15, X16, X17, X18, X19, X20, X21, X22, X23, X28, X29, X30, X31,
X8, X9, X12, X13, X14, X15, X5, X6, X7, X16, X17, X18, X19, X20, X21, X22, X23, X28, X29, X30, X31,
};
static const RiscVReg allocationOrderStaticAlloc[] = {
X7, X12, X13, X14, X5, X6, X15, X16, X17, X21, X22, X23, X28, X29, X30, X31,
X12, X13, X14, X15, X5, X6, X7, X16, X17, X21, X22, X23, X28, X29, X30, X31,
};
if (jo_->useStaticAlloc) {
@ -432,6 +443,7 @@ RiscVReg RiscVRegCache::GetAndLockTempR() {
RiscVReg reg = AllocateReg();
if (reg != INVALID_REG) {
ar[reg].tempLocked = true;
pendingUnlock_ = true;
}
return reg;
}
@ -958,14 +970,6 @@ bool RiscVRegCache::IsImm(IRRegIndex r) const {
return mr[r].loc == MIPSLoc::IMM || mr[r].loc == MIPSLoc::RVREG_IMM;
}
bool RiscVRegCache::IsPureImm(IRRegIndex r) const {
_dbg_assert_(IsValidReg(r));
if (r == MIPS_REG_ZERO)
return true;
else
return mr[r].loc == MIPSLoc::IMM;
}
u64 RiscVRegCache::GetImm(IRRegIndex r) const {
_dbg_assert_(IsValidReg(r));
if (r == MIPS_REG_ZERO)
@ -1016,9 +1020,13 @@ void RiscVRegCache::SpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRReg
if (r2 != IRREG_INVALID) mr[r2].spillLock = true;
if (r3 != IRREG_INVALID) mr[r3].spillLock = true;
if (r4 != IRREG_INVALID) mr[r4].spillLock = true;
pendingUnlock_ = true;
}
void RiscVRegCache::ReleaseSpillLocksAndDiscardTemps() {
if (!pendingUnlock_)
return;
for (int i = 0; i < NUM_MIPSREG; i++) {
if (!mr[i].isStatic)
mr[i].spillLock = false;
@ -1026,6 +1034,8 @@ void RiscVRegCache::ReleaseSpillLocksAndDiscardTemps() {
for (int i = 0; i < NUM_RVREG; i++) {
ar[i].tempLocked = false;
}
pendingUnlock_ = false;
}
void RiscVRegCache::ReleaseSpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRRegIndex r4) {

View file

@ -68,10 +68,6 @@ enum class MapType {
} // namespace RiscVJitConstants
namespace MIPSAnalyst {
struct AnalysisResults;
};
namespace MIPSComp {
struct JitOptions;
}
@ -116,10 +112,7 @@ public:
void SetImm(IRRegIndex reg, u64 immVal);
bool IsImm(IRRegIndex reg) const;
bool IsPureImm(IRRegIndex reg) const;
u64 GetImm(IRRegIndex reg) const;
// Optimally set a register to an imm value (possibly using another register.)
void SetRegImm(RiscVGen::RiscVReg reg, u64 imm);
// May fail and return INVALID_REG if it needs flushing.
RiscVGen::RiscVReg TryMapTempImm(IRRegIndex);
@ -144,7 +137,6 @@ public:
void MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, RiscVJitConstants::MapType type = RiscVJitConstants::MapType::AVOID_LOAD);
void MapDirtyDirtyIn(IRRegIndex rd1, IRRegIndex rd2, IRRegIndex rs, RiscVJitConstants::MapType type = RiscVJitConstants::MapType::AVOID_LOAD);
void MapDirtyDirtyInIn(IRRegIndex rd1, IRRegIndex rd2, IRRegIndex rs, IRRegIndex rt, RiscVJitConstants::MapType type = RiscVJitConstants::MapType::AVOID_LOAD);
void FlushRiscVReg(RiscVGen::RiscVReg r);
void FlushBeforeCall();
void FlushAll();
void FlushR(IRRegIndex r);
@ -171,12 +163,16 @@ private:
RiscVGen::RiscVReg AllocateReg();
RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
void FlushRiscVReg(RiscVGen::RiscVReg r);
void SetRegImm(RiscVGen::RiscVReg reg, u64 imm);
void AddMemBase(RiscVGen::RiscVReg reg);
int GetMipsRegOffset(IRRegIndex r);
bool IsValidReg(IRRegIndex r) const;
bool IsValidRegNoZero(IRRegIndex r) const;
void SetupInitialRegs();
MIPSState *mips_;
RiscVGen::RiscVEmitter *emit_ = nullptr;
MIPSComp::JitOptions *jo_;
@ -188,4 +184,9 @@ private:
RegStatusRiscV ar[NUM_RVREG]{};
RegStatusMIPS mr[NUM_MIPSREG]{};
bool initialReady_ = false;
bool pendingUnlock_ = false;
RegStatusRiscV arInitial_[NUM_RVREG];
RegStatusMIPS mrInitial_[NUM_MIPSREG];
};

View file

@ -14,3 +14,401 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#ifndef offsetof
#include <cstddef>
#endif
#include "Common/CPUDetect.h"
#include "Core/MIPS/RiscV/RiscVRegCacheFPU.h"
#include "Core/MIPS/JitCommon/JitState.h"
#include "Core/Reporting.h"
using namespace RiscVGen;
using namespace RiscVJitConstants;
using namespace RiscVGen;
using namespace RiscVJitConstants;
RiscVRegCacheFPU::RiscVRegCacheFPU(MIPSState *mipsState, MIPSComp::JitOptions *jo)
: mips_(mipsState), jo_(jo) {}
void RiscVRegCacheFPU::Init(RiscVEmitter *emitter) {
emit_ = emitter;
}
void RiscVRegCacheFPU::Start() {
if (!initialReady_) {
SetupInitialRegs();
initialReady_ = true;
}
memcpy(ar, arInitial_, sizeof(ar));
memcpy(mr, mrInitial_, sizeof(mr));
pendingFlush_ = false;
}
void RiscVRegCacheFPU::SetupInitialRegs() {
for (int i = 0; i < NUM_RVFPUREG; i++) {
arInitial_[i].mipsReg = IRREG_INVALID;
arInitial_[i].isDirty = false;
}
for (int i = 0; i < NUM_MIPSFPUREG; i++) {
mrInitial_[i].loc = MIPSLoc::MEM;
mrInitial_[i].reg = (int)INVALID_REG;
mrInitial_[i].spillLock = false;
}
}
const RiscVReg *RiscVRegCacheFPU::GetMIPSAllocationOrder(int &count) {
// F8 through F15 are used for compression, so they are great.
// TODO: Maybe we could remove some saved regs since we rarely need that many? Or maybe worth it?
static const RiscVReg allocationOrder[] = {
F8, F9, F10, F11, F12, F13, F14, F15,
F0, F1, F2, F3, F4, F5, F6, F7,
F16, F17, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, F31,
};
count = ARRAY_SIZE(allocationOrder);
return allocationOrder;
}
bool RiscVRegCacheFPU::IsInRAM(IRRegIndex reg) {
_dbg_assert_(IsValidReg(reg));
return mr[reg].loc == MIPSLoc::MEM;
}
bool RiscVRegCacheFPU::IsMapped(IRRegIndex mipsReg) {
_dbg_assert_(IsValidReg(mipsReg));
return mr[mipsReg].loc == MIPSLoc::RVREG;
}
RiscVReg RiscVRegCacheFPU::MapReg(IRRegIndex mipsReg, MIPSMap mapFlags) {
_dbg_assert_(IsValidReg(mipsReg));
_dbg_assert_(mr[mipsReg].loc == MIPSLoc::MEM || mr[mipsReg].loc == MIPSLoc::RVREG);
pendingFlush_ = true;
// Let's see if it's already mapped. If so we just need to update the dirty flag.
// We don't need to check for NOINIT because we assume that anyone who maps
// with that flag immediately writes a "known" value to the register.
if (mr[mipsReg].loc == MIPSLoc::RVREG) {
_assert_msg_(ar[mr[mipsReg].reg].mipsReg == mipsReg, "GPU mapping out of sync, IR=%i", mipsReg);
if ((mapFlags & MIPSMap::DIRTY) == MIPSMap::DIRTY) {
ar[mr[mipsReg].reg].isDirty = true;
}
return (RiscVReg)(mr[mipsReg].reg + F0);
}
// Okay, not mapped, so we need to allocate an RV register.
RiscVReg reg = AllocateReg();
if (reg != INVALID_REG) {
// That means it's free. Grab it, and load the value into it (if requested).
ar[reg - F0].isDirty = (mapFlags & MIPSMap::DIRTY) == MIPSMap::DIRTY;
if ((mapFlags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
if (mr[mipsReg].loc == MIPSLoc::MEM) {
emit_->FL(32, reg, CTXREG, GetMipsRegOffset(mipsReg));
}
}
ar[reg - F0].mipsReg = mipsReg;
mr[mipsReg].loc = MIPSLoc::RVREG;
mr[mipsReg].reg = reg - F0;
return reg;
}
return reg;
}
RiscVReg RiscVRegCacheFPU::AllocateReg() {
int allocCount = 0;
const RiscVReg *allocOrder = GetMIPSAllocationOrder(allocCount);
allocate:
for (int i = 0; i < allocCount; i++) {
RiscVReg reg = allocOrder[i];
if (ar[reg - F0].mipsReg == IRREG_INVALID) {
return reg;
}
}
// Still nothing. Let's spill a reg and goto 10.
// TODO: Use age or something to choose which register to spill?
// TODO: Spill dirty regs first? or opposite?
bool clobbered;
RiscVReg bestToSpill = FindBestToSpill(true, &clobbered);
if (bestToSpill == INVALID_REG) {
bestToSpill = FindBestToSpill(false, &clobbered);
}
if (bestToSpill != INVALID_REG) {
if (clobbered) {
DiscardR(ar[bestToSpill - F0].mipsReg);
} else {
FlushRiscVReg(bestToSpill);
}
// Now one must be free.
goto allocate;
}
// Uh oh, we have all of them spilllocked....
ERROR_LOG_REPORT(JIT, "Out of spillable registers near PC %08x", mips_->pc);
_assert_(bestToSpill != INVALID_REG);
return INVALID_REG;
}
RiscVReg RiscVRegCacheFPU::FindBestToSpill(bool unusedOnly, bool *clobbered) {
int allocCount = 0;
const RiscVReg *allocOrder = GetMIPSAllocationOrder(allocCount);
static const int UNUSED_LOOKAHEAD_OPS = 30;
*clobbered = false;
for (int i = 0; i < allocCount; i++) {
RiscVReg reg = allocOrder[i];
if (ar[reg - F0].mipsReg != IRREG_INVALID && mr[ar[reg - F0].mipsReg].spillLock)
continue;
// TODO: Look for clobbering in the IRInst array with index?
// Not awesome. A used reg. Let's try to avoid spilling.
// TODO: Actually check if we'd be spilling.
if (unusedOnly) {
continue;
}
return reg;
}
return INVALID_REG;
}
void RiscVRegCacheFPU::MapInIn(IRRegIndex rd, IRRegIndex rs) {
SpillLock(rd, rs);
MapReg(rd);
MapReg(rs);
ReleaseSpillLock(rd);
ReleaseSpillLock(rs);
}
void RiscVRegCacheFPU::MapDirtyIn(IRRegIndex rd, IRRegIndex rs, bool avoidLoad) {
SpillLock(rd, rs);
bool load = !avoidLoad || rd == rs;
MapReg(rd, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
MapReg(rs);
ReleaseSpillLock(rd);
ReleaseSpillLock(rs);
}
void RiscVRegCacheFPU::MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, bool avoidLoad) {
SpillLock(rd, rs, rt);
bool load = !avoidLoad || (rd == rs || rd == rt);
MapReg(rd, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
MapReg(rt);
MapReg(rs);
ReleaseSpillLock(rd);
ReleaseSpillLock(rs);
ReleaseSpillLock(rt);
}
void RiscVRegCacheFPU::Map4DirtyIn(IRRegIndex rdbase, IRRegIndex rsbase, bool avoidLoad) {
for (int i = 0; i < 4; ++i)
SpillLock(rdbase + i, rsbase + i);
bool load = !avoidLoad || (rdbase < rsbase + 4 && rdbase + 4 > rsbase);
for (int i = 0; i < 4; ++i)
MapReg(rdbase + i, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
for (int i = 0; i < 4; ++i)
MapReg(rsbase + i);
for (int i = 0; i < 4; ++i)
ReleaseSpillLock(rdbase + i, rsbase + i);
}
void RiscVRegCacheFPU::Map4DirtyInIn(IRRegIndex rdbase, IRRegIndex rsbase, IRRegIndex rtbase, bool avoidLoad) {
for (int i = 0; i < 4; ++i)
SpillLock(rdbase + i, rsbase + i, rtbase + i);
bool load = !avoidLoad || (rdbase < rsbase + 4 && rdbase + 4 > rsbase) || (rdbase < rtbase + 4 && rdbase + 4 > rtbase);
for (int i = 0; i < 4; ++i)
MapReg(rdbase + i, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
for (int i = 0; i < 4; ++i)
MapReg(rsbase + i);
for (int i = 0; i < 4; ++i)
MapReg(rtbase + i);
for (int i = 0; i < 4; ++i)
ReleaseSpillLock(rdbase + i, rsbase + i, rtbase + i);
}
void RiscVRegCacheFPU::FlushRiscVReg(RiscVReg r) {
_dbg_assert_(r >= F0 && r <= F31);
int reg = r - F0;
if (ar[reg].mipsReg == IRREG_INVALID) {
// Nothing to do, reg not mapped.
return;
}
if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == MIPSLoc::RVREG) {
emit_->FS(32, r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));
}
mr[ar[reg].mipsReg].loc = MIPSLoc::MEM;
mr[ar[reg].mipsReg].reg = (int)INVALID_REG;
ar[reg].mipsReg = IRREG_INVALID;
ar[reg].isDirty = false;
}
void RiscVRegCacheFPU::FlushR(IRRegIndex r) {
_dbg_assert_(IsValidReg(r));
RiscVReg reg = RiscVRegForFlush(r);
if (reg != INVALID_REG)
FlushRiscVReg(reg);
}
RiscVReg RiscVRegCacheFPU::RiscVRegForFlush(IRRegIndex r) {
_dbg_assert_(IsValidReg(r));
switch (mr[r].loc) {
case MIPSLoc::RVREG:
_assert_msg_(mr[r].reg != INVALID_REG, "RiscVRegForFlush: IR %d had bad RiscVReg", r);
if (mr[r].reg == INVALID_REG) {
return INVALID_REG;
}
return (RiscVReg)(F0 + mr[r].reg);
case MIPSLoc::MEM:
return INVALID_REG;
default:
_assert_(false);
return INVALID_REG;
}
}
void RiscVRegCacheFPU::FlushAll() {
if (!pendingFlush_) {
// Nothing allocated. FPU regs are not nearly as common as GPR.
return;
}
int numRVRegs = 0;
const RiscVReg *order = GetMIPSAllocationOrder(numRVRegs);
for (int i = 0; i < numRVRegs; i++) {
int a = order[i] - F0;
int m = ar[a].mipsReg;
if (ar[a].isDirty) {
_assert_(m != MIPS_REG_INVALID);
emit_->FS(32, order[i], CTXREG, GetMipsRegOffset(m));
mr[m].loc = MIPSLoc::MEM;
mr[m].reg = (int)INVALID_REG;
ar[a].mipsReg = IRREG_INVALID;
ar[a].isDirty = false;
} else {
if (m != IRREG_INVALID) {
mr[m].loc = MIPSLoc::MEM;
mr[m].reg = (int)INVALID_REG;
}
ar[a].mipsReg = IRREG_INVALID;
}
}
pendingFlush_ = false;
}
void RiscVRegCacheFPU::DiscardR(IRRegIndex r) {
_dbg_assert_(IsValidReg(r));
switch (mr[r].loc) {
case MIPSLoc::RVREG:
_assert_(mr[r].reg != INVALID_REG);
if (mr[r].reg != INVALID_REG) {
// Note that we DO NOT write it back here. That's the whole point of Discard.
ar[mr[r].reg].isDirty = false;
ar[mr[r].reg].mipsReg = IRREG_INVALID;
}
break;
case MIPSLoc::MEM:
// Already there, nothing to do.
break;
default:
_assert_(false);
break;
}
mr[r].loc = MIPSLoc::MEM;
mr[r].reg = (int)INVALID_REG;
mr[r].spillLock = false;
}
int RiscVRegCacheFPU::GetMipsRegOffset(IRRegIndex r) {
_assert_(IsValidReg(r));
// These are offsets within the MIPSState structure.
// IR gives us an index that is already 32 after the state index (skipping GPRs.)
return (32 + r) * 4;
}
void RiscVRegCacheFPU::SpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRRegIndex r4) {
_dbg_assert_(IsValidReg(r1));
_dbg_assert_(r2 == IRREG_INVALID || IsValidReg(r2));
_dbg_assert_(r3 == IRREG_INVALID || IsValidReg(r3));
_dbg_assert_(r4 == IRREG_INVALID || IsValidReg(r4));
mr[r1].spillLock = true;
if (r2 != IRREG_INVALID)
mr[r2].spillLock = true;
if (r3 != IRREG_INVALID)
mr[r3].spillLock = true;
if (r4 != IRREG_INVALID)
mr[r4].spillLock = true;
pendingUnlock_ = true;
}
void RiscVRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
if (!pendingUnlock_)
return;
for (int i = 0; i < NUM_MIPSFPUREG; i++) {
mr[i].spillLock = false;
}
pendingUnlock_ = false;
}
void RiscVRegCacheFPU::ReleaseSpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRRegIndex r4) {
_dbg_assert_(IsValidReg(r1));
_dbg_assert_(r2 == IRREG_INVALID || IsValidReg(r2));
_dbg_assert_(r3 == IRREG_INVALID || IsValidReg(r3));
_dbg_assert_(r4 == IRREG_INVALID || IsValidReg(r4));
mr[r1].spillLock = false;
if (r2 != IRREG_INVALID)
mr[r2].spillLock = false;
if (r3 != IRREG_INVALID)
mr[r3].spillLock = false;
if (r4 != IRREG_INVALID)
mr[r4].spillLock = false;
}
RiscVReg RiscVRegCacheFPU::R(IRRegIndex mipsReg) {
_dbg_assert_(IsValidReg(mipsReg));
_dbg_assert_(mr[mipsReg].loc == MIPSLoc::RVREG);
if (mr[mipsReg].loc == MIPSLoc::RVREG) {
return (RiscVReg)(mr[mipsReg].reg + F0);
} else {
ERROR_LOG_REPORT(JIT, "Reg %i not in riscv reg", mipsReg);
return INVALID_REG; // BAAAD
}
}
bool RiscVRegCacheFPU::IsValidReg(IRRegIndex r) const {
if (r < 0 || r >= NUM_MIPSFPUREG)
return false;
// See MIPSState for these offsets.
int index = r + 32;
// Allow FPU or VFPU regs here.
if (index >= 32 && index < 32 + 32 + 128)
return true;
// Also allow VFPU temps.
if (index >= 224 && index < 224 + 16)
return true;
// Nothing else is allowed for the FPU side cache.
return false;
}

View file

@ -16,3 +16,91 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#pragma once
#include "Common/RiscVEmitter.h"
#include "Core/MIPS/MIPS.h"
#include "Core/MIPS/RiscV/RiscVRegCache.h"
struct FPURegStatusRiscV {
int mipsReg; // if -1, no mipsreg attached.
bool isDirty; // Should the register be written back?
};
struct FPURegStatusMIPS {
// Where is this MIPS register?
RiscVJitConstants::MIPSLoc loc;
// Index from F0.
int reg;
bool spillLock; // if true, this register cannot be spilled.
// If loc == ML_MEM, it's back in its location in the CPU context struct.
};
namespace MIPSComp {
struct JitOptions;
}
class RiscVRegCacheFPU {
public:
RiscVRegCacheFPU(MIPSState *mipsState, MIPSComp::JitOptions *jo);
~RiscVRegCacheFPU() {}
void Init(RiscVGen::RiscVEmitter *emitter);
// TODO: Maybe pass in IR block and start PC for logging/debugging?
void Start();
// Protect the RISC-V register containing a MIPS register from spilling, to ensure that
// it's being kept allocated.
void SpillLock(IRRegIndex reg, IRRegIndex reg2 = IRREG_INVALID, IRRegIndex reg3 = IRREG_INVALID, IRRegIndex reg4 = IRREG_INVALID);
void ReleaseSpillLock(IRRegIndex reg, IRRegIndex reg2 = IRREG_INVALID, IRRegIndex reg3 = IRREG_INVALID, IRRegIndex reg4 = IRREG_INVALID);
void ReleaseSpillLocksAndDiscardTemps();
// Returns a RISC-V register containing the requested MIPS register.
RiscVGen::RiscVReg MapReg(IRRegIndex reg, RiscVJitConstants::MIPSMap mapFlags = RiscVJitConstants::MIPSMap::INIT);
bool IsMapped(IRRegIndex r);
bool IsInRAM(IRRegIndex r);
void MapInIn(IRRegIndex rd, IRRegIndex rs);
void MapDirtyIn(IRRegIndex rd, IRRegIndex rs, bool avoidLoad = true);
void MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, bool avoidLoad = true);
void Map4Dirty(IRRegIndex rdbase, bool avoidLoad = true);
void Map4DirtyIn(IRRegIndex rdbase, IRRegIndex rsbase, bool avoidLoad = true);
void Map4DirtyInIn(IRRegIndex rdbase, IRRegIndex rsbase, IRRegIndex rtbase, bool avoidLoad = true);
void FlushAll();
void FlushR(IRRegIndex r);
void DiscardR(IRRegIndex r);
RiscVGen::RiscVReg R(int preg); // Returns a cached register
private:
const RiscVGen::RiscVReg *GetMIPSAllocationOrder(int &count);
RiscVGen::RiscVReg AllocateReg();
RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
void FlushRiscVReg(RiscVGen::RiscVReg r);
int GetMipsRegOffset(IRRegIndex r);
bool IsValidReg(IRRegIndex r) const;
void SetupInitialRegs();
MIPSState *mips_;
RiscVGen::RiscVEmitter *emit_ = nullptr;
MIPSComp::JitOptions *jo_;
enum {
// On RiscV, each of the 32 registers are full 128-bit. No sharing of components!
NUM_RVFPUREG = 32,
NUM_MIPSFPUREG = RiscVJitConstants::TOTAL_MAPPABLE_MIPSREGS - 32,
};
FPURegStatusRiscV ar[NUM_RVFPUREG];
FPURegStatusMIPS mr[NUM_MIPSFPUREG];
bool pendingFlush_ = false;
bool pendingUnlock_ = false;
bool initialReady_ = false;
FPURegStatusRiscV arInitial_[NUM_RVFPUREG];
FPURegStatusMIPS mrInitial_[NUM_MIPSFPUREG];
};