Merge pull request #17800 from unknownbrackets/riscv-jit

More RISC-V jit ops
This commit is contained in:
Henrik Rydgård 2023-07-30 09:26:22 +02:00 committed by GitHub
commit b93275bb35
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 572 additions and 137 deletions

View file

@ -85,7 +85,8 @@ void IRJit::Compile(u32 em_address) {
if (block_num != -1) {
IRBlock *b = blocks_.GetBlock(block_num);
// Okay, let's link and finalize the block now.
b->Finalize(block_num);
int cookie = b->GetTargetOffset() < 0 ? block_num : b->GetTargetOffset();
b->Finalize(cookie);
if (b->IsValid()) {
// Success, we're done.
return;
@ -128,13 +129,13 @@ bool IRJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32
b->SetOriginalSize(mipsBytes);
if (preload) {
// Hash, then only update page stats, don't link yet.
b->UpdateHash();
blocks_.FinalizeBlock(block_num, true);
} else {
// Overwrites the first instruction, and also updates stats.
// TODO: Should we always hash? Then we can reuse blocks.
blocks_.FinalizeBlock(block_num);
b->UpdateHash();
}
if (!CompileTargetBlock(b, block_num, preload))
return false;
// Overwrites the first instruction, and also updates stats.
blocks_.FinalizeBlock(block_num, preload);
return true;
}
@ -264,7 +265,8 @@ void IRJit::UnlinkBlock(u8 *checkedEntry, u32 originalAddress) {
void IRBlockCache::Clear() {
for (int i = 0; i < (int)blocks_.size(); ++i) {
blocks_[i].Destroy(i);
int cookie = blocks_[i].GetTargetOffset() < 0 ? i : blocks_[i].GetTargetOffset();
blocks_[i].Destroy(cookie);
}
blocks_.clear();
byPage_.clear();
@ -283,7 +285,8 @@ void IRBlockCache::InvalidateICache(u32 address, u32 length) {
for (int i : blocksInPage) {
if (blocks_[i].OverlapsRange(address, length)) {
// Not removing from the page, hopefully doesn't build up with small recompiles.
blocks_[i].Destroy(i);
int cookie = blocks_[i].GetTargetOffset() < 0 ? i : blocks_[i].GetTargetOffset();
blocks_[i].Destroy(cookie);
}
}
}
@ -291,7 +294,8 @@ void IRBlockCache::InvalidateICache(u32 address, u32 length) {
void IRBlockCache::FinalizeBlock(int i, bool preload) {
if (!preload) {
blocks_[i].Finalize(i);
int cookie = blocks_[i].GetTargetOffset() < 0 ? i : blocks_[i].GetTargetOffset();
blocks_[i].Finalize(cookie);
}
u32 startAddr, size;
@ -331,13 +335,30 @@ int IRBlockCache::FindPreloadBlock(u32 em_address) {
return -1;
}
int IRBlockCache::FindByCookie(int cookie) {
if (blocks_.empty())
return -1;
// TODO: Maybe a flag to determine target offset mode?
if (blocks_[0].GetTargetOffset() < 0)
return cookie;
for (int i = 0; i < GetNumBlocks(); ++i) {
int offset = blocks_[i].GetTargetOffset();
if (offset == cookie)
return i;
}
return -1;
}
std::vector<u32> IRBlockCache::SaveAndClearEmuHackOps() {
std::vector<u32> result;
result.resize(blocks_.size());
for (int number = 0; number < (int)blocks_.size(); ++number) {
IRBlock &b = blocks_[number];
if (b.IsValid() && b.RestoreOriginalFirstOp(number)) {
int cookie = b.GetTargetOffset() < 0 ? number : b.GetTargetOffset();
if (b.IsValid() && b.RestoreOriginalFirstOp(cookie)) {
result[number] = number;
} else {
result[number] = 0;
@ -357,7 +378,8 @@ void IRBlockCache::RestoreSavedEmuHackOps(std::vector<u32> saved) {
IRBlock &b = blocks_[number];
// Only if we restored it, write it back.
if (b.IsValid() && saved[number] != 0 && b.HasOriginalFirstOp()) {
b.Finalize(number);
int cookie = b.GetTargetOffset() < 0 ? number : b.GetTargetOffset();
b.Finalize(cookie);
}
}
}
@ -441,8 +463,8 @@ bool IRBlock::HasOriginalFirstOp() const {
return Memory::ReadUnchecked_U32(origAddr_) == origFirstOpcode_.encoding;
}
bool IRBlock::RestoreOriginalFirstOp(int number) {
const u32 emuhack = MIPS_EMUHACK_OPCODE | number;
bool IRBlock::RestoreOriginalFirstOp(int cookie) {
const u32 emuhack = MIPS_EMUHACK_OPCODE | cookie;
if (Memory::ReadUnchecked_U32(origAddr_) == emuhack) {
Memory::Write_Opcode_JIT(origAddr_, origFirstOpcode_);
return true;
@ -450,19 +472,19 @@ bool IRBlock::RestoreOriginalFirstOp(int number) {
return false;
}
void IRBlock::Finalize(int number) {
void IRBlock::Finalize(int cookie) {
// Check it wasn't invalidated, in case this is after preload.
// TODO: Allow reusing blocks when the code matches hash_ again, instead.
if (origAddr_) {
origFirstOpcode_ = Memory::Read_Opcode_JIT(origAddr_);
MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | number);
MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | cookie);
Memory::Write_Opcode_JIT(origAddr_, opcode);
}
}
void IRBlock::Destroy(int number) {
void IRBlock::Destroy(int cookie) {
if (origAddr_) {
MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | number);
MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | cookie);
if (Memory::ReadUnchecked_U32(origAddr_) == opcode.encoding)
Memory::Write_Opcode_JIT(origAddr_, origFirstOpcode_);
@ -496,7 +518,7 @@ bool IRBlock::OverlapsRange(u32 addr, u32 size) const {
}
MIPSOpcode IRJit::GetOriginalOp(MIPSOpcode op) {
IRBlock *b = blocks_.GetBlock(op.encoding & 0xFFFFFF);
IRBlock *b = blocks_.GetBlock(blocks_.FindByCookie(op.encoding & 0xFFFFFF));
if (b) {
return b->GetOriginalFirstOp();
}

View file

@ -38,15 +38,16 @@ namespace MIPSComp {
// TODO : Use arena allocators. For now let's just malloc.
class IRBlock {
public:
IRBlock() : instr_(nullptr), numInstructions_(0), origAddr_(0), origSize_(0) {}
IRBlock(u32 emAddr) : instr_(nullptr), numInstructions_(0), origAddr_(emAddr), origSize_(0) {}
IRBlock() {}
IRBlock(u32 emAddr) : origAddr_(emAddr) {}
IRBlock(IRBlock &&b) {
instr_ = b.instr_;
numInstructions_ = b.numInstructions_;
hash_ = b.hash_;
origAddr_ = b.origAddr_;
origSize_ = b.origSize_;
origFirstOpcode_ = b.origFirstOpcode_;
hash_ = b.hash_;
targetOffset_ = b.targetOffset_;
numInstructions_ = b.numInstructions_;
b.instr_ = nullptr;
}
@ -71,6 +72,12 @@ public:
void SetOriginalSize(u32 size) {
origSize_ = size;
}
void SetTargetOffset(int offset) {
targetOffset_ = offset;
}
int GetTargetOffset() const {
return targetOffset_;
}
void UpdateHash() {
hash_ = CalculateHash();
}
@ -90,12 +97,13 @@ public:
private:
u64 CalculateHash() const;
IRInst *instr_;
u16 numInstructions_;
u32 origAddr_;
u32 origSize_;
IRInst *instr_ = nullptr;
u64 hash_ = 0;
u32 origAddr_ = 0;
u32 origSize_ = 0;
MIPSOpcode origFirstOpcode_ = MIPSOpcode(0x68FFFFFF);
int targetOffset_ = -1;
u16 numInstructions_ = 0;
};
class IRBlockCache : public JitBlockCacheDebugInterface {
@ -118,6 +126,7 @@ public:
}
int FindPreloadBlock(u32 em_address);
int FindByCookie(int cookie);
std::vector<u32> SaveAndClearEmuHackOps();
void RestoreSavedEmuHackOps(std::vector<u32> saved);
@ -172,6 +181,7 @@ public:
protected:
virtual bool CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32 &mipsBytes, bool preload);
virtual bool CompileTargetBlock(IRBlock *block, int block_num, bool preload) { return true; }
JitOptions jo;

View file

@ -112,7 +112,7 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
static constexpr RiscVReg regs_to_save[]{ R_RA, X8, X9, X18, X19, X20, X21, X22, X23, X24, X25, X26, X27 };
// TODO: Maybe we shouldn't regalloc all of these? Is it worth it?
static constexpr RiscVReg regs_to_save_fp[]{ F8, F9, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27 };
int saveSize = 8 * (int)(ARRAY_SIZE(regs_to_save) + ARRAY_SIZE(regs_to_save_fp));
int saveSize = (XLEN / 8) * (int)(ARRAY_SIZE(regs_to_save) + ARRAY_SIZE(regs_to_save_fp));
if (saveSize & 0xF)
saveSize += 8;
_assert_msg_((saveSize & 0xF) == 0, "Stack must be kept aligned");
@ -120,18 +120,18 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
ADDI(R_SP, R_SP, -saveSize);
for (RiscVReg r : regs_to_save) {
SD(r, R_SP, saveOffset);
saveOffset += 8;
saveOffset += XLEN / 8;
}
for (RiscVReg r : regs_to_save_fp) {
FS(64, r, R_SP, saveOffset);
saveOffset += 8;
saveOffset += XLEN / 8;
}
_assert_(saveOffset <= saveSize);
// Fixed registers, these are always kept when in Jit context.
LI(MEMBASEREG, Memory::base, SCRATCH1);
LI(CTXREG, mips_, SCRATCH1);
LI(JITBASEREG, blockStartAddrs_, SCRATCH1);
LI(JITBASEREG, GetBasePtr(), SCRATCH1);
LoadStaticRegisters();
MovFromPC(SCRATCH1);
@ -183,35 +183,11 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
// We're in other words comparing to the top 8 bits of MIPS_EMUHACK_OPCODE by subtracting.
ADDI(SCRATCH2, SCRATCH2, -(MIPS_EMUHACK_OPCODE >> 24));
FixupBranch needsCompile = BNE(SCRATCH2, R_ZERO);
// Use a wall to mask by 0x00FFFFFF and extract the block number.
// Use a wall to mask by 0x00FFFFFF and extract the block jit offset.
SLLI(SCRATCH1, SCRATCH1, XLEN - 24);
// But actually, we want * 8, so skip shifting back just a bit.
_assert_msg_(sizeof(blockStartAddrs_[0]) == 8, "RiscVAsm currently assumes pointers are 64-bit");
SRLI(SCRATCH1, SCRATCH1, XLEN - 24 - 3);
if (enableDebug) {
// Let's do some extra validation of the block number in debug mode for testing.
LI(SCRATCH2, MAX_ALLOWED_JIT_BLOCKS * 8);
FixupBranch highBlockNum = BGEU(SCRATCH1, SCRATCH2);
ADD(SCRATCH1, JITBASEREG, SCRATCH1);
// TODO: Consider replacing the block nums after all, just trying to use IR block cache.
LD(SCRATCH1, SCRATCH1, 0);
LI(SCRATCH2, 2);
FixupBranch invalidBlockNum = BEQ(SCRATCH1, R_ZERO);
JR(SCRATCH1);
SetJumpTarget(highBlockNum);
LI(SCRATCH2, 1);
SetJumpTarget(invalidBlockNum);
MV(X10, SCRATCH2);
QuickCallFunction(&ShowBlockError);
} else {
ADD(SCRATCH1, JITBASEREG, SCRATCH1);
// TODO: Consider replacing the block nums after all, just trying to use IR block cache.
LD(SCRATCH1, SCRATCH1, 0);
JR(SCRATCH1);
}
SRLI(SCRATCH1, SCRATCH1, XLEN - 24);
ADD(SCRATCH1, JITBASEREG, SCRATCH1);
JR(SCRATCH1);
SetJumpTarget(needsCompile);
// No block found, let's jit. We don't need to save static regs, they're all callee saved.
@ -238,17 +214,16 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
saveOffset = 0;
for (RiscVReg r : regs_to_save) {
LD(r, R_SP, saveOffset);
saveOffset += 8;
saveOffset += XLEN / 8;
}
for (RiscVReg r : regs_to_save_fp) {
FL(64, r, R_SP, saveOffset);
saveOffset += 8;
saveOffset += XLEN / 8;
}
ADDI(R_SP, R_SP, saveSize);
RET();
// TODO
crashHandler_ = GetCodePtr();
LI(SCRATCH1, &coreState, SCRATCH2);
LI(SCRATCH2, CORE_RUNTIME_ERROR);

View file

@ -254,7 +254,13 @@ void RiscVJit::CompIR_Bits(IRInst inst) {
break;
case IROp::Clz:
CompIR_Generic(inst);
if (cpu_info.RiscV_Zbb) {
gpr.MapDirtyIn(inst.dest, inst.src1, MapType::AVOID_LOAD_MARK_NORM32);
// This even sets to 32 when zero, perfect.
CLZW(gpr.R(inst.dest), gpr.R(inst.src1));
} else {
CompIR_Generic(inst);
}
break;
default:
@ -640,10 +646,53 @@ void RiscVJit::CompIR_Mult(IRInst inst) {
void RiscVJit::CompIR_Div(IRInst inst) {
CONDITIONAL_DISABLE;
RiscVReg numReg, denomReg;
switch (inst.op) {
case IROp::Div:
gpr.MapDirtyDirtyInIn(IRREG_LO, IRREG_HI, inst.src1, inst.src2, MapType::AVOID_LOAD_MARK_NORM32);
// We have to do this because of the divide by zero and overflow checks below.
NormalizeSrc12(inst, &numReg, &denomReg, SCRATCH1, SCRATCH2, true);
DIVW(gpr.R(IRREG_LO), numReg, denomReg);
REMW(gpr.R(IRREG_HI), numReg, denomReg);
// Now some tweaks for divide by zero and overflow.
{
// Start with divide by zero, remainder is fine.
FixupBranch skipNonZero = BNE(denomReg, R_ZERO);
FixupBranch keepNegOne = BGE(numReg, R_ZERO);
LI(gpr.R(IRREG_LO), 1);
SetJumpTarget(keepNegOne);
SetJumpTarget(skipNonZero);
// For overflow, RISC-V sets LO right, but remainder to zero.
// Cheating a bit by using R_RA as a temp...
LI(R_RA, (int32_t)0x80000000);
FixupBranch notMostNegative = BNE(numReg, R_RA);
LI(R_RA, -1);
FixupBranch notNegativeOne = BNE(denomReg, R_RA);
LI(gpr.R(IRREG_HI), -1);
SetJumpTarget(notNegativeOne);
SetJumpTarget(notMostNegative);
}
break;
case IROp::DivU:
CompIR_Generic(inst);
gpr.MapDirtyDirtyInIn(IRREG_LO, IRREG_HI, inst.src1, inst.src2, MapType::AVOID_LOAD_MARK_NORM32);
// We have to do this because of the divide by zero check below.
NormalizeSrc12(inst, &numReg, &denomReg, SCRATCH1, SCRATCH2, true);
DIVUW(gpr.R(IRREG_LO), numReg, denomReg);
REMUW(gpr.R(IRREG_HI), numReg, denomReg);
// On divide by zero, everything is correct already except the 0xFFFF case.
{
FixupBranch skipNonZero = BNE(denomReg, R_ZERO);
// Luckily, we don't need SCRATCH2/denomReg anymore.
LI(SCRATCH2, 0xFFFF);
FixupBranch keepNegOne = BLTU(SCRATCH2, numReg);
MV(gpr.R(IRREG_LO), SCRATCH2);
SetJumpTarget(keepNegOne);
SetJumpTarget(skipNonZero);
}
break;
default:

View file

@ -110,18 +110,68 @@ void RiscVJit::CompIR_FArith(IRInst inst) {
void RiscVJit::CompIR_FCondAssign(IRInst inst) {
CONDITIONAL_DISABLE;
switch (inst.op) {
case IROp::FMin:
case IROp::FMax:
// TODO: These are tricky, have to handle order correctly.
CompIR_Generic(inst);
break;
default:
if (inst.op != IROp::FMin && inst.op != IROp::FMax)
INVALIDOP;
break;
bool maxCondition = inst.op == IROp::FMax;
// FMin and FMax are used by VFPU and handle NAN/INF as just a larger exponent.
fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
FCLASS(32, SCRATCH1, fpr.R(inst.src1));
FCLASS(32, SCRATCH2, fpr.R(inst.src2));
// If either side is a NAN, it needs to participate in the comparison.
OR(SCRATCH1, SCRATCH1, SCRATCH2);
// NAN is either 0x100 or 0x200.
ANDI(SCRATCH1, SCRATCH1, 0x300);
FixupBranch useNormalCond = BEQ(SCRATCH1, R_ZERO);
// Time to use bits... classify won't help because it ignores -NAN.
FMV(FMv::X, FMv::W, SCRATCH1, fpr.R(inst.src1));
FMV(FMv::X, FMv::W, SCRATCH2, fpr.R(inst.src2));
// If both are negative, we flip the comparison (not two's compliment.)
// We cheat and use RA...
AND(R_RA, SCRATCH1, SCRATCH2);
SRLIW(R_RA, R_RA, 31);
if (cpu_info.RiscV_Zbb) {
FixupBranch swapCompare = BNE(R_RA, R_ZERO);
if (maxCondition)
MAX(SCRATCH1, SCRATCH1, SCRATCH2);
else
MIN(SCRATCH1, SCRATCH1, SCRATCH2);
FixupBranch skipSwapCompare = J();
SetJumpTarget(swapCompare);
if (maxCondition)
MIN(SCRATCH1, SCRATCH1, SCRATCH2);
else
MAX(SCRATCH1, SCRATCH1, SCRATCH2);
SetJumpTarget(skipSwapCompare);
} else {
RiscVReg isSrc1LowerReg = gpr.GetAndLockTempR();
gpr.ReleaseSpillLocksAndDiscardTemps();
SLT(isSrc1LowerReg, SCRATCH1, SCRATCH2);
// Flip the flag (to reverse the min/max) based on if both were negative.
XOR(isSrc1LowerReg, isSrc1LowerReg, R_RA);
FixupBranch useSrc1;
if (maxCondition)
useSrc1 = BEQ(isSrc1LowerReg, R_ZERO);
else
useSrc1 = BNE(isSrc1LowerReg, R_ZERO);
MV(SCRATCH1, SCRATCH2);
SetJumpTarget(useSrc1);
}
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
FixupBranch finish = J();
SetJumpTarget(useNormalCond);
if (maxCondition)
FMAX(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
else
FMIN(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
SetJumpTarget(finish);
}
void RiscVJit::CompIR_FAssign(IRInst inst) {
@ -220,12 +270,209 @@ void RiscVJit::CompIR_FSat(IRInst inst) {
void RiscVJit::CompIR_FCompare(IRInst inst) {
CONDITIONAL_DISABLE;
constexpr IRRegIndex IRREG_VFPUL_CC = IRREG_VFPU_CTRL_BASE + VFPU_CTRL_CC;
switch (inst.op) {
case IROp::FCmp:
switch (inst.dest) {
case IRFpCompareMode::False:
gpr.SetImm(IRREG_FPCOND, 0);
break;
case IRFpCompareMode::EitherUnordered:
fpr.MapInIn(inst.src1, inst.src2);
gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
FCLASS(32, SCRATCH1, fpr.R(inst.src1));
FCLASS(32, SCRATCH2, fpr.R(inst.src2));
OR(SCRATCH1, SCRATCH1, SCRATCH2);
// NAN is 0x100 or 0x200.
ANDI(SCRATCH1, SCRATCH1, 0x300);
SNEZ(gpr.R(IRREG_FPCOND), SCRATCH1);
break;
case IRFpCompareMode::EqualOrdered:
fpr.MapInIn(inst.src1, inst.src2);
gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
FEQ(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
break;
case IRFpCompareMode::EqualUnordered:
fpr.MapInIn(inst.src1, inst.src2);
gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
FEQ(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
// Now let's just OR in the unordered check.
FCLASS(32, SCRATCH1, fpr.R(inst.src1));
FCLASS(32, SCRATCH2, fpr.R(inst.src2));
OR(SCRATCH1, SCRATCH1, SCRATCH2);
// NAN is 0x100 or 0x200.
ANDI(SCRATCH1, SCRATCH1, 0x300);
SNEZ(SCRATCH1, SCRATCH1);
OR(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND), SCRATCH1);
break;
case IRFpCompareMode::LessEqualOrdered:
fpr.MapInIn(inst.src1, inst.src2);
gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
FLE(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
break;
case IRFpCompareMode::LessEqualUnordered:
fpr.MapInIn(inst.src1, inst.src2);
gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
FLT(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src2), fpr.R(inst.src1));
SEQZ(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND));
break;
case IRFpCompareMode::LessOrdered:
fpr.MapInIn(inst.src1, inst.src2);
gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
FLT(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
break;
case IRFpCompareMode::LessUnordered:
fpr.MapInIn(inst.src1, inst.src2);
gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
FLE(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src2), fpr.R(inst.src1));
SEQZ(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND));
break;
}
break;
case IROp::FCmovVfpuCC:
gpr.MapReg(IRREG_VFPUL_CC);
fpr.MapDirtyIn(inst.dest, inst.src1, false);
if ((inst.src2 & 0xF) == 0) {
ANDI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), 1);
} else if (cpu_info.RiscV_Zbs) {
BEXTI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), inst.src2 & 0xF);
} else {
SRLI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), inst.src2 & 0xF);
ANDI(SCRATCH1, SCRATCH1, 1);
}
if ((inst.src2 >> 7) & 1) {
FixupBranch skip = BEQ(SCRATCH1, R_ZERO);
FMV(32, fpr.R(inst.dest), fpr.R(inst.src1));
SetJumpTarget(skip);
} else {
FixupBranch skip = BNE(SCRATCH1, R_ZERO);
FMV(32, fpr.R(inst.dest), fpr.R(inst.src1));
SetJumpTarget(skip);
}
break;
case IROp::FCmpVfpuBit:
gpr.MapReg(IRREG_VFPUL_CC, MIPSMap::DIRTY);
switch (VCondition(inst.dest & 0xF)) {
case VC_EQ:
fpr.MapInIn(inst.src1, inst.src2);
FEQ(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
break;
case VC_NE:
fpr.MapInIn(inst.src1, inst.src2);
// We could almost negate FEQ, except NAN != NAN.
// Anything != NAN is false and NAN != NAN is within that, so we only check one side.
FCLASS(32, SCRATCH2, fpr.R(inst.src2));
// NAN is 0x100 or 0x200.
ANDI(SCRATCH2, SCRATCH2, 0x300);
SNEZ(SCRATCH2, SCRATCH2);
FEQ(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
SEQZ(SCRATCH1, SCRATCH1);
// Just OR in whether that side was a NAN so it's always not equal.
OR(SCRATCH1, SCRATCH1, SCRATCH2);
break;
case VC_LT:
fpr.MapInIn(inst.src1, inst.src2);
FLT(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
break;
case VC_LE:
fpr.MapInIn(inst.src1, inst.src2);
FLE(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
break;
case VC_GT:
fpr.MapInIn(inst.src1, inst.src2);
FLT(32, SCRATCH1, fpr.R(inst.src2), fpr.R(inst.src1));
break;
case VC_GE:
fpr.MapInIn(inst.src1, inst.src2);
FLE(32, SCRATCH1, fpr.R(inst.src2), fpr.R(inst.src1));
break;
case VC_EZ:
case VC_NZ:
fpr.MapReg(inst.src1);
// Zero is either 0x10 or 0x08.
FCLASS(32, SCRATCH1, gpr.R(inst.src1));
ANDI(SCRATCH1, SCRATCH1, 0x18);
if ((inst.dest & 4) == 0)
SNEZ(SCRATCH1, SCRATCH1);
else
SEQZ(SCRATCH1, SCRATCH1);
break;
case VC_EN:
case VC_NN:
fpr.MapReg(inst.src1);
// NAN is either 0x100 or 0x200.
FCLASS(32, SCRATCH1, gpr.R(inst.src1));
ANDI(SCRATCH1, SCRATCH1, 0x300);
if ((inst.dest & 4) == 0)
SNEZ(SCRATCH1, SCRATCH1);
else
SEQZ(SCRATCH1, SCRATCH1);
break;
case VC_EI:
case VC_NI:
fpr.MapReg(inst.src1);
// Infinity is either 0x80 or 0x01.
FCLASS(32, SCRATCH1, gpr.R(inst.src1));
ANDI(SCRATCH1, SCRATCH1, 0x81);
if ((inst.dest & 4) == 0)
SNEZ(SCRATCH1, SCRATCH1);
else
SEQZ(SCRATCH1, SCRATCH1);
break;
case VC_ES:
case VC_NS:
fpr.MapReg(inst.src1);
// Infinity is either 0x80 or 0x01, NAN is either 0x100 or 0x200.
FCLASS(32, SCRATCH1, gpr.R(inst.src1));
ANDI(SCRATCH1, SCRATCH1, 0x381);
if ((inst.dest & 4) == 0)
SNEZ(SCRATCH1, SCRATCH1);
else
SEQZ(SCRATCH1, SCRATCH1);
break;
case VC_TR:
LI(SCRATCH1, 1);
break;
case VC_FL:
LI(SCRATCH1, 0);
break;
}
ANDI(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), ~(1 << (inst.dest >> 4)));
if ((inst.dest >> 4) != 0)
SLLI(SCRATCH1, SCRATCH1, inst.dest >> 4);
OR(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), SCRATCH1);
break;
case IROp::FCmpVfpuAggregate:
CompIR_Generic(inst);
gpr.MapReg(IRREG_VFPUL_CC, MIPSMap::DIRTY);
ANDI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), inst.dest);
// This is the "any bit", easy.
SNEZ(SCRATCH2, SCRATCH1);
// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
ADDI(SCRATCH1, SCRATCH1, -inst.dest);
SEQZ(SCRATCH1, SCRATCH1);
// Now we combine those together.
SLLI(SCRATCH1, SCRATCH1, 5);
SLLI(SCRATCH2, SCRATCH2, 4);
OR(SCRATCH1, SCRATCH1, SCRATCH2);
// Reject those any/all bits and replace them with our own.
ANDI(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), ~0x30);
OR(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), SCRATCH1);
break;
default:
@ -259,13 +506,70 @@ void RiscVJit::CompIR_RoundingMode(IRInst inst) {
void RiscVJit::CompIR_FSpecial(IRInst inst) {
CONDITIONAL_DISABLE;
#ifdef __riscv_float_abi_soft
#error Currently hard float is required.
#endif
auto callFuncF_F = [&](float (*func)(float)){
gpr.FlushBeforeCall();
fpr.FlushBeforeCall();
// It might be in a non-volatile register.
if (fpr.IsMapped(inst.src1)) {
FMV(32, F10, fpr.R(inst.src1));
} else {
int offset = offsetof(MIPSState, f) + inst.src1 * 4;
FL(32, F10, CTXREG, offset);
}
QuickCallFunction(func);
fpr.MapReg(inst.dest, MIPSMap::NOINIT);
// If it's already F10, we're done - MapReg doesn't actually overwrite the reg in that case.
if (fpr.R(inst.dest) != F10) {
FMV(32, fpr.R(inst.dest), F10);
}
};
switch (inst.op) {
case IROp::FSin:
callFuncF_F(&vfpu_sin);
break;
case IROp::FCos:
callFuncF_F(&vfpu_cos);
break;
case IROp::FRSqrt:
fpr.MapDirtyIn(inst.dest, inst.src1);
FSQRT(32, fpr.R(inst.dest), fpr.R(inst.src1));
// Ugh, we can't really avoid a temp here. Probably not worth a permanent one.
LI(SCRATCH1, 1.0f);
{
// TODO: Smarter allocation of a temp reg?
RiscVReg tempReg = fpr.R(inst.dest) == F31 ? F30 : F31;
fpr.FlushRiscVReg(tempReg);
FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
FDIV(32, fpr.R(inst.dest), tempReg, fpr.R(inst.dest));
}
break;
case IROp::FRecip:
fpr.MapDirtyIn(inst.dest, inst.src1);
LI(SCRATCH1, 1.0f);
if (inst.dest != inst.src1) {
// This is the easy case.
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
FDIV(32, fpr.R(inst.dest), fpr.R(inst.dest), fpr.R(inst.src1));
} else {
RiscVReg tempReg = fpr.R(inst.dest) == F31 ? F30 : F31;
fpr.FlushRiscVReg(tempReg);
FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
FDIV(32, fpr.R(inst.dest), tempReg, fpr.R(inst.src1));
}
break;
case IROp::FAsin:
CompIR_Generic(inst);
callFuncF_F(&vfpu_asin);
break;
default:

View file

@ -20,7 +20,6 @@
#include "Core/HLE/HLE.h"
#include "Core/HLE/ReplaceTables.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSTables.h"
#include "Core/MIPS/RiscV/RiscVJit.h"
#include "Core/MIPS/RiscV/RiscVRegCache.h"
@ -101,7 +100,7 @@ void RiscVJit::CompIR_Transfer(IRInst inst) {
break;
case IROp::SetCtrlVFPUFReg:
gpr.MapReg(IRREG_VFPU_CTRL_BASE + inst.dest, MIPSMap::NOINIT);
gpr.MapReg(IRREG_VFPU_CTRL_BASE + inst.dest, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
fpr.MapReg(inst.src1);
FMV(FMv::X, FMv::W, gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), fpr.R(inst.src1));
break;
@ -167,7 +166,7 @@ void RiscVJit::CompIR_Transfer(IRInst inst) {
break;
case IROp::FMovToGPR:
gpr.MapReg(inst.dest, MIPSMap::NOINIT);
gpr.MapReg(inst.dest, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
fpr.MapReg(inst.src1);
FMV(FMv::X, FMv::W, gpr.R(inst.dest), fpr.R(inst.src1));
break;
@ -182,15 +181,6 @@ void RiscVJit::CompIR_System(IRInst inst) {
CONDITIONAL_DISABLE;
switch (inst.op) {
case IROp::Interpret:
// IR protects us against this being a branching instruction (well, hopefully.)
FlushAll();
SaveStaticRegisters();
LI(X10, (int32_t)inst.constant);
QuickCallFunction((const u8 *)MIPSGetInterpretFunc(MIPSOpcode(inst.constant)));
LoadStaticRegisters();
break;
case IROp::Syscall:
FlushAll();
SaveStaticRegisters();

View file

@ -241,10 +241,25 @@ void RiscVJit::CompIR_VecPack(IRInst inst) {
case IROp::Vec4Pack31To8:
case IROp::Vec4Pack32To8:
case IROp::Vec2Pack31To16:
case IROp::Vec2Pack32To16:
CompIR_Generic(inst);
break;
case IROp::Vec2Pack32To16:
fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src1 + 1);
FMV(FMv::X, FMv::W, SCRATCH1, fpr.R(inst.src1));
FMV(FMv::X, FMv::W, SCRATCH2, fpr.R(inst.src1 + 1));
// Keep in mind, this was sign-extended, so we have to zero the upper.
SLLI(SCRATCH1, SCRATCH1, XLEN - 32);
// Now we just set (SCRATCH2 & 0xFFFF0000) | SCRATCH1.
SRLI(SCRATCH1, SCRATCH1, XLEN - 16);
// Use a wall to mask. We can ignore the upper 32 here.
SRLI(SCRATCH2, SCRATCH2, 16);
SLLI(SCRATCH2, SCRATCH2, 16);
OR(SCRATCH1, SCRATCH1, SCRATCH2);
// Okay, to the floating point register.
FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
break;
default:
INVALIDOP;
break;

View file

@ -16,7 +16,9 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "Common/StringUtils.h"
#include "Common/TimeUtil.h"
#include "Core/MemMap.h"
#include "Core/MIPS/MIPSTables.h"
#include "Core/MIPS/RiscV/RiscVJit.h"
#include "Core/MIPS/RiscV/RiscVRegCache.h"
#include "Common/Profiler/Profiler.h"
@ -26,19 +28,58 @@ namespace MIPSComp {
using namespace RiscVGen;
using namespace RiscVJitConstants;
static constexpr bool enableDebug = false;
static std::map<uint8_t, int> debugSeenNotCompiledIR;
static std::map<const char *, int> debugSeenNotCompiled;
double lastDebugLog = 0.0;
static void LogDebugNotCompiled() {
if (!enableDebug)
return;
double now = time_now_d();
if (now < lastDebugLog + 1.0)
return;
lastDebugLog = now;
int worstIROp = -1;
int worstIRVal = 0;
for (auto it : debugSeenNotCompiledIR) {
if (it.second > worstIRVal) {
worstIRVal = it.second;
worstIROp = it.first;
}
}
debugSeenNotCompiledIR.clear();
const char *worstName = nullptr;
int worstVal = 0;
for (auto it : debugSeenNotCompiled) {
if (it.second > worstVal) {
worstVal = it.second;
worstName = it.first;
}
}
debugSeenNotCompiled.clear();
if (worstIROp != -1)
WARN_LOG(JIT, "Most not compiled IR op: %s (%d)", GetIRMeta((IROp)worstIROp)->name, worstIRVal);
if (worstName != nullptr)
WARN_LOG(JIT, "Most not compiled op: %s (%d)", worstName, worstVal);
}
RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo), fpr(mipsState, &jo) {
// Automatically disable incompatible options.
if (((intptr_t)Memory::base & 0x00000000FFFFFFFFUL) != 0) {
jo.enablePointerify = false;
}
// Since we store the offset, this is as big as it can be.
// We could shift off one bit to double it, would need to change RiscVAsm.
AllocCodeSpace(1024 * 1024 * 16);
SetAutoCompress(true);
// TODO: Consider replacing block num method form IRJit - this is 2MB.
blockStartAddrs_ = new const u8 *[MAX_ALLOWED_JIT_BLOCKS];
memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);
gpr.Init(this);
fpr.Init(this);
@ -46,42 +87,35 @@ RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo)
}
RiscVJit::~RiscVJit() {
delete [] blockStartAddrs_;
}
void RiscVJit::RunLoopUntil(u64 globalticks) {
if constexpr (enableDebug) {
LogDebugNotCompiled();
}
PROFILE_THIS_SCOPE("jit");
((void (*)())enterDispatcher_)();
}
bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32 &mipsBytes, bool preload) {
// Check that we're not full (we allow less blocks than IR itself.)
if (blocks_.GetNumBlocks() >= MAX_ALLOWED_JIT_BLOCKS - 1)
static void NoBlockExits() {
_assert_msg_(false, "Never exited block, invalid IR?");
}
bool RiscVJit::CompileTargetBlock(IRBlock *block, int block_num, bool preload) {
if (GetSpaceLeft() < 0x800)
return false;
if (!IRJit::CompileBlock(em_address, instructions, mipsBytes, preload))
return false;
// Don't worry, the codespace isn't large enough to overflow offsets.
block->SetTargetOffset((int)GetOffset(GetCodePointer()));
// TODO: Block linking, checked entries and such.
int block_num;
if (preload) {
block_num = blocks_.GetBlockNumberFromStartAddress(em_address);
} else {
u32 first_inst = Memory::ReadUnchecked_U32(em_address);
_assert_msg_(MIPS_IS_RUNBLOCK(first_inst), "Should've written an emuhack");
block_num = first_inst & MIPS_EMUHACK_VALUE_MASK;
}
_assert_msg_(block_num >= 0 && block_num < MAX_ALLOWED_JIT_BLOCKS, "Bad block num");
_assert_msg_(blockStartAddrs_[block_num] == nullptr, "Block %d reused before clear", block_num);
blockStartAddrs_[block_num] = GetCodePointer();
gpr.Start();
fpr.Start();
for (const IRInst &inst : instructions) {
for (int i = 0; i < block->GetNumInstructions(); ++i) {
const IRInst &inst = block->GetInstructions()[i];
CompileIRInst(inst);
if (jo.Disabled(JitDisable::REGALLOC_GPR)) {
@ -97,9 +131,11 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
}
}
// Note: a properly constructed block should never get here.
// TODO: Need to do more than just this? Call a func to set an exception?
QuickJ(R_RA, crashHandler_);
// We should've written an exit above. If we didn't, bad things will happen.
if (enableDebug) {
QuickCallFunction(&NoBlockExits);
QuickJ(R_RA, crashHandler_);
}
FlushIcache();
@ -351,6 +387,9 @@ void RiscVJit::CompileIRInst(IRInst inst) {
break;
case IROp::Interpret:
CompIR_Interpret(inst);
break;
case IROp::Syscall:
case IROp::CallReplacement:
case IROp::Break:
@ -397,6 +436,9 @@ static u32 DoIRInst(uint64_t value) {
IRInst inst;
memcpy(&inst, &value, sizeof(inst));
if constexpr (enableDebug)
debugSeenNotCompiledIR[(uint8_t)inst.op]++;
return IRInterpret(currentMIPS, &inst, 1);
}
@ -425,6 +467,26 @@ void RiscVJit::CompIR_Generic(IRInst inst) {
}
}
static void DebugInterpretHit(const char *name) {
if (enableDebug)
debugSeenNotCompiled[name]++;
}
void RiscVJit::CompIR_Interpret(IRInst inst) {
MIPSOpcode op(inst.constant);
// IR protects us against this being a branching instruction (well, hopefully.)
FlushAll();
SaveStaticRegisters();
if (enableDebug) {
LI(X10, MIPSGetName(op));
QuickCallFunction(&DebugInterpretHit);
}
LI(X10, (int32_t)inst.constant);
QuickCallFunction((const u8 *)MIPSGetInterpretFunc(op));
LoadStaticRegisters();
}
void RiscVJit::FlushAll() {
gpr.FlushAll();
fpr.FlushAll();
@ -449,17 +511,14 @@ bool RiscVJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
} else if (!IsInSpace(ptr)) {
return false;
} else {
uintptr_t uptr = (uintptr_t)ptr;
int offset = (int)GetOffset(ptr);
int block_num = -1;
for (int i = 0; i < MAX_ALLOWED_JIT_BLOCKS; ++i) {
uintptr_t blockptr = (uintptr_t)blockStartAddrs_[i];
// Out of allocated blocks.
if (uptr == 0)
break;
if (uptr >= blockptr)
for (int i = 0; i < blocks_.GetNumBlocks(); ++i) {
const auto &b = blocks_.GetBlock(i);
// We allocate linearly.
if (b->GetTargetOffset() <= offset)
block_num = i;
if (uptr < blockptr)
if (b->GetTargetOffset() > offset)
break;
}
@ -501,8 +560,6 @@ void RiscVJit::ClearCache() {
ClearCodeSpace(jitStartOffset_);
FlushIcacheSection(region + jitStartOffset_, region + region_size - jitStartOffset_);
memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);
}
void RiscVJit::RestoreRoundingMode(bool force) {

View file

@ -46,7 +46,7 @@ public:
// TODO: GetBlockCacheDebugInterface, block linking?
protected:
bool CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32 &mipsBytes, bool preload) override;
bool CompileTargetBlock(IRBlock *block, int block_num, bool preload) override;
void CompileIRInst(IRInst inst);
@ -87,6 +87,7 @@ private:
void CompIR_FStore(IRInst inst);
void CompIR_Generic(IRInst inst);
void CompIR_HiLo(IRInst inst);
void CompIR_Interpret(IRInst inst);
void CompIR_Load(IRInst inst);
void CompIR_LoadShift(IRInst inst);
void CompIR_Logic(IRInst inst);
@ -116,8 +117,6 @@ private:
RiscVRegCache gpr;
RiscVRegCacheFPU fpr;
static constexpr int MAX_ALLOWED_JIT_BLOCKS = 262144;
const u8 *enterDispatcher_ = nullptr;
const u8 *outerLoop_ = nullptr;
@ -135,7 +134,6 @@ private:
const u8 *crashHandler_ = nullptr;
int jitStartOffset_ = 0;
const u8 **blockStartAddrs_ = nullptr;
};
} // namespace MIPSComp

View file

@ -140,6 +140,7 @@ public:
void FlushBeforeCall();
void FlushAll();
void FlushR(IRRegIndex r);
void FlushRiscVReg(RiscVGen::RiscVReg r);
void DiscardR(IRRegIndex r);
RiscVGen::RiscVReg GetAndLockTempR();
@ -163,7 +164,6 @@ private:
RiscVGen::RiscVReg AllocateReg();
RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
void FlushRiscVReg(RiscVGen::RiscVReg r);
void SetRegImm(RiscVGen::RiscVReg reg, u64 imm);
void AddMemBase(RiscVGen::RiscVReg reg);
int GetMipsRegOffset(IRRegIndex r);

View file

@ -27,9 +27,6 @@
using namespace RiscVGen;
using namespace RiscVJitConstants;
using namespace RiscVGen;
using namespace RiscVJitConstants;
RiscVRegCacheFPU::RiscVRegCacheFPU(MIPSState *mipsState, MIPSComp::JitOptions *jo)
: mips_(mipsState), jo_(jo) {}
@ -279,6 +276,24 @@ RiscVReg RiscVRegCacheFPU::RiscVRegForFlush(IRRegIndex r) {
}
}
void RiscVRegCacheFPU::FlushBeforeCall() {
// Note: don't set this false at the end, since we don't flush everything.
if (!pendingFlush_) {
return;
}
// These registers are not preserved by function calls.
for (int i = 0; i <= 7; ++i) {
FlushRiscVReg(RiscVReg(F0 + i));
}
for (int i = 10; i <= 17; ++i) {
FlushRiscVReg(RiscVReg(F0 + i));
}
for (int i = 28; i <= 31; ++i) {
FlushRiscVReg(RiscVReg(F0 + i));
}
}
void RiscVRegCacheFPU::FlushAll() {
if (!pendingFlush_) {
// Nothing allocated. FPU regs are not nearly as common as GPR.

View file

@ -64,11 +64,12 @@ public:
void MapInIn(IRRegIndex rd, IRRegIndex rs);
void MapDirtyIn(IRRegIndex rd, IRRegIndex rs, bool avoidLoad = true);
void MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, bool avoidLoad = true);
void Map4Dirty(IRRegIndex rdbase, bool avoidLoad = true);
void Map4DirtyIn(IRRegIndex rdbase, IRRegIndex rsbase, bool avoidLoad = true);
void Map4DirtyInIn(IRRegIndex rdbase, IRRegIndex rsbase, IRRegIndex rtbase, bool avoidLoad = true);
void FlushBeforeCall();
void FlushAll();
void FlushR(IRRegIndex r);
void FlushRiscVReg(RiscVGen::RiscVReg r);
void DiscardR(IRRegIndex r);
RiscVGen::RiscVReg R(int preg); // Returns a cached register
@ -78,7 +79,6 @@ private:
RiscVGen::RiscVReg AllocateReg();
RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
void FlushRiscVReg(RiscVGen::RiscVReg r);
int GetMipsRegOffset(IRRegIndex r);
bool IsValidReg(IRRegIndex r) const;