Merge pull request #17800 from unknownbrackets/riscv-jit

More RISC-V jit ops
2025-04-02 11:01:50 -04:00 · 2023-07-30 09:26:22 +02:00 · 2023-07-30 09:26:22 +02:00 · b93275bb35
commit b93275bb35
parent c8447ff4b7 0036f3c494
12 changed files with 572 additions and 137 deletions
--- a/Core/MIPS/IR/IRJit.cpp
+++ b/Core/MIPS/IR/IRJit.cpp
@ -85,7 +85,8 @@ void IRJit::Compile(u32 em_address) {
 		if (block_num != -1) {
 			IRBlock *b = blocks_.GetBlock(block_num);
 			// Okay, let's link and finalize the block now.
-			b->Finalize(block_num);
+			int cookie = b->GetTargetOffset() < 0 ? block_num : b->GetTargetOffset();
+			b->Finalize(cookie);
 			if (b->IsValid()) {
 				// Success, we're done.
 				return;
@ -128,13 +129,13 @@ bool IRJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32
 	b->SetOriginalSize(mipsBytes);
 	if (preload) {
 		// Hash, then only update page stats, don't link yet.
-		b->UpdateHash();
-		blocks_.FinalizeBlock(block_num, true);
-	} else {
-		// Overwrites the first instruction, and also updates stats.
 		// TODO: Should we always hash?  Then we can reuse blocks.
-		blocks_.FinalizeBlock(block_num);
+		b->UpdateHash();
 	}
+	if (!CompileTargetBlock(b, block_num, preload))
+		return false;
+	// Overwrites the first instruction, and also updates stats.
+	blocks_.FinalizeBlock(block_num, preload);

 	return true;
 }
@ -264,7 +265,8 @@ void IRJit::UnlinkBlock(u8 *checkedEntry, u32 originalAddress) {

 void IRBlockCache::Clear() {
 	for (int i = 0; i < (int)blocks_.size(); ++i) {
-		blocks_[i].Destroy(i);
+		int cookie = blocks_[i].GetTargetOffset() < 0 ? i : blocks_[i].GetTargetOffset();
+		blocks_[i].Destroy(cookie);
 	}
 	blocks_.clear();
 	byPage_.clear();
@ -283,7 +285,8 @@ void IRBlockCache::InvalidateICache(u32 address, u32 length) {
 		for (int i : blocksInPage) {
 			if (blocks_[i].OverlapsRange(address, length)) {
 				// Not removing from the page, hopefully doesn't build up with small recompiles.
-				blocks_[i].Destroy(i);
+				int cookie = blocks_[i].GetTargetOffset() < 0 ? i : blocks_[i].GetTargetOffset();
+				blocks_[i].Destroy(cookie);
 			}
 		}
 	}
@ -291,7 +294,8 @@ void IRBlockCache::InvalidateICache(u32 address, u32 length) {

 void IRBlockCache::FinalizeBlock(int i, bool preload) {
 	if (!preload) {
-		blocks_[i].Finalize(i);
+		int cookie = blocks_[i].GetTargetOffset() < 0 ? i : blocks_[i].GetTargetOffset();
+		blocks_[i].Finalize(cookie);
 	}

 	u32 startAddr, size;
@ -331,13 +335,30 @@ int IRBlockCache::FindPreloadBlock(u32 em_address) {
 	return -1;
 }

+int IRBlockCache::FindByCookie(int cookie) {
+	if (blocks_.empty())
+		return -1;
+	// TODO: Maybe a flag to determine target offset mode?
+	if (blocks_[0].GetTargetOffset() < 0)
+		return cookie;
+
+	for (int i = 0; i < GetNumBlocks(); ++i) {
+		int offset = blocks_[i].GetTargetOffset();
+		if (offset == cookie)
+			return i;
+	}
+
+	return -1;
+}
+
 std::vector<u32> IRBlockCache::SaveAndClearEmuHackOps() {
 	std::vector<u32> result;
 	result.resize(blocks_.size());

 	for (int number = 0; number < (int)blocks_.size(); ++number) {
 		IRBlock &b = blocks_[number];
-		if (b.IsValid() && b.RestoreOriginalFirstOp(number)) {
+		int cookie = b.GetTargetOffset() < 0 ? number : b.GetTargetOffset();
+		if (b.IsValid() && b.RestoreOriginalFirstOp(cookie)) {
 			result[number] = number;
 		} else {
 			result[number] = 0;
@ -357,7 +378,8 @@ void IRBlockCache::RestoreSavedEmuHackOps(std::vector<u32> saved) {
 		IRBlock &b = blocks_[number];
 		// Only if we restored it, write it back.
 		if (b.IsValid() && saved[number] != 0 && b.HasOriginalFirstOp()) {
-			b.Finalize(number);
+			int cookie = b.GetTargetOffset() < 0 ? number : b.GetTargetOffset();
+			b.Finalize(cookie);
 		}
 	}
 }
@ -441,8 +463,8 @@ bool IRBlock::HasOriginalFirstOp() const {
 	return Memory::ReadUnchecked_U32(origAddr_) == origFirstOpcode_.encoding;
 }

-bool IRBlock::RestoreOriginalFirstOp(int number) {
-	const u32 emuhack = MIPS_EMUHACK_OPCODE | number;
+bool IRBlock::RestoreOriginalFirstOp(int cookie) {
+	const u32 emuhack = MIPS_EMUHACK_OPCODE | cookie;
 	if (Memory::ReadUnchecked_U32(origAddr_) == emuhack) {
 		Memory::Write_Opcode_JIT(origAddr_, origFirstOpcode_);
 		return true;
@ -450,19 +472,19 @@ bool IRBlock::RestoreOriginalFirstOp(int number) {
 	return false;
 }

-void IRBlock::Finalize(int number) {
+void IRBlock::Finalize(int cookie) {
 	// Check it wasn't invalidated, in case this is after preload.
 	// TODO: Allow reusing blocks when the code matches hash_ again, instead.
 	if (origAddr_) {
 		origFirstOpcode_ = Memory::Read_Opcode_JIT(origAddr_);
-		MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | number);
+		MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | cookie);
 		Memory::Write_Opcode_JIT(origAddr_, opcode);
 	}
 }

-void IRBlock::Destroy(int number) {
+void IRBlock::Destroy(int cookie) {
 	if (origAddr_) {
-		MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | number);
+		MIPSOpcode opcode = MIPSOpcode(MIPS_EMUHACK_OPCODE | cookie);
 		if (Memory::ReadUnchecked_U32(origAddr_) == opcode.encoding)
 			Memory::Write_Opcode_JIT(origAddr_, origFirstOpcode_);

@ -496,7 +518,7 @@ bool IRBlock::OverlapsRange(u32 addr, u32 size) const {
 }

 MIPSOpcode IRJit::GetOriginalOp(MIPSOpcode op) {
-	IRBlock *b = blocks_.GetBlock(op.encoding & 0xFFFFFF);
+	IRBlock *b = blocks_.GetBlock(blocks_.FindByCookie(op.encoding & 0xFFFFFF));
 	if (b) {
 		return b->GetOriginalFirstOp();
 	}
--- a/Core/MIPS/IR/IRJit.h
+++ b/Core/MIPS/IR/IRJit.h
@ -38,15 +38,16 @@ namespace MIPSComp {
 // TODO : Use arena allocators. For now let's just malloc.
 class IRBlock {
 public:
-	IRBlock() : instr_(nullptr), numInstructions_(0), origAddr_(0), origSize_(0) {}
-	IRBlock(u32 emAddr) : instr_(nullptr), numInstructions_(0), origAddr_(emAddr), origSize_(0) {}
+	IRBlock() {}
+	IRBlock(u32 emAddr) : origAddr_(emAddr) {}
 	IRBlock(IRBlock &&b) {
 		instr_ = b.instr_;
-		numInstructions_ = b.numInstructions_;
+		hash_ = b.hash_;
 		origAddr_ = b.origAddr_;
 		origSize_ = b.origSize_;
 		origFirstOpcode_ = b.origFirstOpcode_;
-		hash_ = b.hash_;
+		targetOffset_ = b.targetOffset_;
+		numInstructions_ = b.numInstructions_;
 		b.instr_ = nullptr;
 	}

@ -71,6 +72,12 @@ public:
 	void SetOriginalSize(u32 size) {
 		origSize_ = size;
 	}
+	void SetTargetOffset(int offset) {
+		targetOffset_ = offset;
+	}
+	int GetTargetOffset() const {
+		return targetOffset_;
+	}
 	void UpdateHash() {
 		hash_ = CalculateHash();
 	}
@ -90,12 +97,13 @@ public:
 private:
 	u64 CalculateHash() const;

-	IRInst *instr_;
-	u16 numInstructions_;
-	u32 origAddr_;
-	u32 origSize_;
+	IRInst *instr_ = nullptr;
 	u64 hash_ = 0;
+	u32 origAddr_ = 0;
+	u32 origSize_ = 0;
 	MIPSOpcode origFirstOpcode_ = MIPSOpcode(0x68FFFFFF);
+	int targetOffset_ = -1;
+	u16 numInstructions_ = 0;
 };

 class IRBlockCache : public JitBlockCacheDebugInterface {
@ -118,6 +126,7 @@ public:
 	}

 	int FindPreloadBlock(u32 em_address);
+	int FindByCookie(int cookie);

 	std::vector<u32> SaveAndClearEmuHackOps();
 	void RestoreSavedEmuHackOps(std::vector<u32> saved);
@ -172,6 +181,7 @@ public:

 protected:
 	virtual bool CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32 &mipsBytes, bool preload);
+	virtual bool CompileTargetBlock(IRBlock *block, int block_num, bool preload) { return true; }

 	JitOptions jo;

--- a/Core/MIPS/RiscV/RiscVAsm.cpp
+++ b/Core/MIPS/RiscV/RiscVAsm.cpp
@ -112,7 +112,7 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
 	static constexpr RiscVReg regs_to_save[]{ R_RA, X8, X9, X18, X19, X20, X21, X22, X23, X24, X25, X26, X27 };
 	// TODO: Maybe we shouldn't regalloc all of these?  Is it worth it?
 	static constexpr RiscVReg regs_to_save_fp[]{ F8, F9, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27 };
-	int saveSize = 8 * (int)(ARRAY_SIZE(regs_to_save) + ARRAY_SIZE(regs_to_save_fp));
+	int saveSize = (XLEN / 8) * (int)(ARRAY_SIZE(regs_to_save) + ARRAY_SIZE(regs_to_save_fp));
 	if (saveSize & 0xF)
 		saveSize += 8;
 	_assert_msg_((saveSize & 0xF) == 0, "Stack must be kept aligned");
@ -120,18 +120,18 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
 	ADDI(R_SP, R_SP, -saveSize);
 	for (RiscVReg r : regs_to_save) {
 		SD(r, R_SP, saveOffset);
-		saveOffset += 8;
+		saveOffset += XLEN / 8;
 	}
 	for (RiscVReg r : regs_to_save_fp) {
 		FS(64, r, R_SP, saveOffset);
-		saveOffset += 8;
+		saveOffset += XLEN / 8;
 	}
 	_assert_(saveOffset <= saveSize);

 	// Fixed registers, these are always kept when in Jit context.
 	LI(MEMBASEREG, Memory::base, SCRATCH1);
 	LI(CTXREG, mips_, SCRATCH1);
-	LI(JITBASEREG, blockStartAddrs_, SCRATCH1);
+	LI(JITBASEREG, GetBasePtr(), SCRATCH1);

 	LoadStaticRegisters();
 	MovFromPC(SCRATCH1);
@ -183,35 +183,11 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
 	// We're in other words comparing to the top 8 bits of MIPS_EMUHACK_OPCODE by subtracting.
 	ADDI(SCRATCH2, SCRATCH2, -(MIPS_EMUHACK_OPCODE >> 24));
 	FixupBranch needsCompile = BNE(SCRATCH2, R_ZERO);
-	// Use a wall to mask by 0x00FFFFFF and extract the block number.
+	// Use a wall to mask by 0x00FFFFFF and extract the block jit offset.
 	SLLI(SCRATCH1, SCRATCH1, XLEN - 24);
-	// But actually, we want * 8, so skip shifting back just a bit.
-	_assert_msg_(sizeof(blockStartAddrs_[0]) == 8, "RiscVAsm currently assumes pointers are 64-bit");
-	SRLI(SCRATCH1, SCRATCH1, XLEN - 24 - 3);
-	if (enableDebug) {
-		// Let's do some extra validation of the block number in debug mode for testing.
-
-		LI(SCRATCH2, MAX_ALLOWED_JIT_BLOCKS * 8);
-		FixupBranch highBlockNum = BGEU(SCRATCH1, SCRATCH2);
-		ADD(SCRATCH1, JITBASEREG, SCRATCH1);
-		// TODO: Consider replacing the block nums after all, just trying to use IR block cache.
-		LD(SCRATCH1, SCRATCH1, 0);
-		LI(SCRATCH2, 2);
-		FixupBranch invalidBlockNum = BEQ(SCRATCH1, R_ZERO);
-		JR(SCRATCH1);
-
-		SetJumpTarget(highBlockNum);
-		LI(SCRATCH2, 1);
-		SetJumpTarget(invalidBlockNum);
-
-		MV(X10, SCRATCH2);
-		QuickCallFunction(&ShowBlockError);
-	} else {
-		ADD(SCRATCH1, JITBASEREG, SCRATCH1);
-		// TODO: Consider replacing the block nums after all, just trying to use IR block cache.
-		LD(SCRATCH1, SCRATCH1, 0);
-		JR(SCRATCH1);
-	}
+	SRLI(SCRATCH1, SCRATCH1, XLEN - 24);
+	ADD(SCRATCH1, JITBASEREG, SCRATCH1);
+	JR(SCRATCH1);
 	SetJumpTarget(needsCompile);

 	// No block found, let's jit.  We don't need to save static regs, they're all callee saved.
@ -238,17 +214,16 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
 	saveOffset = 0;
 	for (RiscVReg r : regs_to_save) {
 		LD(r, R_SP, saveOffset);
-		saveOffset += 8;
+		saveOffset += XLEN / 8;
 	}
 	for (RiscVReg r : regs_to_save_fp) {
 		FL(64, r, R_SP, saveOffset);
-		saveOffset += 8;
+		saveOffset += XLEN / 8;
 	}
 	ADDI(R_SP, R_SP, saveSize);

 	RET();

-	// TODO
 	crashHandler_ = GetCodePtr();
 	LI(SCRATCH1, &coreState, SCRATCH2);
 	LI(SCRATCH2, CORE_RUNTIME_ERROR);
--- a/Core/MIPS/RiscV/RiscVCompALU.cpp
+++ b/Core/MIPS/RiscV/RiscVCompALU.cpp
@ -254,7 +254,13 @@ void RiscVJit::CompIR_Bits(IRInst inst) {
 		break;

 	case IROp::Clz:
-		CompIR_Generic(inst);
+		if (cpu_info.RiscV_Zbb) {
+			gpr.MapDirtyIn(inst.dest, inst.src1, MapType::AVOID_LOAD_MARK_NORM32);
+			// This even sets to 32 when zero, perfect.
+			CLZW(gpr.R(inst.dest), gpr.R(inst.src1));
+		} else {
+			CompIR_Generic(inst);
+		}
 		break;

 	default:
@ -640,10 +646,53 @@ void RiscVJit::CompIR_Mult(IRInst inst) {
 void RiscVJit::CompIR_Div(IRInst inst) {
 	CONDITIONAL_DISABLE;

+	RiscVReg numReg, denomReg;
 	switch (inst.op) {
 	case IROp::Div:
+		gpr.MapDirtyDirtyInIn(IRREG_LO, IRREG_HI, inst.src1, inst.src2, MapType::AVOID_LOAD_MARK_NORM32);
+		// We have to do this because of the divide by zero and overflow checks below.
+		NormalizeSrc12(inst, &numReg, &denomReg, SCRATCH1, SCRATCH2, true);
+		DIVW(gpr.R(IRREG_LO), numReg, denomReg);
+		REMW(gpr.R(IRREG_HI), numReg, denomReg);
+
+		// Now some tweaks for divide by zero and overflow.
+		{
+			// Start with divide by zero, remainder is fine.
+			FixupBranch skipNonZero = BNE(denomReg, R_ZERO);
+			FixupBranch keepNegOne = BGE(numReg, R_ZERO);
+			LI(gpr.R(IRREG_LO), 1);
+			SetJumpTarget(keepNegOne);
+			SetJumpTarget(skipNonZero);
+
+			// For overflow, RISC-V sets LO right, but remainder to zero.
+			// Cheating a bit by using R_RA as a temp...
+			LI(R_RA, (int32_t)0x80000000);
+			FixupBranch notMostNegative = BNE(numReg, R_RA);
+			LI(R_RA, -1);
+			FixupBranch notNegativeOne = BNE(denomReg, R_RA);
+			LI(gpr.R(IRREG_HI), -1);
+			SetJumpTarget(notNegativeOne);
+			SetJumpTarget(notMostNegative);
+		}
+		break;
+
 	case IROp::DivU:
-		CompIR_Generic(inst);
+		gpr.MapDirtyDirtyInIn(IRREG_LO, IRREG_HI, inst.src1, inst.src2, MapType::AVOID_LOAD_MARK_NORM32);
+		// We have to do this because of the divide by zero check below.
+		NormalizeSrc12(inst, &numReg, &denomReg, SCRATCH1, SCRATCH2, true);
+		DIVUW(gpr.R(IRREG_LO), numReg, denomReg);
+		REMUW(gpr.R(IRREG_HI), numReg, denomReg);
+
+		// On divide by zero, everything is correct already except the 0xFFFF case.
+		{
+			FixupBranch skipNonZero = BNE(denomReg, R_ZERO);
+			// Luckily, we don't need SCRATCH2/denomReg anymore.
+			LI(SCRATCH2, 0xFFFF);
+			FixupBranch keepNegOne = BLTU(SCRATCH2, numReg);
+			MV(gpr.R(IRREG_LO), SCRATCH2);
+			SetJumpTarget(keepNegOne);
+			SetJumpTarget(skipNonZero);
+		}
 		break;

 	default:
--- a/Core/MIPS/RiscV/RiscVCompFPU.cpp
+++ b/Core/MIPS/RiscV/RiscVCompFPU.cpp
@ -110,18 +110,68 @@ void RiscVJit::CompIR_FArith(IRInst inst) {

 void RiscVJit::CompIR_FCondAssign(IRInst inst) {
 	CONDITIONAL_DISABLE;
-
-	switch (inst.op) {
-	case IROp::FMin:
-	case IROp::FMax:
-		// TODO: These are tricky, have to handle order correctly.
-		CompIR_Generic(inst);
-		break;
-
-	default:
+	if (inst.op != IROp::FMin && inst.op != IROp::FMax)
 		INVALIDOP;
-		break;
+	bool maxCondition = inst.op == IROp::FMax;
+
+	// FMin and FMax are used by VFPU and handle NAN/INF as just a larger exponent.
+	fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+	FCLASS(32, SCRATCH1, fpr.R(inst.src1));
+	FCLASS(32, SCRATCH2, fpr.R(inst.src2));
+
+	// If either side is a NAN, it needs to participate in the comparison.
+	OR(SCRATCH1, SCRATCH1, SCRATCH2);
+	// NAN is either 0x100 or 0x200.
+	ANDI(SCRATCH1, SCRATCH1, 0x300);
+	FixupBranch useNormalCond = BEQ(SCRATCH1, R_ZERO);
+
+	// Time to use bits... classify won't help because it ignores -NAN.
+	FMV(FMv::X, FMv::W, SCRATCH1, fpr.R(inst.src1));
+	FMV(FMv::X, FMv::W, SCRATCH2, fpr.R(inst.src2));
+
+	// If both are negative, we flip the comparison (not two's compliment.)
+	// We cheat and use RA...
+	AND(R_RA, SCRATCH1, SCRATCH2);
+	SRLIW(R_RA, R_RA, 31);
+
+	if (cpu_info.RiscV_Zbb) {
+		FixupBranch swapCompare = BNE(R_RA, R_ZERO);
+		if (maxCondition)
+			MAX(SCRATCH1, SCRATCH1, SCRATCH2);
+		else
+			MIN(SCRATCH1, SCRATCH1, SCRATCH2);
+		FixupBranch skipSwapCompare = J();
+		SetJumpTarget(swapCompare);
+		if (maxCondition)
+			MIN(SCRATCH1, SCRATCH1, SCRATCH2);
+		else
+			MAX(SCRATCH1, SCRATCH1, SCRATCH2);
+		SetJumpTarget(skipSwapCompare);
+	} else {
+		RiscVReg isSrc1LowerReg = gpr.GetAndLockTempR();
+		gpr.ReleaseSpillLocksAndDiscardTemps();
+
+		SLT(isSrc1LowerReg, SCRATCH1, SCRATCH2);
+		// Flip the flag (to reverse the min/max) based on if both were negative.
+		XOR(isSrc1LowerReg, isSrc1LowerReg, R_RA);
+		FixupBranch useSrc1;
+		if (maxCondition)
+			useSrc1 = BEQ(isSrc1LowerReg, R_ZERO);
+		else
+			useSrc1 = BNE(isSrc1LowerReg, R_ZERO);
+		MV(SCRATCH1, SCRATCH2);
+		SetJumpTarget(useSrc1);
 	}
+
+	FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+	FixupBranch finish = J();
+
+	SetJumpTarget(useNormalCond);
+	if (maxCondition)
+		FMAX(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+	else
+		FMIN(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+	SetJumpTarget(finish);
 }

 void RiscVJit::CompIR_FAssign(IRInst inst) {
@ -220,12 +270,209 @@ void RiscVJit::CompIR_FSat(IRInst inst) {
 void RiscVJit::CompIR_FCompare(IRInst inst) {
 	CONDITIONAL_DISABLE;

+	constexpr IRRegIndex IRREG_VFPUL_CC = IRREG_VFPU_CTRL_BASE + VFPU_CTRL_CC;
+
 	switch (inst.op) {
 	case IROp::FCmp:
+		switch (inst.dest) {
+		case IRFpCompareMode::False:
+			gpr.SetImm(IRREG_FPCOND, 0);
+			break;
+
+		case IRFpCompareMode::EitherUnordered:
+			fpr.MapInIn(inst.src1, inst.src2);
+			gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
+			FCLASS(32, SCRATCH1, fpr.R(inst.src1));
+			FCLASS(32, SCRATCH2, fpr.R(inst.src2));
+			OR(SCRATCH1, SCRATCH1, SCRATCH2);
+			// NAN is 0x100 or 0x200.
+			ANDI(SCRATCH1, SCRATCH1, 0x300);
+			SNEZ(gpr.R(IRREG_FPCOND), SCRATCH1);
+			break;
+
+		case IRFpCompareMode::EqualOrdered:
+			fpr.MapInIn(inst.src1, inst.src2);
+			gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
+			FEQ(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
+			break;
+
+		case IRFpCompareMode::EqualUnordered:
+			fpr.MapInIn(inst.src1, inst.src2);
+			gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
+			FEQ(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
+
+			// Now let's just OR in the unordered check.
+			FCLASS(32, SCRATCH1, fpr.R(inst.src1));
+			FCLASS(32, SCRATCH2, fpr.R(inst.src2));
+			OR(SCRATCH1, SCRATCH1, SCRATCH2);
+			// NAN is 0x100 or 0x200.
+			ANDI(SCRATCH1, SCRATCH1, 0x300);
+			SNEZ(SCRATCH1, SCRATCH1);
+			OR(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND), SCRATCH1);
+			break;
+
+		case IRFpCompareMode::LessEqualOrdered:
+			fpr.MapInIn(inst.src1, inst.src2);
+			gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
+			FLE(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
+			break;
+
+		case IRFpCompareMode::LessEqualUnordered:
+			fpr.MapInIn(inst.src1, inst.src2);
+			gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
+			FLT(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src2), fpr.R(inst.src1));
+			SEQZ(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND));
+			break;
+
+		case IRFpCompareMode::LessOrdered:
+			fpr.MapInIn(inst.src1, inst.src2);
+			gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
+			FLT(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src1), fpr.R(inst.src2));
+			break;
+
+		case IRFpCompareMode::LessUnordered:
+			fpr.MapInIn(inst.src1, inst.src2);
+			gpr.MapReg(IRREG_FPCOND, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
+			FLE(32, gpr.R(IRREG_FPCOND), fpr.R(inst.src2), fpr.R(inst.src1));
+			SEQZ(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND));
+			break;
+		}
+		break;
+
 	case IROp::FCmovVfpuCC:
+		gpr.MapReg(IRREG_VFPUL_CC);
+		fpr.MapDirtyIn(inst.dest, inst.src1, false);
+		if ((inst.src2 & 0xF) == 0) {
+			ANDI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), 1);
+		} else if (cpu_info.RiscV_Zbs) {
+			BEXTI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), inst.src2 & 0xF);
+		} else {
+			SRLI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), inst.src2 & 0xF);
+			ANDI(SCRATCH1, SCRATCH1, 1);
+		}
+		if ((inst.src2 >> 7) & 1) {
+			FixupBranch skip = BEQ(SCRATCH1, R_ZERO);
+			FMV(32, fpr.R(inst.dest), fpr.R(inst.src1));
+			SetJumpTarget(skip);
+		} else {
+			FixupBranch skip = BNE(SCRATCH1, R_ZERO);
+			FMV(32, fpr.R(inst.dest), fpr.R(inst.src1));
+			SetJumpTarget(skip);
+		}
+		break;
+
 	case IROp::FCmpVfpuBit:
+		gpr.MapReg(IRREG_VFPUL_CC, MIPSMap::DIRTY);
+
+		switch (VCondition(inst.dest & 0xF)) {
+		case VC_EQ:
+			fpr.MapInIn(inst.src1, inst.src2);
+			FEQ(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
+			break;
+		case VC_NE:
+			fpr.MapInIn(inst.src1, inst.src2);
+			// We could almost negate FEQ, except NAN != NAN.
+			// Anything != NAN is false and NAN != NAN is within that, so we only check one side.
+			FCLASS(32, SCRATCH2, fpr.R(inst.src2));
+			// NAN is 0x100 or 0x200.
+			ANDI(SCRATCH2, SCRATCH2, 0x300);
+			SNEZ(SCRATCH2, SCRATCH2);
+
+			FEQ(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
+			SEQZ(SCRATCH1, SCRATCH1);
+			// Just OR in whether that side was a NAN so it's always not equal.
+			OR(SCRATCH1, SCRATCH1, SCRATCH2);
+			break;
+		case VC_LT:
+			fpr.MapInIn(inst.src1, inst.src2);
+			FLT(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
+			break;
+		case VC_LE:
+			fpr.MapInIn(inst.src1, inst.src2);
+			FLE(32, SCRATCH1, fpr.R(inst.src1), fpr.R(inst.src2));
+			break;
+		case VC_GT:
+			fpr.MapInIn(inst.src1, inst.src2);
+			FLT(32, SCRATCH1, fpr.R(inst.src2), fpr.R(inst.src1));
+			break;
+		case VC_GE:
+			fpr.MapInIn(inst.src1, inst.src2);
+			FLE(32, SCRATCH1, fpr.R(inst.src2), fpr.R(inst.src1));
+			break;
+		case VC_EZ:
+		case VC_NZ:
+			fpr.MapReg(inst.src1);
+			// Zero is either 0x10 or 0x08.
+			FCLASS(32, SCRATCH1, gpr.R(inst.src1));
+			ANDI(SCRATCH1, SCRATCH1, 0x18);
+			if ((inst.dest & 4) == 0)
+				SNEZ(SCRATCH1, SCRATCH1);
+			else
+				SEQZ(SCRATCH1, SCRATCH1);
+			break;
+		case VC_EN:
+		case VC_NN:
+			fpr.MapReg(inst.src1);
+			// NAN is either 0x100 or 0x200.
+			FCLASS(32, SCRATCH1, gpr.R(inst.src1));
+			ANDI(SCRATCH1, SCRATCH1, 0x300);
+			if ((inst.dest & 4) == 0)
+				SNEZ(SCRATCH1, SCRATCH1);
+			else
+				SEQZ(SCRATCH1, SCRATCH1);
+			break;
+		case VC_EI:
+		case VC_NI:
+			fpr.MapReg(inst.src1);
+			// Infinity is either 0x80 or 0x01.
+			FCLASS(32, SCRATCH1, gpr.R(inst.src1));
+			ANDI(SCRATCH1, SCRATCH1, 0x81);
+			if ((inst.dest & 4) == 0)
+				SNEZ(SCRATCH1, SCRATCH1);
+			else
+				SEQZ(SCRATCH1, SCRATCH1);
+			break;
+		case VC_ES:
+		case VC_NS:
+			fpr.MapReg(inst.src1);
+			// Infinity is either 0x80 or 0x01, NAN is either 0x100 or 0x200.
+			FCLASS(32, SCRATCH1, gpr.R(inst.src1));
+			ANDI(SCRATCH1, SCRATCH1, 0x381);
+			if ((inst.dest & 4) == 0)
+				SNEZ(SCRATCH1, SCRATCH1);
+			else
+				SEQZ(SCRATCH1, SCRATCH1);
+			break;
+		case VC_TR:
+			LI(SCRATCH1, 1);
+			break;
+		case VC_FL:
+			LI(SCRATCH1, 0);
+			break;
+		}
+
+		ANDI(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), ~(1 << (inst.dest >> 4)));
+		if ((inst.dest >> 4) != 0)
+			SLLI(SCRATCH1, SCRATCH1, inst.dest >> 4);
+		OR(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), SCRATCH1);
+		break;
+
 	case IROp::FCmpVfpuAggregate:
-		CompIR_Generic(inst);
+		gpr.MapReg(IRREG_VFPUL_CC, MIPSMap::DIRTY);
+		ANDI(SCRATCH1, gpr.R(IRREG_VFPUL_CC), inst.dest);
+		// This is the "any bit", easy.
+		SNEZ(SCRATCH2, SCRATCH1);
+		// To compare to inst.dest for "all", let's simply subtract it and compare to zero.
+		ADDI(SCRATCH1, SCRATCH1, -inst.dest);
+		SEQZ(SCRATCH1, SCRATCH1);
+		// Now we combine those together.
+		SLLI(SCRATCH1, SCRATCH1, 5);
+		SLLI(SCRATCH2, SCRATCH2, 4);
+		OR(SCRATCH1, SCRATCH1, SCRATCH2);
+
+		// Reject those any/all bits and replace them with our own.
+		ANDI(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), ~0x30);
+		OR(gpr.R(IRREG_VFPUL_CC), gpr.R(IRREG_VFPUL_CC), SCRATCH1);
 		break;

 	default:
@ -259,13 +506,70 @@ void RiscVJit::CompIR_RoundingMode(IRInst inst) {
 void RiscVJit::CompIR_FSpecial(IRInst inst) {
 	CONDITIONAL_DISABLE;

+#ifdef __riscv_float_abi_soft
+#error Currently hard float is required.
+#endif
+
+	auto callFuncF_F = [&](float (*func)(float)){
+		gpr.FlushBeforeCall();
+		fpr.FlushBeforeCall();
+		// It might be in a non-volatile register.
+		if (fpr.IsMapped(inst.src1)) {
+			FMV(32, F10, fpr.R(inst.src1));
+		} else {
+			int offset = offsetof(MIPSState, f) + inst.src1 * 4;
+			FL(32, F10, CTXREG, offset);
+		}
+		QuickCallFunction(func);
+
+		fpr.MapReg(inst.dest, MIPSMap::NOINIT);
+		// If it's already F10, we're done - MapReg doesn't actually overwrite the reg in that case.
+		if (fpr.R(inst.dest) != F10) {
+			FMV(32, fpr.R(inst.dest), F10);
+		}
+	};
+
 	switch (inst.op) {
 	case IROp::FSin:
+		callFuncF_F(&vfpu_sin);
+		break;
+
 	case IROp::FCos:
+		callFuncF_F(&vfpu_cos);
+		break;
+
 	case IROp::FRSqrt:
+		fpr.MapDirtyIn(inst.dest, inst.src1);
+		FSQRT(32, fpr.R(inst.dest), fpr.R(inst.src1));
+
+		// Ugh, we can't really avoid a temp here.  Probably not worth a permanent one.
+		LI(SCRATCH1, 1.0f);
+		{
+			// TODO: Smarter allocation of a temp reg?
+			RiscVReg tempReg = fpr.R(inst.dest) == F31 ? F30 : F31;
+			fpr.FlushRiscVReg(tempReg);
+			FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
+			FDIV(32, fpr.R(inst.dest), tempReg, fpr.R(inst.dest));
+		}
+		break;
+
 	case IROp::FRecip:
+		fpr.MapDirtyIn(inst.dest, inst.src1);
+		LI(SCRATCH1, 1.0f);
+		if (inst.dest != inst.src1) {
+			// This is the easy case.
+			FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+			FDIV(32, fpr.R(inst.dest), fpr.R(inst.dest), fpr.R(inst.src1));
+		} else {
+			RiscVReg tempReg = fpr.R(inst.dest) == F31 ? F30 : F31;
+			fpr.FlushRiscVReg(tempReg);
+			FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
+			FDIV(32, fpr.R(inst.dest), tempReg, fpr.R(inst.src1));
+		}
+		break;
+
 	case IROp::FAsin:
-		CompIR_Generic(inst);
+		callFuncF_F(&vfpu_asin);
 		break;

 	default:
--- a/Core/MIPS/RiscV/RiscVCompSystem.cpp
+++ b/Core/MIPS/RiscV/RiscVCompSystem.cpp
@ -20,7 +20,6 @@
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/ReplaceTables.h"
 #include "Core/MemMap.h"
-#include "Core/MIPS/MIPSTables.h"
 #include "Core/MIPS/RiscV/RiscVJit.h"
 #include "Core/MIPS/RiscV/RiscVRegCache.h"

@ -101,7 +100,7 @@ void RiscVJit::CompIR_Transfer(IRInst inst) {
 		break;

 	case IROp::SetCtrlVFPUFReg:
-		gpr.MapReg(IRREG_VFPU_CTRL_BASE + inst.dest, MIPSMap::NOINIT);
+		gpr.MapReg(IRREG_VFPU_CTRL_BASE + inst.dest, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
 		fpr.MapReg(inst.src1);
 		FMV(FMv::X, FMv::W, gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), fpr.R(inst.src1));
 		break;
@ -167,7 +166,7 @@ void RiscVJit::CompIR_Transfer(IRInst inst) {
 		break;

 	case IROp::FMovToGPR:
-		gpr.MapReg(inst.dest, MIPSMap::NOINIT);
+		gpr.MapReg(inst.dest, MIPSMap::NOINIT | MIPSMap::MARK_NORM32);
 		fpr.MapReg(inst.src1);
 		FMV(FMv::X, FMv::W, gpr.R(inst.dest), fpr.R(inst.src1));
 		break;
@ -182,15 +181,6 @@ void RiscVJit::CompIR_System(IRInst inst) {
 	CONDITIONAL_DISABLE;

 	switch (inst.op) {
-	case IROp::Interpret:
-		// IR protects us against this being a branching instruction (well, hopefully.)
-		FlushAll();
-		SaveStaticRegisters();
-		LI(X10, (int32_t)inst.constant);
-		QuickCallFunction((const u8 *)MIPSGetInterpretFunc(MIPSOpcode(inst.constant)));
-		LoadStaticRegisters();
-		break;
-
 	case IROp::Syscall:
 		FlushAll();
 		SaveStaticRegisters();
--- a/Core/MIPS/RiscV/RiscVCompVec.cpp
+++ b/Core/MIPS/RiscV/RiscVCompVec.cpp
@ -241,10 +241,25 @@ void RiscVJit::CompIR_VecPack(IRInst inst) {
 	case IROp::Vec4Pack31To8:
 	case IROp::Vec4Pack32To8:
 	case IROp::Vec2Pack31To16:
-	case IROp::Vec2Pack32To16:
 		CompIR_Generic(inst);
 		break;

+	case IROp::Vec2Pack32To16:
+		fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src1 + 1);
+		FMV(FMv::X, FMv::W, SCRATCH1, fpr.R(inst.src1));
+		FMV(FMv::X, FMv::W, SCRATCH2, fpr.R(inst.src1 + 1));
+		// Keep in mind, this was sign-extended, so we have to zero the upper.
+		SLLI(SCRATCH1, SCRATCH1, XLEN - 32);
+		// Now we just set (SCRATCH2 & 0xFFFF0000) | SCRATCH1.
+		SRLI(SCRATCH1, SCRATCH1, XLEN - 16);
+		// Use a wall to mask.  We can ignore the upper 32 here.
+		SRLI(SCRATCH2, SCRATCH2, 16);
+		SLLI(SCRATCH2, SCRATCH2, 16);
+		OR(SCRATCH1, SCRATCH1, SCRATCH2);
+		// Okay, to the floating point register.
+		FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+		break;
+
 	default:
 		INVALIDOP;
 		break;
--- a/Core/MIPS/RiscV/RiscVJit.cpp
+++ b/Core/MIPS/RiscV/RiscVJit.cpp
@ -16,7 +16,9 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

 #include "Common/StringUtils.h"
+#include "Common/TimeUtil.h"
 #include "Core/MemMap.h"
+#include "Core/MIPS/MIPSTables.h"
 #include "Core/MIPS/RiscV/RiscVJit.h"
 #include "Core/MIPS/RiscV/RiscVRegCache.h"
 #include "Common/Profiler/Profiler.h"
@ -26,19 +28,58 @@ namespace MIPSComp {
 using namespace RiscVGen;
 using namespace RiscVJitConstants;

+static constexpr bool enableDebug = false;
+
+static std::map<uint8_t, int> debugSeenNotCompiledIR;
+static std::map<const char *, int> debugSeenNotCompiled;
+double lastDebugLog = 0.0;
+
+static void LogDebugNotCompiled() {
+	if (!enableDebug)
+		return;
+
+	double now = time_now_d();
+	if (now < lastDebugLog + 1.0)
+		return;
+	lastDebugLog = now;
+
+	int worstIROp = -1;
+	int worstIRVal = 0;
+	for (auto it : debugSeenNotCompiledIR) {
+		if (it.second > worstIRVal) {
+			worstIRVal = it.second;
+			worstIROp = it.first;
+		}
+	}
+	debugSeenNotCompiledIR.clear();
+
+	const char *worstName = nullptr;
+	int worstVal = 0;
+	for (auto it : debugSeenNotCompiled) {
+		if (it.second > worstVal) {
+			worstVal = it.second;
+			worstName = it.first;
+		}
+	}
+	debugSeenNotCompiled.clear();
+
+	if (worstIROp != -1)
+		WARN_LOG(JIT, "Most not compiled IR op: %s (%d)", GetIRMeta((IROp)worstIROp)->name, worstIRVal);
+	if (worstName != nullptr)
+		WARN_LOG(JIT, "Most not compiled op: %s (%d)", worstName, worstVal);
+}
+
 RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo), fpr(mipsState, &jo) {
 	// Automatically disable incompatible options.
 	if (((intptr_t)Memory::base & 0x00000000FFFFFFFFUL) != 0) {
 		jo.enablePointerify = false;
 	}

+	// Since we store the offset, this is as big as it can be.
+	// We could shift off one bit to double it, would need to change RiscVAsm.
 	AllocCodeSpace(1024 * 1024 * 16);
 	SetAutoCompress(true);

-	// TODO: Consider replacing block num method form IRJit - this is 2MB.
-	blockStartAddrs_ = new const u8 *[MAX_ALLOWED_JIT_BLOCKS];
-	memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);
-
 	gpr.Init(this);
 	fpr.Init(this);

@ -46,42 +87,35 @@ RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo)
 }

 RiscVJit::~RiscVJit() {
-	delete [] blockStartAddrs_;
 }

 void RiscVJit::RunLoopUntil(u64 globalticks) {
+	if constexpr (enableDebug) {
+		LogDebugNotCompiled();
+	}
+
 	PROFILE_THIS_SCOPE("jit");
 	((void (*)())enterDispatcher_)();
 }

-bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32 &mipsBytes, bool preload) {
-	// Check that we're not full (we allow less blocks than IR itself.)
-	if (blocks_.GetNumBlocks() >= MAX_ALLOWED_JIT_BLOCKS - 1)
+static void NoBlockExits() {
+	_assert_msg_(false, "Never exited block, invalid IR?");
+}
+
+bool RiscVJit::CompileTargetBlock(IRBlock *block, int block_num, bool preload) {
+	if (GetSpaceLeft() < 0x800)
 		return false;

-	if (!IRJit::CompileBlock(em_address, instructions, mipsBytes, preload))
-		return false;
+	// Don't worry, the codespace isn't large enough to overflow offsets.
+	block->SetTargetOffset((int)GetOffset(GetCodePointer()));

 	// TODO: Block linking, checked entries and such.

-	int block_num;
-	if (preload) {
-		block_num = blocks_.GetBlockNumberFromStartAddress(em_address);
-	} else {
-		u32 first_inst = Memory::ReadUnchecked_U32(em_address);
-		_assert_msg_(MIPS_IS_RUNBLOCK(first_inst), "Should've written an emuhack");
-
-		block_num = first_inst & MIPS_EMUHACK_VALUE_MASK;
-	}
-
-	_assert_msg_(block_num >= 0 && block_num < MAX_ALLOWED_JIT_BLOCKS, "Bad block num");
-	_assert_msg_(blockStartAddrs_[block_num] == nullptr, "Block %d reused before clear", block_num);
-	blockStartAddrs_[block_num] = GetCodePointer();
-
 	gpr.Start();
 	fpr.Start();

-	for (const IRInst &inst : instructions) {
+	for (int i = 0; i < block->GetNumInstructions(); ++i) {
+		const IRInst &inst = block->GetInstructions()[i];
 		CompileIRInst(inst);

 		if (jo.Disabled(JitDisable::REGALLOC_GPR)) {
@ -97,9 +131,11 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
 		}
 	}

-	// Note: a properly constructed block should never get here.
-	// TODO: Need to do more than just this?  Call a func to set an exception?
-	QuickJ(R_RA, crashHandler_);
+	// We should've written an exit above.  If we didn't, bad things will happen.
+	if (enableDebug) {
+		QuickCallFunction(&NoBlockExits);
+		QuickJ(R_RA, crashHandler_);
+	}

 	FlushIcache();

@ -351,6 +387,9 @@ void RiscVJit::CompileIRInst(IRInst inst) {
 		break;

 	case IROp::Interpret:
+		CompIR_Interpret(inst);
+		break;
+
 	case IROp::Syscall:
 	case IROp::CallReplacement:
 	case IROp::Break:
@ -397,6 +436,9 @@ static u32 DoIRInst(uint64_t value) {
 	IRInst inst;
 	memcpy(&inst, &value, sizeof(inst));

+	if constexpr (enableDebug)
+		debugSeenNotCompiledIR[(uint8_t)inst.op]++;
+
 	return IRInterpret(currentMIPS, &inst, 1);
 }

@ -425,6 +467,26 @@ void RiscVJit::CompIR_Generic(IRInst inst) {
 	}
 }

+static void DebugInterpretHit(const char *name) {
+	if (enableDebug)
+		debugSeenNotCompiled[name]++;
+}
+
+void RiscVJit::CompIR_Interpret(IRInst inst) {
+	MIPSOpcode op(inst.constant);
+
+	// IR protects us against this being a branching instruction (well, hopefully.)
+	FlushAll();
+	SaveStaticRegisters();
+	if (enableDebug) {
+		LI(X10, MIPSGetName(op));
+		QuickCallFunction(&DebugInterpretHit);
+	}
+	LI(X10, (int32_t)inst.constant);
+	QuickCallFunction((const u8 *)MIPSGetInterpretFunc(op));
+	LoadStaticRegisters();
+}
+
 void RiscVJit::FlushAll() {
 	gpr.FlushAll();
 	fpr.FlushAll();
@ -449,17 +511,14 @@ bool RiscVJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
 	} else if (!IsInSpace(ptr)) {
 		return false;
 	} else {
-		uintptr_t uptr = (uintptr_t)ptr;
+		int offset = (int)GetOffset(ptr);
 		int block_num = -1;
-		for (int i = 0; i < MAX_ALLOWED_JIT_BLOCKS; ++i) {
-			uintptr_t blockptr = (uintptr_t)blockStartAddrs_[i];
-			// Out of allocated blocks.
-			if (uptr == 0)
-				break;
-
-			if (uptr >= blockptr)
+		for (int i = 0; i < blocks_.GetNumBlocks(); ++i) {
+			const auto &b = blocks_.GetBlock(i);
+			// We allocate linearly.
+			if (b->GetTargetOffset() <= offset)
 				block_num = i;
-			if (uptr < blockptr)
+			if (b->GetTargetOffset() > offset)
 				break;
 		}

@ -501,8 +560,6 @@ void RiscVJit::ClearCache() {

 	ClearCodeSpace(jitStartOffset_);
 	FlushIcacheSection(region + jitStartOffset_, region + region_size - jitStartOffset_);
-
-	memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);
 }

 void RiscVJit::RestoreRoundingMode(bool force) {
--- a/Core/MIPS/RiscV/RiscVJit.h
+++ b/Core/MIPS/RiscV/RiscVJit.h
@ -46,7 +46,7 @@ public:
 	// TODO: GetBlockCacheDebugInterface, block linking?

 protected:
-	bool CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u32 &mipsBytes, bool preload) override;
+	bool CompileTargetBlock(IRBlock *block, int block_num, bool preload) override;

 	void CompileIRInst(IRInst inst);

@ -87,6 +87,7 @@ private:
 	void CompIR_FStore(IRInst inst);
 	void CompIR_Generic(IRInst inst);
 	void CompIR_HiLo(IRInst inst);
+	void CompIR_Interpret(IRInst inst);
 	void CompIR_Load(IRInst inst);
 	void CompIR_LoadShift(IRInst inst);
 	void CompIR_Logic(IRInst inst);
@ -116,8 +117,6 @@ private:
 	RiscVRegCache gpr;
 	RiscVRegCacheFPU fpr;

-	static constexpr int MAX_ALLOWED_JIT_BLOCKS = 262144;
-
 	const u8 *enterDispatcher_ = nullptr;

 	const u8 *outerLoop_ = nullptr;
@ -135,7 +134,6 @@ private:
 	const u8 *crashHandler_ = nullptr;

 	int jitStartOffset_ = 0;
-	const u8 **blockStartAddrs_ = nullptr;
 };

 } // namespace MIPSComp
--- a/Core/MIPS/RiscV/RiscVRegCache.h
+++ b/Core/MIPS/RiscV/RiscVRegCache.h
@ -140,6 +140,7 @@ public:
 	void FlushBeforeCall();
 	void FlushAll();
 	void FlushR(IRRegIndex r);
+	void FlushRiscVReg(RiscVGen::RiscVReg r);
 	void DiscardR(IRRegIndex r);

 	RiscVGen::RiscVReg GetAndLockTempR();
@ -163,7 +164,6 @@ private:
 	RiscVGen::RiscVReg AllocateReg();
 	RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
 	RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
-	void FlushRiscVReg(RiscVGen::RiscVReg r);
 	void SetRegImm(RiscVGen::RiscVReg reg, u64 imm);
 	void AddMemBase(RiscVGen::RiscVReg reg);
 	int GetMipsRegOffset(IRRegIndex r);
--- a/Core/MIPS/RiscV/RiscVRegCacheFPU.cpp
+++ b/Core/MIPS/RiscV/RiscVRegCacheFPU.cpp
@ -27,9 +27,6 @@
 using namespace RiscVGen;
 using namespace RiscVJitConstants;

-using namespace RiscVGen;
-using namespace RiscVJitConstants;
-
 RiscVRegCacheFPU::RiscVRegCacheFPU(MIPSState *mipsState, MIPSComp::JitOptions *jo)
 	: mips_(mipsState), jo_(jo) {}

@ -279,6 +276,24 @@ RiscVReg RiscVRegCacheFPU::RiscVRegForFlush(IRRegIndex r) {
 	}
 }

+void RiscVRegCacheFPU::FlushBeforeCall() {
+	// Note: don't set this false at the end, since we don't flush everything.
+	if (!pendingFlush_) {
+		return;
+	}
+
+	// These registers are not preserved by function calls.
+	for (int i = 0; i <= 7; ++i) {
+		FlushRiscVReg(RiscVReg(F0 + i));
+	}
+	for (int i = 10; i <= 17; ++i) {
+		FlushRiscVReg(RiscVReg(F0 + i));
+	}
+	for (int i = 28; i <= 31; ++i) {
+		FlushRiscVReg(RiscVReg(F0 + i));
+	}
+}
+
 void RiscVRegCacheFPU::FlushAll() {
 	if (!pendingFlush_) {
 		// Nothing allocated.  FPU regs are not nearly as common as GPR.
--- a/Core/MIPS/RiscV/RiscVRegCacheFPU.h
+++ b/Core/MIPS/RiscV/RiscVRegCacheFPU.h
@ -64,11 +64,12 @@ public:
 	void MapInIn(IRRegIndex rd, IRRegIndex rs);
 	void MapDirtyIn(IRRegIndex rd, IRRegIndex rs, bool avoidLoad = true);
 	void MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, bool avoidLoad = true);
-	void Map4Dirty(IRRegIndex rdbase, bool avoidLoad = true);
 	void Map4DirtyIn(IRRegIndex rdbase, IRRegIndex rsbase, bool avoidLoad = true);
 	void Map4DirtyInIn(IRRegIndex rdbase, IRRegIndex rsbase, IRRegIndex rtbase, bool avoidLoad = true);
+	void FlushBeforeCall();
 	void FlushAll();
 	void FlushR(IRRegIndex r);
+	void FlushRiscVReg(RiscVGen::RiscVReg r);
 	void DiscardR(IRRegIndex r);

 	RiscVGen::RiscVReg R(int preg); // Returns a cached register
@ -78,7 +79,6 @@ private:
 	RiscVGen::RiscVReg AllocateReg();
 	RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
 	RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
-	void FlushRiscVReg(RiscVGen::RiscVReg r);
 	int GetMipsRegOffset(IRRegIndex r);

 	bool IsValidReg(IRRegIndex r) const;