Merge pull request #17783 from unknownbrackets/riscv-jit

Implement float/vec operations in RISC-V jit
2025-04-02 11:01:50 -04:00 · 2023-07-28 08:38:19 +02:00 · 2023-07-28 08:38:19 +02:00 · 4aa2b1fcac
commit 4aa2b1fcac
parent bf40eae4f8 a181f6d5b8
21 changed files with 1088 additions and 142 deletions
--- a/Core/MIPS/ARM64/Arm64RegCache.cpp
+++ b/Core/MIPS/ARM64/Arm64RegCache.cpp
@ -180,7 +180,7 @@ void Arm64RegCache::MapRegTo(ARM64Reg reg, MIPSGPReg mipsReg, int mapFlags) {
 	ar[reg].isDirty = (mapFlags & MAP_DIRTY) ? true : false;
 	if ((mapFlags & MAP_NOINIT) != MAP_NOINIT) {
 		if (mipsReg == MIPS_REG_ZERO) {
-			// If we get a request to load the zero register, at least we won't spend
+			// If we get a request to map the zero register, at least we won't spend
 			// time on a memory access...
 			emit_->MOVI2R(reg, 0);

--- a/Core/MIPS/ARM64/Arm64RegCacheFPU.cpp
+++ b/Core/MIPS/ARM64/Arm64RegCacheFPU.cpp
@ -319,6 +319,7 @@ void Arm64RegCacheFPU::FlushR(MIPSReg r) {
 		if (mr[r].reg == INVALID_REG) {
 			ERROR_LOG(JIT, "FlushR: MipsReg had bad ArmReg");
 		}
+		FlushArmReg((ARM64Reg)(S0 + mr[r].reg));
 		break;

 	case ML_MEM:
@ -329,8 +330,6 @@ void Arm64RegCacheFPU::FlushR(MIPSReg r) {
 		//BAD
 		break;
 	}
-	mr[r].loc = ML_MEM;
-	mr[r].reg = (int)INVALID_REG;
 }

 Arm64Gen::ARM64Reg Arm64RegCacheFPU::ARM64RegForFlush(int r) {
--- a/Core/MIPS/ARM64/Arm64RegCacheFPU.h
+++ b/Core/MIPS/ARM64/Arm64RegCacheFPU.h
@ -17,8 +17,6 @@

 #pragma once

-#pragma once
-
 #include "Core/MIPS/MIPS.h"
 #include "Core/MIPS/ARM64/Arm64RegCache.h"
 #include "Core/MIPS/MIPSVFPUUtils.h"
@ -165,7 +163,6 @@ private:
 	MIPSComp::JitOptions *jo_;

 	int numARMFpuReg_;
-	int qTime_;

 	enum {
 		// On ARM64, each of the 32 registers are full 128-bit. No sharing of components!
--- a/Core/MIPS/IR/IRCompBranch.cpp
+++ b/Core/MIPS/IR/IRCompBranch.cpp
@ -455,6 +455,7 @@ void IRFrontend::Comp_Syscall(MIPSOpcode op) {
 }

 void IRFrontend::Comp_Break(MIPSOpcode op) {
+	ir.Write(IROp::SetPCConst, 0, ir.AddConstant(GetCompilerPC()));
 	ir.Write(IROp::Break);
 	js.compiling = false;
 }
--- a/Core/MIPS/IR/IRCompFPU.cpp
+++ b/Core/MIPS/IR/IRCompFPU.cpp
@ -203,7 +203,8 @@ void IRFrontend::Comp_mxc1(MIPSOpcode op) {
 			return;
 		}
 		if (fs == 31) {
-			DISABLE;  // TODO: Add a new op
+			// This needs to insert fpcond.
+			ir.Write(IROp::FpCtrlToReg, rt);
 		} else if (fs == 0) {
 			ir.Write(IROp::SetConst, rt, ir.AddConstant(MIPSState::FCR0_VALUE));
 		} else {
@ -219,7 +220,10 @@ void IRFrontend::Comp_mxc1(MIPSOpcode op) {
 	case 6: //ctc1
 		if (fs == 31) {
 			// Set rounding mode
-			DISABLE;
+			RestoreRoundingMode();
+			ir.Write(IROp::FpCtrlFromReg, 0, rt);
+			UpdateRoundingMode();
+			ApplyRoundingMode();
 		} else {
 			Comp_Generic(op);
 		}
--- a/Core/MIPS/IR/IRInst.cpp
+++ b/Core/MIPS/IR/IRInst.cpp
@ -112,6 +112,8 @@ static const IRMeta irMeta[] = {
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
 	{ IROp::ZeroFpCond, "ZeroFpCond", "" },
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
+	{ IROp::FpCtrlFromReg, "FpCtrlFromReg", "_G" },
+	{ IROp::FpCtrlToReg, "FpCtrlToReg", "G" },
 	{ IROp::VfpuCtrlToReg, "VfpuCtrlToReg", "GI" },
 	{ IROp::SetCtrlVFPU, "SetCtrlVFPU", "TC" },
 	{ IROp::SetCtrlVFPUReg, "SetCtrlVFPUReg", "TG" },
--- a/Core/MIPS/IR/IRInst.h
+++ b/Core/MIPS/IR/IRInst.h
@ -135,6 +135,8 @@ enum class IROp : u8 {
 	FSatMinus1_1,

 	FpCondToReg,
+	FpCtrlFromReg,
+	FpCtrlToReg,
 	VfpuCtrlToReg,

 	ZeroFpCond,
--- a/Core/MIPS/IR/IRInterpreter.cpp
+++ b/Core/MIPS/IR/IRInterpreter.cpp
@ -768,9 +768,11 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
 			break;
 		case IROp::FMin:
+			// TODO: This doesn't handle VFPU ordering right.
 			mips->f[inst->dest] = std::min(mips->f[inst->src1], mips->f[inst->src2]);
 			break;
 		case IROp::FMax:
+			// TODO: This doesn't handle VFPU ordering right.
 			mips->f[inst->dest] = std::max(mips->f[inst->src1], mips->f[inst->src2]);
 			break;

@ -811,6 +813,17 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst, int count) {
 		case IROp::FpCondToReg:
 			mips->r[inst->dest] = mips->fpcond;
 			break;
+		case IROp::FpCtrlFromReg:
+			mips->fcr31 = mips->r[inst->src1] & 0x0181FFFF;
+			// Extract the new fpcond value.
+			// TODO: Is it really helping us to keep it separate?
+			mips->fpcond = (mips->fcr31 >> 23) & 1;
+			break;
+		case IROp::FpCtrlToReg:
+			// Update the fpcond bit first.
+			mips->fcr31 = (mips->fcr31 & ~(1 << 23)) | ((mips->fpcond & 1) << 23);
+			mips->r[inst->dest] = mips->fcr31;
+			break;
 		case IROp::VfpuCtrlToReg:
 			mips->r[inst->dest] = mips->vfpuCtrl[inst->src1];
 			break;
--- a/Core/MIPS/IR/IRPassSimplify.cpp
+++ b/Core/MIPS/IR/IRPassSimplify.cpp
@ -694,6 +694,13 @@ bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts
 				out.Write(inst);
 			}
 			break;
+		case IROp::FpCtrlFromReg:
+			gpr.MapDirtyIn(IRREG_FCR31, inst.src1);
+			gpr.MapDirty(IRREG_FPCOND);
+			goto doDefault;
+		case IROp::FpCtrlToReg:
+			gpr.MapDirtyInIn(inst.dest, IRREG_FPCOND, IRREG_FCR31);
+			goto doDefault;

 		case IROp::Vec4Init:
 		case IROp::Vec4Mov:
--- a/Core/MIPS/RiscV/RiscVAsm.cpp
+++ b/Core/MIPS/RiscV/RiscVAsm.cpp
@ -75,9 +75,8 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
 		loadStaticRegisters_ = nullptr;
 	}

-	// TODO: Do we actually need updateRoundingMode_?  Hm.
-	//applyRoundingMode_ = AlignCode16();
-	if (false) {
+	applyRoundingMode_ = AlignCode16();
+	{
 		// Not sure if RISC-V has any flush to zero capability?  Leaving it off for now...
 		LWU(SCRATCH2, CTXREG, offsetof(MIPSState, fcr31));

@ -105,30 +104,6 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
 		RET();
 	}

-	//updateRoundingMode_ = AlignCode16();
-	if (false) {
-		LWU(SCRATCH2, CTXREG, offsetof(MIPSState, fcr31));
-
-		// Set SCRATCH2 to FZ:RM (FZ is bit 24, and RM are lowest 2 bits.)
-		ANDI(SCRATCH1, SCRATCH2, 1 << 24);
-		ANDI(SCRATCH2, SCRATCH2, 3);
-		SRLI(SCRATCH1, SCRATCH1, 22);
-		OR(SCRATCH2, SCRATCH2, SCRATCH1);
-
-		// Let's update js.currentRoundingFunc with the right convertS0ToSCRATCH1 func.
-		//LI(SCRATCH1, convertS0ToSCRATCH1);
-		if (cpu_info.RiscV_Zba) {
-			SH_ADD(3, SCRATCH1, SCRATCH2, SCRATCH1);
-		} else {
-			SLLI(SCRATCH2, SCRATCH2, 3);
-			ADD(SCRATCH1, SCRATCH1, SCRATCH2);
-		}
-		LD(SCRATCH2, SCRATCH1, 0);
-		//LI(SCRATCH1, &js.currentRoundingFunc);
-		SW(SCRATCH2, SCRATCH1, 0);
-		RET();
-	}
-
 	enterDispatcher_ = AlignCode16();

 	// Start by saving some regs on the stack.  There are 12 GPs and 12 FPs we want.
@ -280,15 +255,6 @@ void RiscVJit::GenerateFixedCode(const JitOptions &jo) {
 	SW(SCRATCH2, SCRATCH1, 0);
 	J(quitLoop);

-	// TODO: Do we need this?
-	static const Round roundModes[8] = { Round::NEAREST_EVEN, Round::TOZERO, Round::UP, Round::DOWN, Round::NEAREST_EVEN, Round::TOZERO, Round::UP, Round::DOWN };
-	for (size_t i = 0; i < ARRAY_SIZE(roundModes); ++i) {
-		//convertS0ToSCRATCH1[i] = AlignCode16();
-
-		//FCVT(FConv::W, FConv::S, SCRATCH1, F0, roundModes[i]);
-		//RET();
-	}
-
 	// Leave this at the end, add more stuff above.
 	if (enableDisasm) {
 		std::vector<std::string> lines = DisassembleRV64(start, GetCodePtr() - start);
--- a/Core/MIPS/RiscV/RiscVCompBranch.cpp
+++ b/Core/MIPS/RiscV/RiscVCompBranch.cpp
@ -55,7 +55,7 @@ void RiscVJit::CompIR_Exit(IRInst inst) {

 	case IROp::ExitToPC:
 		FlushAll();
-		QuickJ(R_RA, dispatcher_);
+		QuickJ(R_RA, dispatcherCheckCoreState_);
 		break;

 	default:
@ -134,7 +134,8 @@ void RiscVJit::CompIR_ExitIf(IRInst inst) {

 	case IROp::ExitToConstIfFpTrue:
 	case IROp::ExitToConstIfFpFalse:
-		CompIR_Generic(inst);
+		// Note: not used.
+		DISABLE;
 		break;

 	default:
--- a/Core/MIPS/RiscV/RiscVCompFPU.cpp
+++ b/Core/MIPS/RiscV/RiscVCompFPU.cpp
@ -39,12 +39,67 @@ void RiscVJit::CompIR_FArith(IRInst inst) {

 	switch (inst.op) {
 	case IROp::FAdd:
+		fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+		FADD(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+		break;
+
 	case IROp::FSub:
+		fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+		FSUB(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+		break;
+
 	case IROp::FMul:
+		fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+		// TODO: If FMUL consistently produces NAN across chip vendors, we can skip this.
+		// Luckily this does match the RISC-V canonical NAN.
+		if (inst.src1 != inst.src2) {
+			// These will output 0x80/0x01 if infinity, 0x10/0x80 if zero.
+			// We need to check if one is infinity and the other zero.
+
+			// First, try inf * zero.
+			FCLASS(32, SCRATCH1, fpr.R(inst.src1));
+			FCLASS(32, SCRATCH2, fpr.R(inst.src2));
+			ANDI(R_RA, SCRATCH1, 0x81);
+			FixupBranch lhsNotInf = BEQ(R_RA, R_ZERO);
+			ANDI(R_RA, SCRATCH2, 0x18);
+			FixupBranch infZero = BNE(R_RA, R_ZERO);
+
+			// Okay, what about the other order?
+			SetJumpTarget(lhsNotInf);
+			ANDI(R_RA, SCRATCH1, 0x18);
+			FixupBranch lhsNotZero = BEQ(R_RA, R_ZERO);
+			ANDI(R_RA, SCRATCH2, 0x81);
+			FixupBranch zeroInf = BNE(R_RA, R_ZERO);
+
+			// Nope, all good.
+			SetJumpTarget(lhsNotZero);
+			FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+			FixupBranch skip = J();
+
+			SetJumpTarget(infZero);
+			SetJumpTarget(zeroInf);
+			LI(SCRATCH1, 0x7FC00000);
+			FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+
+			SetJumpTarget(skip);
+		} else {
+			FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+		}
+		break;
+
 	case IROp::FDiv:
+		fpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2);
+		FDIV(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+		break;
+
 	case IROp::FSqrt:
+		fpr.MapDirtyIn(inst.dest, inst.src1);
+		FSQRT(32, fpr.R(inst.dest), fpr.R(inst.src1));
+		break;
+
 	case IROp::FNeg:
-		CompIR_Generic(inst);
+		fpr.MapDirtyIn(inst.dest, inst.src1);
+		FNEG(32, fpr.R(inst.dest), fpr.R(inst.src1));
 		break;

 	default:
@ -59,6 +114,7 @@ void RiscVJit::CompIR_FCondAssign(IRInst inst) {
 	switch (inst.op) {
 	case IROp::FMin:
 	case IROp::FMax:
+		// TODO: These are tricky, have to handle order correctly.
 		CompIR_Generic(inst);
 		break;

@ -73,11 +129,39 @@ void RiscVJit::CompIR_FAssign(IRInst inst) {

 	switch (inst.op) {
 	case IROp::FMov:
-	case IROp::FAbs:
-	case IROp::FSign:
-		CompIR_Generic(inst);
+		fpr.MapDirtyIn(inst.dest, inst.src1);
+		FMV(32, fpr.R(inst.dest), fpr.R(inst.src1));
 		break;

+	case IROp::FAbs:
+		fpr.MapDirtyIn(inst.dest, inst.src1);
+		FABS(32, fpr.R(inst.dest), fpr.R(inst.src1));
+		break;
+
+	case IROp::FSign:
+	{
+		fpr.MapDirtyIn(inst.dest, inst.src1);
+		// Check if it's negative zero, either 0x10/0x08 is zero.
+		FCLASS(32, SCRATCH1, fpr.R(inst.src1));
+		ANDI(SCRATCH1, SCRATCH1, 0x18);
+		SEQZ(SCRATCH1, SCRATCH1);
+		// Okay, it's zero if zero, 1 otherwise.  Convert 1 to a constant 1.0.
+		// Probably non-zero is the common case, so we make that the straight line.
+		FixupBranch skipOne = BEQ(SCRATCH1, R_ZERO);
+		LI(SCRATCH1, 1.0f);
+
+		// Now we just need the sign from it.
+		FMV(FMv::X, FMv::W, SCRATCH2, fpr.R(inst.src1));
+		// Use a wall to isolate the sign, and combine.
+		SRAIW(SCRATCH2, SCRATCH2, 31);
+		SLLIW(SCRATCH2, SCRATCH2, 31);
+		OR(SCRATCH1, SCRATCH1, SCRATCH2);
+
+		SetJumpTarget(skipOne);
+		FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+		break;
+	}
+
 	default:
 		INVALIDOP;
 		break;
@ -135,7 +219,6 @@ void RiscVJit::CompIR_FCompare(IRInst inst) {
 	CONDITIONAL_DISABLE;

 	switch (inst.op) {
-	case IROp::ZeroFpCond:
 	case IROp::FCmp:
 	case IROp::FCmovVfpuCC:
 	case IROp::FCmpVfpuBit:
@ -154,9 +237,15 @@ void RiscVJit::CompIR_RoundingMode(IRInst inst) {

 	switch (inst.op) {
 	case IROp::RestoreRoundingMode:
+		RestoreRoundingMode();
+		break;
+
 	case IROp::ApplyRoundingMode:
+		ApplyRoundingMode();
+		break;
+
 	case IROp::UpdateRoundingMode:
-		CompIR_Generic(inst);
+		// We don't need to do anything, instructions allow a "dynamic" rounding mode.
 		break;

 	default:
--- a/Core/MIPS/RiscV/RiscVCompLoadStore.cpp
+++ b/Core/MIPS/RiscV/RiscVCompLoadStore.cpp
@ -53,8 +53,8 @@ void RiscVJit::SetScratch1ToSrc1Address(IRReg src1) {
 #endif
 }

-int32_t RiscVJit::AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant) {
-	if (constant < -2048 || constant > 2047) {
+int32_t RiscVJit::AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant, int32_t range) {
+	if (constant < -2048 || constant + range > 2047) {
 		LI(SCRATCH2, constant);
 		ADD(SCRATCH1, *reg, SCRATCH2);
 		*reg = SCRATCH1;
@ -124,7 +124,8 @@ void RiscVJit::CompIR_LoadShift(IRInst inst) {
 	switch (inst.op) {
 	case IROp::Load32Left:
 	case IROp::Load32Right:
-		CompIR_Generic(inst);
+		// Should not happen if the pass to split is active.
+		DISABLE;
 		break;

 	default:
@ -136,9 +137,28 @@ void RiscVJit::CompIR_LoadShift(IRInst inst) {
 void RiscVJit::CompIR_FLoad(IRInst inst) {
 	CONDITIONAL_DISABLE;

+	RiscVReg addrReg = INVALID_REG;
+	if (inst.src1 == MIPS_REG_ZERO) {
+		// This will get changed by AdjustForAddressOffset.
+		addrReg = MEMBASEREG;
+#ifdef MASKED_PSP_MEMORY
+		inst.constant &= Memory::MEMVIEW32_MASK;
+#endif
+	} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
+		addrReg = gpr.MapRegAsPointer(inst.src1);
+	} else {
+		SetScratch1ToSrc1Address(inst.src1);
+		addrReg = SCRATCH1;
+	}
+
+	s32 imm = AdjustForAddressOffset(&addrReg, inst.constant);
+
+	// TODO: Safe memory?  Or enough to have crash handler + validate?
+
 	switch (inst.op) {
 	case IROp::LoadFloat:
-		CompIR_Generic(inst);
+		fpr.MapReg(inst.dest, MIPSMap::NOINIT);
+		FL(32, fpr.R(inst.dest), addrReg, imm);
 		break;

 	default:
@ -150,9 +170,32 @@ void RiscVJit::CompIR_FLoad(IRInst inst) {
 void RiscVJit::CompIR_VecLoad(IRInst inst) {
 	CONDITIONAL_DISABLE;

+	RiscVReg addrReg = INVALID_REG;
+	if (inst.src1 == MIPS_REG_ZERO) {
+		// This will get changed by AdjustForAddressOffset.
+		addrReg = MEMBASEREG;
+#ifdef MASKED_PSP_MEMORY
+		inst.constant &= Memory::MEMVIEW32_MASK;
+#endif
+	} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
+		addrReg = gpr.MapRegAsPointer(inst.src1);
+	} else {
+		SetScratch1ToSrc1Address(inst.src1);
+		addrReg = SCRATCH1;
+	}
+
+	// We need to be able to address the whole 16 bytes, so offset of 12.
+	s32 imm = AdjustForAddressOffset(&addrReg, inst.constant, 12);
+
+	// TODO: Safe memory?  Or enough to have crash handler + validate?
+
 	switch (inst.op) {
 	case IROp::LoadVec4:
-		CompIR_Generic(inst);
+		for (int i = 0; i < 4; ++i) {
+			// Spilling is okay.
+			fpr.MapReg(inst.dest + i, MIPSMap::NOINIT);
+			FL(32, fpr.R(inst.dest + i), addrReg, imm + 4 * i);
+		}
 		break;

 	default:
@ -212,7 +255,8 @@ void RiscVJit::CompIR_StoreShift(IRInst inst) {
 	switch (inst.op) {
 	case IROp::Store32Left:
 	case IROp::Store32Right:
-		CompIR_Generic(inst);
+		// Should not happen if the pass to split is active.
+		DISABLE;
 		break;

 	default:
@ -224,9 +268,28 @@ void RiscVJit::CompIR_StoreShift(IRInst inst) {
 void RiscVJit::CompIR_FStore(IRInst inst) {
 	CONDITIONAL_DISABLE;

+	RiscVReg addrReg = INVALID_REG;
+	if (inst.src1 == MIPS_REG_ZERO) {
+		// This will get changed by AdjustForAddressOffset.
+		addrReg = MEMBASEREG;
+#ifdef MASKED_PSP_MEMORY
+		inst.constant &= Memory::MEMVIEW32_MASK;
+#endif
+	} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
+		addrReg = gpr.MapRegAsPointer(inst.src1);
+	} else {
+		SetScratch1ToSrc1Address(inst.src1);
+		addrReg = SCRATCH1;
+	}
+
+	s32 imm = AdjustForAddressOffset(&addrReg, inst.constant);
+
+	// TODO: Safe memory?  Or enough to have crash handler + validate?
+
 	switch (inst.op) {
 	case IROp::StoreFloat:
-		CompIR_Generic(inst);
+		fpr.MapReg(inst.src3);
+		FS(32, fpr.R(inst.src3), addrReg, imm);
 		break;

 	default:
@ -238,9 +301,32 @@ void RiscVJit::CompIR_FStore(IRInst inst) {
 void RiscVJit::CompIR_VecStore(IRInst inst) {
 	CONDITIONAL_DISABLE;

+	RiscVReg addrReg = INVALID_REG;
+	if (inst.src1 == MIPS_REG_ZERO) {
+		// This will get changed by AdjustForAddressOffset.
+		addrReg = MEMBASEREG;
+#ifdef MASKED_PSP_MEMORY
+		inst.constant &= Memory::MEMVIEW32_MASK;
+#endif
+	} else if (jo.cachePointers || gpr.IsMappedAsPointer(inst.src1)) {
+		addrReg = gpr.MapRegAsPointer(inst.src1);
+	} else {
+		SetScratch1ToSrc1Address(inst.src1);
+		addrReg = SCRATCH1;
+	}
+
+	// We need to be able to address the whole 16 bytes, so offset of 12.
+	s32 imm = AdjustForAddressOffset(&addrReg, inst.constant, 12);
+
+	// TODO: Safe memory?  Or enough to have crash handler + validate?
+
 	switch (inst.op) {
 	case IROp::StoreVec4:
-		CompIR_Generic(inst);
+		for (int i = 0; i < 4; ++i) {
+			// Spilling is okay, though not ideal.
+			fpr.MapReg(inst.src3 + i);
+			FS(32, fpr.R(inst.src3 + i), addrReg, imm + 4 * i);
+		}
 		break;

 	default:
--- a/Core/MIPS/RiscV/RiscVCompSystem.cpp
+++ b/Core/MIPS/RiscV/RiscVCompSystem.cpp
@ -15,7 +15,12 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

+#include "Common/Profiler/Profiler.h"
+#include "Core/Core.h"
+#include "Core/HLE/HLE.h"
+#include "Core/HLE/ReplaceTables.h"
 #include "Core/MemMap.h"
+#include "Core/MIPS/MIPSTables.h"
 #include "Core/MIPS/RiscV/RiscVJit.h"
 #include "Core/MIPS/RiscV/RiscVRegCache.h"

@ -45,7 +50,15 @@ void RiscVJit::CompIR_Basic(IRInst inst) {
 		break;

 	case IROp::SetConstF:
-		CompIR_Generic(inst);
+		fpr.MapReg(inst.dest, MIPSMap::NOINIT);
+		if (inst.constant == 0) {
+			FCVT(FConv::S, FConv::W, fpr.R(inst.dest), R_ZERO);
+		} else {
+			// TODO: In the future, could use FLI if it's approved.
+			// Also, is FCVT faster?
+			LI(SCRATCH1, (int32_t)inst.constant);
+			FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+		}
 		break;

 	case IROp::Downcount:
@ -78,13 +91,85 @@ void RiscVJit::CompIR_Transfer(IRInst inst) {

 	switch (inst.op) {
 	case IROp::SetCtrlVFPU:
+		gpr.SetImm(IRREG_VFPU_CTRL_BASE + inst.dest, (int32_t)inst.constant);
+		break;
+
 	case IROp::SetCtrlVFPUReg:
+		gpr.MapDirtyIn(IRREG_VFPU_CTRL_BASE + inst.dest, inst.src1);
+		MV(gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), gpr.R(inst.src1));
+		gpr.MarkDirty(gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), gpr.IsNormalized32(inst.src1));
+		break;
+
 	case IROp::SetCtrlVFPUFReg:
+		gpr.MapReg(IRREG_VFPU_CTRL_BASE + inst.dest, MIPSMap::NOINIT);
+		fpr.MapReg(inst.src1);
+		FMV(FMv::X, FMv::W, gpr.R(IRREG_VFPU_CTRL_BASE + inst.dest), fpr.R(inst.src1));
+		break;
+
 	case IROp::FpCondToReg:
+		gpr.MapDirtyIn(inst.dest, IRREG_FPCOND);
+		MV(gpr.R(inst.dest), gpr.R(IRREG_FPCOND));
+		gpr.MarkDirty(gpr.R(inst.dest), gpr.IsNormalized32(IRREG_FPCOND));
+		break;
+
+	case IROp::ZeroFpCond:
+		gpr.SetImm(IRREG_FPCOND, 0);
+		break;
+
+	case IROp::FpCtrlFromReg:
+		gpr.MapDirtyIn(IRREG_FPCOND, inst.src1, MapType::AVOID_LOAD_MARK_NORM32);
+		LI(SCRATCH1, 0x0181FFFF);
+		AND(SCRATCH1, gpr.R(inst.src1), SCRATCH1);
+		// Extract the new fpcond value.
+		if (cpu_info.RiscV_Zbs) {
+			BEXTI(gpr.R(IRREG_FPCOND), SCRATCH1, 23);
+		} else {
+			SRLI(gpr.R(IRREG_FPCOND), SCRATCH1, 23);
+			ANDI(gpr.R(IRREG_FPCOND), gpr.R(IRREG_FPCOND), 1);
+		}
+		SW(SCRATCH1, CTXREG, IRREG_FCR31 * 4);
+		break;
+
+	case IROp::FpCtrlToReg:
+		gpr.MapDirtyIn(inst.dest, IRREG_FPCOND, MapType::AVOID_LOAD_MARK_NORM32);
+		// Load fcr31 and clear the fpcond bit.
+		LW(SCRATCH1, CTXREG, IRREG_FCR31 * 4);
+		if (cpu_info.RiscV_Zbs) {
+			BCLRI(SCRATCH1, SCRATCH1, 23);
+		} else {
+			LI(SCRATCH2, ~(1 << 23));
+			AND(SCRATCH1, SCRATCH1, SCRATCH2);
+		}
+
+		// Now get the correct fpcond bit.
+		ANDI(SCRATCH2, gpr.R(IRREG_FPCOND), 1);
+		SLLI(SCRATCH2, SCRATCH2, 23);
+		OR(gpr.R(inst.dest), SCRATCH1, SCRATCH2);
+
+		// Also update mips->fcr31 while we're here.
+		SW(gpr.R(inst.dest), CTXREG, IRREG_FCR31 * 4);
+		break;
+
 	case IROp::VfpuCtrlToReg:
+		gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1);
+		MV(gpr.R(inst.dest), gpr.R(IRREG_VFPU_CTRL_BASE + inst.src1));
+		gpr.MarkDirty(gpr.R(inst.dest), gpr.IsNormalized32(IRREG_VFPU_CTRL_BASE + inst.src1));
+		break;
+
 	case IROp::FMovFromGPR:
+		fpr.MapReg(inst.dest, MIPSMap::NOINIT);
+		if (gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) {
+			FCVT(FConv::S, FConv::W, fpr.R(inst.dest), R_ZERO);
+		} else {
+			gpr.MapReg(inst.src1);
+			FMV(FMv::W, FMv::X, fpr.R(inst.dest), gpr.R(inst.src1));
+		}
+		break;
+
 	case IROp::FMovToGPR:
-		CompIR_Generic(inst);
+		gpr.MapReg(inst.dest, MIPSMap::NOINIT);
+		fpr.MapReg(inst.src1);
+		FMV(FMv::X, FMv::W, gpr.R(inst.dest), fpr.R(inst.src1));
 		break;

 	default:
@ -98,10 +183,61 @@ void RiscVJit::CompIR_System(IRInst inst) {

 	switch (inst.op) {
 	case IROp::Interpret:
+		// IR protects us against this being a branching instruction (well, hopefully.)
+		FlushAll();
+		SaveStaticRegisters();
+		LI(X10, (int32_t)inst.constant);
+		QuickCallFunction((const u8 *)MIPSGetInterpretFunc(MIPSOpcode(inst.constant)));
+		LoadStaticRegisters();
+		break;
+
 	case IROp::Syscall:
+		FlushAll();
+		SaveStaticRegisters();
+
+#ifdef USE_PROFILER
+		// When profiling, we can't skip CallSyscall, since it times syscalls.
+		LI(X10, (int32_t)inst.constant);
+		QuickCallFunction(&CallSyscall);
+#else
+		// Skip the CallSyscall where possible.
+		{
+			MIPSOpcode op(inst.constant);
+			void *quickFunc = GetQuickSyscallFunc(op);
+			if (quickFunc) {
+				LI(X10, (uintptr_t)GetSyscallFuncPointer(op));
+				QuickCallFunction((const u8 *)quickFunc);
+			} else {
+				LI(X10, (int32_t)inst.constant);
+				QuickCallFunction(&CallSyscall);
+			}
+		}
+#endif
+
+		LoadStaticRegisters();
+		// This is always followed by an ExitToPC, where we check coreState.
+		break;
+
 	case IROp::CallReplacement:
+		FlushAll();
+		SaveStaticRegisters();
+		QuickCallFunction(GetReplacementFunc(inst.constant)->replaceFunc);
+		LoadStaticRegisters();
+		SUB(DOWNCOUNTREG, DOWNCOUNTREG, X10);
+		break;
+
 	case IROp::Break:
-		CompIR_Generic(inst);
+		FlushAll();
+		// This doesn't naturally have restore/apply around it.
+		RestoreRoundingMode(true);
+		SaveStaticRegisters();
+		MovFromPC(X10);
+		QuickCallFunction(&Core_Break);
+		LoadStaticRegisters();
+		ApplyRoundingMode(true);
+		MovFromPC(SCRATCH1);
+		ADDI(SCRATCH1, SCRATCH1, 4);
+		QuickJ(R_RA, dispatcherPCInSCRATCH1_);
 		break;

 	default:
--- a/Core/MIPS/RiscV/RiscVCompVec.cpp
+++ b/Core/MIPS/RiscV/RiscVCompVec.cpp
@ -39,9 +39,88 @@ void RiscVJit::CompIR_VecAssign(IRInst inst) {

 	switch (inst.op) {
 	case IROp::Vec4Init:
+		for (int i = 0; i < 4; ++i)
+			fpr.SpillLock(inst.dest + i);
+		for (int i = 0; i < 4; ++i)
+			fpr.MapReg(inst.dest + i, MIPSMap::NOINIT);
+		for (int i = 0; i < 4; ++i)
+			fpr.ReleaseSpillLock(inst.dest + i);
+
+		// TODO: Check if FCVT/FMV/FL is better.
+		switch ((Vec4Init)inst.src1) {
+		case Vec4Init::AllZERO:
+			for (int i = 0; i < 4; ++i)
+				FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
+			break;
+
+		case Vec4Init::AllONE:
+			LI(SCRATCH1, 1.0f);
+			FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+			for (int i = 1; i < 4; ++i)
+				FMV(32, fpr.R(inst.dest + i), fpr.R(inst.dest));
+			break;
+
+		case Vec4Init::AllMinusONE:
+			LI(SCRATCH1, -1.0f);
+			FMV(FMv::W, FMv::X, fpr.R(inst.dest), SCRATCH1);
+			for (int i = 1; i < 4; ++i)
+				FMV(32, fpr.R(inst.dest + i), fpr.R(inst.dest));
+			break;
+
+		case Vec4Init::Set_1000:
+			LI(SCRATCH1, 1.0f);
+			for (int i = 0; i < 4; ++i) {
+				if (i == 0)
+					FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
+				else
+					FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
+			}
+			break;
+
+		case Vec4Init::Set_0100:
+			LI(SCRATCH1, 1.0f);
+			for (int i = 0; i < 4; ++i) {
+				if (i == 1)
+					FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
+				else
+					FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
+			}
+			break;
+
+		case Vec4Init::Set_0010:
+			LI(SCRATCH1, 1.0f);
+			for (int i = 0; i < 4; ++i) {
+				if (i == 2)
+					FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
+				else
+					FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
+			}
+			break;
+
+		case Vec4Init::Set_0001:
+			LI(SCRATCH1, 1.0f);
+			for (int i = 0; i < 4; ++i) {
+				if (i == 3)
+					FMV(FMv::W, FMv::X, fpr.R(inst.dest + i), SCRATCH1);
+				else
+					FCVT(FConv::S, FConv::W, fpr.R(inst.dest + i), R_ZERO);
+			}
+			break;
+		}
+		break;
+
 	case IROp::Vec4Shuffle:
+		fpr.Map4DirtyIn(inst.dest, inst.src1);
+		for (int i = 0; i < 4; ++i) {
+			int lane = (inst.src2 >> (i * 2)) & 3;
+			FMV(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + lane));
+		}
+		break;
+
 	case IROp::Vec4Mov:
-		CompIR_Generic(inst);
+		fpr.Map4DirtyIn(inst.dest, inst.src1);
+		for (int i = 0; i < 4; ++i)
+			FMV(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i));
 		break;

 	default:
@ -55,13 +134,48 @@ void RiscVJit::CompIR_VecArith(IRInst inst) {

 	switch (inst.op) {
 	case IROp::Vec4Add:
+		fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
+		for (int i = 0; i < 4; ++i)
+			FADD(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
+		break;
+
 	case IROp::Vec4Sub:
+		fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
+		for (int i = 0; i < 4; ++i)
+			FSUB(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
+		break;
+
 	case IROp::Vec4Mul:
+		fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
+		for (int i = 0; i < 4; ++i)
+			FMUL(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
+		break;
+
 	case IROp::Vec4Div:
+		fpr.Map4DirtyInIn(inst.dest, inst.src1, inst.src2);
+		for (int i = 0; i < 4; ++i)
+			FDIV(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
+		break;
+
 	case IROp::Vec4Scale:
+		fpr.SpillLock(inst.src2);
+		fpr.MapReg(inst.src2);
+		fpr.Map4DirtyIn(inst.dest, inst.src1);
+		fpr.ReleaseSpillLock(inst.src2);
+		for (int i = 0; i < 4; ++i)
+			FMUL(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i), fpr.R(inst.src2));
+		break;
+
 	case IROp::Vec4Neg:
+		fpr.Map4DirtyIn(inst.dest, inst.src1);
+		for (int i = 0; i < 4; ++i)
+			FNEG(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i));
+		break;
+
 	case IROp::Vec4Abs:
-		CompIR_Generic(inst);
+		fpr.Map4DirtyIn(inst.dest, inst.src1);
+		for (int i = 0; i < 4; ++i)
+			FABS(32, fpr.R(inst.dest + i), fpr.R(inst.src1 + i));
 		break;

 	default:
@ -75,7 +189,39 @@ void RiscVJit::CompIR_VecHoriz(IRInst inst) {

 	switch (inst.op) {
 	case IROp::Vec4Dot:
-		CompIR_Generic(inst);
+		// TODO: Maybe some option to call the slow accurate mode?
+		fpr.SpillLock(inst.dest);
+		for (int i = 0; i < 4; ++i) {
+			fpr.SpillLock(inst.src1 + i);
+			fpr.SpillLock(inst.src2 + i);
+		}
+		for (int i = 0; i < 4; ++i) {
+			fpr.MapReg(inst.src1 + i);
+			fpr.MapReg(inst.src2 + i);
+		}
+		fpr.MapReg(inst.dest, MIPSMap::NOINIT);
+		for (int i = 0; i < 4; ++i) {
+			fpr.ReleaseSpillLock(inst.src1 + i);
+			fpr.ReleaseSpillLock(inst.src2 + i);
+		}
+		fpr.ReleaseSpillLock(inst.dest);
+
+		if ((inst.dest < inst.src1 + 4 && inst.dest >= inst.src1) || (inst.dest < inst.src2 + 4 && inst.dest >= inst.src2)) {
+			// This means inst.dest overlaps one of src1 or src2.  We have to do that one first.
+			// Technically this may impact -0.0 and such, but dots accurately need to be aligned anyway.
+			for (int i = 0; i < 4; ++i) {
+				if (inst.dest == inst.src1 + i || inst.dest == inst.src2 + i)
+					FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i));
+			}
+			for (int i = 0; i < 4; ++i) {
+				if (inst.dest != inst.src1 + i && inst.dest != inst.src2 + i)
+					FMADD(32, fpr.R(inst.dest), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i), fpr.R(inst.dest));
+			}
+		} else {
+			FMUL(32, fpr.R(inst.dest), fpr.R(inst.src1), fpr.R(inst.src2));
+			for (int i = 1; i < 4; ++i)
+				FMADD(32, fpr.R(inst.dest), fpr.R(inst.src1 + i), fpr.R(inst.src2 + i), fpr.R(inst.dest));
+		}
 		break;

 	default:
--- a/Core/MIPS/RiscV/RiscVJit.cpp
+++ b/Core/MIPS/RiscV/RiscVJit.cpp
@ -26,7 +26,7 @@ namespace MIPSComp {
 using namespace RiscVGen;
 using namespace RiscVJitConstants;

-RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo) {
+RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo), fpr(mipsState, &jo) {
 	// Automatically disable incompatible options.
 	if (((intptr_t)Memory::base & 0x00000000FFFFFFFFUL) != 0) {
 		jo.enablePointerify = false;
@ -40,7 +40,7 @@ RiscVJit::RiscVJit(MIPSState *mipsState) : IRJit(mipsState), gpr(mipsState, &jo)
 	memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);

 	gpr.Init(this);
-	// TODO: fpr
+	fpr.Init(this);

 	GenerateFixedCode(jo);
 }
@ -79,7 +79,7 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
 	blockStartAddrs_[block_num] = GetCodePointer();

 	gpr.Start();
-	// TODO: fpr.
+	fpr.Start();

 	for (const IRInst &inst : instructions) {
 		CompileIRInst(inst);
@ -87,9 +87,8 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
 		if (jo.Disabled(JitDisable::REGALLOC_GPR)) {
 			gpr.FlushAll();
 		}
-		// TODO
 		if (jo.Disabled(JitDisable::REGALLOC_FPR)) {
-			//fpr.FlushAll();
+			fpr.FlushAll();
 		}

 		// Safety check, in case we get a bunch of really large jit ops without a lot of branching.
@ -107,13 +106,6 @@ bool RiscVJit::CompileBlock(u32 em_address, std::vector<IRInst> &instructions, u
 	return true;
 }

-static u32 DoIRInst(uint64_t value) {
-	IRInst inst;
-	memcpy(&inst, &value, sizeof(inst));
-
-	return IRInterpret(currentMIPS, &inst, 1);
-}
-
 void RiscVJit::CompileIRInst(IRInst inst) {
 	switch (inst.op) {
 	case IROp::Nop:
@ -281,7 +273,6 @@ void RiscVJit::CompileIRInst(IRInst inst) {
 		CompIR_FSat(inst);
 		break;

-	case IROp::ZeroFpCond:
 	case IROp::FCmp:
 	case IROp::FCmovVfpuCC:
 	case IROp::FCmpVfpuBit:
@ -299,6 +290,9 @@ void RiscVJit::CompileIRInst(IRInst inst) {
 	case IROp::SetCtrlVFPUReg:
 	case IROp::SetCtrlVFPUFReg:
 	case IROp::FpCondToReg:
+	case IROp::ZeroFpCond:
+	case IROp::FpCtrlFromReg:
+	case IROp::FpCtrlToReg:
 	case IROp::VfpuCtrlToReg:
 	case IROp::FMovFromGPR:
 	case IROp::FMovToGPR:
@ -392,9 +386,15 @@ void RiscVJit::CompileIRInst(IRInst inst) {
 	}
 }

+static u32 DoIRInst(uint64_t value) {
+	IRInst inst;
+	memcpy(&inst, &value, sizeof(inst));
+
+	return IRInterpret(currentMIPS, &inst, 1);
+}
+
 void RiscVJit::CompIR_Generic(IRInst inst) {
-	// For now, we're gonna do it the slow and ugly way.
-	// Maybe there's a smarter way to fallback?
+	// If we got here, we're going the slow way.
 	uint64_t value;
 	memcpy(&value, &inst, sizeof(inst));

@ -403,20 +403,24 @@ void RiscVJit::CompIR_Generic(IRInst inst) {
 	SaveStaticRegisters();
 	QuickCallFunction(&DoIRInst);
 	LoadStaticRegisters();
-	// Result in X10 aka SCRATCH1.
-	_assert_(X10 == SCRATCH1);
-	if (BInRange(dispatcherPCInSCRATCH1_)) {
-		BNE(X10, R_ZERO, dispatcherPCInSCRATCH1_);
-	} else {
-		FixupBranch skip = BEQ(X10, R_ZERO);
-		QuickJ(R_RA, dispatcherPCInSCRATCH1_);
-		SetJumpTarget(skip);
+
+	// We only need to check the return value if it's a potential exit.
+	if ((GetIRMeta(inst.op)->flags & IRFLAG_EXIT) != 0) {
+		// Result in X10 aka SCRATCH1.
+		_assert_(X10 == SCRATCH1);
+		if (BInRange(dispatcherPCInSCRATCH1_)) {
+			BNE(X10, R_ZERO, dispatcherPCInSCRATCH1_);
+		} else {
+			FixupBranch skip = BEQ(X10, R_ZERO);
+			QuickJ(R_RA, dispatcherPCInSCRATCH1_);
+			SetJumpTarget(skip);
+		}
 	}
 }

 void RiscVJit::FlushAll() {
 	gpr.FlushAll();
-	// TODO: fpr.
+	fpr.FlushAll();
 }

 bool RiscVJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
@ -433,6 +437,8 @@ bool RiscVJit::DescribeCodePtr(const u8 *ptr, std::string &name) {
 		name = "loadStaticRegisters";
 	} else if (ptr == enterDispatcher_) {
 		name = "enterDispatcher";
+	} else if (ptr == applyRoundingMode_) {
+		name = "applyRoundingMode";
 	} else if (!IsInSpace(ptr)) {
 		return false;
 	} else {
@ -492,20 +498,12 @@ void RiscVJit::ClearCache() {
 	memset(blockStartAddrs_, 0, sizeof(blockStartAddrs_[0]) * MAX_ALLOWED_JIT_BLOCKS);
 }

-void RiscVJit::UpdateFCR31() {
-	IRJit::UpdateFCR31();
-
-	// TODO: Handle rounding modes?
-}
-
 void RiscVJit::RestoreRoundingMode(bool force) {
-	// TODO: Could maybe skip if not hasSetRounding?  But that's on IRFrontend...
 	FSRMI(Round::NEAREST_EVEN);
 }

 void RiscVJit::ApplyRoundingMode(bool force) {
-	// TODO: Also could maybe sometimes skip?
-	//QuickCallFunction(applyRoundingMode_);
+	QuickCallFunction(applyRoundingMode_);
 }

 void RiscVJit::MovFromPC(RiscVReg r) {
--- a/Core/MIPS/RiscV/RiscVJit.h
+++ b/Core/MIPS/RiscV/RiscVJit.h
@ -24,6 +24,7 @@
 #include "Core/MIPS/JitCommon/JitState.h"
 #include "Core/MIPS/JitCommon/JitCommon.h"
 #include "Core/MIPS/RiscV/RiscVRegCache.h"
+#include "Core/MIPS/RiscV/RiscVRegCacheFPU.h"

 namespace MIPSComp {

@ -41,7 +42,6 @@ public:
 	const u8 *GetCrashHandler() const override;

 	void ClearCache() override;
-	void UpdateFCR31() override;

 	// TODO: GetBlockCacheDebugInterface, block linking?

@ -107,12 +107,13 @@ private:

 	void SetScratch1ToSrc1Address(IRReg src1);
 	// Modifies SCRATCH regs.
-	int32_t AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant);
+	int32_t AdjustForAddressOffset(RiscVGen::RiscVReg *reg, int32_t constant, int32_t range = 0);
 	void NormalizeSrc1(IRInst inst, RiscVGen::RiscVReg *reg, RiscVGen::RiscVReg tempReg, bool allowOverlap);
 	void NormalizeSrc12(IRInst inst, RiscVGen::RiscVReg *lhs, RiscVGen::RiscVReg *rhs, RiscVGen::RiscVReg lhsTempReg, RiscVGen::RiscVReg rhsTempReg, bool allowOverlap);
 	RiscVGen::RiscVReg NormalizeR(IRRegIndex rs, IRRegIndex rd, RiscVGen::RiscVReg tempReg);

 	RiscVRegCache gpr;
+	RiscVRegCacheFPU fpr;

 	static constexpr int MAX_ALLOWED_JIT_BLOCKS = 262144;

@ -125,6 +126,7 @@ private:
 	const u8 *dispatcher_ = nullptr;
 	const u8 *dispatcherNoCheck_ = nullptr;
 	const u8 *dispatcherFetch_ = nullptr;
+	const u8 *applyRoundingMode_ = nullptr;

 	const u8 *saveStaticRegisters_ = nullptr;
 	const u8 *loadStaticRegisters_ = nullptr;
--- a/Core/MIPS/RiscV/RiscVRegCache.cpp
+++ b/Core/MIPS/RiscV/RiscVRegCache.cpp
@ -15,15 +15,15 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

+#ifndef offsetof
+#include <cstddef>
+#endif
+
 #include "Common/CPUDetect.h"
 #include "Core/MIPS/RiscV/RiscVRegCache.h"
 #include "Core/MIPS/JitCommon/JitState.h"
 #include "Core/Reporting.h"

-#ifndef offsetof
-#include "stddef.h"
-#endif
-
 using namespace RiscVGen;
 using namespace RiscVJitConstants;

@ -36,20 +36,14 @@ void RiscVRegCache::Init(RiscVEmitter *emitter) {
 }

 void RiscVRegCache::Start() {
-	for (int i = 0; i < NUM_RVREG; i++) {
-		ar[i].mipsReg = IRREG_INVALID;
-		ar[i].isDirty = false;
-		ar[i].pointerified = false;
-		ar[i].tempLocked = false;
-		ar[i].normalized32 = false;
-	}
-	for (int i = 0; i < NUM_MIPSREG; i++) {
-		mr[i].loc = MIPSLoc::MEM;
-		mr[i].reg = INVALID_REG;
-		mr[i].imm = -1;
-		mr[i].spillLock = false;
-		mr[i].isStatic = false;
+	if (!initialReady_) {
+		SetupInitialRegs();
+		initialReady_ = true;
 	}
+
+	memcpy(ar, arInitial_, sizeof(ar));
+	memcpy(mr, mrInitial_, sizeof(mr));
+
 	int numStatics;
 	const StaticAllocation *statics = GetStaticAllocations(numStatics);
 	for (int i = 0; i < numStatics; i++) {
@ -61,24 +55,41 @@ void RiscVRegCache::Start() {
 		mr[statics[i].mr].isStatic = true;
 		mr[statics[i].mr].spillLock = true;
 	}
+}
+
+void RiscVRegCache::SetupInitialRegs() {
+	for (int i = 0; i < NUM_RVREG; i++) {
+		arInitial_[i].mipsReg = IRREG_INVALID;
+		arInitial_[i].isDirty = false;
+		arInitial_[i].pointerified = false;
+		arInitial_[i].tempLocked = false;
+		arInitial_[i].normalized32 = false;
+	}
+	for (int i = 0; i < NUM_MIPSREG; i++) {
+		mrInitial_[i].loc = MIPSLoc::MEM;
+		mrInitial_[i].reg = INVALID_REG;
+		mrInitial_[i].imm = -1;
+		mrInitial_[i].spillLock = false;
+		mrInitial_[i].isStatic = false;
+	}

 	// Treat R_ZERO a bit specially, but it's basically static alloc too.
-	ar[R_ZERO].mipsReg = MIPS_REG_ZERO;
-	ar[R_ZERO].normalized32 = true;
-	mr[MIPS_REG_ZERO].loc = MIPSLoc::RVREG_IMM;
-	mr[MIPS_REG_ZERO].reg = R_ZERO;
-	mr[MIPS_REG_ZERO].imm = 0;
-	mr[MIPS_REG_ZERO].isStatic = true;
+	arInitial_[R_ZERO].mipsReg = MIPS_REG_ZERO;
+	arInitial_[R_ZERO].normalized32 = true;
+	mrInitial_[MIPS_REG_ZERO].loc = MIPSLoc::RVREG_IMM;
+	mrInitial_[MIPS_REG_ZERO].reg = R_ZERO;
+	mrInitial_[MIPS_REG_ZERO].imm = 0;
+	mrInitial_[MIPS_REG_ZERO].isStatic = true;
 }

 const RiscVReg *RiscVRegCache::GetMIPSAllocationOrder(int &count) {
 	// X8 and X9 are the most ideal for static alloc because they can be used with compression.
 	// Otherwise we stick to saved regs - might not be necessary.
 	static const RiscVReg allocationOrder[] = {
-		X7, X8, X9, X12, X13, X14, X5, X6, X15, X16, X17, X18, X19, X20, X21, X22, X23, X28, X29, X30, X31,
+		X8, X9, X12, X13, X14, X15, X5, X6, X7, X16, X17, X18, X19, X20, X21, X22, X23, X28, X29, X30, X31,
 	};
 	static const RiscVReg allocationOrderStaticAlloc[] = {
-		X7, X12, X13, X14, X5, X6, X15, X16, X17, X21, X22, X23, X28, X29, X30, X31,
+		X12, X13, X14, X15, X5, X6, X7, X16, X17, X21, X22, X23, X28, X29, X30, X31,
 	};

 	if (jo_->useStaticAlloc) {
@ -432,6 +443,7 @@ RiscVReg RiscVRegCache::GetAndLockTempR() {
 	RiscVReg reg = AllocateReg();
 	if (reg != INVALID_REG) {
 		ar[reg].tempLocked = true;
+		pendingUnlock_ = true;
 	}
 	return reg;
 }
@ -958,14 +970,6 @@ bool RiscVRegCache::IsImm(IRRegIndex r) const {
 		return mr[r].loc == MIPSLoc::IMM || mr[r].loc == MIPSLoc::RVREG_IMM;
 }

-bool RiscVRegCache::IsPureImm(IRRegIndex r) const {
-	_dbg_assert_(IsValidReg(r));
-	if (r == MIPS_REG_ZERO)
-		return true;
-	else
-		return mr[r].loc == MIPSLoc::IMM;
-}
-
 u64 RiscVRegCache::GetImm(IRRegIndex r) const {
 	_dbg_assert_(IsValidReg(r));
 	if (r == MIPS_REG_ZERO)
@ -1016,9 +1020,13 @@ void RiscVRegCache::SpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRReg
 	if (r2 != IRREG_INVALID) mr[r2].spillLock = true;
 	if (r3 != IRREG_INVALID) mr[r3].spillLock = true;
 	if (r4 != IRREG_INVALID) mr[r4].spillLock = true;
+	pendingUnlock_ = true;
 }

 void RiscVRegCache::ReleaseSpillLocksAndDiscardTemps() {
+	if (!pendingUnlock_)
+		return;
+
 	for (int i = 0; i < NUM_MIPSREG; i++) {
 		if (!mr[i].isStatic)
 			mr[i].spillLock = false;
@ -1026,6 +1034,8 @@ void RiscVRegCache::ReleaseSpillLocksAndDiscardTemps() {
 	for (int i = 0; i < NUM_RVREG; i++) {
 		ar[i].tempLocked = false;
 	}
+
+	pendingUnlock_ = false;
 }

 void RiscVRegCache::ReleaseSpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRRegIndex r4) {
--- a/Core/MIPS/RiscV/RiscVRegCache.h
+++ b/Core/MIPS/RiscV/RiscVRegCache.h
@ -68,10 +68,6 @@ enum class MapType {

 } // namespace RiscVJitConstants

-namespace MIPSAnalyst {
-struct AnalysisResults;
-};
-
 namespace MIPSComp {
 struct JitOptions;
 }
@ -116,10 +112,7 @@ public:

 	void SetImm(IRRegIndex reg, u64 immVal);
 	bool IsImm(IRRegIndex reg) const;
-	bool IsPureImm(IRRegIndex reg) const;
 	u64 GetImm(IRRegIndex reg) const;
-	// Optimally set a register to an imm value (possibly using another register.)
-	void SetRegImm(RiscVGen::RiscVReg reg, u64 imm);

 	// May fail and return INVALID_REG if it needs flushing.
 	RiscVGen::RiscVReg TryMapTempImm(IRRegIndex);
@ -144,7 +137,6 @@ public:
 	void MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, RiscVJitConstants::MapType type = RiscVJitConstants::MapType::AVOID_LOAD);
 	void MapDirtyDirtyIn(IRRegIndex rd1, IRRegIndex rd2, IRRegIndex rs, RiscVJitConstants::MapType type = RiscVJitConstants::MapType::AVOID_LOAD);
 	void MapDirtyDirtyInIn(IRRegIndex rd1, IRRegIndex rd2, IRRegIndex rs, IRRegIndex rt, RiscVJitConstants::MapType type = RiscVJitConstants::MapType::AVOID_LOAD);
-	void FlushRiscVReg(RiscVGen::RiscVReg r);
 	void FlushBeforeCall();
 	void FlushAll();
 	void FlushR(IRRegIndex r);
@ -171,12 +163,16 @@ private:
 	RiscVGen::RiscVReg AllocateReg();
 	RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
 	RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
+	void FlushRiscVReg(RiscVGen::RiscVReg r);
+	void SetRegImm(RiscVGen::RiscVReg reg, u64 imm);
 	void AddMemBase(RiscVGen::RiscVReg reg);
 	int GetMipsRegOffset(IRRegIndex r);

 	bool IsValidReg(IRRegIndex r) const;
 	bool IsValidRegNoZero(IRRegIndex r) const;

+	void SetupInitialRegs();
+
 	MIPSState *mips_;
 	RiscVGen::RiscVEmitter *emit_ = nullptr;
 	MIPSComp::JitOptions *jo_;
@ -188,4 +184,9 @@ private:

 	RegStatusRiscV ar[NUM_RVREG]{};
 	RegStatusMIPS mr[NUM_MIPSREG]{};
+
+	bool initialReady_ = false;
+	bool pendingUnlock_ = false;
+	RegStatusRiscV arInitial_[NUM_RVREG];
+	RegStatusMIPS mrInitial_[NUM_MIPSREG];
 };
--- a/Core/MIPS/RiscV/RiscVRegCacheFPU.cpp
+++ b/Core/MIPS/RiscV/RiscVRegCacheFPU.cpp
@ -14,3 +14,401 @@

 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#ifndef offsetof
+#include <cstddef>
+#endif
+
+#include "Common/CPUDetect.h"
+#include "Core/MIPS/RiscV/RiscVRegCacheFPU.h"
+#include "Core/MIPS/JitCommon/JitState.h"
+#include "Core/Reporting.h"
+
+using namespace RiscVGen;
+using namespace RiscVJitConstants;
+
+using namespace RiscVGen;
+using namespace RiscVJitConstants;
+
+RiscVRegCacheFPU::RiscVRegCacheFPU(MIPSState *mipsState, MIPSComp::JitOptions *jo)
+	: mips_(mipsState), jo_(jo) {}
+
+void RiscVRegCacheFPU::Init(RiscVEmitter *emitter) {
+	emit_ = emitter;
+}
+
+void RiscVRegCacheFPU::Start() {
+	if (!initialReady_) {
+		SetupInitialRegs();
+		initialReady_ = true;
+	}
+
+	memcpy(ar, arInitial_, sizeof(ar));
+	memcpy(mr, mrInitial_, sizeof(mr));
+	pendingFlush_ = false;
+}
+
+void RiscVRegCacheFPU::SetupInitialRegs() {
+	for (int i = 0; i < NUM_RVFPUREG; i++) {
+		arInitial_[i].mipsReg = IRREG_INVALID;
+		arInitial_[i].isDirty = false;
+	}
+	for (int i = 0; i < NUM_MIPSFPUREG; i++) {
+		mrInitial_[i].loc = MIPSLoc::MEM;
+		mrInitial_[i].reg = (int)INVALID_REG;
+		mrInitial_[i].spillLock = false;
+	}
+}
+
+const RiscVReg *RiscVRegCacheFPU::GetMIPSAllocationOrder(int &count) {
+	// F8 through F15 are used for compression, so they are great.
+	// TODO: Maybe we could remove some saved regs since we rarely need that many?  Or maybe worth it?
+	static const RiscVReg allocationOrder[] = {
+		F8, F9, F10, F11, F12, F13, F14, F15,
+		F0, F1, F2, F3, F4, F5, F6, F7,
+		F16, F17, F18, F19, F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30, F31,
+	};
+
+	count = ARRAY_SIZE(allocationOrder);
+	return allocationOrder;
+}
+
+bool RiscVRegCacheFPU::IsInRAM(IRRegIndex reg) {
+	_dbg_assert_(IsValidReg(reg));
+	return mr[reg].loc == MIPSLoc::MEM;
+}
+
+bool RiscVRegCacheFPU::IsMapped(IRRegIndex mipsReg) {
+	_dbg_assert_(IsValidReg(mipsReg));
+	return mr[mipsReg].loc == MIPSLoc::RVREG;
+}
+
+RiscVReg RiscVRegCacheFPU::MapReg(IRRegIndex mipsReg, MIPSMap mapFlags) {
+	_dbg_assert_(IsValidReg(mipsReg));
+	_dbg_assert_(mr[mipsReg].loc == MIPSLoc::MEM || mr[mipsReg].loc == MIPSLoc::RVREG);
+
+	pendingFlush_ = true;
+
+	// Let's see if it's already mapped. If so we just need to update the dirty flag.
+	// We don't need to check for NOINIT because we assume that anyone who maps
+	// with that flag immediately writes a "known" value to the register.
+	if (mr[mipsReg].loc == MIPSLoc::RVREG) {
+		_assert_msg_(ar[mr[mipsReg].reg].mipsReg == mipsReg, "GPU mapping out of sync, IR=%i", mipsReg);
+		if ((mapFlags & MIPSMap::DIRTY) == MIPSMap::DIRTY) {
+			ar[mr[mipsReg].reg].isDirty = true;
+		}
+		return (RiscVReg)(mr[mipsReg].reg + F0);
+	}
+
+	// Okay, not mapped, so we need to allocate an RV register.
+	RiscVReg reg = AllocateReg();
+	if (reg != INVALID_REG) {
+		// That means it's free. Grab it, and load the value into it (if requested).
+		ar[reg - F0].isDirty = (mapFlags & MIPSMap::DIRTY) == MIPSMap::DIRTY;
+		if ((mapFlags & MIPSMap::NOINIT) != MIPSMap::NOINIT) {
+			if (mr[mipsReg].loc == MIPSLoc::MEM) {
+				emit_->FL(32, reg, CTXREG, GetMipsRegOffset(mipsReg));
+			}
+		}
+		ar[reg - F0].mipsReg = mipsReg;
+		mr[mipsReg].loc = MIPSLoc::RVREG;
+		mr[mipsReg].reg = reg - F0;
+		return reg;
+	}
+
+	return reg;
+}
+
+RiscVReg RiscVRegCacheFPU::AllocateReg() {
+	int allocCount = 0;
+	const RiscVReg *allocOrder = GetMIPSAllocationOrder(allocCount);
+
+allocate:
+	for (int i = 0; i < allocCount; i++) {
+		RiscVReg reg = allocOrder[i];
+
+		if (ar[reg - F0].mipsReg == IRREG_INVALID) {
+			return reg;
+		}
+	}
+
+	// Still nothing. Let's spill a reg and goto 10.
+	// TODO: Use age or something to choose which register to spill?
+	// TODO: Spill dirty regs first? or opposite?
+	bool clobbered;
+	RiscVReg bestToSpill = FindBestToSpill(true, &clobbered);
+	if (bestToSpill == INVALID_REG) {
+		bestToSpill = FindBestToSpill(false, &clobbered);
+	}
+
+	if (bestToSpill != INVALID_REG) {
+		if (clobbered) {
+			DiscardR(ar[bestToSpill - F0].mipsReg);
+		} else {
+			FlushRiscVReg(bestToSpill);
+		}
+		// Now one must be free.
+		goto allocate;
+	}
+
+	// Uh oh, we have all of them spilllocked....
+	ERROR_LOG_REPORT(JIT, "Out of spillable registers near PC %08x", mips_->pc);
+	_assert_(bestToSpill != INVALID_REG);
+	return INVALID_REG;
+}
+
+RiscVReg RiscVRegCacheFPU::FindBestToSpill(bool unusedOnly, bool *clobbered) {
+	int allocCount = 0;
+	const RiscVReg *allocOrder = GetMIPSAllocationOrder(allocCount);
+
+	static const int UNUSED_LOOKAHEAD_OPS = 30;
+
+	*clobbered = false;
+	for (int i = 0; i < allocCount; i++) {
+		RiscVReg reg = allocOrder[i];
+		if (ar[reg - F0].mipsReg != IRREG_INVALID && mr[ar[reg - F0].mipsReg].spillLock)
+			continue;
+
+		// TODO: Look for clobbering in the IRInst array with index?
+
+		// Not awesome.  A used reg.  Let's try to avoid spilling.
+		// TODO: Actually check if we'd be spilling.
+		if (unusedOnly) {
+			continue;
+		}
+
+		return reg;
+	}
+
+	return INVALID_REG;
+}
+
+void RiscVRegCacheFPU::MapInIn(IRRegIndex rd, IRRegIndex rs) {
+	SpillLock(rd, rs);
+	MapReg(rd);
+	MapReg(rs);
+	ReleaseSpillLock(rd);
+	ReleaseSpillLock(rs);
+}
+
+void RiscVRegCacheFPU::MapDirtyIn(IRRegIndex rd, IRRegIndex rs, bool avoidLoad) {
+	SpillLock(rd, rs);
+	bool load = !avoidLoad || rd == rs;
+	MapReg(rd, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
+	MapReg(rs);
+	ReleaseSpillLock(rd);
+	ReleaseSpillLock(rs);
+}
+
+void RiscVRegCacheFPU::MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, bool avoidLoad) {
+	SpillLock(rd, rs, rt);
+	bool load = !avoidLoad || (rd == rs || rd == rt);
+	MapReg(rd, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
+	MapReg(rt);
+	MapReg(rs);
+	ReleaseSpillLock(rd);
+	ReleaseSpillLock(rs);
+	ReleaseSpillLock(rt);
+}
+
+void RiscVRegCacheFPU::Map4DirtyIn(IRRegIndex rdbase, IRRegIndex rsbase, bool avoidLoad) {
+	for (int i = 0; i < 4; ++i)
+		SpillLock(rdbase + i, rsbase + i);
+	bool load = !avoidLoad || (rdbase < rsbase + 4 && rdbase + 4 > rsbase);
+	for (int i = 0; i < 4; ++i)
+		MapReg(rdbase + i, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
+	for (int i = 0; i < 4; ++i)
+		MapReg(rsbase + i);
+	for (int i = 0; i < 4; ++i)
+		ReleaseSpillLock(rdbase + i, rsbase + i);
+}
+
+void RiscVRegCacheFPU::Map4DirtyInIn(IRRegIndex rdbase, IRRegIndex rsbase, IRRegIndex rtbase, bool avoidLoad) {
+	for (int i = 0; i < 4; ++i)
+		SpillLock(rdbase + i, rsbase + i, rtbase + i);
+	bool load = !avoidLoad || (rdbase < rsbase + 4 && rdbase + 4 > rsbase) || (rdbase < rtbase + 4 && rdbase + 4 > rtbase);
+	for (int i = 0; i < 4; ++i)
+		MapReg(rdbase + i, load ? MIPSMap::DIRTY : MIPSMap::NOINIT);
+	for (int i = 0; i < 4; ++i)
+		MapReg(rsbase + i);
+	for (int i = 0; i < 4; ++i)
+		MapReg(rtbase + i);
+	for (int i = 0; i < 4; ++i)
+		ReleaseSpillLock(rdbase + i, rsbase + i, rtbase + i);
+}
+
+void RiscVRegCacheFPU::FlushRiscVReg(RiscVReg r) {
+	_dbg_assert_(r >= F0 && r <= F31);
+	int reg = r - F0;
+	if (ar[reg].mipsReg == IRREG_INVALID) {
+		// Nothing to do, reg not mapped.
+		return;
+	}
+	if (ar[reg].isDirty && mr[ar[reg].mipsReg].loc == MIPSLoc::RVREG) {
+		emit_->FS(32, r, CTXREG, GetMipsRegOffset(ar[reg].mipsReg));
+	}
+	mr[ar[reg].mipsReg].loc = MIPSLoc::MEM;
+	mr[ar[reg].mipsReg].reg = (int)INVALID_REG;
+	ar[reg].mipsReg = IRREG_INVALID;
+	ar[reg].isDirty = false;
+}
+
+void RiscVRegCacheFPU::FlushR(IRRegIndex r) {
+	_dbg_assert_(IsValidReg(r));
+	RiscVReg reg = RiscVRegForFlush(r);
+	if (reg != INVALID_REG)
+		FlushRiscVReg(reg);
+}
+
+RiscVReg RiscVRegCacheFPU::RiscVRegForFlush(IRRegIndex r) {
+	_dbg_assert_(IsValidReg(r));
+	switch (mr[r].loc) {
+	case MIPSLoc::RVREG:
+		_assert_msg_(mr[r].reg != INVALID_REG, "RiscVRegForFlush: IR %d had bad RiscVReg", r);
+		if (mr[r].reg == INVALID_REG) {
+			return INVALID_REG;
+		}
+		return (RiscVReg)(F0 + mr[r].reg);
+
+	case MIPSLoc::MEM:
+		return INVALID_REG;
+
+	default:
+		_assert_(false);
+		return INVALID_REG;
+	}
+}
+
+void RiscVRegCacheFPU::FlushAll() {
+	if (!pendingFlush_) {
+		// Nothing allocated.  FPU regs are not nearly as common as GPR.
+		return;
+	}
+
+	int numRVRegs = 0;
+	const RiscVReg *order = GetMIPSAllocationOrder(numRVRegs);
+
+	for (int i = 0; i < numRVRegs; i++) {
+		int a = order[i] - F0;
+		int m = ar[a].mipsReg;
+
+		if (ar[a].isDirty) {
+			_assert_(m != MIPS_REG_INVALID);
+			emit_->FS(32, order[i], CTXREG, GetMipsRegOffset(m));
+
+			mr[m].loc = MIPSLoc::MEM;
+			mr[m].reg = (int)INVALID_REG;
+			ar[a].mipsReg = IRREG_INVALID;
+			ar[a].isDirty = false;
+		} else {
+			if (m != IRREG_INVALID) {
+				mr[m].loc = MIPSLoc::MEM;
+				mr[m].reg = (int)INVALID_REG;
+			}
+			ar[a].mipsReg = IRREG_INVALID;
+		}
+	}
+
+	pendingFlush_ = false;
+}
+
+void RiscVRegCacheFPU::DiscardR(IRRegIndex r) {
+	_dbg_assert_(IsValidReg(r));
+	switch (mr[r].loc) {
+	case MIPSLoc::RVREG:
+		_assert_(mr[r].reg != INVALID_REG);
+		if (mr[r].reg != INVALID_REG) {
+			// Note that we DO NOT write it back here. That's the whole point of Discard.
+			ar[mr[r].reg].isDirty = false;
+			ar[mr[r].reg].mipsReg = IRREG_INVALID;
+		}
+		break;
+
+	case MIPSLoc::MEM:
+		// Already there, nothing to do.
+		break;
+
+	default:
+		_assert_(false);
+		break;
+	}
+	mr[r].loc = MIPSLoc::MEM;
+	mr[r].reg = (int)INVALID_REG;
+	mr[r].spillLock = false;
+}
+
+int RiscVRegCacheFPU::GetMipsRegOffset(IRRegIndex r) {
+	_assert_(IsValidReg(r));
+	// These are offsets within the MIPSState structure.
+	// IR gives us an index that is already 32 after the state index (skipping GPRs.)
+	return (32 + r) * 4;
+}
+
+void RiscVRegCacheFPU::SpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRRegIndex r4) {
+	_dbg_assert_(IsValidReg(r1));
+	_dbg_assert_(r2 == IRREG_INVALID || IsValidReg(r2));
+	_dbg_assert_(r3 == IRREG_INVALID || IsValidReg(r3));
+	_dbg_assert_(r4 == IRREG_INVALID || IsValidReg(r4));
+	mr[r1].spillLock = true;
+	if (r2 != IRREG_INVALID)
+		mr[r2].spillLock = true;
+	if (r3 != IRREG_INVALID)
+		mr[r3].spillLock = true;
+	if (r4 != IRREG_INVALID)
+		mr[r4].spillLock = true;
+	pendingUnlock_ = true;
+}
+
+void RiscVRegCacheFPU::ReleaseSpillLocksAndDiscardTemps() {
+	if (!pendingUnlock_)
+		return;
+
+	for (int i = 0; i < NUM_MIPSFPUREG; i++) {
+		mr[i].spillLock = false;
+	}
+
+	pendingUnlock_ = false;
+}
+
+void RiscVRegCacheFPU::ReleaseSpillLock(IRRegIndex r1, IRRegIndex r2, IRRegIndex r3, IRRegIndex r4) {
+	_dbg_assert_(IsValidReg(r1));
+	_dbg_assert_(r2 == IRREG_INVALID || IsValidReg(r2));
+	_dbg_assert_(r3 == IRREG_INVALID || IsValidReg(r3));
+	_dbg_assert_(r4 == IRREG_INVALID || IsValidReg(r4));
+	mr[r1].spillLock = false;
+	if (r2 != IRREG_INVALID)
+		mr[r2].spillLock = false;
+	if (r3 != IRREG_INVALID)
+		mr[r3].spillLock = false;
+	if (r4 != IRREG_INVALID)
+		mr[r4].spillLock = false;
+}
+
+RiscVReg RiscVRegCacheFPU::R(IRRegIndex mipsReg) {
+	_dbg_assert_(IsValidReg(mipsReg));
+	_dbg_assert_(mr[mipsReg].loc == MIPSLoc::RVREG);
+	if (mr[mipsReg].loc == MIPSLoc::RVREG) {
+		return (RiscVReg)(mr[mipsReg].reg + F0);
+	} else {
+		ERROR_LOG_REPORT(JIT, "Reg %i not in riscv reg", mipsReg);
+		return INVALID_REG;  // BAAAD
+	}
+}
+
+bool RiscVRegCacheFPU::IsValidReg(IRRegIndex r) const {
+	if (r < 0 || r >= NUM_MIPSFPUREG)
+		return false;
+
+	// See MIPSState for these offsets.
+	int index = r + 32;
+
+	// Allow FPU or VFPU regs here.
+	if (index >= 32 && index < 32 + 32 + 128)
+		return true;
+	// Also allow VFPU temps.
+	if (index >= 224 && index < 224 + 16)
+		return true;
+
+	// Nothing else is allowed for the FPU side cache.
+	return false;
+}
--- a/Core/MIPS/RiscV/RiscVRegCacheFPU.h
+++ b/Core/MIPS/RiscV/RiscVRegCacheFPU.h
@ -16,3 +16,91 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

 #pragma once
+
+#include "Common/RiscVEmitter.h"
+#include "Core/MIPS/MIPS.h"
+#include "Core/MIPS/RiscV/RiscVRegCache.h"
+
+struct FPURegStatusRiscV {
+	int mipsReg;  // if -1, no mipsreg attached.
+	bool isDirty;  // Should the register be written back?
+};
+
+struct FPURegStatusMIPS {
+	// Where is this MIPS register?
+	RiscVJitConstants::MIPSLoc loc;
+	// Index from F0.
+	int reg;
+
+	bool spillLock;  // if true, this register cannot be spilled.
+	// If loc == ML_MEM, it's back in its location in the CPU context struct.
+};
+
+namespace MIPSComp {
+struct JitOptions;
+}
+
+class RiscVRegCacheFPU {
+public:
+	RiscVRegCacheFPU(MIPSState *mipsState, MIPSComp::JitOptions *jo);
+	~RiscVRegCacheFPU() {}
+
+	void Init(RiscVGen::RiscVEmitter *emitter);
+	// TODO: Maybe pass in IR block and start PC for logging/debugging?
+	void Start();
+
+	// Protect the RISC-V register containing a MIPS register from spilling, to ensure that
+	// it's being kept allocated.
+	void SpillLock(IRRegIndex reg, IRRegIndex reg2 = IRREG_INVALID, IRRegIndex reg3 = IRREG_INVALID, IRRegIndex reg4 = IRREG_INVALID);
+	void ReleaseSpillLock(IRRegIndex reg, IRRegIndex reg2 = IRREG_INVALID, IRRegIndex reg3 = IRREG_INVALID, IRRegIndex reg4 = IRREG_INVALID);
+	void ReleaseSpillLocksAndDiscardTemps();
+
+	// Returns a RISC-V register containing the requested MIPS register.
+	RiscVGen::RiscVReg MapReg(IRRegIndex reg, RiscVJitConstants::MIPSMap mapFlags = RiscVJitConstants::MIPSMap::INIT);
+
+	bool IsMapped(IRRegIndex r);
+	bool IsInRAM(IRRegIndex r);
+
+	void MapInIn(IRRegIndex rd, IRRegIndex rs);
+	void MapDirtyIn(IRRegIndex rd, IRRegIndex rs, bool avoidLoad = true);
+	void MapDirtyInIn(IRRegIndex rd, IRRegIndex rs, IRRegIndex rt, bool avoidLoad = true);
+	void Map4Dirty(IRRegIndex rdbase, bool avoidLoad = true);
+	void Map4DirtyIn(IRRegIndex rdbase, IRRegIndex rsbase, bool avoidLoad = true);
+	void Map4DirtyInIn(IRRegIndex rdbase, IRRegIndex rsbase, IRRegIndex rtbase, bool avoidLoad = true);
+	void FlushAll();
+	void FlushR(IRRegIndex r);
+	void DiscardR(IRRegIndex r);
+
+	RiscVGen::RiscVReg R(int preg); // Returns a cached register
+
+private:
+	const RiscVGen::RiscVReg *GetMIPSAllocationOrder(int &count);
+	RiscVGen::RiscVReg AllocateReg();
+	RiscVGen::RiscVReg FindBestToSpill(bool unusedOnly, bool *clobbered);
+	RiscVGen::RiscVReg RiscVRegForFlush(IRRegIndex r);
+	void FlushRiscVReg(RiscVGen::RiscVReg r);
+	int GetMipsRegOffset(IRRegIndex r);
+
+	bool IsValidReg(IRRegIndex r) const;
+
+	void SetupInitialRegs();
+
+	MIPSState *mips_;
+	RiscVGen::RiscVEmitter *emit_ = nullptr;
+	MIPSComp::JitOptions *jo_;
+
+	enum {
+		// On RiscV, each of the 32 registers are full 128-bit. No sharing of components!
+		NUM_RVFPUREG = 32,
+		NUM_MIPSFPUREG = RiscVJitConstants::TOTAL_MAPPABLE_MIPSREGS - 32,
+	};
+
+	FPURegStatusRiscV ar[NUM_RVFPUREG];
+	FPURegStatusMIPS mr[NUM_MIPSFPUREG];
+
+	bool pendingFlush_ = false;
+	bool pendingUnlock_ = false;
+	bool initialReady_ = false;
+	FPURegStatusRiscV arInitial_[NUM_RVFPUREG];
+	FPURegStatusMIPS mrInitial_[NUM_MIPSFPUREG];
+};