Merge pull request #16957 from unknownbrackets/riscv-vertexjit

riscv: Initial vertexjit
2025-04-02 11:01:50 -04:00 · 2023-02-13 08:33:05 +01:00 · 2023-02-13 08:33:05 +01:00 · 86a19cebfd
commit 86a19cebfd
parent c42d660858 dc4136d547
14 changed files with 657 additions and 7 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1485,6 +1485,10 @@ list(APPEND CoreExtra
 	Core/MIPS/MIPS/MipsJit.h
 )

+list(APPEND CoreExtra
+	GPU/Common/VertexDecoderRiscV.cpp
+)
+
 if(NOT MOBILE_DEVICE)
 	set(CoreExtra ${CoreExtra}
 		Core/AVIDump.cpp
--- a/Common/CommonFuncs.h
+++ b/Common/CommonFuncs.h
@ -36,6 +36,8 @@
 #define Crash() {asm ("bkpt #0");}
 #elif PPSSPP_ARCH(ARM64)
 #define Crash() {asm ("brk #0");}
+#elif PPSSPP_ARCH(RISCV64)
+#define Crash() {asm ("ebreak");}
 #else
 #include <signal.h>
 #define Crash() {kill(getpid(), SIGINT);}
--- a/Common/RiscVEmitter.cpp
+++ b/Common/RiscVEmitter.cpp
@ -1021,8 +1021,12 @@ void RiscVEmitter::ReserveCodeSpace(u32 bytes) {
 	_assert_msg_((bytes & 3) == 0 || SupportsCompressed(), "Code space should be aligned (no compressed)");
 	for (u32 i = 0; i < bytes / 4; i++)
 		EBREAK();
-	if (bytes & 2)
-		Write16(0);
+	if (bytes & 2) {
+		if (SupportsCompressed())
+			C_EBREAK();
+		else
+			Write16(0);
+	}
 }

 const u8 *RiscVEmitter::AlignCode16() {
@ -4278,4 +4282,18 @@ void RiscVEmitter::C_SDSP(RiscVReg rs2, u32 uimm9) {
 	Write16(EncodeCSS(Opcode16::C2, rs2, imm5_4_3_8_7_6, Funct3::C_SDSP));
 }

+void RiscVCodeBlock::PoisonMemory(int offset) {
+	// So we can adjust region to writable space.  Might be zero.
+	ptrdiff_t writable = writable_ - code_;
+
+	u32 *ptr = (u32 *)(region + offset + writable);
+	u32 *maxptr = (u32 *)(region + region_size - offset + writable);
+	// This will only write an even multiple of u32, but not much else to do.
+	// RiscV: 0x00100073 = EBREAK, 0x9002 = C.EBREAK
+	while (ptr + 1 <= maxptr)
+		*ptr++ = 0x00100073;
+	if (SupportsCompressed() && ptr < maxptr && (intptr_t)maxptr - (intptr_t)ptr >= 2)
+		*(u16 *)ptr = 0x9002;
+}
+
 };
--- a/Common/RiscVEmitter.h
+++ b/Common/RiscVEmitter.h
@ -47,6 +47,8 @@ enum RiscVReg {
 	V8, V9, V10, V11, V12, V13, V14, V15,
 	V16, V17, V18, V19, V20, V21, V22, V23,
 	V24, V25, V26, V27, V28, V29, V30, V31,
+
+	INVALID_REG = 0xFFFFFFFF,
 };

 enum class FixupBranchType {
@ -302,7 +304,7 @@ public:
 	void ECALL();
 	void EBREAK();

-	// 64-bit instructions - oens ending in W sign extend result to 32 bits.
+	// 64-bit instructions - ones ending in W sign extend result to 32 bits.
 	void LWU(RiscVReg rd, RiscVReg addr, s32 simm12);
 	void LD(RiscVReg rd, RiscVReg addr, s32 simm12);
 	void SD(RiscVReg rs2, RiscVReg addr, s32 simm12);
@ -1024,13 +1026,14 @@ private:
 		writable_ += 2;
 	}

+protected:
 	const u8 *code_ = nullptr;
 	u8 *writable_ = nullptr;
 	const u8 *lastCacheFlushEnd_ = nullptr;
 	bool autoCompress_ = false;
 };

-class MIPSCodeBlock : public CodeBlock<RiscVEmitter> {
+class RiscVCodeBlock : public CodeBlock<RiscVEmitter> {
 private:
 	void PoisonMemory(int offset) override;
 };
--- a/Core/Dialog/PSPOskDialog.cpp
+++ b/Core/Dialog/PSPOskDialog.cpp
@ -350,6 +350,7 @@ int PSPOskDialog::Init(u32 oskPtr) {

 	// Eat any keys pressed before the dialog inited.
 	UpdateButtons();
+	InitCommon();

 	std::lock_guard<std::mutex> guard(nativeMutex_);
 	nativeStatus_ = PSPOskNativeStatus::IDLE;
@ -954,6 +955,7 @@ int PSPOskDialog::Update(int animSpeed) {
 	const int framesHeldRepeatRate = 5;

 	UpdateButtons();
+	UpdateCommon();
 	int selectedRow = selectedChar / numKeyCols[currentKeyboard];
 	int selectedExtra = selectedChar % numKeyCols[currentKeyboard];

--- a/Core/MIPS/JitCommon/JitBlockCache.cpp
+++ b/Core/MIPS/JitCommon/JitBlockCache.cpp
@ -698,7 +698,7 @@ JitBlockDebugInfo JitBlockCache::GetBlockDebugInfo(int blockNum) const {
 	debugInfo.targetDisasm = DisassembleArm64(block->normalEntry, block->codeSize);
 #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 	debugInfo.targetDisasm = DisassembleX86(block->normalEntry, block->codeSize);
-#elif PPSSPP_ARCH(ARM64)
+#elif PPSSPP_ARCH(RISCV64)
 	debugInfo.targetDisasm = DisassembleRV64(block->normalEntry, block->codeSize);
 #endif

--- a/Core/MIPS/JitCommon/JitState.cpp
+++ b/Core/MIPS/JitCommon/JitState.cpp
@ -60,7 +60,7 @@ namespace MIPSComp {

 		useStaticAlloc = false;
 		enablePointerify = false;
-#if PPSSPP_ARCH(ARM64)
+#if PPSSPP_ARCH(ARM64) || PPSSPP_ARCH(RISCV64)
 		useStaticAlloc = !Disabled(JitDisable::STATIC_ALLOC);
 		// iOS/etc. may disable at runtime if Memory::base is not nicely aligned.
 		enablePointerify = !Disabled(JitDisable::POINTERIFY);
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@ -722,7 +722,7 @@ void VertexDecoderJitCache::Jit_NormalS16() {

 void VertexDecoderJitCache::Jit_NormalFloat() {
 	// Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
-	if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
+	if ((dec_->nrmoff & 7) == 0 && (dec_->decFmt.nrmoff & 7) == 0) {
 		LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->nrmoff);
 		STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.nrmoff);
 	} else {
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@ -37,6 +37,8 @@
 #include "Common/Arm64Emitter.h"
 #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #include "Common/x64Emitter.h"
+#elif PPSSPP_ARCH(RISCV64)
+#include "Common/RiscVEmitter.h"
 #else
 #include "Common/FakeEmitter.h"
 #endif
@ -489,6 +491,8 @@ public:
 #define VERTEXDECODER_JIT_BACKEND Arm64Gen::ARM64CodeBlock
 #elif PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #define VERTEXDECODER_JIT_BACKEND Gen::XCodeBlock
+#elif PPSSPP_ARCH(RISCV64)
+#define VERTEXDECODER_JIT_BACKEND RiscVGen::RiscVCodeBlock
 #endif


--- a/GPU/Common/VertexDecoderRiscV.cpp
+++ b/GPU/Common/VertexDecoderRiscV.cpp
@ -0,0 +1,604 @@
+// Copyright (c) 2023- PPSSPP Project.
+
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, version 2.0 or later versions.
+
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License 2.0 for more details.
+
+// A copy of the GPL 2.0 should have been included with the program.
+// If not, see http://www.gnu.org/licenses/
+
+// Official git repository and contact information can be found at
+// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
+
+#include "ppsspp_config.h"
+#if PPSSPP_ARCH(RISCV64)
+
+#include "Common/CPUDetect.h"
+#include "Common/Log.h"
+#include "Common/RiscVEmitter.h"
+#include "Core/MIPS/JitCommon/JitCommon.h"
+#include "GPU/GPUState.h"
+#include "GPU/Common/VertexDecoderCommon.h"
+
+static const float by128 = 1.0f / 128.0f;
+static const float by16384 = 1.0f / 16384.0f;
+static const float by32768 = 1.0f / 32768.0f;
+static const float const65535 = 65535.0f;
+
+using namespace RiscVGen;
+
+static const RiscVReg srcReg = X10;
+static const RiscVReg dstReg = X11;
+static const RiscVReg counterReg = X12;
+
+static const RiscVReg tempReg1 = X13;
+static const RiscVReg tempReg2 = X14;
+static const RiscVReg tempReg3 = X15;
+static const RiscVReg scratchReg = X16;
+
+static const RiscVReg fullAlphaReg = X17;
+static const RiscVReg boundsMinUReg = X28;
+static const RiscVReg boundsMinVReg = X29;
+static const RiscVReg boundsMaxUReg = X30;
+static const RiscVReg boundsMaxVReg = X31;
+
+static const RiscVReg fpScratchReg1 = F10;
+static const RiscVReg fpScratchReg2 = F11;
+static const RiscVReg fpScratchReg3 = F12;
+static const RiscVReg fpSrc[3] = { F13, F14, F15 };
+
+struct UVScaleRegs {
+	struct {
+		RiscVReg u;
+		RiscVReg v;
+	} scale;
+	struct {
+		RiscVReg u;
+		RiscVReg v;
+	} offset;
+};
+
+static const UVScaleRegs prescaleRegs = { { F0, F1 }, { F2, F3 } };
+
+// TODO: Use vector, where supported.
+
+static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_TcU8ToFloat, &VertexDecoderJitCache::Jit_TcU8ToFloat},
+	{&VertexDecoder::Step_TcU16ToFloat, &VertexDecoderJitCache::Jit_TcU16ToFloat},
+	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
+
+	{&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat},
+	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
+
+	{&VertexDecoder::Step_TcU8Prescale, &VertexDecoderJitCache::Jit_TcU8Prescale},
+	{&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale},
+	{&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale},
+
+	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
+	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
+	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
+
+	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
+	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
+	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
+
+	{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
+	{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
+	{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloatThrough},
+
+	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
+	{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
+	{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
+	{&VertexDecoder::Step_Color5551, &VertexDecoderJitCache::Jit_Color5551},
+};
+
+JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int32_t *jittedSize) {
+	dec_ = &dec;
+
+	BeginWrite(4096);
+	const u8 *start = AlignCode16();
+	SetAutoCompress(true);
+
+	bool log = false;
+	bool prescaleStep = false;
+
+	// Look for prescaled texcoord steps
+	for (int i = 0; i < dec.numSteps_; i++) {
+		if (dec.steps_[i] == &VertexDecoder::Step_TcU8Prescale ||
+			dec.steps_[i] == &VertexDecoder::Step_TcU16Prescale ||
+			dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) {
+			prescaleStep = true;
+		}
+	}
+
+	// Keep the scale/offset in a few fp registers if we need it.
+	if (prescaleStep) {
+		LI(tempReg1, &gstate_c.uv);
+		FL(32, prescaleRegs.scale.u, tempReg1, 0);
+		FL(32, prescaleRegs.scale.v, tempReg1, 4);
+		FL(32, prescaleRegs.offset.u, tempReg1, 8);
+		FL(32, prescaleRegs.offset.v, tempReg1, 12);
+		if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_8BIT) {
+			LI(scratchReg, by128);
+			FMV(FMv::W, FMv::X, fpScratchReg1, scratchReg);
+			FMUL(32, prescaleRegs.scale.u, prescaleRegs.scale.u, fpScratchReg1);
+			FMUL(32, prescaleRegs.scale.v, prescaleRegs.scale.v, fpScratchReg1);
+			FMUL(32, prescaleRegs.offset.u, prescaleRegs.offset.u, fpScratchReg1);
+			FMUL(32, prescaleRegs.offset.v, prescaleRegs.offset.v, fpScratchReg1);
+		} else if ((dec.VertexType() & GE_VTYPE_TC_MASK) == GE_VTYPE_TC_16BIT) {
+			LI(scratchReg, by32768);
+			FMV(FMv::W, FMv::X, fpScratchReg1, scratchReg);
+			FMUL(32, prescaleRegs.scale.u, prescaleRegs.scale.u, fpScratchReg1);
+			FMUL(32, prescaleRegs.scale.v, prescaleRegs.scale.v, fpScratchReg1);
+			FMUL(32, prescaleRegs.offset.u, prescaleRegs.offset.u, fpScratchReg1);
+			FMUL(32, prescaleRegs.offset.v, prescaleRegs.offset.v, fpScratchReg1);
+		}
+	}
+
+	if (dec.col) {
+		// Or LDB and skip the conditional?  This is probably cheaper.
+		LI(fullAlphaReg, 0xFF);
+	}
+
+	if (dec.tc && dec.throughmode) {
+		// TODO: Smarter, only when doing bounds.
+		LI(tempReg1, &gstate_c.vertBounds.minU);
+		LH(boundsMinUReg, tempReg1, offsetof(KnownVertexBounds, minU));
+		LH(boundsMaxUReg, tempReg1, offsetof(KnownVertexBounds, maxU));
+		LH(boundsMinVReg, tempReg1, offsetof(KnownVertexBounds, minV));
+		LH(boundsMaxVReg, tempReg1, offsetof(KnownVertexBounds, maxV));
+	}
+
+	// TODO: Skipping, prescale.
+
+	const u8 *loopStart = GetCodePtr();
+	for (int i = 0; i < dec.numSteps_; i++) {
+		if (!CompileStep(dec, i)) {
+			EndWrite();
+			// Reset the code ptr (effectively undoing what we generated) and return zero to indicate that we failed.
+			ResetCodePtr(GetOffset(start));
+			char temp[1024]{};
+			dec.ToString(temp);
+			ERROR_LOG(G3D, "Could not compile vertex decoder, failed at step %d: %s", i, temp);
+			return nullptr;
+		}
+	}
+
+	ADDI(srcReg, srcReg, dec.VertexSize());
+	ADDI(dstReg, dstReg, dec.decFmt.stride);
+	ADDI(counterReg, counterReg, -1);
+	BLT(R_ZERO, counterReg, loopStart);
+
+	if (dec.col) {
+		LI(tempReg1, &gstate_c.vertexFullAlpha);
+		FixupBranch skip = BNE(R_ZERO, fullAlphaReg);
+		SB(fullAlphaReg, tempReg1, 0);
+		SetJumpTarget(skip);
+	}
+
+	if (dec.tc && dec.throughmode) {
+		// TODO: Smarter, only when doing bounds.
+		LI(tempReg1, &gstate_c.vertBounds.minU);
+		SH(boundsMinUReg, tempReg1, offsetof(KnownVertexBounds, minU));
+		SH(boundsMaxUReg, tempReg1, offsetof(KnownVertexBounds, maxU));
+		SH(boundsMinVReg, tempReg1, offsetof(KnownVertexBounds, minV));
+		SH(boundsMaxVReg, tempReg1, offsetof(KnownVertexBounds, maxV));
+	}
+
+	RET();
+
+	FlushIcache();
+
+	if (log) {
+		char temp[1024]{};
+		dec.ToString(temp);
+		INFO_LOG(JIT, "=== %s (%d bytes) ===", temp, (int)(GetCodePtr() - start));
+		std::vector<std::string> lines = DisassembleRV64(start, (int)(GetCodePtr() - start));
+		for (auto line : lines) {
+			INFO_LOG(JIT, "%s", line.c_str());
+		}
+		INFO_LOG(JIT, "==========");
+	}
+
+	*jittedSize = (int)(GetCodePtr() - start);
+	EndWrite();
+	return (JittedVertexDecoder)start;
+}
+
+bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
+	// See if we find a matching JIT function.
+	for (size_t i = 0; i < ARRAY_SIZE(jitLookup); i++) {
+		if (dec.steps_[step] == jitLookup[i].func) {
+			((*this).*jitLookup[i].jitFunc)();
+			return true;
+		}
+	}
+	return false;
+}
+
+void VertexDecoderJitCache::Jit_TcU8ToFloat() {
+	Jit_AnyU8ToFloat(dec_->tcoff, 16);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_TcU16ToFloat() {
+	Jit_AnyU16ToFloat(dec_->tcoff, 32);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_TcFloat() {
+	// Just copy 64 bits.  Might be nice if we could detect misaligned load perf.
+	LW(tempReg1, srcReg, dec_->tcoff);
+	LW(tempReg2, srcReg, dec_->tcoff + 4);
+	SW(tempReg1, dstReg, dec_->decFmt.uvoff);
+	SW(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_TcU16ThroughToFloat() {
+	LHU(tempReg1, srcReg, dec_->tcoff + 0);
+	LHU(tempReg2, srcReg, dec_->tcoff + 2);
+
+	if (cpu_info.RiscV_B) {
+		MINU(boundsMinUReg, boundsMinUReg, tempReg1);
+		MAXU(boundsMaxUReg, boundsMaxUReg, tempReg1);
+		MINU(boundsMinVReg, boundsMinVReg, tempReg2);
+		MAXU(boundsMaxVReg, boundsMaxVReg, tempReg2);
+	} else {
+		auto updateSide = [&](RiscVReg src, bool greater, RiscVReg dst) {
+			FixupBranch skip = BLT(greater ? dst : src, greater ? src : dst);
+			MV(dst, src);
+			SetJumpTarget(skip);
+		};
+
+		updateSide(tempReg1, false, boundsMinUReg);
+		updateSide(tempReg1, true, boundsMaxUReg);
+		updateSide(tempReg2, false, boundsMinVReg);
+		updateSide(tempReg2, true, boundsMaxVReg);
+	}
+
+	FCVT(FConv::S, FConv::WU, fpSrc[0], tempReg1, Round::TOZERO);
+	FCVT(FConv::S, FConv::WU, fpSrc[1], tempReg2, Round::TOZERO);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_TcFloatThrough() {
+	// Just copy 64 bits.  Might be nice if we could detect misaligned load perf.
+	LW(tempReg1, srcReg, dec_->tcoff);
+	LW(tempReg2, srcReg, dec_->tcoff + 4);
+	SW(tempReg1, dstReg, dec_->decFmt.uvoff);
+	SW(tempReg2, dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_TcU8Prescale() {
+	LBU(tempReg1, srcReg, dec_->tcoff + 0);
+	LBU(tempReg2, srcReg, dec_->tcoff + 1);
+	FCVT(FConv::S, FConv::WU, fpSrc[0], tempReg1, Round::TOZERO);
+	FCVT(FConv::S, FConv::WU, fpSrc[1], tempReg2, Round::TOZERO);
+	FMADD(32, fpSrc[0], fpSrc[0], prescaleRegs.scale.u, prescaleRegs.offset.u);
+	FMADD(32, fpSrc[1], fpSrc[1], prescaleRegs.scale.v, prescaleRegs.offset.v);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_TcU16Prescale() {
+	LHU(tempReg1, srcReg, dec_->tcoff + 0);
+	LHU(tempReg2, srcReg, dec_->tcoff + 2);
+	FCVT(FConv::S, FConv::WU, fpSrc[0], tempReg1, Round::TOZERO);
+	FCVT(FConv::S, FConv::WU, fpSrc[1], tempReg2, Round::TOZERO);
+	FMADD(32, fpSrc[0], fpSrc[0], prescaleRegs.scale.u, prescaleRegs.offset.u);
+	FMADD(32, fpSrc[1], fpSrc[1], prescaleRegs.scale.v, prescaleRegs.offset.v);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_TcFloatPrescale() {
+	FL(32, fpSrc[0], srcReg, dec_->tcoff + 0);
+	FL(32, fpSrc[1], srcReg, dec_->tcoff + 4);
+	FMADD(32, fpSrc[0], fpSrc[0], prescaleRegs.scale.u, prescaleRegs.offset.u);
+	FMADD(32, fpSrc[1], fpSrc[1], prescaleRegs.scale.v, prescaleRegs.offset.v);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.uvoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.uvoff + 4);
+}
+
+void VertexDecoderJitCache::Jit_NormalS8() {
+	LB(tempReg1, srcReg, dec_->nrmoff);
+	LB(tempReg2, srcReg, dec_->nrmoff + 1);
+	LB(tempReg3, srcReg, dec_->nrmoff + 2);
+	SB(tempReg1, dstReg, dec_->decFmt.nrmoff);
+	SB(tempReg2, dstReg, dec_->decFmt.nrmoff + 1);
+	SB(tempReg3, dstReg, dec_->decFmt.nrmoff + 2);
+	SB(R_ZERO, dstReg, dec_->decFmt.nrmoff + 3);
+}
+
+void VertexDecoderJitCache::Jit_NormalS16() {
+	LH(tempReg1, srcReg, dec_->nrmoff);
+	LH(tempReg2, srcReg, dec_->nrmoff + 2);
+	LH(tempReg3, srcReg, dec_->nrmoff + 4);
+	SH(tempReg1, dstReg, dec_->decFmt.nrmoff);
+	SH(tempReg2, dstReg, dec_->decFmt.nrmoff + 2);
+	SH(tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
+	SH(R_ZERO, dstReg, dec_->decFmt.nrmoff + 6);
+}
+
+void VertexDecoderJitCache::Jit_NormalFloat() {
+	// Just copy 12 bytes, play with over read/write later.
+	LW(tempReg1, srcReg, dec_->nrmoff);
+	LW(tempReg2, srcReg, dec_->nrmoff + 4);
+	LW(tempReg3, srcReg, dec_->nrmoff + 8);
+	SW(tempReg1, dstReg, dec_->decFmt.nrmoff);
+	SW(tempReg2, dstReg, dec_->decFmt.nrmoff + 4);
+	SW(tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_PosS8() {
+	Jit_AnyS8ToFloat(dec_->posoff);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.posoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.posoff + 4);
+	FS(32, fpSrc[2], dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_PosS16() {
+	Jit_AnyS16ToFloat(dec_->posoff);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.posoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.posoff + 4);
+	FS(32, fpSrc[2], dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_PosFloat() {
+	// Just copy 12 bytes, play with over read/write later.
+	LW(tempReg1, srcReg, dec_->posoff);
+	LW(tempReg2, srcReg, dec_->posoff + 4);
+	LW(tempReg3, srcReg, dec_->posoff + 8);
+	SW(tempReg1, dstReg, dec_->decFmt.posoff);
+	SW(tempReg2, dstReg, dec_->decFmt.posoff + 4);
+	SW(tempReg3, dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_PosS8Through() {
+	// 8-bit positions in throughmode always decode to 0, depth included.
+	SW(R_ZERO, dstReg, dec_->decFmt.posoff);
+	SW(R_ZERO, dstReg, dec_->decFmt.posoff + 4);
+	SW(R_ZERO, dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_PosS16Through() {
+	// Start with X and Y (which are signed.)
+	LH(tempReg1, srcReg, dec_->posoff + 0);
+	LH(tempReg2, srcReg, dec_->posoff + 2);
+	// This one, Z, has to be unsigned.
+	LHU(tempReg3, srcReg, dec_->posoff + 4);
+	FCVT(FConv::S, FConv::WU, fpSrc[0], tempReg1, Round::TOZERO);
+	FCVT(FConv::S, FConv::WU, fpSrc[1], tempReg2, Round::TOZERO);
+	FCVT(FConv::S, FConv::WU, fpSrc[2], tempReg3, Round::TOZERO);
+	FS(32, fpSrc[0], dstReg, dec_->decFmt.posoff);
+	FS(32, fpSrc[1], dstReg, dec_->decFmt.posoff + 4);
+	FS(32, fpSrc[2], dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_PosFloatThrough() {
+	// Start by copying 8 bytes, then handle Z separately to clamp it.
+	LW(tempReg1, srcReg, dec_->posoff);
+	LW(tempReg2, srcReg, dec_->posoff + 4);
+	FL(32, fpSrc[2], srcReg, dec_->posoff + 8);
+	SW(tempReg1, dstReg, dec_->decFmt.posoff);
+	SW(tempReg2, dstReg, dec_->decFmt.posoff + 4);
+
+	// Load the constants to clamp.  Maybe could static alloc this constant in a reg.
+	LI(scratchReg, const65535);
+	FMV(FMv::W, FMv::X, fpScratchReg1, R_ZERO);
+	FMV(FMv::W, FMv::X, fpScratchReg2, scratchReg);
+
+	FMAX(32, fpSrc[2], fpSrc[2], fpScratchReg1);
+	FMIN(32, fpSrc[2], fpSrc[2], fpScratchReg2);
+	FS(32, fpSrc[2], dstReg, dec_->decFmt.posoff + 8);
+}
+
+void VertexDecoderJitCache::Jit_Color8888() {
+	LW(tempReg1, srcReg, dec_->coloff);
+
+	// Set tempReg2=-1 if full alpha, 0 otherwise.
+	SRLI(tempReg2, tempReg1, 24);
+	SLTIU(tempReg2, tempReg2, 0xFF);
+	ADDI(tempReg2, tempReg2, -1);
+
+	// Now use that as a mask to clear fullAlpha.
+	AND(fullAlphaReg, fullAlphaReg, tempReg2);
+
+	SW(tempReg1, dstReg, dec_->decFmt.c0off);
+}
+
+void VertexDecoderJitCache::Jit_Color4444() {
+	LHU(tempReg1, srcReg, dec_->coloff);
+
+	// Red...
+	ANDI(tempReg2, tempReg1, 0x0F);
+	// Move green left to position 8.
+	ANDI(tempReg3, tempReg1, 0xF0);
+	SLLI(tempReg3, tempReg3, 4);
+	OR(tempReg2, tempReg2, tempReg3);
+	// For blue, we modify tempReg1 since immediates are sign extended after 11 bits.
+	SRLI(tempReg1, tempReg1, 8);
+	ANDI(tempReg3, tempReg1, 0x0F);
+	SLLI(tempReg3, tempReg3, 16);
+	OR(tempReg2, tempReg2, tempReg3);
+	// And now alpha, moves 20 to get to 24.
+	ANDI(tempReg3, tempReg1, 0xF0);
+	SLLI(tempReg3, tempReg3, 20);
+	OR(tempReg2, tempReg2, tempReg3);
+
+	// Now we swizzle.
+	SLLI(tempReg3, tempReg2, 4);
+	OR(tempReg2, tempReg2, tempReg3);
+
+	// Color is down, now let's say the fullAlphaReg flag from tempReg1 (still has alpha.)
+	// Set tempReg1=-1 if full alpha, 0 otherwise.
+	SLTIU(tempReg1, tempReg1, 0xF0);
+	ADDI(tempReg1, tempReg1, -1);
+
+	// Now use that as a mask to clear fullAlpha.
+	AND(fullAlphaReg, fullAlphaReg, tempReg1);
+
+	SW(tempReg2, dstReg, dec_->decFmt.c0off);
+}
+
+void VertexDecoderJitCache::Jit_Color565() {
+	LHU(tempReg1, srcReg, dec_->coloff);
+
+	// Start by extracting green.
+	SRLI(tempReg2, tempReg1, 5);
+	ANDI(tempReg2, tempReg2, 0x3F);
+	// And now swizzle 6 -> 8, using a wall to clear bits.
+	SRLI(tempReg3, tempReg2, 4);
+	SLLI(tempReg3, tempReg3, 8);
+	SLLI(tempReg2, tempReg2, 2 + 8);
+	OR(tempReg2, tempReg2, tempReg3);
+
+	// Now pull blue out using a wall to isolate it.
+	SRLI(tempReg3, tempReg1, 11);
+	// And now isolate red and combine them.
+	ANDI(tempReg1, tempReg1, 0x1F);
+	SLLI(tempReg3, tempReg3, 16);
+	OR(tempReg1, tempReg1, tempReg3);
+	// Now we swizzle them together.
+	SRLI(tempReg3, tempReg1, 2);
+	SLLI(tempReg1, tempReg1, 3);
+	OR(tempReg1, tempReg1, tempReg3);
+	// But we have to clear the bits now which is annoying.
+	LI(tempReg3, 0x00FF00FF);
+	AND(tempReg1, tempReg1, tempReg3);
+
+	// Now add green back in, and then make an alpha FF and add it too.
+	OR(tempReg1, tempReg1, tempReg2);
+	LI(tempReg3, (s32)0xFF000000);
+	OR(tempReg1, tempReg1, tempReg3);
+
+	SW(tempReg1, dstReg, dec_->decFmt.c0off);
+}
+
+void VertexDecoderJitCache::Jit_Color5551() {
+	LHU(tempReg1, srcReg, dec_->coloff);
+
+	// Separate each color.
+	SRLI(tempReg2, tempReg1, 5);
+	SRLI(tempReg3, tempReg1, 10);
+
+	// Set tempReg3 to -1 if the alpha bit is set.
+	SLLIW(scratchReg, tempReg1, 16);
+	SRAIW(scratchReg, scratchReg, 31);
+	// Now we can mask the flag.
+	AND(fullAlphaReg, fullAlphaReg, scratchReg);
+
+	// Let's move alpha into position.
+	SLLI(scratchReg, scratchReg, 24);
+
+	// Mask each.
+	ANDI(tempReg1, tempReg1, 0x1F);
+	ANDI(tempReg2, tempReg2, 0x1F);
+	ANDI(tempReg3, tempReg3, 0x1F);
+	// And shift into position.
+	SLLI(tempReg2, tempReg2, 8);
+	SLLI(tempReg3, tempReg3, 16);
+	// Combine RGB together.
+	OR(tempReg1, tempReg1, tempReg2);
+	OR(tempReg1, tempReg1, tempReg3);
+	// Swizzle our 5 -> 8
+	SRLI(tempReg2, tempReg1, 2);
+	SLLI(tempReg1, tempReg1, 3);
+	// Mask out the overflow in tempReg2 and combine.
+	LI(tempReg3, 0x00070707);
+	AND(tempReg2, tempReg2, tempReg3);
+	OR(tempReg1, tempReg1, tempReg2);
+
+	// Add in alpha and we're done.
+	OR(tempReg1, tempReg1, scratchReg);
+
+	SW(tempReg1, dstReg, dec_->decFmt.c0off);
+}
+
+void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
+	LB(tempReg1, srcReg, srcoff + 0);
+	LB(tempReg2, srcReg, srcoff + 1);
+	LB(tempReg3, srcReg, srcoff + 2);
+	// TODO: Could maybe static alloc?
+	LI(scratchReg, by128);
+	FMV(FMv::W, FMv::X, fpScratchReg1, scratchReg);
+	FCVT(FConv::S, FConv::W, fpSrc[0], tempReg1, Round::TOZERO);
+	FCVT(FConv::S, FConv::W, fpSrc[1], tempReg2, Round::TOZERO);
+	FCVT(FConv::S, FConv::W, fpSrc[2], tempReg3, Round::TOZERO);
+	FMUL(32, fpSrc[0], fpSrc[0], fpScratchReg1);
+	FMUL(32, fpSrc[1], fpSrc[1], fpScratchReg1);
+	FMUL(32, fpSrc[2], fpSrc[2], fpScratchReg1);
+}
+
+void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
+	LH(tempReg1, srcReg, srcoff + 0);
+	LH(tempReg2, srcReg, srcoff + 2);
+	LH(tempReg3, srcReg, srcoff + 4);
+	// TODO: Could maybe static alloc?
+	LI(scratchReg, by32768);
+	FMV(FMv::W, FMv::X, fpScratchReg1, scratchReg);
+	FCVT(FConv::S, FConv::W, fpSrc[0], tempReg1, Round::TOZERO);
+	FCVT(FConv::S, FConv::W, fpSrc[1], tempReg2, Round::TOZERO);
+	FCVT(FConv::S, FConv::W, fpSrc[2], tempReg3, Round::TOZERO);
+	FMUL(32, fpSrc[0], fpSrc[0], fpScratchReg1);
+	FMUL(32, fpSrc[1], fpSrc[1], fpScratchReg1);
+	FMUL(32, fpSrc[2], fpSrc[2], fpScratchReg1);
+}
+
+void VertexDecoderJitCache::Jit_AnyU8ToFloat(int srcoff, u32 bits) {
+	_dbg_assert_msg_((bits & ~(16 | 8)) == 0, "Bits must be a multiple of 8.");
+	_dbg_assert_msg_(bits >= 8 && bits <= 24, "Bits must be a between 8 and 24.");
+
+	LBU(tempReg1, srcReg, srcoff + 0);
+	if (bits >= 16)
+		LBU(tempReg2, srcReg, srcoff + 1);
+	if (bits >= 24)
+		LBU(tempReg3, srcReg, srcoff + 2);
+	// TODO: Could maybe static alloc?
+	LI(scratchReg, by128);
+	FMV(FMv::W, FMv::X, fpScratchReg1, scratchReg);
+	FCVT(FConv::S, FConv::WU, fpSrc[0], tempReg1, Round::TOZERO);
+	if (bits >= 16)
+		FCVT(FConv::S, FConv::WU, fpSrc[1], tempReg2, Round::TOZERO);
+	if (bits >= 24)
+		FCVT(FConv::S, FConv::WU, fpSrc[2], tempReg3, Round::TOZERO);
+	FMUL(32, fpSrc[0], fpSrc[0], fpScratchReg1);
+	if (bits >= 16)
+		FMUL(32, fpSrc[1], fpSrc[1], fpScratchReg1);
+	if (bits >= 24)
+		FMUL(32, fpSrc[2], fpSrc[2], fpScratchReg1);
+}
+
+void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff, u32 bits) {
+	_dbg_assert_msg_((bits & ~(32 | 16)) == 0, "Bits must be a multiple of 16.");
+	_dbg_assert_msg_(bits >= 16 && bits <= 48, "Bits must be a between 16 and 48.");
+
+	LHU(tempReg1, srcReg, srcoff + 0);
+	if (bits >= 32)
+		LHU(tempReg2, srcReg, srcoff + 2);
+	if (bits >= 48)
+		LHU(tempReg3, srcReg, srcoff + 4);
+	// TODO: Could maybe static alloc?
+	LI(scratchReg, by32768);
+	FMV(FMv::W, FMv::X, fpScratchReg1, scratchReg);
+	FCVT(FConv::S, FConv::WU, fpSrc[0], tempReg1, Round::TOZERO);
+	if (bits >= 32)
+		FCVT(FConv::S, FConv::WU, fpSrc[1], tempReg2, Round::TOZERO);
+	if (bits >= 48)
+		FCVT(FConv::S, FConv::WU, fpSrc[2], tempReg3, Round::TOZERO);
+	FMUL(32, fpSrc[0], fpSrc[0], fpScratchReg1);
+	if (bits >= 32)
+		FMUL(32, fpSrc[1], fpSrc[1], fpScratchReg1);
+	if (bits >= 48)
+		FMUL(32, fpSrc[2], fpSrc[2], fpScratchReg1);
+}
+
+#endif // PPSSPP_ARCH(RISCV64)
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@ -498,6 +498,7 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|ARM'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="Common\VertexDecoderCommon.cpp" />
+    <ClCompile Include="Common\VertexDecoderRiscV.cpp" />
    <ClCompile Include="Common\VertexDecoderX86.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|ARM'">true</ExcludedFromBuild>
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@ -521,6 +521,9 @@
    <ClCompile Include="GLES\StencilBufferGLES.cpp">
      <Filter>GLES</Filter>
    </ClCompile>
+    <ClCompile Include="Common\VertexDecoderRiscV.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <FxCompile Include="..\assets\shaders\tex_4xbrz.csh">
--- a/GPU/Software/RasterizerRegCache.cpp
+++ b/GPU/Software/RasterizerRegCache.cpp
@ -145,6 +145,8 @@ void RegCache::SetupABI(const std::vector<Purpose> &args, bool forceRetain) {
 	for (Reg r : vecTemps)
 		Add(r, VEC_INVALID);
 #endif
+#elif PPSSPP_ARCH(RISCV64)
+	_assert_msg_(false, "Not yet implemented (no vector calling standard yet)");
 #elif PPSSPP_ARCH(MIPS)
 	_assert_msg_(false, "Not yet implemented");
 #else
--- a/GPU/Software/RasterizerRegCache.h
+++ b/GPU/Software/RasterizerRegCache.h
@ -44,6 +44,8 @@
 #include "Common/x64Emitter.h"
 #elif PPSSPP_ARCH(MIPS)
 #include "Common/MipsEmitter.h"
+#elif PPSSPP_ARCH(RISCV64)
+#include "Common/RiscVEmitter.h"
 #else
 #include "Common/FakeEmitter.h"
 #endif
@ -60,6 +62,8 @@ typedef Arm64Gen::ARM64CodeBlock BaseCodeBlock;
 typedef Gen::XCodeBlock BaseCodeBlock;
 #elif PPSSPP_ARCH(MIPS)
 typedef MIPSGen::MIPSCodeBlock BaseCodeBlock;
+#elif PPSSPP_ARCH(RISCV64)
+typedef RiscVGen::RiscVCodeBlock BaseCodeBlock;
 #else
 typedef FakeGen::FakeXCodeBlock BaseCodeBlock;
 #endif
@ -169,6 +173,9 @@ struct RegCache {
 #elif PPSSPP_ARCH(MIPS)
 	typedef MIPSGen::MIPSReg Reg;
 	static constexpr Reg REG_INVALID_VALUE = MIPSGen::INVALID_REG;
+#elif PPSSPP_ARCH(RISCV64)
+	typedef RiscVGen::RiscVReg Reg;
+	static constexpr Reg REG_INVALID_VALUE = RiscVGen::INVALID_REG;
 #else
 	typedef int Reg;
 	static constexpr Reg REG_INVALID_VALUE = -1;