From 15cb782f85ff92cd95c5ee89f57a51281503789a Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 1 Dec 2023 20:31:25 -0800
Subject: [PATCH] riscv: Implement Zfa encoding.

Not yet enabled/detected.
---
 Common/RiscVEmitter.cpp             | 163 ++++++++++++++++++++++++++++
 Common/RiscVEmitter.h               |  20 ++++
 Core/MIPS/RiscV/RiscVCompFPU.cpp    |  18 +--
 Core/MIPS/RiscV/RiscVCompSystem.cpp |  10 +-
 Core/MIPS/RiscV/RiscVCompVec.cpp    |  78 +++++++++----
 GPU/Common/VertexDecoderRiscV.cpp   |   6 +-
 6 files changed, 248 insertions(+), 47 deletions(-)

diff --git a/Common/RiscVEmitter.cpp b/Common/RiscVEmitter.cpp
index 6f11dbe2cc..d1cf5d38ff 100644
--- a/Common/RiscVEmitter.cpp
+++ b/Common/RiscVEmitter.cpp
@@ -75,6 +75,11 @@ static inline bool SupportsFloatHalf(bool allowMin = false) {
 	return false;
 }
 
+static inline bool SupportsFloatExtra() {
+	// TODO: cpu_info.RiscV_Zfa
+	return false;
+}
+
 enum class Opcode32 {
 	// Note: invalid, just used for FixupBranch.
 	ZERO = 0b0000000,
@@ -156,6 +161,8 @@ enum class Funct3 {
 
 	FMIN = 0b000,
 	FMAX = 0b001,
+	FMINM = 0b010,
+	FMAXM = 0b011,
 
 	FMV = 0b000,
 	FCLASS = 0b001,
@@ -2216,6 +2223,162 @@ void RiscVEmitter::FCLASS(int bits, RiscVReg rd, RiscVReg rs1) {
 	Write32(EncodeR(Opcode32::OP_FP, rd, Funct3::FCLASS, rs1, F0, BitsToFunct2(bits), Funct5::FMV_TOX));
 }
 
+static const uint32_t FLIvalues[32] = {
+	0xBF800000, // -1.0
+	0x00800000, // FLT_MIN (note: a bit special)
+	0x37800000, // pow(2, -16)
+	0x38000000, // pow(2, -15)
+	0x3B800000, // pow(2, -8)
+	0x3C000000, // pow(2, -7)
+	0x3D800000, // 0.0625
+	0x3E000000, // 0.125
+	0x3E800000, // 0.25
+	0x3EA00000, // 0.3125
+	0x3EC00000, // 0.375
+	0x3EE00000, // 0.4375
+	0x3F000000, // 0.5
+	0x3F200000, // 0.625
+	0x3F400000, // 0.75
+	0x3F600000, // 0.875
+	0x3F800000, // 1.0
+	0x3FA00000, // 1.25
+	0x3FC00000, // 1.5
+	0x3FE00000, // 1.75
+	0x40000000, // 2.0
+	0x40200000, // 2.5
+	0x40400000, // 3.0
+	0x40800000, // 4.0
+	0x41000000, // 8.0
+	0x41800000, // 16.0
+	0x43000000, // 128.0
+	0x43800000, // 256.0
+	0x47000000, // pow(2, 15)
+	0x47800000, // pow(2, 16)
+	0x7F800000, // INFINITY
+	0x7FC00000, // NAN
+};
+
+static RiscVReg EncodeFLImm(int bits, double v) {
+	float f = (float)v;
+	int index = -1;
+	for (size_t i = 0; i < ARRAY_SIZE(FLIvalues); ++i) {
+		if (memcmp(&f, &FLIvalues[i], sizeof(float)) == 0) {
+			index = (int)i;
+			break;
+		}
+	}
+
+	// For 16-bit, 2/3 are subnormal and 29 is not possible.  Just avoid for now.
+	if (index != -1 && index != 1 && (bits > 16 || (index != 2 && index != 3 && index != 29)))
+		return (RiscVReg)index;
+
+	if (bits == 64) {
+		uint64_t dmin = 0x0010000000000000ULL;
+		if (memcmp(&v, &dmin, 8) == 0)
+			return F1;
+	} else if (bits == 32 && index == 1) {
+		return F1;
+	} else if (bits == 16) {
+		uint64_t hmin = 0x3F10000000000000ULL;
+		if (memcmp(&v, &hmin, 8) == 0)
+			return F1;
+	}
+
+	return INVALID_REG;
+}
+
+bool RiscVEmitter::CanFLI(int bits, double v) const {
+	if (!SupportsFloatExtra())
+		return false;
+	if (bits == 16 && !SupportsFloatHalf())
+		return false;
+	if (bits > FloatBitsSupported())
+		return false;
+	return EncodeFLImm(bits, v) != INVALID_REG;
+}
+
+bool RiscVEmitter::CanFLI(int bits, uint32_t pattern) const {
+	float f;
+	memcpy(&f, &pattern, sizeof(f));
+	return CanFLI(bits, f);
+}
+
+void RiscVEmitter::FLI(int bits, RiscVReg rd, double v) {
+	_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
+	_assert_msg_(bits <= FloatBitsSupported(), "FLI cannot be used for %d bits, only %d/%d supported", bits, BitsSupported(), FloatBitsSupported());
+	_assert_msg_(IsFPR(rd), "%s rd of wrong type", __func__);
+
+	RiscVReg imm = EncodeFLImm(bits, v);
+	_assert_msg_(imm != INVALID_REG, "FLI with unsupported constant %f for %d bits", v, bits);
+	Write32(EncodeR(Opcode32::OP_FP, rd, Funct3::FMV, imm, F1, BitsToFunct2(bits, false), Funct5::FMV_FROMX));
+}
+
+void RiscVEmitter::FLI(int bits, RiscVReg rd, uint32_t pattern) {
+	float f;
+	memcpy(&f, &pattern, sizeof(f));
+	FLI(bits, rd, f);
+}
+
+void RiscVEmitter::FMINM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2) {
+	_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
+	Write32(EncodeFR(Opcode32::OP_FP, rd, Funct3::FMINM, rs1, rs2, BitsToFunct2(bits), Funct5::FMINMAX));
+}
+
+void RiscVEmitter::FMAXM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2) {
+	_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
+	Write32(EncodeFR(Opcode32::OP_FP, rd, Funct3::FMAXM, rs1, rs2, BitsToFunct2(bits), Funct5::FMINMAX));
+}
+
+void RiscVEmitter::FROUND(int bits, RiscVReg rd, RiscVReg rs1, Round rm) {
+	_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
+	_assert_msg_(bits <= FloatBitsSupported(), "FROUND for %d float bits, only %d supported", bits, FloatBitsSupported());
+	_assert_msg_(IsFPR(rd), "%s rd of wrong type", __func__);
+	_assert_msg_(IsFPR(rs1), "%s rs1 of wrong type", __func__);
+
+	Funct2 toFmt = BitsToFunct2(bits, false);
+	Write32(EncodeR(Opcode32::OP_FP, rd, (Funct3)rm, rs1, F4, toFmt, Funct5::FCVT_SZ));
+}
+
+void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, double v, RiscVReg scratchReg) {
+	if (CanFLI(bits, v)) {
+		FLI(bits, rd, v);
+	} else if (bits == 64) {
+		LI(scratchReg, v);
+		FMV(FMv::D, FMv::X, rd, scratchReg);
+	} else if (bits <= 32) {
+		QuickFLI(32, rd, (float)v, scratchReg);
+	} else {
+		_assert_msg_(false, "Unsupported QuickFLI bits");
+	}
+}
+
+void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, uint32_t pattern, RiscVReg scratchReg) {
+	if (CanFLI(bits, pattern)) {
+		FLI(bits, rd, pattern);
+	} else if (bits == 32) {
+		LI(scratchReg, (int32_t)pattern);
+		FMV(FMv::W, FMv::X, rd, scratchReg);
+	} else if (bits == 16) {
+		LI(scratchReg, (int16_t)pattern);
+		FMV(FMv::H, FMv::X, rd, scratchReg);
+	} else {
+		_assert_msg_(false, "Unsupported QuickFLI bits");
+	}
+}
+
+void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, float v, RiscVReg scratchReg) {
+	if (CanFLI(bits, v)) {
+		FLI(bits, rd, v);
+	} else if (bits == 64) {
+		QuickFLI(32, rd, (double)v, scratchReg);
+	} else if (bits == 32) {
+		LI(scratchReg, v);
+		FMV(FMv::D, FMv::X, rd, scratchReg);
+	} else {
+		_assert_msg_(false, "Unsupported QuickFLI bits");
+	}
+}
+
 void RiscVEmitter::CSRRW(RiscVReg rd, Csr csr, RiscVReg rs1) {
 	_assert_msg_(SupportsZicsr(), "%s instruction not supported", __func__);
 	_assert_msg_((u32)csr <= 0x00000FFF, "%s with invalid CSR number", __func__);
diff --git a/Common/RiscVEmitter.h b/Common/RiscVEmitter.h
index 9fe5d87164..e885f52e6e 100644
--- a/Common/RiscVEmitter.h
+++ b/Common/RiscVEmitter.h
@@ -421,6 +421,26 @@ public:
 	void FLE(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2);
 	void FCLASS(int bits, RiscVReg rd, RiscVReg rs1);
 
+	// Additional floating point (Zfa.)
+	bool CanFLI(int bits, double v) const;
+	bool CanFLI(int bits, uint32_t pattern) const;
+	bool CanFLI(int bits, float v) const {
+		return CanFLI(bits, (double)v);
+	}
+	void FLI(int bits, RiscVReg rd, double v);
+	void FLI(int bits, RiscVReg rd, uint32_t pattern);
+	void FLI(int bits, RiscVReg rd, float v) {
+		FLI(bits, rd, (double)v);
+	}
+	void FMINM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2);
+	void FMAXM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2);
+	void FROUND(int bits, RiscVReg rd, RiscVReg rs1, Round rm = Round::DYNAMIC);
+
+	// Convenience helper for FLI support.
+	void QuickFLI(int bits, RiscVReg rd, double v, RiscVReg scratchReg);
+	void QuickFLI(int bits, RiscVReg rd, uint32_t pattern, RiscVReg scratchReg);
+	void QuickFLI(int bits, RiscVReg rd, float v, RiscVReg scratchReg);
+
 	// Control state register manipulation.
 	void CSRRW(RiscVReg rd, Csr csr, RiscVReg rs1);
 	void CSRRS(RiscVReg rd, Csr csr, RiscVReg rs1);
diff --git a/Core/MIPS/RiscV/RiscVCompFPU.cpp b/Core/MIPS/RiscV/RiscVCompFPU.cpp
index 132ef8e58c..eb26e5caed 100644
--- a/Core/MIPS/RiscV/RiscVCompFPU.cpp
+++ b/Core/MIPS/RiscV/RiscVCompFPU.cpp
@@ -246,8 +246,7 @@ void RiscVJitBackend::CompIR_FCvt(IRInst inst) {
 
 		tempReg = regs_.MapWithFPRTemp(inst);
 		// Prepare the multiplier.
-		LI(SCRATCH1, 1UL << (inst.src2 & 0x1F));
-		FCVT(FConv::S, FConv::WU, tempReg, SCRATCH1, rm);
+		QuickFLI(32, tempReg, (float)(1UL << (inst.src2 & 0x1F)), SCRATCH1);
 
 		FMUL(32, regs_.F(inst.dest), regs_.F(inst.src1), tempReg, rm);
 		// NAN and clamping should all be correct.
@@ -264,8 +263,7 @@ void RiscVJitBackend::CompIR_FCvt(IRInst inst) {
 		FCVT(FConv::S, FConv::W, regs_.F(inst.dest), SCRATCH1);
 
 		// Pre-divide so we can avoid any actual divide.
-		LI(SCRATCH1, 1.0f / (1UL << (inst.src2 & 0x1F)));
-		FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
+		QuickFLI(32, tempReg, 1.0f / (1UL << (inst.src2 & 0x1F)), SCRATCH1);
 		FMUL(32, regs_.F(inst.dest), regs_.F(inst.dest), tempReg);
 		break;
 
@@ -292,8 +290,7 @@ void RiscVJitBackend::CompIR_FSat(IRInst inst) {
 		FCVT(FConv::S, FConv::W, tempReg, R_ZERO);
 		// FLE here is intentional to convert -0.0 to +0.0.
 		FLE(32, SCRATCH1, regs_.F(inst.src1), tempReg);
-		LI(SCRATCH2, 1.0f);
-		FMV(FMv::W, FMv::X, tempReg, SCRATCH2);
+		QuickFLI(32, tempReg, 1.0f, SCRATCH2);
 		FLT(32, SCRATCH2, tempReg, regs_.F(inst.src1));
 
 		skipLower = BEQ(SCRATCH1, R_ZERO);
@@ -315,8 +312,7 @@ void RiscVJitBackend::CompIR_FSat(IRInst inst) {
 			FMV(32, regs_.F(inst.dest), regs_.F(inst.src1));
 
 		// First, set SCRATCH1 = clamp to negative, SCRATCH2 = clamp to positive.
-		LI(SCRATCH2, -1.0f);
-		FMV(FMv::W, FMv::X, tempReg, SCRATCH2);
+		QuickFLI(32, tempReg, -1.0f, SCRATCH2);
 		FLT(32, SCRATCH1, regs_.F(inst.src1), tempReg);
 		FNEG(32, tempReg, tempReg);
 		FLT(32, SCRATCH2, tempReg, regs_.F(inst.src1));
@@ -621,8 +617,7 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
 		FSQRT(32, regs_.F(inst.dest), regs_.F(inst.src1));
 
 		// Ugh, we can't really avoid a temp here.  Probably not worth a permanent one.
-		LI(SCRATCH1, 1.0f);
-		FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
+		QuickFLI(32, tempReg, 1.0f, SCRATCH1);
 		FDIV(32, regs_.F(inst.dest), tempReg, regs_.F(inst.dest));
 		break;
 
@@ -635,8 +630,7 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
 			FDIV(32, regs_.F(inst.dest), regs_.F(inst.dest), regs_.F(inst.src1));
 		} else {
 			tempReg = regs_.MapWithFPRTemp(inst);
-			LI(SCRATCH1, 1.0f);
-			FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
+			QuickFLI(32, tempReg, 1.0f, SCRATCH1);
 			FDIV(32, regs_.F(inst.dest), tempReg, regs_.F(inst.src1));
 		}
 		break;
diff --git a/Core/MIPS/RiscV/RiscVCompSystem.cpp b/Core/MIPS/RiscV/RiscVCompSystem.cpp
index a291d85930..4bba8a9ece 100644
--- a/Core/MIPS/RiscV/RiscVCompSystem.cpp
+++ b/Core/MIPS/RiscV/RiscVCompSystem.cpp
@@ -50,14 +50,10 @@ void RiscVJitBackend::CompIR_Basic(IRInst inst) {
 
 	case IROp::SetConstF:
 		regs_.Map(inst);
-		if (inst.constant == 0) {
+		if (inst.constant == 0)
 			FCVT(FConv::S, FConv::W, regs_.F(inst.dest), R_ZERO);
-		} else {
-			// TODO: In the future, could use FLI if it's approved.
-			// Also, is FCVT faster?
-			LI(SCRATCH1, (int32_t)inst.constant);
-			FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
-		}
+		else
+			QuickFLI(32, regs_.F(inst.dest), inst.constant, SCRATCH1);
 		break;
 
 	case IROp::Downcount:
diff --git a/Core/MIPS/RiscV/RiscVCompVec.cpp b/Core/MIPS/RiscV/RiscVCompVec.cpp
index b220d0ce8e..8118fc7195 100644
--- a/Core/MIPS/RiscV/RiscVCompVec.cpp
+++ b/Core/MIPS/RiscV/RiscVCompVec.cpp
@@ -54,56 +54,86 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) {
 			break;
 
 		case Vec4Init::AllONE:
-			LI(SCRATCH1, 1.0f);
-			FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
-			for (int i = 1; i < 4; ++i)
-				FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
+			if (CanFLI(32, 1.0f)) {
+				for (int i = 0; i < 4; ++i)
+					FLI(32, regs_.F(inst.dest + i), 1.0f);
+			} else {
+				LI(SCRATCH1, 1.0f);
+				FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
+				for (int i = 1; i < 4; ++i)
+					FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
+			}
 			break;
 
 		case Vec4Init::AllMinusONE:
-			LI(SCRATCH1, -1.0f);
-			FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
-			for (int i = 1; i < 4; ++i)
-				FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
+			if (CanFLI(32, -1.0f)) {
+				for (int i = 0; i < 4; ++i)
+					FLI(32, regs_.F(inst.dest + i), -1.0f);
+			} else {
+				LI(SCRATCH1, -1.0f);
+				FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
+				for (int i = 1; i < 4; ++i)
+					FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
+			}
 			break;
 
 		case Vec4Init::Set_1000:
-			LI(SCRATCH1, 1.0f);
+			if (!CanFLI(32, 1.0f))
+				LI(SCRATCH1, 1.0f);
 			for (int i = 0; i < 4; ++i) {
-				if (i == 0)
-					FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
-				else
+				if (i == 0) {
+					if (CanFLI(32, 1.0f))
+						FLI(32, regs_.F(inst.dest + i), 1.0f);
+					else
+						FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
+				} else {
 					FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
+				}
 			}
 			break;
 
 		case Vec4Init::Set_0100:
-			LI(SCRATCH1, 1.0f);
+			if (!CanFLI(32, 1.0f))
+				LI(SCRATCH1, 1.0f);
 			for (int i = 0; i < 4; ++i) {
-				if (i == 1)
-					FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
-				else
+				if (i == 1) {
+					if (CanFLI(32, 1.0f))
+						FLI(32, regs_.F(inst.dest + i), 1.0f);
+					else
+						FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
+				} else {
 					FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
+				}
 			}
 			break;
 
 		case Vec4Init::Set_0010:
-			LI(SCRATCH1, 1.0f);
+			if (!CanFLI(32, 1.0f))
+				LI(SCRATCH1, 1.0f);
 			for (int i = 0; i < 4; ++i) {
-				if (i == 2)
-					FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
-				else
+				if (i == 2) {
+					if (CanFLI(32, 1.0f))
+						FLI(32, regs_.F(inst.dest + i), 1.0f);
+					else
+						FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
+				} else {
 					FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
+				}
 			}
 			break;
 
 		case Vec4Init::Set_0001:
-			LI(SCRATCH1, 1.0f);
+			if (!CanFLI(32, 1.0f))
+				LI(SCRATCH1, 1.0f);
 			for (int i = 0; i < 4; ++i) {
-				if (i == 3)
-					FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
-				else
+				if (i == 3) {
+					if (CanFLI(32, 1.0f))
+						FLI(32, regs_.F(inst.dest + i), 1.0f);
+					else
+						FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
+				} else {
 					FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
+				}
 			}
 			break;
 		}
diff --git a/GPU/Common/VertexDecoderRiscV.cpp b/GPU/Common/VertexDecoderRiscV.cpp
index 41e44f7106..6a69f7fce5 100644
--- a/GPU/Common/VertexDecoderRiscV.cpp
+++ b/GPU/Common/VertexDecoderRiscV.cpp
@@ -223,10 +223,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 	}
 
 	// TODO: Only load these when needed?
-	LI(scratchReg, by128);
-	FMV(FMv::W, FMv::X, by128Reg, scratchReg);
-	LI(scratchReg, by32768);
-	FMV(FMv::W, FMv::X, by32768Reg, scratchReg);
+	QuickFLI(32, by128Reg, by128, scratchReg);
+	QuickFLI(32, by32768Reg, by32768, scratchReg);
 	if (posThroughStep) {
 		LI(scratchReg, const65535);
 		FMV(FMv::W, FMv::X, const65535Reg, scratchReg);