From 15cb782f85ff92cd95c5ee89f57a51281503789a Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Fri, 1 Dec 2023 20:31:25 -0800 Subject: [PATCH] riscv: Implement Zfa encoding. Not yet enabled/detected. --- Common/RiscVEmitter.cpp | 163 ++++++++++++++++++++++++++++ Common/RiscVEmitter.h | 20 ++++ Core/MIPS/RiscV/RiscVCompFPU.cpp | 18 +-- Core/MIPS/RiscV/RiscVCompSystem.cpp | 10 +- Core/MIPS/RiscV/RiscVCompVec.cpp | 78 +++++++++---- GPU/Common/VertexDecoderRiscV.cpp | 6 +- 6 files changed, 248 insertions(+), 47 deletions(-) diff --git a/Common/RiscVEmitter.cpp b/Common/RiscVEmitter.cpp index 6f11dbe2cc..d1cf5d38ff 100644 --- a/Common/RiscVEmitter.cpp +++ b/Common/RiscVEmitter.cpp @@ -75,6 +75,11 @@ static inline bool SupportsFloatHalf(bool allowMin = false) { return false; } +static inline bool SupportsFloatExtra() { + // TODO: cpu_info.RiscV_Zfa + return false; +} + enum class Opcode32 { // Note: invalid, just used for FixupBranch. ZERO = 0b0000000, @@ -156,6 +161,8 @@ enum class Funct3 { FMIN = 0b000, FMAX = 0b001, + FMINM = 0b010, + FMAXM = 0b011, FMV = 0b000, FCLASS = 0b001, @@ -2216,6 +2223,162 @@ void RiscVEmitter::FCLASS(int bits, RiscVReg rd, RiscVReg rs1) { Write32(EncodeR(Opcode32::OP_FP, rd, Funct3::FCLASS, rs1, F0, BitsToFunct2(bits), Funct5::FMV_TOX)); } +static const uint32_t FLIvalues[32] = { + 0xBF800000, // -1.0 + 0x00800000, // FLT_MIN (note: a bit special) + 0x37800000, // pow(2, -16) + 0x38000000, // pow(2, -15) + 0x3B800000, // pow(2, -8) + 0x3C000000, // pow(2, -7) + 0x3D800000, // 0.0625 + 0x3E000000, // 0.125 + 0x3E800000, // 0.25 + 0x3EA00000, // 0.3125 + 0x3EC00000, // 0.375 + 0x3EE00000, // 0.4375 + 0x3F000000, // 0.5 + 0x3F200000, // 0.625 + 0x3F400000, // 0.75 + 0x3F600000, // 0.875 + 0x3F800000, // 1.0 + 0x3FA00000, // 1.25 + 0x3FC00000, // 1.5 + 0x3FE00000, // 1.75 + 0x40000000, // 2.0 + 0x40200000, // 2.5 + 0x40400000, // 3.0 + 0x40800000, // 4.0 + 0x41000000, // 8.0 + 0x41800000, // 16.0 + 0x43000000, // 128.0 + 0x43800000, // 256.0 + 0x47000000, // pow(2, 15) + 0x47800000, // pow(2, 16) + 0x7F800000, // INFINITY + 0x7FC00000, // NAN +}; + +static RiscVReg EncodeFLImm(int bits, double v) { + float f = (float)v; + int index = -1; + for (size_t i = 0; i < ARRAY_SIZE(FLIvalues); ++i) { + if (memcmp(&f, &FLIvalues[i], sizeof(float)) == 0) { + index = (int)i; + break; + } + } + + // For 16-bit, 2/3 are subnormal and 29 is not possible. Just avoid for now. + if (index != -1 && index != 1 && (bits > 16 || (index != 2 && index != 3 && index != 29))) + return (RiscVReg)index; + + if (bits == 64) { + uint64_t dmin = 0x0010000000000000ULL; + if (memcmp(&v, &dmin, 8) == 0) + return F1; + } else if (bits == 32 && index == 1) { + return F1; + } else if (bits == 16) { + uint64_t hmin = 0x3F10000000000000ULL; + if (memcmp(&v, &hmin, 8) == 0) + return F1; + } + + return INVALID_REG; +} + +bool RiscVEmitter::CanFLI(int bits, double v) const { + if (!SupportsFloatExtra()) + return false; + if (bits == 16 && !SupportsFloatHalf()) + return false; + if (bits > FloatBitsSupported()) + return false; + return EncodeFLImm(bits, v) != INVALID_REG; +} + +bool RiscVEmitter::CanFLI(int bits, uint32_t pattern) const { + float f; + memcpy(&f, &pattern, sizeof(f)); + return CanFLI(bits, f); +} + +void RiscVEmitter::FLI(int bits, RiscVReg rd, double v) { + _assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__); + _assert_msg_(bits <= FloatBitsSupported(), "FLI cannot be used for %d bits, only %d/%d supported", bits, BitsSupported(), FloatBitsSupported()); + _assert_msg_(IsFPR(rd), "%s rd of wrong type", __func__); + + RiscVReg imm = EncodeFLImm(bits, v); + _assert_msg_(imm != INVALID_REG, "FLI with unsupported constant %f for %d bits", v, bits); + Write32(EncodeR(Opcode32::OP_FP, rd, Funct3::FMV, imm, F1, BitsToFunct2(bits, false), Funct5::FMV_FROMX)); +} + +void RiscVEmitter::FLI(int bits, RiscVReg rd, uint32_t pattern) { + float f; + memcpy(&f, &pattern, sizeof(f)); + FLI(bits, rd, f); +} + +void RiscVEmitter::FMINM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2) { + _assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__); + Write32(EncodeFR(Opcode32::OP_FP, rd, Funct3::FMINM, rs1, rs2, BitsToFunct2(bits), Funct5::FMINMAX)); +} + +void RiscVEmitter::FMAXM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2) { + _assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__); + Write32(EncodeFR(Opcode32::OP_FP, rd, Funct3::FMAXM, rs1, rs2, BitsToFunct2(bits), Funct5::FMINMAX)); +} + +void RiscVEmitter::FROUND(int bits, RiscVReg rd, RiscVReg rs1, Round rm) { + _assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__); + _assert_msg_(bits <= FloatBitsSupported(), "FROUND for %d float bits, only %d supported", bits, FloatBitsSupported()); + _assert_msg_(IsFPR(rd), "%s rd of wrong type", __func__); + _assert_msg_(IsFPR(rs1), "%s rs1 of wrong type", __func__); + + Funct2 toFmt = BitsToFunct2(bits, false); + Write32(EncodeR(Opcode32::OP_FP, rd, (Funct3)rm, rs1, F4, toFmt, Funct5::FCVT_SZ)); +} + +void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, double v, RiscVReg scratchReg) { + if (CanFLI(bits, v)) { + FLI(bits, rd, v); + } else if (bits == 64) { + LI(scratchReg, v); + FMV(FMv::D, FMv::X, rd, scratchReg); + } else if (bits <= 32) { + QuickFLI(32, rd, (float)v, scratchReg); + } else { + _assert_msg_(false, "Unsupported QuickFLI bits"); + } +} + +void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, uint32_t pattern, RiscVReg scratchReg) { + if (CanFLI(bits, pattern)) { + FLI(bits, rd, pattern); + } else if (bits == 32) { + LI(scratchReg, (int32_t)pattern); + FMV(FMv::W, FMv::X, rd, scratchReg); + } else if (bits == 16) { + LI(scratchReg, (int16_t)pattern); + FMV(FMv::H, FMv::X, rd, scratchReg); + } else { + _assert_msg_(false, "Unsupported QuickFLI bits"); + } +} + +void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, float v, RiscVReg scratchReg) { + if (CanFLI(bits, v)) { + FLI(bits, rd, v); + } else if (bits == 64) { + QuickFLI(32, rd, (double)v, scratchReg); + } else if (bits == 32) { + LI(scratchReg, v); + FMV(FMv::D, FMv::X, rd, scratchReg); + } else { + _assert_msg_(false, "Unsupported QuickFLI bits"); + } +} + void RiscVEmitter::CSRRW(RiscVReg rd, Csr csr, RiscVReg rs1) { _assert_msg_(SupportsZicsr(), "%s instruction not supported", __func__); _assert_msg_((u32)csr <= 0x00000FFF, "%s with invalid CSR number", __func__); diff --git a/Common/RiscVEmitter.h b/Common/RiscVEmitter.h index 9fe5d87164..e885f52e6e 100644 --- a/Common/RiscVEmitter.h +++ b/Common/RiscVEmitter.h @@ -421,6 +421,26 @@ public: void FLE(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2); void FCLASS(int bits, RiscVReg rd, RiscVReg rs1); + // Additional floating point (Zfa.) + bool CanFLI(int bits, double v) const; + bool CanFLI(int bits, uint32_t pattern) const; + bool CanFLI(int bits, float v) const { + return CanFLI(bits, (double)v); + } + void FLI(int bits, RiscVReg rd, double v); + void FLI(int bits, RiscVReg rd, uint32_t pattern); + void FLI(int bits, RiscVReg rd, float v) { + FLI(bits, rd, (double)v); + } + void FMINM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2); + void FMAXM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2); + void FROUND(int bits, RiscVReg rd, RiscVReg rs1, Round rm = Round::DYNAMIC); + + // Convenience helper for FLI support. + void QuickFLI(int bits, RiscVReg rd, double v, RiscVReg scratchReg); + void QuickFLI(int bits, RiscVReg rd, uint32_t pattern, RiscVReg scratchReg); + void QuickFLI(int bits, RiscVReg rd, float v, RiscVReg scratchReg); + // Control state register manipulation. void CSRRW(RiscVReg rd, Csr csr, RiscVReg rs1); void CSRRS(RiscVReg rd, Csr csr, RiscVReg rs1); diff --git a/Core/MIPS/RiscV/RiscVCompFPU.cpp b/Core/MIPS/RiscV/RiscVCompFPU.cpp index 132ef8e58c..eb26e5caed 100644 --- a/Core/MIPS/RiscV/RiscVCompFPU.cpp +++ b/Core/MIPS/RiscV/RiscVCompFPU.cpp @@ -246,8 +246,7 @@ void RiscVJitBackend::CompIR_FCvt(IRInst inst) { tempReg = regs_.MapWithFPRTemp(inst); // Prepare the multiplier. - LI(SCRATCH1, 1UL << (inst.src2 & 0x1F)); - FCVT(FConv::S, FConv::WU, tempReg, SCRATCH1, rm); + QuickFLI(32, tempReg, (float)(1UL << (inst.src2 & 0x1F)), SCRATCH1); FMUL(32, regs_.F(inst.dest), regs_.F(inst.src1), tempReg, rm); // NAN and clamping should all be correct. @@ -264,8 +263,7 @@ void RiscVJitBackend::CompIR_FCvt(IRInst inst) { FCVT(FConv::S, FConv::W, regs_.F(inst.dest), SCRATCH1); // Pre-divide so we can avoid any actual divide. - LI(SCRATCH1, 1.0f / (1UL << (inst.src2 & 0x1F))); - FMV(FMv::W, FMv::X, tempReg, SCRATCH1); + QuickFLI(32, tempReg, 1.0f / (1UL << (inst.src2 & 0x1F)), SCRATCH1); FMUL(32, regs_.F(inst.dest), regs_.F(inst.dest), tempReg); break; @@ -292,8 +290,7 @@ void RiscVJitBackend::CompIR_FSat(IRInst inst) { FCVT(FConv::S, FConv::W, tempReg, R_ZERO); // FLE here is intentional to convert -0.0 to +0.0. FLE(32, SCRATCH1, regs_.F(inst.src1), tempReg); - LI(SCRATCH2, 1.0f); - FMV(FMv::W, FMv::X, tempReg, SCRATCH2); + QuickFLI(32, tempReg, 1.0f, SCRATCH2); FLT(32, SCRATCH2, tempReg, regs_.F(inst.src1)); skipLower = BEQ(SCRATCH1, R_ZERO); @@ -315,8 +312,7 @@ void RiscVJitBackend::CompIR_FSat(IRInst inst) { FMV(32, regs_.F(inst.dest), regs_.F(inst.src1)); // First, set SCRATCH1 = clamp to negative, SCRATCH2 = clamp to positive. - LI(SCRATCH2, -1.0f); - FMV(FMv::W, FMv::X, tempReg, SCRATCH2); + QuickFLI(32, tempReg, -1.0f, SCRATCH2); FLT(32, SCRATCH1, regs_.F(inst.src1), tempReg); FNEG(32, tempReg, tempReg); FLT(32, SCRATCH2, tempReg, regs_.F(inst.src1)); @@ -621,8 +617,7 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) { FSQRT(32, regs_.F(inst.dest), regs_.F(inst.src1)); // Ugh, we can't really avoid a temp here. Probably not worth a permanent one. - LI(SCRATCH1, 1.0f); - FMV(FMv::W, FMv::X, tempReg, SCRATCH1); + QuickFLI(32, tempReg, 1.0f, SCRATCH1); FDIV(32, regs_.F(inst.dest), tempReg, regs_.F(inst.dest)); break; @@ -635,8 +630,7 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) { FDIV(32, regs_.F(inst.dest), regs_.F(inst.dest), regs_.F(inst.src1)); } else { tempReg = regs_.MapWithFPRTemp(inst); - LI(SCRATCH1, 1.0f); - FMV(FMv::W, FMv::X, tempReg, SCRATCH1); + QuickFLI(32, tempReg, 1.0f, SCRATCH1); FDIV(32, regs_.F(inst.dest), tempReg, regs_.F(inst.src1)); } break; diff --git a/Core/MIPS/RiscV/RiscVCompSystem.cpp b/Core/MIPS/RiscV/RiscVCompSystem.cpp index a291d85930..4bba8a9ece 100644 --- a/Core/MIPS/RiscV/RiscVCompSystem.cpp +++ b/Core/MIPS/RiscV/RiscVCompSystem.cpp @@ -50,14 +50,10 @@ void RiscVJitBackend::CompIR_Basic(IRInst inst) { case IROp::SetConstF: regs_.Map(inst); - if (inst.constant == 0) { + if (inst.constant == 0) FCVT(FConv::S, FConv::W, regs_.F(inst.dest), R_ZERO); - } else { - // TODO: In the future, could use FLI if it's approved. - // Also, is FCVT faster? - LI(SCRATCH1, (int32_t)inst.constant); - FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1); - } + else + QuickFLI(32, regs_.F(inst.dest), inst.constant, SCRATCH1); break; case IROp::Downcount: diff --git a/Core/MIPS/RiscV/RiscVCompVec.cpp b/Core/MIPS/RiscV/RiscVCompVec.cpp index b220d0ce8e..8118fc7195 100644 --- a/Core/MIPS/RiscV/RiscVCompVec.cpp +++ b/Core/MIPS/RiscV/RiscVCompVec.cpp @@ -54,56 +54,86 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) { break; case Vec4Init::AllONE: - LI(SCRATCH1, 1.0f); - FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1); - for (int i = 1; i < 4; ++i) - FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest)); + if (CanFLI(32, 1.0f)) { + for (int i = 0; i < 4; ++i) + FLI(32, regs_.F(inst.dest + i), 1.0f); + } else { + LI(SCRATCH1, 1.0f); + FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1); + for (int i = 1; i < 4; ++i) + FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest)); + } break; case Vec4Init::AllMinusONE: - LI(SCRATCH1, -1.0f); - FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1); - for (int i = 1; i < 4; ++i) - FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest)); + if (CanFLI(32, -1.0f)) { + for (int i = 0; i < 4; ++i) + FLI(32, regs_.F(inst.dest + i), -1.0f); + } else { + LI(SCRATCH1, -1.0f); + FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1); + for (int i = 1; i < 4; ++i) + FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest)); + } break; case Vec4Init::Set_1000: - LI(SCRATCH1, 1.0f); + if (!CanFLI(32, 1.0f)) + LI(SCRATCH1, 1.0f); for (int i = 0; i < 4; ++i) { - if (i == 0) - FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); - else + if (i == 0) { + if (CanFLI(32, 1.0f)) + FLI(32, regs_.F(inst.dest + i), 1.0f); + else + FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); + } else { FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO); + } } break; case Vec4Init::Set_0100: - LI(SCRATCH1, 1.0f); + if (!CanFLI(32, 1.0f)) + LI(SCRATCH1, 1.0f); for (int i = 0; i < 4; ++i) { - if (i == 1) - FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); - else + if (i == 1) { + if (CanFLI(32, 1.0f)) + FLI(32, regs_.F(inst.dest + i), 1.0f); + else + FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); + } else { FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO); + } } break; case Vec4Init::Set_0010: - LI(SCRATCH1, 1.0f); + if (!CanFLI(32, 1.0f)) + LI(SCRATCH1, 1.0f); for (int i = 0; i < 4; ++i) { - if (i == 2) - FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); - else + if (i == 2) { + if (CanFLI(32, 1.0f)) + FLI(32, regs_.F(inst.dest + i), 1.0f); + else + FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); + } else { FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO); + } } break; case Vec4Init::Set_0001: - LI(SCRATCH1, 1.0f); + if (!CanFLI(32, 1.0f)) + LI(SCRATCH1, 1.0f); for (int i = 0; i < 4; ++i) { - if (i == 3) - FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); - else + if (i == 3) { + if (CanFLI(32, 1.0f)) + FLI(32, regs_.F(inst.dest + i), 1.0f); + else + FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1); + } else { FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO); + } } break; } diff --git a/GPU/Common/VertexDecoderRiscV.cpp b/GPU/Common/VertexDecoderRiscV.cpp index 41e44f7106..6a69f7fce5 100644 --- a/GPU/Common/VertexDecoderRiscV.cpp +++ b/GPU/Common/VertexDecoderRiscV.cpp @@ -223,10 +223,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int } // TODO: Only load these when needed? - LI(scratchReg, by128); - FMV(FMv::W, FMv::X, by128Reg, scratchReg); - LI(scratchReg, by32768); - FMV(FMv::W, FMv::X, by32768Reg, scratchReg); + QuickFLI(32, by128Reg, by128, scratchReg); + QuickFLI(32, by32768Reg, by32768, scratchReg); if (posThroughStep) { LI(scratchReg, const65535); FMV(FMv::W, FMv::X, const65535Reg, scratchReg);