riscv: Implement Zfa encoding.

Not yet enabled/detected.
This commit is contained in:
Unknown W. Brackets 2023-12-01 20:31:25 -08:00
parent ecb7f93418
commit 15cb782f85
6 changed files with 248 additions and 47 deletions

View file

@ -75,6 +75,11 @@ static inline bool SupportsFloatHalf(bool allowMin = false) {
return false;
}
static inline bool SupportsFloatExtra() {
// TODO: cpu_info.RiscV_Zfa
return false;
}
enum class Opcode32 {
// Note: invalid, just used for FixupBranch.
ZERO = 0b0000000,
@ -156,6 +161,8 @@ enum class Funct3 {
FMIN = 0b000,
FMAX = 0b001,
FMINM = 0b010,
FMAXM = 0b011,
FMV = 0b000,
FCLASS = 0b001,
@ -2216,6 +2223,162 @@ void RiscVEmitter::FCLASS(int bits, RiscVReg rd, RiscVReg rs1) {
Write32(EncodeR(Opcode32::OP_FP, rd, Funct3::FCLASS, rs1, F0, BitsToFunct2(bits), Funct5::FMV_TOX));
}
static const uint32_t FLIvalues[32] = {
0xBF800000, // -1.0
0x00800000, // FLT_MIN (note: a bit special)
0x37800000, // pow(2, -16)
0x38000000, // pow(2, -15)
0x3B800000, // pow(2, -8)
0x3C000000, // pow(2, -7)
0x3D800000, // 0.0625
0x3E000000, // 0.125
0x3E800000, // 0.25
0x3EA00000, // 0.3125
0x3EC00000, // 0.375
0x3EE00000, // 0.4375
0x3F000000, // 0.5
0x3F200000, // 0.625
0x3F400000, // 0.75
0x3F600000, // 0.875
0x3F800000, // 1.0
0x3FA00000, // 1.25
0x3FC00000, // 1.5
0x3FE00000, // 1.75
0x40000000, // 2.0
0x40200000, // 2.5
0x40400000, // 3.0
0x40800000, // 4.0
0x41000000, // 8.0
0x41800000, // 16.0
0x43000000, // 128.0
0x43800000, // 256.0
0x47000000, // pow(2, 15)
0x47800000, // pow(2, 16)
0x7F800000, // INFINITY
0x7FC00000, // NAN
};
static RiscVReg EncodeFLImm(int bits, double v) {
float f = (float)v;
int index = -1;
for (size_t i = 0; i < ARRAY_SIZE(FLIvalues); ++i) {
if (memcmp(&f, &FLIvalues[i], sizeof(float)) == 0) {
index = (int)i;
break;
}
}
// For 16-bit, 2/3 are subnormal and 29 is not possible. Just avoid for now.
if (index != -1 && index != 1 && (bits > 16 || (index != 2 && index != 3 && index != 29)))
return (RiscVReg)index;
if (bits == 64) {
uint64_t dmin = 0x0010000000000000ULL;
if (memcmp(&v, &dmin, 8) == 0)
return F1;
} else if (bits == 32 && index == 1) {
return F1;
} else if (bits == 16) {
uint64_t hmin = 0x3F10000000000000ULL;
if (memcmp(&v, &hmin, 8) == 0)
return F1;
}
return INVALID_REG;
}
bool RiscVEmitter::CanFLI(int bits, double v) const {
if (!SupportsFloatExtra())
return false;
if (bits == 16 && !SupportsFloatHalf())
return false;
if (bits > FloatBitsSupported())
return false;
return EncodeFLImm(bits, v) != INVALID_REG;
}
bool RiscVEmitter::CanFLI(int bits, uint32_t pattern) const {
float f;
memcpy(&f, &pattern, sizeof(f));
return CanFLI(bits, f);
}
void RiscVEmitter::FLI(int bits, RiscVReg rd, double v) {
_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
_assert_msg_(bits <= FloatBitsSupported(), "FLI cannot be used for %d bits, only %d/%d supported", bits, BitsSupported(), FloatBitsSupported());
_assert_msg_(IsFPR(rd), "%s rd of wrong type", __func__);
RiscVReg imm = EncodeFLImm(bits, v);
_assert_msg_(imm != INVALID_REG, "FLI with unsupported constant %f for %d bits", v, bits);
Write32(EncodeR(Opcode32::OP_FP, rd, Funct3::FMV, imm, F1, BitsToFunct2(bits, false), Funct5::FMV_FROMX));
}
void RiscVEmitter::FLI(int bits, RiscVReg rd, uint32_t pattern) {
float f;
memcpy(&f, &pattern, sizeof(f));
FLI(bits, rd, f);
}
void RiscVEmitter::FMINM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2) {
_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
Write32(EncodeFR(Opcode32::OP_FP, rd, Funct3::FMINM, rs1, rs2, BitsToFunct2(bits), Funct5::FMINMAX));
}
void RiscVEmitter::FMAXM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2) {
_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
Write32(EncodeFR(Opcode32::OP_FP, rd, Funct3::FMAXM, rs1, rs2, BitsToFunct2(bits), Funct5::FMINMAX));
}
void RiscVEmitter::FROUND(int bits, RiscVReg rd, RiscVReg rs1, Round rm) {
_assert_msg_(SupportsFloatExtra(), "%s cannot be used without Zfa", __func__);
_assert_msg_(bits <= FloatBitsSupported(), "FROUND for %d float bits, only %d supported", bits, FloatBitsSupported());
_assert_msg_(IsFPR(rd), "%s rd of wrong type", __func__);
_assert_msg_(IsFPR(rs1), "%s rs1 of wrong type", __func__);
Funct2 toFmt = BitsToFunct2(bits, false);
Write32(EncodeR(Opcode32::OP_FP, rd, (Funct3)rm, rs1, F4, toFmt, Funct5::FCVT_SZ));
}
void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, double v, RiscVReg scratchReg) {
if (CanFLI(bits, v)) {
FLI(bits, rd, v);
} else if (bits == 64) {
LI(scratchReg, v);
FMV(FMv::D, FMv::X, rd, scratchReg);
} else if (bits <= 32) {
QuickFLI(32, rd, (float)v, scratchReg);
} else {
_assert_msg_(false, "Unsupported QuickFLI bits");
}
}
void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, uint32_t pattern, RiscVReg scratchReg) {
if (CanFLI(bits, pattern)) {
FLI(bits, rd, pattern);
} else if (bits == 32) {
LI(scratchReg, (int32_t)pattern);
FMV(FMv::W, FMv::X, rd, scratchReg);
} else if (bits == 16) {
LI(scratchReg, (int16_t)pattern);
FMV(FMv::H, FMv::X, rd, scratchReg);
} else {
_assert_msg_(false, "Unsupported QuickFLI bits");
}
}
void RiscVEmitter::QuickFLI(int bits, RiscVReg rd, float v, RiscVReg scratchReg) {
if (CanFLI(bits, v)) {
FLI(bits, rd, v);
} else if (bits == 64) {
QuickFLI(32, rd, (double)v, scratchReg);
} else if (bits == 32) {
LI(scratchReg, v);
FMV(FMv::D, FMv::X, rd, scratchReg);
} else {
_assert_msg_(false, "Unsupported QuickFLI bits");
}
}
void RiscVEmitter::CSRRW(RiscVReg rd, Csr csr, RiscVReg rs1) {
_assert_msg_(SupportsZicsr(), "%s instruction not supported", __func__);
_assert_msg_((u32)csr <= 0x00000FFF, "%s with invalid CSR number", __func__);

View file

@ -421,6 +421,26 @@ public:
void FLE(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2);
void FCLASS(int bits, RiscVReg rd, RiscVReg rs1);
// Additional floating point (Zfa.)
bool CanFLI(int bits, double v) const;
bool CanFLI(int bits, uint32_t pattern) const;
bool CanFLI(int bits, float v) const {
return CanFLI(bits, (double)v);
}
void FLI(int bits, RiscVReg rd, double v);
void FLI(int bits, RiscVReg rd, uint32_t pattern);
void FLI(int bits, RiscVReg rd, float v) {
FLI(bits, rd, (double)v);
}
void FMINM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2);
void FMAXM(int bits, RiscVReg rd, RiscVReg rs1, RiscVReg rs2);
void FROUND(int bits, RiscVReg rd, RiscVReg rs1, Round rm = Round::DYNAMIC);
// Convenience helper for FLI support.
void QuickFLI(int bits, RiscVReg rd, double v, RiscVReg scratchReg);
void QuickFLI(int bits, RiscVReg rd, uint32_t pattern, RiscVReg scratchReg);
void QuickFLI(int bits, RiscVReg rd, float v, RiscVReg scratchReg);
// Control state register manipulation.
void CSRRW(RiscVReg rd, Csr csr, RiscVReg rs1);
void CSRRS(RiscVReg rd, Csr csr, RiscVReg rs1);

View file

@ -246,8 +246,7 @@ void RiscVJitBackend::CompIR_FCvt(IRInst inst) {
tempReg = regs_.MapWithFPRTemp(inst);
// Prepare the multiplier.
LI(SCRATCH1, 1UL << (inst.src2 & 0x1F));
FCVT(FConv::S, FConv::WU, tempReg, SCRATCH1, rm);
QuickFLI(32, tempReg, (float)(1UL << (inst.src2 & 0x1F)), SCRATCH1);
FMUL(32, regs_.F(inst.dest), regs_.F(inst.src1), tempReg, rm);
// NAN and clamping should all be correct.
@ -264,8 +263,7 @@ void RiscVJitBackend::CompIR_FCvt(IRInst inst) {
FCVT(FConv::S, FConv::W, regs_.F(inst.dest), SCRATCH1);
// Pre-divide so we can avoid any actual divide.
LI(SCRATCH1, 1.0f / (1UL << (inst.src2 & 0x1F)));
FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
QuickFLI(32, tempReg, 1.0f / (1UL << (inst.src2 & 0x1F)), SCRATCH1);
FMUL(32, regs_.F(inst.dest), regs_.F(inst.dest), tempReg);
break;
@ -292,8 +290,7 @@ void RiscVJitBackend::CompIR_FSat(IRInst inst) {
FCVT(FConv::S, FConv::W, tempReg, R_ZERO);
// FLE here is intentional to convert -0.0 to +0.0.
FLE(32, SCRATCH1, regs_.F(inst.src1), tempReg);
LI(SCRATCH2, 1.0f);
FMV(FMv::W, FMv::X, tempReg, SCRATCH2);
QuickFLI(32, tempReg, 1.0f, SCRATCH2);
FLT(32, SCRATCH2, tempReg, regs_.F(inst.src1));
skipLower = BEQ(SCRATCH1, R_ZERO);
@ -315,8 +312,7 @@ void RiscVJitBackend::CompIR_FSat(IRInst inst) {
FMV(32, regs_.F(inst.dest), regs_.F(inst.src1));
// First, set SCRATCH1 = clamp to negative, SCRATCH2 = clamp to positive.
LI(SCRATCH2, -1.0f);
FMV(FMv::W, FMv::X, tempReg, SCRATCH2);
QuickFLI(32, tempReg, -1.0f, SCRATCH2);
FLT(32, SCRATCH1, regs_.F(inst.src1), tempReg);
FNEG(32, tempReg, tempReg);
FLT(32, SCRATCH2, tempReg, regs_.F(inst.src1));
@ -621,8 +617,7 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
FSQRT(32, regs_.F(inst.dest), regs_.F(inst.src1));
// Ugh, we can't really avoid a temp here. Probably not worth a permanent one.
LI(SCRATCH1, 1.0f);
FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
QuickFLI(32, tempReg, 1.0f, SCRATCH1);
FDIV(32, regs_.F(inst.dest), tempReg, regs_.F(inst.dest));
break;
@ -635,8 +630,7 @@ void RiscVJitBackend::CompIR_FSpecial(IRInst inst) {
FDIV(32, regs_.F(inst.dest), regs_.F(inst.dest), regs_.F(inst.src1));
} else {
tempReg = regs_.MapWithFPRTemp(inst);
LI(SCRATCH1, 1.0f);
FMV(FMv::W, FMv::X, tempReg, SCRATCH1);
QuickFLI(32, tempReg, 1.0f, SCRATCH1);
FDIV(32, regs_.F(inst.dest), tempReg, regs_.F(inst.src1));
}
break;

View file

@ -50,14 +50,10 @@ void RiscVJitBackend::CompIR_Basic(IRInst inst) {
case IROp::SetConstF:
regs_.Map(inst);
if (inst.constant == 0) {
if (inst.constant == 0)
FCVT(FConv::S, FConv::W, regs_.F(inst.dest), R_ZERO);
} else {
// TODO: In the future, could use FLI if it's approved.
// Also, is FCVT faster?
LI(SCRATCH1, (int32_t)inst.constant);
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
}
else
QuickFLI(32, regs_.F(inst.dest), inst.constant, SCRATCH1);
break;
case IROp::Downcount:

View file

@ -54,56 +54,86 @@ void RiscVJitBackend::CompIR_VecAssign(IRInst inst) {
break;
case Vec4Init::AllONE:
LI(SCRATCH1, 1.0f);
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
for (int i = 1; i < 4; ++i)
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
if (CanFLI(32, 1.0f)) {
for (int i = 0; i < 4; ++i)
FLI(32, regs_.F(inst.dest + i), 1.0f);
} else {
LI(SCRATCH1, 1.0f);
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
for (int i = 1; i < 4; ++i)
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
}
break;
case Vec4Init::AllMinusONE:
LI(SCRATCH1, -1.0f);
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
for (int i = 1; i < 4; ++i)
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
if (CanFLI(32, -1.0f)) {
for (int i = 0; i < 4; ++i)
FLI(32, regs_.F(inst.dest + i), -1.0f);
} else {
LI(SCRATCH1, -1.0f);
FMV(FMv::W, FMv::X, regs_.F(inst.dest), SCRATCH1);
for (int i = 1; i < 4; ++i)
FMV(32, regs_.F(inst.dest + i), regs_.F(inst.dest));
}
break;
case Vec4Init::Set_1000:
LI(SCRATCH1, 1.0f);
if (!CanFLI(32, 1.0f))
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 0)
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
else
if (i == 0) {
if (CanFLI(32, 1.0f))
FLI(32, regs_.F(inst.dest + i), 1.0f);
else
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
} else {
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
}
}
break;
case Vec4Init::Set_0100:
LI(SCRATCH1, 1.0f);
if (!CanFLI(32, 1.0f))
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 1)
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
else
if (i == 1) {
if (CanFLI(32, 1.0f))
FLI(32, regs_.F(inst.dest + i), 1.0f);
else
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
} else {
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
}
}
break;
case Vec4Init::Set_0010:
LI(SCRATCH1, 1.0f);
if (!CanFLI(32, 1.0f))
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 2)
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
else
if (i == 2) {
if (CanFLI(32, 1.0f))
FLI(32, regs_.F(inst.dest + i), 1.0f);
else
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
} else {
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
}
}
break;
case Vec4Init::Set_0001:
LI(SCRATCH1, 1.0f);
if (!CanFLI(32, 1.0f))
LI(SCRATCH1, 1.0f);
for (int i = 0; i < 4; ++i) {
if (i == 3)
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
else
if (i == 3) {
if (CanFLI(32, 1.0f))
FLI(32, regs_.F(inst.dest + i), 1.0f);
else
FMV(FMv::W, FMv::X, regs_.F(inst.dest + i), SCRATCH1);
} else {
FCVT(FConv::S, FConv::W, regs_.F(inst.dest + i), R_ZERO);
}
}
break;
}

View file

@ -223,10 +223,8 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
}
// TODO: Only load these when needed?
LI(scratchReg, by128);
FMV(FMv::W, FMv::X, by128Reg, scratchReg);
LI(scratchReg, by32768);
FMV(FMv::W, FMv::X, by32768Reg, scratchReg);
QuickFLI(32, by128Reg, by128, scratchReg);
QuickFLI(32, by32768Reg, by32768, scratchReg);
if (posThroughStep) {
LI(scratchReg, const65535);
FMV(FMv::W, FMv::X, const65535Reg, scratchReg);