From c8f888fab0942312d01eb587ddf2a46ce8d48b2b Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 5 Sep 2023 23:47:52 -0700 Subject: [PATCH] arm64jit: Implement FMin/FMax. --- Common/Arm64Emitter.cpp | 61 ++++++++++++++++++++++++++++++ Common/Arm64Emitter.h | 14 +++++++ Core/MIPS/ARM64/Arm64IRCompFPU.cpp | 39 ++++++++++++++++++- 3 files changed, 113 insertions(+), 1 deletion(-) diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index 2192a8a6e6..6b3221e76f 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -3011,6 +3011,12 @@ void ARM64FloatEmitter::BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(1, 1, 3, Rd, Rn, Rm); } +void ARM64FloatEmitter::BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, 2, 3, Rd, Rn, Rm); +} +void ARM64FloatEmitter::BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + EmitThreeSame(1, 3, 3, Rd, Rn, Rm); +} void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { u32 imm5 = 0; @@ -3184,6 +3190,61 @@ void ARM64FloatEmitter::XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) Emit2RegMisc(true, 0, dest_size >> 4, 0x12, Rd, Rn); } +void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(true, size >> 4, 0b10001, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(false, size >> 4, 0b00111, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(false, size >> 4, 0b00110, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(true, size >> 4, 0b00110, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(true, size >> 4, 0b00111, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + EmitThreeSame(false, size >> 4, 0b10001, Rd, Rn, Rm); +} + +void ARM64FloatEmitter::CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01001, Rd, Rn); +} + +void ARM64FloatEmitter::CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), true, size >> 4, 0b01000, Rd, Rn); +} + +void ARM64FloatEmitter::CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01000, Rd, Rn); +} + +void ARM64FloatEmitter::CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), true, size >> 4, 0b01001, Rd, Rn); +} + +void ARM64FloatEmitter::CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn) { + _assert_msg_(!IsQuad(Rd) || size != 64, "%s cannot be used for scalar double", __FUNCTION__); + Emit2RegMisc(IsQuad(Rd), false, size >> 4, 0b01010, Rd, Rn); +} + // Move void ARM64FloatEmitter::DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn) { diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index 70a7253c28..f800e16137 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -851,6 +851,8 @@ public: void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void BSL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BIT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void BIF(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); @@ -894,6 +896,18 @@ public: void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMHI(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMHS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMTST(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void CMEQ(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMGE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMGT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMLE(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void CMLT(u8 size, ARM64Reg Rd, ARM64Reg Rn); + // Move void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn); void INS(u8 size, ARM64Reg Rd, u8 index, ARM64Reg Rn); diff --git a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp index 27f7b40a82..74d7bfe8df 100644 --- a/Core/MIPS/ARM64/Arm64IRCompFPU.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompFPU.cpp @@ -143,16 +143,53 @@ void Arm64JitBackend::CompIR_FCompare(IRInst inst) { void Arm64JitBackend::CompIR_FCondAssign(IRInst inst) { CONDITIONAL_DISABLE; + // For Vec4, we could basically just ORR FCMPGE/FCMPLE together, but overlap is trickier. + regs_.Map(inst); + fp_.FCMP(regs_.F(inst.src1), regs_.F(inst.src2)); + FixupBranch unordered = B(CC_VS); + switch (inst.op) { case IROp::FMin: + fp_.FMIN(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2)); + break; + case IROp::FMax: - CompIR_Generic(inst); + fp_.FMAX(regs_.F(inst.dest), regs_.F(inst.src1), regs_.F(inst.src2)); break; default: INVALIDOP; break; } + + FixupBranch orderedDone = B(); + + // Not sure if this path is fast, trying to optimize it to be small but correct. + // Probably an uncommon path. + SetJumpTarget(unordered); + fp_.AND(EncodeRegToDouble(SCRATCHF1), regs_.FD(inst.src1), regs_.FD(inst.src2)); + // SCRATCHF1 = 0xFFFFFFFF if sign bit set on both, 0x00000000 otherwise. + fp_.CMLT(32, EncodeRegToDouble(SCRATCHF1), EncodeRegToDouble(SCRATCHF1)); + + switch (inst.op) { + case IROp::FMin: + fp_.SMAX(32, EncodeRegToDouble(SCRATCHF2), regs_.FD(inst.src1), regs_.FD(inst.src2)); + fp_.SMIN(32, regs_.FD(inst.dest), regs_.FD(inst.src1), regs_.FD(inst.src2)); + break; + + case IROp::FMax: + fp_.SMIN(32, EncodeRegToDouble(SCRATCHF2), regs_.FD(inst.src1), regs_.FD(inst.src2)); + fp_.SMAX(32, regs_.FD(inst.dest), regs_.FD(inst.src1), regs_.FD(inst.src2)); + break; + + default: + INVALIDOP; + break; + } + // Replace dest with SCRATCHF2 if both were less than zero. + fp_.BIT(regs_.FD(inst.dest), EncodeRegToDouble(SCRATCHF2), EncodeRegToDouble(SCRATCHF1)); + + SetJumpTarget(orderedDone); } void Arm64JitBackend::CompIR_FCvt(IRInst inst) {