From 885ae5c80540b25260fb6fc5eca35e557bdcb02e Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 4 Sep 2023 12:27:39 -0700 Subject: [PATCH 1/2] arm64jit: Implement shuffle optimizer. --- Common/Arm64Emitter.cpp | 14 ++ Common/Arm64Emitter.h | 2 + Core/MIPS/ARM64/Arm64IRCompVec.cpp | 260 ++++++++++++++++++++++++++++- 3 files changed, 274 insertions(+), 2 deletions(-) diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index 2323d289e2..f74838d1d4 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -3450,6 +3450,20 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) EmitPermute(size, 7, Rd, Rn, Rm); } +void ARM64FloatEmitter::EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, int index) { + _assert_msg_(!IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); + + bool quad = IsQuad(Rd); + _assert_msg_(index >= 0 && index < 16 && (quad || index < 8), "%s start index out of bounds", __FUNCTION__); + _assert_msg_(IsQuad(Rd) == IsQuad(Rn) && IsQuad(Rd) == IsQuad(Rm), "%s operands not same size", __FUNCTION__); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0x17 << 25) | (Rm << 16) | (index << 11) | (Rn << 5) | Rd); +} + // Shift by immediate void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index 6dbfa3deed..27152a18bc 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -934,6 +934,8 @@ public: void UZP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void TRN2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + // Related to permute, extract vector from pair (always by byte arrangement.) + void EXT(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, int index); // Shift by immediate void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); diff --git a/Core/MIPS/ARM64/Arm64IRCompVec.cpp b/Core/MIPS/ARM64/Arm64IRCompVec.cpp index 8cd3331ddb..b06e1531f1 100644 --- a/Core/MIPS/ARM64/Arm64IRCompVec.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompVec.cpp @@ -84,19 +84,275 @@ void Arm64JitBackend::CompIR_VecArith(IRInst inst) { } } +enum class Arm64Shuffle { + DUP0_AAAA, + DUP1_BBBB, + DUP2_CCCC, + DUP3_DDDD, + MOV_ABCD, + TRN1_AACC, + TRN2_BBDD, + UZP1_ACAC, + UZP2_BDBD, + ZIP1_AABB, + ZIP2_CCDD, + REV64_BADC, + EXT4_BCDA, + EXT8_CDAB, + EXT12_DABC, + + // These steps are more expensive and use a temp. + REV64_EXT8_CDBA, + REV64_EXT8_DCAB, + EXT4_UZP1_BDAC, + EXT4_UZP2_CABD, + + // Any that mutate dest must be after this point. + INS0_TO_1, + INS0_TO_2, + INS0_TO_3, + INS1_TO_0, + INS1_TO_2, + INS1_TO_3, + INS2_TO_0, + INS2_TO_1, + INS2_TO_3, + INS3_TO_0, + INS3_TO_1, + INS3_TO_2, + XTN2, + + COUNT, + COUNT_NOPREV = INS0_TO_1, +}; + +uint8_t Arm64ShuffleMask(Arm64Shuffle method) { + // Hopefully optimized into a lookup table, this is a bit less confusing to read... + switch (method) { + case Arm64Shuffle::DUP0_AAAA: return 0x00; + case Arm64Shuffle::DUP1_BBBB: return 0x55; + case Arm64Shuffle::DUP2_CCCC: return 0xAA; + case Arm64Shuffle::DUP3_DDDD: return 0xFF; + case Arm64Shuffle::MOV_ABCD: return 0xE4; + case Arm64Shuffle::TRN1_AACC: return 0xA0; + case Arm64Shuffle::TRN2_BBDD: return 0xF5; + case Arm64Shuffle::UZP1_ACAC: return 0x88; + case Arm64Shuffle::UZP2_BDBD: return 0xDD; + case Arm64Shuffle::ZIP1_AABB: return 0x50; + case Arm64Shuffle::ZIP2_CCDD: return 0xFA; + case Arm64Shuffle::REV64_BADC: return 0xB1; + case Arm64Shuffle::EXT4_BCDA: return 0x39; + case Arm64Shuffle::EXT8_CDAB: return 0x4E; + case Arm64Shuffle::EXT12_DABC: return 0x93; + case Arm64Shuffle::REV64_EXT8_CDBA: return 0x1E; + case Arm64Shuffle::REV64_EXT8_DCAB: return 0x4B; + case Arm64Shuffle::EXT4_UZP1_BDAC: return 0x8D; + case Arm64Shuffle::EXT4_UZP2_CABD: return 0xD2; + case Arm64Shuffle::INS0_TO_1: return 0xE0; + case Arm64Shuffle::INS0_TO_2: return 0xC4; + case Arm64Shuffle::INS0_TO_3: return 0x24; + case Arm64Shuffle::INS1_TO_0: return 0xE5; + case Arm64Shuffle::INS1_TO_2: return 0xD4; + case Arm64Shuffle::INS1_TO_3: return 0x64; + case Arm64Shuffle::INS2_TO_0: return 0xE6; + case Arm64Shuffle::INS2_TO_1: return 0xE8; + case Arm64Shuffle::INS2_TO_3: return 0xA4; + case Arm64Shuffle::INS3_TO_0: return 0xE7; + case Arm64Shuffle::INS3_TO_1: return 0xEC; + case Arm64Shuffle::INS3_TO_2: return 0xF4; + case Arm64Shuffle::XTN2: return 0x84; + default: + _assert_(false); + return 0; + } +} + +void Arm64ShuffleApply(ARM64FloatEmitter &fp, Arm64Shuffle method, ARM64Reg vd, ARM64Reg vs) { + switch (method) { + case Arm64Shuffle::DUP0_AAAA: fp.DUP(32, vd, vs, 0); return; + case Arm64Shuffle::DUP1_BBBB: fp.DUP(32, vd, vs, 1); return; + case Arm64Shuffle::DUP2_CCCC: fp.DUP(32, vd, vs, 2); return; + case Arm64Shuffle::DUP3_DDDD: fp.DUP(32, vd, vs, 3); return; + case Arm64Shuffle::MOV_ABCD: _assert_(vd != vs); fp.MOV(vd, vs); return; + case Arm64Shuffle::TRN1_AACC: fp.TRN1(32, vd, vs, vs); return; + case Arm64Shuffle::TRN2_BBDD: fp.TRN2(32, vd, vs, vs); return; + case Arm64Shuffle::UZP1_ACAC: fp.UZP1(32, vd, vs, vs); return; + case Arm64Shuffle::UZP2_BDBD: fp.UZP2(32, vd, vs, vs); return; + case Arm64Shuffle::ZIP1_AABB: fp.ZIP1(32, vd, vs, vs); return; + case Arm64Shuffle::ZIP2_CCDD: fp.ZIP2(32, vd, vs, vs); return; + case Arm64Shuffle::REV64_BADC: fp.REV64(32, vd, vs); return; + case Arm64Shuffle::EXT4_BCDA: fp.EXT(vd, vs, vs, 4); return; + case Arm64Shuffle::EXT8_CDAB: fp.EXT(vd, vs, vs, 8); return; + case Arm64Shuffle::EXT12_DABC: fp.EXT(vd, vs, vs, 12); return; + + case Arm64Shuffle::REV64_EXT8_CDBA: + fp.REV64(32, EncodeRegToQuad(SCRATCHF1), vs); + fp.EXT(vd, vs, EncodeRegToQuad(SCRATCHF1), 8); + return; + + case Arm64Shuffle::REV64_EXT8_DCAB: + fp.REV64(32, EncodeRegToQuad(SCRATCHF1), vs); + fp.EXT(vd, EncodeRegToQuad(SCRATCHF1), vs, 8); + return; + + case Arm64Shuffle::EXT4_UZP1_BDAC: + fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 4); + fp.UZP1(32, vd, EncodeRegToQuad(SCRATCHF1), vs); + return; + + case Arm64Shuffle::EXT4_UZP2_CABD: + fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 4); + fp.UZP2(32, vd, EncodeRegToQuad(SCRATCHF1), vs); + return; + + case Arm64Shuffle::INS0_TO_1: fp.INS(32, vd, 1, vs, 0); return; + case Arm64Shuffle::INS0_TO_2: fp.INS(32, vd, 2, vs, 0); return; + case Arm64Shuffle::INS0_TO_3: fp.INS(32, vd, 3, vs, 0); return; + case Arm64Shuffle::INS1_TO_0: fp.INS(32, vd, 0, vs, 1); return; + case Arm64Shuffle::INS1_TO_2: fp.INS(32, vd, 2, vs, 1); return; + case Arm64Shuffle::INS1_TO_3: fp.INS(32, vd, 3, vs, 1); return; + case Arm64Shuffle::INS2_TO_0: fp.INS(32, vd, 0, vs, 2); return; + case Arm64Shuffle::INS2_TO_1: fp.INS(32, vd, 1, vs, 2); return; + case Arm64Shuffle::INS2_TO_3: fp.INS(32, vd, 3, vs, 2); return; + case Arm64Shuffle::INS3_TO_0: fp.INS(32, vd, 0, vs, 3); return; + case Arm64Shuffle::INS3_TO_1: fp.INS(32, vd, 1, vs, 3); return; + case Arm64Shuffle::INS3_TO_2: fp.INS(32, vd, 2, vs, 3); return; + + case Arm64Shuffle::XTN2: fp.XTN2(32, vd, vs); return; + + default: + _assert_(false); + return; + } +} + +uint8_t Arm64ShuffleResult(uint8_t mask, uint8_t prev) { + if (prev == 0xE4) + return mask; + + uint8_t result = 0; + for (int i = 0; i < 4; ++i) { + int takeLane = (mask >> (i * 2)) & 3; + int lane = (prev >> (takeLane * 2)) & 3; + result |= lane << (i * 2); + } + return result; +} + +int Arm64ShuffleScore(uint8_t shuf, uint8_t goal, int steps = 1) { + if (shuf == goal) + return 100; + + int score = 0; + bool needs[4]{}; + bool gets[4]{}; + for (int i = 0; i < 4; ++i) { + uint8_t mask = 3 << (i * 2); + needs[(goal & mask) >> (i * 2)] = true; + gets[(shuf & mask) >> (i * 2)] = true; + if ((shuf & mask) == (goal & mask)) + score += 4; + } + + for (int i = 0; i < 4; ++i) { + if (needs[i] && !gets[i]) + return 0; + } + + // We need to look one level deeper to solve some, such as 1B (common) well. + if (steps > 0) { + int bestNextScore = 0; + for (int m = 0; m < (int)Arm64Shuffle::COUNT; ++m) { + uint8_t next = Arm64ShuffleResult(Arm64ShuffleMask((Arm64Shuffle)m), shuf); + int nextScore = Arm64ShuffleScore(next, goal, steps - 1); + if (nextScore > score) { + bestNextScore = nextScore; + if (bestNextScore == 100) + break; + } + } + + score += bestNextScore / 2; + } + + return score; +} + +Arm64Shuffle Arm64BestShuffle(uint8_t goal, uint8_t prev, bool needsCopy) { + // needsCopy true means insert isn't possible. + int attempts = needsCopy ? (int)Arm64Shuffle::COUNT_NOPREV : (int)Arm64Shuffle::COUNT; + + Arm64Shuffle best = Arm64Shuffle::MOV_ABCD; + int bestScore = 0; + for (int m = 0; m < attempts; ++m) { + uint8_t result = Arm64ShuffleResult(Arm64ShuffleMask((Arm64Shuffle)m), prev); + int score = Arm64ShuffleScore(result, goal); + if (score > bestScore) { + best = (Arm64Shuffle)m; + bestScore = score; + } + } + + _assert_(bestScore > 0); + return best; +} + + +static void Arm64ShufflePerform(ARM64FloatEmitter &fp, ARM64Reg vd, ARM64Reg vs, u8 shuf) { + // This performs all shuffles within 3 "steps" (some are two instructions, though.) + _assert_msg_(shuf != 0xE4, "Non-shuffles shouldn't get here"); + + uint8_t state = 0xE4; + // If they're not the same, the first step needs to be a copy. + bool needsCopy = vd != vs; + for (int i = 0; i < 4 && state != shuf; ++i) { + // Figure out the next step and write it out. + Arm64Shuffle method = Arm64BestShuffle(shuf, state, needsCopy); + Arm64ShuffleApply(fp, method, vd, needsCopy ? vs : vd); + + // Update our state to where we've ended up, for next time. + needsCopy = false; + state = Arm64ShuffleResult(Arm64ShuffleMask(method), state); + } + + _assert_msg_(state == shuf, "Arm64ShufflePerform failed to resolve shuffle"); +} + void Arm64JitBackend::CompIR_VecAssign(IRInst inst) { CONDITIONAL_DISABLE; switch (inst.op) { case IROp::Vec4Init: + CompIR_Generic(inst); + break; + case IROp::Vec4Shuffle: + // There's not really an easy shuffle op on ARM64... + if (regs_.GetFPRLaneCount(inst.src1) == 1 && (inst.src1 & 3) == 0 && inst.src2 == 0x00) { + // This is a broadcast. If dest == src1, this won't clear it. + regs_.SpillLockFPR(inst.src1); + regs_.MapVec4(inst.dest, MIPSMap::NOINIT); + fp_.DUP(32, regs_.FQ(inst.dest), regs_.FQ(inst.src1), 0); + } else if (inst.src2 == 0xE4) { + if (inst.dest != inst.src1) { + regs_.Map(inst); + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + } + } else { + regs_.Map(inst); + Arm64ShufflePerform(fp_, regs_.FQ(inst.dest), regs_.FQ(inst.src1), inst.src2); + } + break; + case IROp::Vec4Blend: CompIR_Generic(inst); break; case IROp::Vec4Mov: - regs_.Map(inst); - fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + if (inst.dest != inst.src1) { + regs_.Map(inst); + fp_.MOV(regs_.FQ(inst.dest), regs_.FQ(inst.src1)); + } break; default: From 17ffc9c26191ff44d88324524ac2aa41ecad2017 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 4 Sep 2023 14:28:32 -0700 Subject: [PATCH 2/2] arm64jit: Special case some shuffles. To avoid 4 instruction shuffles. --- Core/MIPS/ARM64/Arm64IRCompVec.cpp | 57 +++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/Core/MIPS/ARM64/Arm64IRCompVec.cpp b/Core/MIPS/ARM64/Arm64IRCompVec.cpp index b06e1531f1..d83ace73d4 100644 --- a/Core/MIPS/ARM64/Arm64IRCompVec.cpp +++ b/Core/MIPS/ARM64/Arm64IRCompVec.cpp @@ -106,8 +106,10 @@ enum class Arm64Shuffle { REV64_EXT8_DCAB, EXT4_UZP1_BDAC, EXT4_UZP2_CABD, + EXT8_ZIP1_ACBD, + EXT8_ZIP2_CADB, - // Any that mutate dest must be after this point. + // Any that don't fully replace dest must be after this point. INS0_TO_1, INS0_TO_2, INS0_TO_3, @@ -122,7 +124,12 @@ enum class Arm64Shuffle { INS3_TO_2, XTN2, - COUNT, + // These hacks to prevent 4 instructions, but scoring isn't smart enough to avoid. + EXT12_ZIP1_ADBA, + DUP3_UZP1_DDAC, + + COUNT_NORMAL = EXT12_ZIP1_ADBA, + COUNT_SIMPLE = REV64_EXT8_CDBA, COUNT_NOPREV = INS0_TO_1, }; @@ -148,6 +155,8 @@ uint8_t Arm64ShuffleMask(Arm64Shuffle method) { case Arm64Shuffle::REV64_EXT8_DCAB: return 0x4B; case Arm64Shuffle::EXT4_UZP1_BDAC: return 0x8D; case Arm64Shuffle::EXT4_UZP2_CABD: return 0xD2; + case Arm64Shuffle::EXT8_ZIP1_ACBD: return 0xD8; + case Arm64Shuffle::EXT8_ZIP2_CADB: return 0x72; case Arm64Shuffle::INS0_TO_1: return 0xE0; case Arm64Shuffle::INS0_TO_2: return 0xC4; case Arm64Shuffle::INS0_TO_3: return 0x24; @@ -161,6 +170,8 @@ uint8_t Arm64ShuffleMask(Arm64Shuffle method) { case Arm64Shuffle::INS3_TO_1: return 0xEC; case Arm64Shuffle::INS3_TO_2: return 0xF4; case Arm64Shuffle::XTN2: return 0x84; + case Arm64Shuffle::EXT12_ZIP1_ADBA: return 0x1C; + case Arm64Shuffle::DUP3_UZP1_DDAC: return 0x8F; default: _assert_(false); return 0; @@ -205,6 +216,16 @@ void Arm64ShuffleApply(ARM64FloatEmitter &fp, Arm64Shuffle method, ARM64Reg vd, fp.UZP2(32, vd, EncodeRegToQuad(SCRATCHF1), vs); return; + case Arm64Shuffle::EXT8_ZIP1_ACBD: + fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 8); + fp.ZIP1(32, vd, vs, EncodeRegToQuad(SCRATCHF1)); + return; + + case Arm64Shuffle::EXT8_ZIP2_CADB: + fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 8); + fp.ZIP2(32, vd, vs, EncodeRegToQuad(SCRATCHF1)); + return; + case Arm64Shuffle::INS0_TO_1: fp.INS(32, vd, 1, vs, 0); return; case Arm64Shuffle::INS0_TO_2: fp.INS(32, vd, 2, vs, 0); return; case Arm64Shuffle::INS0_TO_3: fp.INS(32, vd, 3, vs, 0); return; @@ -220,6 +241,16 @@ void Arm64ShuffleApply(ARM64FloatEmitter &fp, Arm64Shuffle method, ARM64Reg vd, case Arm64Shuffle::XTN2: fp.XTN2(32, vd, vs); return; + case Arm64Shuffle::EXT12_ZIP1_ADBA: + fp.EXT(EncodeRegToQuad(SCRATCHF1), vs, vs, 12); + fp.ZIP1(32, vd, vs, EncodeRegToQuad(SCRATCHF1)); + return; + + case Arm64Shuffle::DUP3_UZP1_DDAC: + fp.DUP(32, EncodeRegToQuad(SCRATCHF1), vs, 3); + fp.UZP1(32, vd, EncodeRegToQuad(SCRATCHF1), vs); + return; + default: _assert_(false); return; @@ -262,13 +293,16 @@ int Arm64ShuffleScore(uint8_t shuf, uint8_t goal, int steps = 1) { // We need to look one level deeper to solve some, such as 1B (common) well. if (steps > 0) { int bestNextScore = 0; - for (int m = 0; m < (int)Arm64Shuffle::COUNT; ++m) { + for (int m = 0; m < (int)Arm64Shuffle::COUNT_NORMAL; ++m) { uint8_t next = Arm64ShuffleResult(Arm64ShuffleMask((Arm64Shuffle)m), shuf); int nextScore = Arm64ShuffleScore(next, goal, steps - 1); if (nextScore > score) { bestNextScore = nextScore; - if (bestNextScore == 100) + if (bestNextScore == 100) { + // Take the earliest that gives us two steps, it's cheaper (not 2 instructions.) + score = 0; break; + } } } @@ -279,14 +313,27 @@ int Arm64ShuffleScore(uint8_t shuf, uint8_t goal, int steps = 1) { } Arm64Shuffle Arm64BestShuffle(uint8_t goal, uint8_t prev, bool needsCopy) { + // A couple special cases for optimal shuffles. + if (goal == 0x7C && prev == 0xE4) + return Arm64Shuffle::REV64_BADC; + if (goal == 0x2B && prev == 0xE4) + return Arm64Shuffle::EXT8_CDAB; + if ((goal == 0x07 || goal == 0x1C) && prev == 0xE4) + return Arm64Shuffle::EXT12_ZIP1_ADBA; + if ((goal == 0x8F || goal == 0x2F) && prev == 0xE4) + return Arm64Shuffle::DUP3_UZP1_DDAC; + // needsCopy true means insert isn't possible. - int attempts = needsCopy ? (int)Arm64Shuffle::COUNT_NOPREV : (int)Arm64Shuffle::COUNT; + int attempts = needsCopy ? (int)Arm64Shuffle::COUNT_NOPREV : (int)Arm64Shuffle::COUNT_NORMAL; Arm64Shuffle best = Arm64Shuffle::MOV_ABCD; int bestScore = 0; for (int m = 0; m < attempts; ++m) { uint8_t result = Arm64ShuffleResult(Arm64ShuffleMask((Arm64Shuffle)m), prev); int score = Arm64ShuffleScore(result, goal); + // Slightly discount options that involve an extra instruction. + if (m >= (int)Arm64Shuffle::COUNT_SIMPLE && m < (int)Arm64Shuffle::COUNT_NOPREV) + score--; if (score > bestScore) { best = (Arm64Shuffle)m; bestScore = score;