From 00e691d6339b035b089f63004801ee92bbfe9c80 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 10 Sep 2023 15:18:44 -0700 Subject: [PATCH] arm64jit: Try shifted MOVI in MOVI2FDUP(). Any penalty from int/float or size change should be less than GPR load. --- Common/Arm64Emitter.cpp | 98 +++++++++++++++++++++++++++++++++++++++++ Common/Arm64Emitter.h | 4 ++ 2 files changed, 102 insertions(+) diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index a5d87c5a11..1d2c8b0438 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -4204,6 +4204,14 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo if (negate) { FNEG(32, Rd, Rd); } + } else if (TryAnyMOVI(32, Rd, ival)) { + if (negate) { + FNEG(32, Rd, Rd); + } + } else if (TryAnyMOVI(32, Rd, ival ^ 0x80000000)) { + if (!negate) { + FNEG(32, Rd, Rd); + } } else { _assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value); if (negate) { @@ -4214,6 +4222,96 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo } } +bool ARM64FloatEmitter::TryMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) { + if (size == 8) { + // Can always do 8. + MOVI(size, Rd, elementValue & 0xFF); + return true; + } else if (size == 16) { + if ((elementValue & 0xFF00) == 0) { + MOVI(size, Rd, elementValue & 0xFF, 0); + return true; + } else if ((elementValue & 0x00FF) == 0) { + MOVI(size, Rd, (elementValue >> 8) & 0xFF, 8); + return true; + } else if ((elementValue & 0xFF00) == 0xFF00) { + MVNI(size, Rd, ~elementValue & 0xFF, 0); + return true; + } else if ((elementValue & 0x00FF) == 0x00FF) { + MVNI(size, Rd, (~elementValue >> 8) & 0xFF, 8); + return true; + } + + return false; + } else if (size == 32) { + for (int shift = 0; shift < 32; shift += 8) { + uint32_t mask = 0xFFFFFFFF &~ (0xFF << shift); + if ((elementValue & mask) == 0) { + MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift); + return true; + } else if ((elementValue & mask) == mask) { + MVNI(size, Rd, (~elementValue >> shift) & 0xFF, shift); + return true; + } + } + + // Maybe an MSL shift will work? + for (int shift = 8; shift <= 16; shift += 8) { + uint32_t mask = 0xFFFFFFFF & ~(0xFF << shift); + uint32_t ones = (1 << shift) - 1; + uint32_t notOnes = 0xFFFFFF00 << shift; + if ((elementValue & mask) == ones) { + MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift, true); + return true; + } else if ((elementValue & mask) == notOnes) { + MVNI(size, Rd, (elementValue >> shift) & 0xFF, shift, true); + return true; + } + } + + return false; + } else if (size == 64) { + uint8_t imm8 = 0; + for (int i = 0; i < 8; ++i) { + uint8_t byte = (elementValue >> (i * 8)) & 0xFF; + if (byte != 0 && byte != 0xFF) + return false; + + if (byte == 0xFF) + imm8 |= 1 << i; + } + + // Didn't run into any partial bytes, so size 64 is doable. + MOVI(size, Rd, imm8); + return true; + } + return false; +} + +bool ARM64FloatEmitter::TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) { + // Try the original size first in case that's more optimal. + if (TryMOVI(size, Rd, elementValue)) + return true; + + uint64_t value = elementValue; + if (size != 64) { + uint64_t masked = elementValue & ((1 << size) - 1); + for (int i = size; i < 64; ++i) { + value |= masked << i; + } + } + + for (int attempt = 8; attempt <= 64; attempt += attempt) { + // Original size was already attempted above. + if (attempt != size) { + if (TryMOVI(attempt, Rd, value)) + return true; + } + } + + return false; +} + void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { u32 val; bool shift; diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index cd4a54cb73..0c3603d1bf 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -925,6 +925,10 @@ public: void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0); void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0); + bool TryMOVI(u8 size, ARM64Reg Rd, uint64_t value); + // Allow using a different size. Unclear if there's a penalty. + bool TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t value); + // One source void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);