From 00e691d6339b035b089f63004801ee92bbfe9c80 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 10 Sep 2023 15:18:44 -0700
Subject: [PATCH] arm64jit: Try shifted MOVI in MOVI2FDUP().

Any penalty from int/float or size change should be less than GPR load.
---
 Common/Arm64Emitter.cpp | 98 +++++++++++++++++++++++++++++++++++++++++
 Common/Arm64Emitter.h   |  4 ++
 2 files changed, 102 insertions(+)

diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp
index a5d87c5a11..1d2c8b0438 100644
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@@ -4204,6 +4204,14 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
 		if (negate) {
 			FNEG(32, Rd, Rd);
 		}
+	} else if (TryAnyMOVI(32, Rd, ival)) {
+		if (negate) {
+			FNEG(32, Rd, Rd);
+		}
+	} else if (TryAnyMOVI(32, Rd, ival ^ 0x80000000)) {
+		if (!negate) {
+			FNEG(32, Rd, Rd);
+		}
 	} else {
 		_assert_msg_(scratch != INVALID_REG, "Failed to find a way to generate FP immediate %f without scratch", value);
 		if (negate) {
@@ -4214,6 +4222,96 @@ void ARM64FloatEmitter::MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch, bo
 	}
 }
 
+bool ARM64FloatEmitter::TryMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
+	if (size == 8) {
+		// Can always do 8.
+		MOVI(size, Rd, elementValue & 0xFF);
+		return true;
+	} else if (size == 16) {
+		if ((elementValue & 0xFF00) == 0) {
+			MOVI(size, Rd, elementValue & 0xFF, 0);
+			return true;
+		} else if ((elementValue & 0x00FF) == 0) {
+			MOVI(size, Rd, (elementValue >> 8) & 0xFF, 8);
+			return true;
+		} else if ((elementValue & 0xFF00) == 0xFF00) {
+			MVNI(size, Rd, ~elementValue & 0xFF, 0);
+			return true;
+		} else if ((elementValue & 0x00FF) == 0x00FF) {
+			MVNI(size, Rd, (~elementValue >> 8) & 0xFF, 8);
+			return true;
+		}
+
+		return false;
+	} else if (size == 32) {
+		for (int shift = 0; shift < 32; shift += 8) {
+			uint32_t mask = 0xFFFFFFFF &~ (0xFF << shift);
+			if ((elementValue & mask) == 0) {
+				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift);
+				return true;
+			} else if ((elementValue & mask) == mask) {
+				MVNI(size, Rd, (~elementValue >> shift) & 0xFF, shift);
+				return true;
+			}
+		}
+
+		// Maybe an MSL shift will work?
+		for (int shift = 8; shift <= 16; shift += 8) {
+			uint32_t mask = 0xFFFFFFFF & ~(0xFF << shift);
+			uint32_t ones = (1 << shift) - 1;
+			uint32_t notOnes = 0xFFFFFF00 << shift;
+			if ((elementValue & mask) == ones) {
+				MOVI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
+				return true;
+			} else if ((elementValue & mask) == notOnes) {
+				MVNI(size, Rd, (elementValue >> shift) & 0xFF, shift, true);
+				return true;
+			}
+		}
+
+		return false;
+	} else if (size == 64) {
+		uint8_t imm8 = 0;
+		for (int i = 0; i < 8; ++i) {
+			uint8_t byte = (elementValue >> (i * 8)) & 0xFF;
+			if (byte != 0 && byte != 0xFF)
+				return false;
+
+			if (byte == 0xFF)
+				imm8 |= 1 << i;
+		}
+
+		// Didn't run into any partial bytes, so size 64 is doable.
+		MOVI(size, Rd, imm8);
+		return true;
+	}
+	return false;
+}
+
+bool ARM64FloatEmitter::TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t elementValue) {
+	// Try the original size first in case that's more optimal.
+	if (TryMOVI(size, Rd, elementValue))
+		return true;
+
+	uint64_t value = elementValue;
+	if (size != 64) {
+		uint64_t masked = elementValue & ((1 << size) - 1);
+		for (int i = size; i < 64; ++i) {
+			value |= masked << i;
+		}
+	}
+
+	for (int attempt = 8; attempt <= 64; attempt += attempt) {
+		// Original size was already attempted above.
+		if (attempt != size) {
+			if (TryMOVI(attempt, Rd, value))
+				return true;
+		}
+	}
+
+	return false;
+}
+
 void ARM64XEmitter::SUBSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
 	u32 val;
 	bool shift;
diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h
index cd4a54cb73..0c3603d1bf 100644
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@@ -925,6 +925,10 @@ public:
 	void ORR(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
 	void BIC(u8 size, ARM64Reg Rd, u8 imm8, u8 shift = 0);
 
+	bool TryMOVI(u8 size, ARM64Reg Rd, uint64_t value);
+	// Allow using a different size.  Unclear if there's a penalty.
+	bool TryAnyMOVI(u8 size, ARM64Reg Rd, uint64_t value);
+
 	// One source
 	void FCVT(u8 size_to, u8 size_from, ARM64Reg Rd, ARM64Reg Rn);