Better logic immediate support in ARM emitter. From V8.

2025-04-02 11:01:50 -04:00 · 2015-03-07 14:44:15 +01:00 · 2015-03-07 14:44:15 +01:00 · 3aebc06329
commit 3aebc06329
parent b309c83973
4 changed files with 298 additions and 4 deletions
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@ -6,6 +6,8 @@
 #include <algorithm>
 #include <cmath>

+#include "base/basictypes.h"
+
 #include "Arm64Emitter.h"
 #include "MathUtil.h"
 #include "CommonTypes.h"
@ -13,6 +15,232 @@
 namespace Arm64Gen
 {

+const int kWRegSizeInBits = 32;
+const int kXRegSizeInBits = 64;
+
+// The below few functions are taken from V8.
+int CountLeadingZeros(uint64_t value, int width) {
+	// TODO(jbramley): Optimize this for ARM64 hosts.
+	int count = 0;
+	uint64_t bit_test = 1UL << (width - 1);
+	while ((count < width) && ((bit_test & value) == 0)) {
+		count++;
+		bit_test >>= 1;
+	}
+	return count;
+}
+
+uint64_t LargestPowerOf2Divisor(uint64_t value) {
+	return value & -value;
+}
+
+bool IsPowerOfTwo(uint64_t x) {
+	return (x != 0) && ((x & (x - 1)) == 0);
+}
+
+#define V8_UINT64_C(x) ((uint64_t)(x))
+
+static bool IsImmLogical(uint64_t value,
+                          unsigned int width,
+                          unsigned int *n,
+                          unsigned int *imm_s,
+                          unsigned int *imm_r) {
+  //DCHECK((n != NULL) && (imm_s != NULL) && (imm_r != NULL));
+  // DCHECK((width == kWRegSizeInBits) || (width == kXRegSizeInBits));
+
+  bool negate = false;
+
+  // Logical immediates are encoded using parameters n, imm_s and imm_r using
+  // the following table:
+  //
+  //    N   imms    immr    size        S             R
+  //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+  //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+  //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+  //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+  //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+  //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+  // (s bits must not be all set)
+  //
+  // A pattern is constructed of size bits, where the least significant S+1 bits
+  // are set. The pattern is rotated right by R, and repeated across a 32 or
+  // 64-bit value, depending on destination register width.
+  //
+  // Put another way: the basic format of a logical immediate is a single
+  // contiguous stretch of 1 bits, repeated across the whole word at intervals
+  // given by a power of 2. To identify them quickly, we first locate the
+  // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+  // is different for every logical immediate, so it gives us all the
+  // information we need to identify the only logical immediate that our input
+  // could be, and then we simply check if that's the value we actually have.
+  //
+  // (The rotation parameter does give the possibility of the stretch of 1 bits
+  // going 'round the end' of the word. To deal with that, we observe that in
+  // any situation where that happens the bitwise NOT of the value is also a
+  // valid logical immediate. So we simply invert the input whenever its low bit
+  // is set, and then we know that the rotated case can't arise.)
+
+  if (value & 1) {
+    // If the low bit is 1, negate the value, and set a flag to remember that we
+    // did (so that we can adjust the return values appropriately).
+    negate = true;
+    value = ~value;
+  }
+
+  if (width == kWRegSizeInBits) {
+    // To handle 32-bit logical immediates, the very easiest thing is to repeat
+    // the input value twice to make a 64-bit word. The correct encoding of that
+    // as a logical immediate will also be the correct encoding of the 32-bit
+    // value.
+
+    // The most-significant 32 bits may not be zero (ie. negate is true) so
+    // shift the value left before duplicating it.
+    value <<= kWRegSizeInBits;
+    value |= value >> kWRegSizeInBits;
+  }
+
+  // The basic analysis idea: imagine our input word looks like this.
+  //
+  //    0011111000111110001111100011111000111110001111100011111000111110
+  //                                                          c  b    a
+  //                                                          |<--d-->|
+  //
+  // We find the lowest set bit (as an actual power-of-2 value, not its index)
+  // and call it a. Then we add a to our original number, which wipes out the
+  // bottommost stretch of set bits and replaces it with a 1 carried into the
+  // next zero bit. Then we look for the new lowest set bit, which is in
+  // position b, and subtract it, so now our number is just like the original
+  // but with the lowest stretch of set bits completely gone. Now we find the
+  // lowest set bit again, which is position c in the diagram above. Then we'll
+  // measure the distance d between bit positions a and c (using CLZ), and that
+  // tells us that the only valid logical immediate that could possibly be equal
+  // to this number is the one in which a stretch of bits running from a to just
+  // below b is replicated every d bits.
+  uint64_t a = LargestPowerOf2Divisor(value);
+  uint64_t value_plus_a = value + a;
+  uint64_t b = LargestPowerOf2Divisor(value_plus_a);
+  uint64_t value_plus_a_minus_b = value_plus_a - b;
+  uint64_t c = LargestPowerOf2Divisor(value_plus_a_minus_b);
+
+  int d, clz_a, out_n;
+  uint64_t mask;
+
+  if (c != 0) {
+    // The general case, in which there is more than one stretch of set bits.
+    // Compute the repeat distance d, and set up a bitmask covering the basic
+    // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+    // of these cases the N bit of the output will be zero.
+    clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+    int clz_c = CountLeadingZeros(c, kXRegSizeInBits);
+    d = clz_a - clz_c;
+    mask = ((V8_UINT64_C(1) << d) - 1);
+    out_n = 0;
+  } else {
+    // Handle degenerate cases.
+    //
+    // If any of those 'find lowest set bit' operations didn't find a set bit at
+    // all, then the word will have been zero thereafter, so in particular the
+    // last lowest_set_bit operation will have returned zero. So we can test for
+    // all the special case conditions in one go by seeing if c is zero.
+    if (a == 0) {
+      // The input was zero (or all 1 bits, which will come to here too after we
+      // inverted it at the start of the function), for which we just return
+      // false.
+      return false;
+    } else {
+      // Otherwise, if c was zero but a was not, then there's just one stretch
+      // of set bits in our word, meaning that we have the trivial case of
+      // d == 64 and only one 'repetition'. Set up all the same variables as in
+      // the general case above, and set the N bit in the output.
+      clz_a = CountLeadingZeros(a, kXRegSizeInBits);
+      d = 64;
+      mask = ~V8_UINT64_C(0);
+      out_n = 1;
+    }
+  }
+
+  // If the repeat period d is not a power of two, it can't be encoded.
+  if (!IsPowerOfTwo(d)) {
+    return false;
+  }
+
+  if (((b - a) & ~mask) != 0) {
+    // If the bit stretch (b - a) does not fit within the mask derived from the
+    // repeat period, then fail.
+    return false;
+  }
+
+  // The only possible option is b - a repeated every d bits. Now we're going to
+  // actually construct the valid logical immediate derived from that
+  // specification, and see if it equals our original input.
+  //
+  // To repeat a value every d bits, we multiply it by a number of the form
+  // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+  // be derived using a table lookup on CLZ(d).
+  static const uint64_t multipliers[] = {
+    0x0000000000000001UL,
+    0x0000000100000001UL,
+    0x0001000100010001UL,
+    0x0101010101010101UL,
+    0x1111111111111111UL,
+    0x5555555555555555UL,
+  };
+  int multiplier_idx = CountLeadingZeros(d, kXRegSizeInBits) - 57;
+  // Ensure that the index to the multipliers array is within bounds.
+  _dbg_assert_(JIT, (multiplier_idx >= 0) &&
+         (static_cast<size_t>(multiplier_idx) < ARRAY_SIZE(multipliers)));
+  uint64_t multiplier = multipliers[multiplier_idx];
+  uint64_t candidate = (b - a) * multiplier;
+
+  if (value != candidate) {
+    // The candidate pattern doesn't match our input value, so fail.
+    return false;
+  }
+
+  // We have a match! This is a valid logical immediate, so now we have to
+  // construct the bits and pieces of the instruction encoding that generates
+  // it.
+
+  // Count the set bits in our basic stretch. The special case of clz(0) == -1
+  // makes the answer come out right for stretches that reach the very top of
+  // the word (e.g. numbers like 0xffffc00000000000).
+  int clz_b = (b == 0) ? -1 : CountLeadingZeros(b, kXRegSizeInBits);
+  int s = clz_a - clz_b;
+
+  // Decide how many bits to rotate right by, to put the low bit of that basic
+  // stretch in position a.
+  int r;
+  if (negate) {
+    // If we inverted the input right at the start of this function, here's
+    // where we compensate: the number of set bits becomes the number of clear
+    // bits, and the rotation count is based on position b rather than position
+    // a (since b is the location of the 'lowest' 1 bit after inversion).
+    s = d - s;
+    r = (clz_b + 1) & (d - 1);
+  } else {
+    r = (clz_a + 1) & (d - 1);
+  }
+
+  // Now we're done, except for having to encode the S output in such a way that
+  // it gives both the number of set bits and the length of the repeated
+  // segment. The s field is encoded like this:
+  //
+  //     imms    size        S
+  //    ssssss    64    UInt(ssssss)
+  //    0sssss    32    UInt(sssss)
+  //    10ssss    16    UInt(ssss)
+  //    110sss     8    UInt(sss)
+  //    1110ss     4    UInt(ss)
+  //    11110s     2    UInt(s)
+  //
+  // So we 'or' (-d << 1) with our computed s to form imms.
+  *n = out_n;
+  *imm_s = ((-d << 1) | (s - 1)) & 0x3f;
+  *imm_r = r;
+
+  return true;
+}
+
 void ARM64XEmitter::SetCodePtr(u8* ptr)
 {
 	m_code = ptr;
@ -704,6 +932,12 @@ void ARM64XEmitter::BL(const void* ptr)
 	EncodeUnconditionalBranchInst(1, ptr);
 }

+void ARM64XEmitter::QuickCallFunction(ARM64Reg scratchreg, const void *func) {
+	// TODO: Add special code to use the scratch reg if the call distance is too great.
+	BL(func);
+}
+
+
 // Unconditional Branch (register)
 void ARM64XEmitter::BR(ARM64Reg Rn)
 {
@ -1114,7 +1348,7 @@ void ARM64XEmitter::ORR(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
 }
 void ARM64XEmitter::TST(ARM64Reg Rn, u32 immr, u32 imms)
 {
-	EncodeLogicalImmInst(3, SP, Rn, immr, imms);
+	EncodeLogicalImmInst(3, Is64Bit(Rn) ? SP : WSP, Rn, immr, imms);
 }

 // Add/subtract (immediate)
@ -2784,5 +3018,35 @@ void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mas
 	}
 }

+void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+	unsigned int n, imm_s, imm_r;
+	if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
+		AND(Rd, Rn, imm_r, imm_s);
+	} else {
+		MOVI2R(scratch, imm);
+		AND(Rd, Rn, scratch);
+	}
+}
+
+void ARM64XEmitter::ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+	unsigned int n, imm_s, imm_r;
+	if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
+		ORR(Rd, Rn, imm_r, imm_s);
+	} else {
+		MOVI2R(scratch, imm);
+		ORR(Rd, Rn, scratch);
+	}
+}
+
+void ARM64XEmitter::ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
+	unsigned int n, imm_s, imm_r;
+	if (IsImmLogical(imm, Is64Bit(Rn) ? 64 : 32, &n, &imm_s, &imm_r)) {
+		ANDS(Rd, Rn, imm_r, imm_s);
+	} else {
+		MOVI2R(scratch, imm);
+		ANDS(Rd, Rn, scratch);
+	}
+}
+
 }

--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@ -79,7 +79,7 @@ enum ARM64Reg
 	INVALID_REG = 0xFFFFFFFF
 };

-inline bool Is64Bit(ARM64Reg reg) { return reg & 0x20; }
+inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; }
 inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; }
 inline bool IsDouble(ARM64Reg reg) { return (reg & 0xC0) == 0x80; }
 inline bool IsQuad(ARM64Reg reg) { return (reg & 0xC0) == 0xC0; }
@ -500,6 +500,17 @@ public:
 	void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
 	void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
 	void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ArithOption Shift);
+
+	// Wrap the above for saner syntax
+	void AND(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { AND(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+	void BIC(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BIC(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+	void ORR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORR(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+	void ORN(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ORN(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+	void EOR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EOR(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+	void EON(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EON(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+	void ANDS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { ANDS(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+	void BICS(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { BICS(Rd, Rn, Rm, ArithOption(Rd, 0)); }
+
 	void MOV(ARM64Reg Rd, ARM64Reg Rm);
 	void MVN(ARM64Reg Rd, ARM64Reg Rm);

@ -611,6 +622,12 @@ public:
 	// Wrapper around MOVZ+MOVK
 	void MOVI2R(ARM64Reg Rd, u64 imm, bool optimize = true);

+	// Wrapper around AND x, y, imm etc
+	void ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch);
+	void ANDSI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch);
+	void TSTI2R(ARM64Reg Rn, u64 imm, ARM64Reg scratch) { ANDSI2R(Is64Bit(Rn) ? SP : WSP, Rn, imm, scratch); }
+	void ORI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch);
+
 	// ABI related
 	void ABI_PushRegisters(BitSet32 registers);
 	void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
@ -637,6 +654,12 @@ public:
 		MOVI2R(X0, (u64)const_cast<void*>((const void*)f));
 		return X30;
 	}
+
+	// Plain function call
+	void QuickCallFunction(ARM64Reg scratchreg, const void *func);
+	template <typename T> void QuickCallFunction(ARM64Reg scratchreg, T func) {
+		QuickCallFunction(scratchreg, (const void *)func);
+	}
 };

 class ARM64FloatEmitter
@ -748,6 +771,7 @@ public:
 	// vector x indexed element
 	void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);

+
 	// ABI related
 	void ABI_PushRegisters(BitSet32 registers);
 	void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
--- a/Common/ArmCPUDetect.cpp
+++ b/Common/ArmCPUDetect.cpp
@ -160,7 +160,7 @@ void CPUInfo::Detect()
 {
 	// Set some defaults here
 	HTT = false;
-#ifdef _M_ARM_64
+#ifdef ARM64
 	OS64bit = true;
 	CPU64bit = true;
 	Mode64bit = true;
--- a/Common/MemArena.cpp
+++ b/Common/MemArena.cpp
@ -229,7 +229,13 @@ u8* MemArena::Find4GBBase()
 	return reinterpret_cast<u8*>(0x2300000000ULL);
 #endif

-#else // 32 bit
+#elif defined(ARM64)
+
+	// Very precarious - mmap cannot return an error when trying to map already used pages.
+	// This makes the Windows approach above unusable on Linux, so we will simply pray...
+	return reinterpret_cast<u8*>(0x2300000000ULL);
+
+#else

 #ifdef _WIN32
 	u8* base = (u8*)VirtualAlloc(0, 0x10000000, MEM_RESERVE, PAGE_READWRITE);