diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp
index a8c7984584..a818eb6ced 100644
--- a/Common/Arm64Emitter.cpp
+++ b/Common/Arm64Emitter.cpp
@@ -1913,17 +1913,25 @@ inline int64_t abs64(int64_t x) {
 	return x >= 0 ? x : -x;
 }
 
+int Count(bool part[4]) {
+	int cnt = 0;
+	for (int i = 0; i < 4; i++) {
+		if (part[i])
+			cnt++;
+	}
+	return cnt;
+}
+
 // Wrapper around MOVZ+MOVK (and later MOVN)
 void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
 {
 	unsigned int parts = Is64Bit(Rd) ? 4 : 2;
-	BitSet32 upload_part(0);
+	bool upload_part[4];
 
 	// Always start with a movz! Kills the dependency on the register.
 	bool use_movz = true;
 
-	if (!imm)
-	{
+	if (!imm) {
 		// Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks clearer in disasm too.
 		MOVZ(Rd, 0, SHIFT_0);
 		return;
@@ -1961,7 +1969,7 @@ void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize)
 
 	u64 aligned_pc = (u64)GetCodePointer() & ~0xFFF;
 	s64 aligned_offset = (s64)imm - (s64)aligned_pc;
-	if (upload_part.Count() > 1 && abs64(aligned_offset) < 0xFFFFFFFFLL)
+	if (Count(upload_part) > 1 && abs64(aligned_offset) < 0xFFFFFFFFLL)
 	{
 		// Immediate we are loading is within 4GB of our aligned range
 		// Most likely a address that we can load in one or two instructions
@@ -2015,115 +2023,11 @@ void ARM64XEmitter::POP(ARM64Reg Rd) {
 void ARM64XEmitter::PUSH2(ARM64Reg Rd, ARM64Reg Rn) {
 	STP(INDEX_PRE, Rd, Rn, SP, -16);
 }
+
 void ARM64XEmitter::POP2(ARM64Reg Rd, ARM64Reg Rn) {
 	LDP(INDEX_POST, Rd, Rn, SP, 16);
 }
 
-
-void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers)
-{
-	int num_regs = registers.Count();
-
-	if (num_regs % 2)
-	{
-		bool first = true;
-
-		// Stack is required to be quad-word aligned.
-		u32 stack_size = ROUND_UP(num_regs * 8, 16);
-		u32 current_offset = 0;
-		std::vector<ARM64Reg> reg_pair;
-
-		for (auto it : registers)
-		{
-			if (first)
-			{
-				STR(INDEX_PRE, (ARM64Reg)(X0 + it), SP, -(s32)stack_size);
-				first = false;
-				current_offset += 16;
-			}
-			else
-			{
-				reg_pair.push_back((ARM64Reg)(X0 + it));
-				if (reg_pair.size() == 2)
-				{
-					STP(INDEX_UNSIGNED, reg_pair[0], reg_pair[1], SP, current_offset);
-					reg_pair.clear();
-					current_offset += 16;
-				}
-			}
-		}
-	}
-	else
-	{
-		std::vector<ARM64Reg> reg_pair;
-
-		for (auto it : registers)
-		{
-			reg_pair.push_back((ARM64Reg)(X0 + it));
-			if (reg_pair.size() == 2)
-			{
-				STP(INDEX_PRE, reg_pair[0], reg_pair[1], SP, -16);
-				reg_pair.clear();
-			}
-		}
-	}
-}
-
-void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
-{
-	int num_regs = registers.Count();
-
-	if (num_regs % 2)
-	{
-		bool first = true;
-
-		std::vector<ARM64Reg> reg_pair;
-
-		for (auto it : registers)
-		{
-			if (ignore_mask[it])
-				it = WSP;
-
-			if (first)
-			{
-				LDR(INDEX_POST, (ARM64Reg)(X0 + it), SP, 16);
-				first = false;
-			}
-			else
-			{
-				reg_pair.push_back((ARM64Reg)(X0 + it));
-				if (reg_pair.size() == 2)
-				{
-					LDP(INDEX_POST, reg_pair[0], reg_pair[1], SP, 16);
-					reg_pair.clear();
-				}
-			}
-		}
-	}
-	else
-	{
-		std::vector<ARM64Reg> reg_pair;
-
-		for (int i = 31; i >= 0; --i)
-		{
-			if (!registers[i])
-				continue;
-
-			int reg = i;
-
-			if (ignore_mask[reg])
-				reg = WSP;
-
-			reg_pair.push_back((ARM64Reg)(X0 + reg));
-			if (reg_pair.size() == 2)
-			{
-				LDP(INDEX_POST, reg_pair[1], reg_pair[0], SP, 16);
-				reg_pair.clear();
-			}
-		}
-	}
-}
-
 // Float Emitter
 void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
 {
@@ -3658,161 +3562,92 @@ void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8
 	EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
 }
 
-void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
-{
-	bool bundled_loadstore = false;
+void ARM64FloatEmitter::ABI_PushRegisters(uint32_t registers, uint32_t fp_registers) {
+	_assert_msg_(DYNA_REC, (registers & 0x60000000) == 0, "ABI_PushRegisters: Do not include FP and LR, those are handled non-conditionally");
 
-	for (int i = 0; i < 32; ++i)
-	{
-		if (!registers[i])
-			continue;
-
-		int count = 0;
-		while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
-		if (count > 1)
-		{
-			bundled_loadstore = true;
-			break;
-		}
+	ARM64Reg gprs[32]{}, fprs[32]{};
+	int num_gprs = 0, num_fprs = 0;
+	for (int i = 0; i < 29; i++) {
+		if (registers & (1U << i))
+			gprs[num_gprs++] = (ARM64Reg)(X0 + i);
 	}
 
-	if (bundled_loadstore && tmp != INVALID_REG)
-	{
-		int num_regs = registers.Count();
-		m_emit->SUB(SP, SP, num_regs * 16);
-		m_emit->ADD(tmp, SP, 0);
-		std::vector<ARM64Reg> island_regs;
-		for (int i = 0; i < 32; ++i)
-		{
-			if (!registers[i])
-				continue;
-
-			int count = 0;
-
-			// 0 = true
-			// 1 < 4 && registers[i + 1] true!
-			// 2 < 4 && registers[i + 2] true!
-			// 3 < 4 && registers[i + 3] true!
-			// 4 < 4 && registers[i + 4] false!
-			while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
-
-			if (count == 1)
-				island_regs.push_back((ARM64Reg)(Q0 + i));
-			else
-				ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
-
-			i += count - 1;
-		}
-
-		// Handle island registers
-		std::vector<ARM64Reg> pair_regs;
-		for (auto& it : island_regs)
-		{
-			pair_regs.push_back(it);
-			if (pair_regs.size() == 2)
-			{
-				STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
-				pair_regs.clear();
-			}
-		}
-		if (pair_regs.size())
-			STR(128, INDEX_POST, pair_regs[0], tmp, 16);
+	for (int i = 0; i < 32; i++) {
+		if (fp_registers & (1U << i))
+			fprs[num_fprs++] = (ARM64Reg)(D0 + i);
 	}
-	else
-	{
-		std::vector<ARM64Reg> pair_regs;
-		for (auto it : registers)
-		{
-			pair_regs.push_back((ARM64Reg)(Q0 + it));
-			if (pair_regs.size() == 2)
-			{
-				STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
-				pair_regs.clear();
-			}
+
+	u32 stack_size = 16 + ROUND_UP(num_gprs * 8, 16) + ROUND_UP(num_fprs * 8, 16);
+
+	// Stack is required to be quad-word aligned.
+	if (stack_size < 256) {
+		m_emit->STP(INDEX_PRE, FP, LR, SP, -(s32)stack_size);
+	} else {
+		m_emit->SUB(SP, SP, stack_size);
+		m_emit->STP(INDEX_UNSIGNED, FP, LR, SP, 0);
+	}
+	m_emit->MOVfromSP(X29);  // Set new frame pointer
+	int offset = 16;
+	for (int i = 0; i < num_gprs / 2; i++) {
+		m_emit->STP(INDEX_SIGNED, gprs[i*2], gprs[i*2+1], X29, offset);
+		offset += 16;
+	}
+	// Do the straggler.
+	if (num_gprs & 1) {
+		m_emit->STR(INDEX_UNSIGNED, gprs[num_gprs - 1], X29, offset);
+		offset += 16;
+	}
+
+	if (num_fprs) {
+		// OK, and now for the FPRs. Let's start simple.
+		//m_emit->ADD(X29, X29, offset);
+		//offset = 0;
+		for (int i = 0; i < num_fprs; i++) {
+			STR(64, INDEX_UNSIGNED, fprs[i], X29, offset);
+			offset += 8;
 		}
-		if (pair_regs.size())
-			STR(128, INDEX_PRE, pair_regs[0], SP, -16);
+		// Since we walked X29 away, reset it.
+		m_emit->MOVfromSP(X29);  // Set new frame pointer
 	}
 }
-void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
-{
-	bool bundled_loadstore = false;
-	int num_regs = registers.Count();
 
-	for (int i = 0; i < 32; ++i)
-	{
-		if (!registers[i])
-			continue;
-
-		int count = 0;
-		while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
-		if (count > 1)
-		{
-			bundled_loadstore = true;
-			break;
-		}
+void ARM64FloatEmitter::ABI_PopRegisters(uint32_t registers, uint32_t fp_registers) {
+	ARM64Reg gprs[32]{}, fprs[32]{};
+	int num_gprs = 0, num_fprs = 0;
+	for (int i = 0; i < 29; i++) {
+		if (registers & (1U << i))
+			gprs[num_gprs++] = (ARM64Reg)(X0 + i);
 	}
 
-	if (bundled_loadstore && tmp != INVALID_REG)
-	{
-		// The temporary register is only used to indicate that we can use this code path
-		std::vector<ARM64Reg> island_regs;
-		for (int i = 0; i < 32; ++i)
-		{
-			if (!registers[i])
-				continue;
-
-			int count = 0;
-			while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
-
-			if (count == 1)
-				island_regs.push_back((ARM64Reg)(Q0 + i));
-			else
-				LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
-
-			i += count - 1;
-		}
-
-		// Handle island registers
-		std::vector<ARM64Reg> pair_regs;
-		for (auto& it : island_regs)
-		{
-			pair_regs.push_back(it);
-			if (pair_regs.size() == 2)
-			{
-				LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
-				pair_regs.clear();
-			}
-		}
-		if (pair_regs.size())
-			LDR(128, INDEX_POST, pair_regs[0], SP, 16);
+	for (int i = 0; i < 32; i++) {
+		if (fp_registers & (1U << i))
+			fprs[num_fprs++] = (ARM64Reg)(D0 + i);
 	}
-	else
-	{
-		bool odd = (num_regs % 2) != 0;
-		std::vector<ARM64Reg> pair_regs;
-		for (int i = 31; i >= 0; --i)
-		{
-			if (!registers[i])
-				continue;
 
-			if (odd)
-			{
-				// First load must be a regular LDR if odd
-				odd = false;
-				LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
-			}
-			else
-			{
-				pair_regs.push_back((ARM64Reg)(Q0 + i));
-				if (pair_regs.size() == 2)
-				{
-					LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
-					pair_regs.clear();
-				}
-			}
-		}
+	u32 stack_size = ROUND_UP((num_gprs + 2) * 8, 16) + num_fprs * 8;
+
+	// SP points to the bottom. We're gonna walk it upwards.
+	// Reload FP, LR.
+	m_emit->LDP(INDEX_SIGNED, FP, LR, SP, 0);
+	int offset = 16;
+	for (int i = 0; i < num_gprs / 2; i++) {
+		m_emit->LDP(INDEX_SIGNED, gprs[i*2], gprs[i*2+1], SP, offset);
+		offset += 16;
 	}
+	// Do the straggler.
+	if (num_gprs & 1) {
+		m_emit->LDR(INDEX_UNSIGNED, gprs[num_gprs-1], SP, offset);
+		offset += 16;
+	}
+
+	// Time for the FP regs.
+	for (int i = 0; i < num_fprs; i++) {
+		LDR(64, INDEX_UNSIGNED, fprs[i], SP, offset);
+		offset += 8;
+	}
+
+	// Restore the stack pointer.
+	m_emit->ADD(SP, SP, stack_size);
 }
 
 void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) {
diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h
index df4c78f3d8..1b65e69d66 100644
--- a/Common/Arm64Emitter.h
+++ b/Common/Arm64Emitter.h
@@ -85,12 +85,14 @@ enum ARM64Reg
 
 	WZR = WSP,
 	ZR = SP,
+	FP = X29,
+	LR = X30,
 
 	INVALID_REG = 0xFFFFFFFF
 };
 
-// R19-R28, R29 (FP), R30 (LR). FP seems questionable?
-const u32 ALL_CALLEE_SAVED = 0x7FF80000;
+// R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately.
+const u32 ALL_CALLEE_SAVED = 0x1FF80000;
 const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00;  // d8-d15
 
 inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; }
@@ -720,10 +722,6 @@ public:
 	bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
 	bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm);
 
-	// ABI related
-	void ABI_PushRegisters(BitSet32 registers);
-	void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
-
 	// Pseudo-instruction for convenience. PUSH pushes 16 bytes even though we only push a single register.
 	// This is so the stack pointer is always 16-byte aligned, which is checked by hardware!
 	void PUSH(ARM64Reg Rd);
@@ -943,8 +941,8 @@ public:
 	void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
 
 	// ABI related
-	void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
-	void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
+	void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers);
+	void ABI_PopRegisters(uint32_t gpr_registers, uint32_t fp_registers);
 
 private:
 	ARM64XEmitter* m_emit;
diff --git a/Common/BitSet.h b/Common/BitSet.h
index d60b507dd4..2fbc80011b 100644
--- a/Common/BitSet.h
+++ b/Common/BitSet.h
@@ -3,8 +3,6 @@
 #pragma once
 
 #include <cstddef>
-#include <initializer_list>
-#include <type_traits>
 #include "CommonTypes.h"
 
 // Helper functions:
@@ -12,8 +10,7 @@
 #ifdef _WIN32
 #include <intrin.h>
 template <typename T>
-static inline int CountSetBits(T v)
-{
+inline int CountSetBits(T v) {
 	// from https://graphics.stanford.edu/~seander/bithacks.html
 	// GCC has this built in, but MSVC's intrinsic will only emit the actual
 	// POPCNT instruction, which we're not depending on
@@ -22,14 +19,14 @@ static inline int CountSetBits(T v)
 	v = (v + (v >> 4)) & (T)~(T)0/255*15;
 	return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
 }
-static inline int LeastSignificantSetBit(u32 val)
+inline int LeastSignificantSetBit(u32 val)
 {
 	unsigned long index;
 	_BitScanForward(&index, val);
 	return (int)index;
 }
 #ifdef _M_X64
-static inline int LeastSignificantSetBit(u64 val)
+inline int LeastSignificantSetBit(u64 val)
 {
 	unsigned long index;
 	_BitScanForward64(&index, val);
@@ -37,134 +34,8 @@ static inline int LeastSignificantSetBit(u64 val)
 }
 #endif
 #else
-static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
-static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
-static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
-static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
+inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
+inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
+inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
+inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
 #endif
-
-// namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
-namespace BS
-{
-
-// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
-// using the set bits of an integer to represent a set of integers.  Like that
-// class, it acts like an array of bools:
-//     BitSet32 bs;
-//     bs[1] = true;
-// but also like the underlying integer ([0] = least significant bit):
-//     BitSet32 bs2 = ...;
-//     bs = (bs ^ bs2) & BitSet32(0xffff);
-// The following additional functionality is provided:
-// - Construction using an initializer list.
-//     BitSet bs { 1, 2, 4, 8 };
-// - Efficiently iterating through the set bits:
-//     for (int i : bs)
-//         [i is the *index* of a set bit]
-//   (This uses the appropriate CPU instruction to find the next set bit in one
-//   operation.)
-// - Counting set bits using .Count() - see comment on that method.
-
-// TODO: use constexpr when MSVC gets out of the Dark Ages
-
-template <typename IntTy>
-class BitSet
-{
-	static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
-public:
-	// A reference to a particular bit, returned from operator[].
-	class Ref
-	{
-	public:
-		Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
-		Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
-		operator bool() const { return (m_bs->m_val & m_mask) != 0; }
-		bool operator=(bool set)
-		{
-			m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
-			return set;
-		}
-	private:
-		BitSet* m_bs;
-		IntTy m_mask;
-	};
-
-	// A STL-like iterator is required to be able to use range-based for loops.
-	class Iterator
-	{
-	public:
-		Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
-		Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
-		Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
-		int operator*() { return m_bit; }
-		Iterator& operator++()
-		{
-			if (m_val == 0)
-			{
-				m_bit = -1;
-			}
-			else
-			{
-				int bit = LeastSignificantSetBit(m_val);
-				m_val &= ~(1 << bit);
-				m_bit = bit;
-			}
-			return *this;
-		}
-		Iterator operator++(int _)
-		{
-			Iterator other(*this);
-			++*this;
-			return other;
-		}
-		bool operator==(Iterator other) const { return m_bit == other.m_bit; }
-		bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
-	private:
-		IntTy m_val;
-		int m_bit;
-	};
-
-	BitSet() : m_val(0) {}
-	explicit BitSet(IntTy val) : m_val(val) {}
-	BitSet(std::initializer_list<int> init)
-	{
-		m_val = 0;
-		for (int bit : init)
-			m_val |= (IntTy)1 << bit;
-	}
-
-	static BitSet AllTrue(size_t count)
-	{
-		return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
-	}
-
-	Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
-	const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
-	bool operator==(BitSet other) const { return m_val == other.m_val; }
-	bool operator!=(BitSet other) const { return m_val != other.m_val; }
-	BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
-	BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
-	BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
-	BitSet operator~() const { return BitSet(~m_val); }
-	BitSet& operator|=(BitSet other) { return *this = *this | other; }
-	BitSet& operator&=(BitSet other) { return *this = *this & other; }
-	BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
-	operator u32() = delete;
-	operator bool() { return m_val != 0; }
-
-	// Warning: Even though on modern CPUs this is a single fast instruction,
-	// Dolphin's official builds do not currently assume POPCNT support on x86,
-	// so slower explicit bit twiddling is generated.  Still should generally
-	// be faster than a loop.
-	unsigned int Count() const { return CountSetBits(m_val); }
-
-	Iterator begin() const { Iterator it(m_val, 0); return ++it; }
-	Iterator end() const { return Iterator(m_val, -1); }
-
-	IntTy m_val;
-};
-
-}
-
-typedef BS::BitSet<u32> BitSet32;
-typedef BS::BitSet<u64> BitSet64;
diff --git a/Core/MIPS/ARM64/Arm64Asm.cpp b/Core/MIPS/ARM64/Arm64Asm.cpp
index 291e73e780..8e163e8756 100644
--- a/Core/MIPS/ARM64/Arm64Asm.cpp
+++ b/Core/MIPS/ARM64/Arm64Asm.cpp
@@ -35,7 +35,7 @@ using namespace Arm64Gen;
 //static int temp32; // unused?
 
 static const bool enableDebug = false;
-static const bool enableDisasm = false;
+static const bool enableDisasm = true;
 
 //static bool enableStatistics = false; //unused?
 
@@ -198,10 +198,9 @@ void Arm64Jit::GenerateFixedCode(const JitOptions &jo) {
 
 	enterDispatcher = AlignCode16();
 
-	BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED);
-	BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP);
-	ABI_PushRegisters(regs_to_save);
-	fp.ABI_PushRegisters(regs_to_save_fp);
+	uint32_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
+	uint32_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
+	fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
 
 	// Fixed registers, these are always kept when in Jit context.
 	MOVP2R(MEMBASEREG, Memory::base);
@@ -290,8 +289,7 @@ void Arm64Jit::GenerateFixedCode(const JitOptions &jo) {
 	SaveStaticRegisters();
 	RestoreRoundingMode(true);
 
-	fp.ABI_PopRegisters(regs_to_save_fp);
-	ABI_PopRegisters(regs_to_save);
+	fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
 
 	RET();
 
diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index 16b1dc0ac5..dfed30d0be 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -175,10 +175,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 
 	// if (skinning) log = true;
 
-	BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED);
-	BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP);
-	ABI_PushRegisters(regs_to_save);
-	fp.ABI_PushRegisters(regs_to_save_fp);
+	uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
+	uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
+	fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
 
 	// Keep the scale/offset in a few fp registers if we need it.
 	if (prescaleStep) {
@@ -279,8 +278,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int
 		STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV));
 	}
 
-	fp.ABI_PopRegisters(regs_to_save_fp);
-	ABI_PopRegisters(regs_to_save);
+	fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
 
 	RET();
 
diff --git a/android/jni/Arm64EmitterTest.cpp b/android/jni/Arm64EmitterTest.cpp
index 872810c812..b9240fc352 100644
--- a/android/jni/Arm64EmitterTest.cpp
+++ b/android/jni/Arm64EmitterTest.cpp
@@ -39,10 +39,9 @@ void TestCode::Generate()
 
 	const u8 *start = AlignCode16();
 
-	BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED);
-	BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP);
-	ABI_PushRegisters(regs_to_save);
-	fp.ABI_PushRegisters(regs_to_save_fp);
+	uint32_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED;
+	uint32_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP;
+	fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp);
 
 	PUSH(X3);
 	POP(X3);
@@ -54,8 +53,7 @@ void TestCode::Generate()
 	fp.SCVTF(S3, W12);
 	MOVI2R(X0, 1337);
 
-	ABI_PopRegisters(regs_to_save);
-	fp.ABI_PopRegisters(regs_to_save_fp);
+	fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp);
 
 	RET();