diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index a8c7984584..a818eb6ced 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -1913,17 +1913,25 @@ inline int64_t abs64(int64_t x) { return x >= 0 ? x : -x; } +int Count(bool part[4]) { + int cnt = 0; + for (int i = 0; i < 4; i++) { + if (part[i]) + cnt++; + } + return cnt; +} + // Wrapper around MOVZ+MOVK (and later MOVN) void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) { unsigned int parts = Is64Bit(Rd) ? 4 : 2; - BitSet32 upload_part(0); + bool upload_part[4]; // Always start with a movz! Kills the dependency on the register. bool use_movz = true; - if (!imm) - { + if (!imm) { // Zero immediate, just clear the register. EOR is pointless when we have MOVZ, which looks clearer in disasm too. MOVZ(Rd, 0, SHIFT_0); return; @@ -1961,7 +1969,7 @@ void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) u64 aligned_pc = (u64)GetCodePointer() & ~0xFFF; s64 aligned_offset = (s64)imm - (s64)aligned_pc; - if (upload_part.Count() > 1 && abs64(aligned_offset) < 0xFFFFFFFFLL) + if (Count(upload_part) > 1 && abs64(aligned_offset) < 0xFFFFFFFFLL) { // Immediate we are loading is within 4GB of our aligned range // Most likely a address that we can load in one or two instructions @@ -2015,115 +2023,11 @@ void ARM64XEmitter::POP(ARM64Reg Rd) { void ARM64XEmitter::PUSH2(ARM64Reg Rd, ARM64Reg Rn) { STP(INDEX_PRE, Rd, Rn, SP, -16); } + void ARM64XEmitter::POP2(ARM64Reg Rd, ARM64Reg Rn) { LDP(INDEX_POST, Rd, Rn, SP, 16); } - -void ARM64XEmitter::ABI_PushRegisters(BitSet32 registers) -{ - int num_regs = registers.Count(); - - if (num_regs % 2) - { - bool first = true; - - // Stack is required to be quad-word aligned. - u32 stack_size = ROUND_UP(num_regs * 8, 16); - u32 current_offset = 0; - std::vector reg_pair; - - for (auto it : registers) - { - if (first) - { - STR(INDEX_PRE, (ARM64Reg)(X0 + it), SP, -(s32)stack_size); - first = false; - current_offset += 16; - } - else - { - reg_pair.push_back((ARM64Reg)(X0 + it)); - if (reg_pair.size() == 2) - { - STP(INDEX_UNSIGNED, reg_pair[0], reg_pair[1], SP, current_offset); - reg_pair.clear(); - current_offset += 16; - } - } - } - } - else - { - std::vector reg_pair; - - for (auto it : registers) - { - reg_pair.push_back((ARM64Reg)(X0 + it)); - if (reg_pair.size() == 2) - { - STP(INDEX_PRE, reg_pair[0], reg_pair[1], SP, -16); - reg_pair.clear(); - } - } - } -} - -void ARM64XEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask) -{ - int num_regs = registers.Count(); - - if (num_regs % 2) - { - bool first = true; - - std::vector reg_pair; - - for (auto it : registers) - { - if (ignore_mask[it]) - it = WSP; - - if (first) - { - LDR(INDEX_POST, (ARM64Reg)(X0 + it), SP, 16); - first = false; - } - else - { - reg_pair.push_back((ARM64Reg)(X0 + it)); - if (reg_pair.size() == 2) - { - LDP(INDEX_POST, reg_pair[0], reg_pair[1], SP, 16); - reg_pair.clear(); - } - } - } - } - else - { - std::vector reg_pair; - - for (int i = 31; i >= 0; --i) - { - if (!registers[i]) - continue; - - int reg = i; - - if (ignore_mask[reg]) - reg = WSP; - - reg_pair.push_back((ARM64Reg)(X0 + reg)); - if (reg_pair.size() == 2) - { - LDP(INDEX_POST, reg_pair[1], reg_pair[0], SP, 16); - reg_pair.clear(); - } - } - } -} - // Float Emitter void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { @@ -3658,161 +3562,92 @@ void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm); } -void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) -{ - bool bundled_loadstore = false; +void ARM64FloatEmitter::ABI_PushRegisters(uint32_t registers, uint32_t fp_registers) { + _assert_msg_(DYNA_REC, (registers & 0x60000000) == 0, "ABI_PushRegisters: Do not include FP and LR, those are handled non-conditionally"); - for (int i = 0; i < 32; ++i) - { - if (!registers[i]) - continue; - - int count = 0; - while (++count < 4 && (i + count) < 32 && registers[i + count]) {} - if (count > 1) - { - bundled_loadstore = true; - break; - } + ARM64Reg gprs[32]{}, fprs[32]{}; + int num_gprs = 0, num_fprs = 0; + for (int i = 0; i < 29; i++) { + if (registers & (1U << i)) + gprs[num_gprs++] = (ARM64Reg)(X0 + i); } - if (bundled_loadstore && tmp != INVALID_REG) - { - int num_regs = registers.Count(); - m_emit->SUB(SP, SP, num_regs * 16); - m_emit->ADD(tmp, SP, 0); - std::vector island_regs; - for (int i = 0; i < 32; ++i) - { - if (!registers[i]) - continue; - - int count = 0; - - // 0 = true - // 1 < 4 && registers[i + 1] true! - // 2 < 4 && registers[i + 2] true! - // 3 < 4 && registers[i + 3] true! - // 4 < 4 && registers[i + 4] false! - while (++count < 4 && (i + count) < 32 && registers[i + count]) {} - - if (count == 1) - island_regs.push_back((ARM64Reg)(Q0 + i)); - else - ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); - - i += count - 1; - } - - // Handle island registers - std::vector pair_regs; - for (auto& it : island_regs) - { - pair_regs.push_back(it); - if (pair_regs.size() == 2) - { - STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32); - pair_regs.clear(); - } - } - if (pair_regs.size()) - STR(128, INDEX_POST, pair_regs[0], tmp, 16); + for (int i = 0; i < 32; i++) { + if (fp_registers & (1U << i)) + fprs[num_fprs++] = (ARM64Reg)(D0 + i); } - else - { - std::vector pair_regs; - for (auto it : registers) - { - pair_regs.push_back((ARM64Reg)(Q0 + it)); - if (pair_regs.size() == 2) - { - STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32); - pair_regs.clear(); - } + + u32 stack_size = 16 + ROUND_UP(num_gprs * 8, 16) + ROUND_UP(num_fprs * 8, 16); + + // Stack is required to be quad-word aligned. + if (stack_size < 256) { + m_emit->STP(INDEX_PRE, FP, LR, SP, -(s32)stack_size); + } else { + m_emit->SUB(SP, SP, stack_size); + m_emit->STP(INDEX_UNSIGNED, FP, LR, SP, 0); + } + m_emit->MOVfromSP(X29); // Set new frame pointer + int offset = 16; + for (int i = 0; i < num_gprs / 2; i++) { + m_emit->STP(INDEX_SIGNED, gprs[i*2], gprs[i*2+1], X29, offset); + offset += 16; + } + // Do the straggler. + if (num_gprs & 1) { + m_emit->STR(INDEX_UNSIGNED, gprs[num_gprs - 1], X29, offset); + offset += 16; + } + + if (num_fprs) { + // OK, and now for the FPRs. Let's start simple. + //m_emit->ADD(X29, X29, offset); + //offset = 0; + for (int i = 0; i < num_fprs; i++) { + STR(64, INDEX_UNSIGNED, fprs[i], X29, offset); + offset += 8; } - if (pair_regs.size()) - STR(128, INDEX_PRE, pair_regs[0], SP, -16); + // Since we walked X29 away, reset it. + m_emit->MOVfromSP(X29); // Set new frame pointer } } -void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) -{ - bool bundled_loadstore = false; - int num_regs = registers.Count(); - for (int i = 0; i < 32; ++i) - { - if (!registers[i]) - continue; - - int count = 0; - while (++count < 4 && (i + count) < 32 && registers[i + count]) {} - if (count > 1) - { - bundled_loadstore = true; - break; - } +void ARM64FloatEmitter::ABI_PopRegisters(uint32_t registers, uint32_t fp_registers) { + ARM64Reg gprs[32]{}, fprs[32]{}; + int num_gprs = 0, num_fprs = 0; + for (int i = 0; i < 29; i++) { + if (registers & (1U << i)) + gprs[num_gprs++] = (ARM64Reg)(X0 + i); } - if (bundled_loadstore && tmp != INVALID_REG) - { - // The temporary register is only used to indicate that we can use this code path - std::vector island_regs; - for (int i = 0; i < 32; ++i) - { - if (!registers[i]) - continue; - - int count = 0; - while (++count < 4 && (i + count) < 32 && registers[i + count]) {} - - if (count == 1) - island_regs.push_back((ARM64Reg)(Q0 + i)); - else - LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); - - i += count - 1; - } - - // Handle island registers - std::vector pair_regs; - for (auto& it : island_regs) - { - pair_regs.push_back(it); - if (pair_regs.size() == 2) - { - LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32); - pair_regs.clear(); - } - } - if (pair_regs.size()) - LDR(128, INDEX_POST, pair_regs[0], SP, 16); + for (int i = 0; i < 32; i++) { + if (fp_registers & (1U << i)) + fprs[num_fprs++] = (ARM64Reg)(D0 + i); } - else - { - bool odd = (num_regs % 2) != 0; - std::vector pair_regs; - for (int i = 31; i >= 0; --i) - { - if (!registers[i]) - continue; - if (odd) - { - // First load must be a regular LDR if odd - odd = false; - LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); - } - else - { - pair_regs.push_back((ARM64Reg)(Q0 + i)); - if (pair_regs.size() == 2) - { - LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32); - pair_regs.clear(); - } - } - } + u32 stack_size = ROUND_UP((num_gprs + 2) * 8, 16) + num_fprs * 8; + + // SP points to the bottom. We're gonna walk it upwards. + // Reload FP, LR. + m_emit->LDP(INDEX_SIGNED, FP, LR, SP, 0); + int offset = 16; + for (int i = 0; i < num_gprs / 2; i++) { + m_emit->LDP(INDEX_SIGNED, gprs[i*2], gprs[i*2+1], SP, offset); + offset += 16; } + // Do the straggler. + if (num_gprs & 1) { + m_emit->LDR(INDEX_UNSIGNED, gprs[num_gprs-1], SP, offset); + offset += 16; + } + + // Time for the FP regs. + for (int i = 0; i < num_fprs; i++) { + LDR(64, INDEX_UNSIGNED, fprs[i], SP, offset); + offset += 8; + } + + // Restore the stack pointer. + m_emit->ADD(SP, SP, stack_size); } void ARM64XEmitter::ANDI2R(ARM64Reg Rd, ARM64Reg Rn, u64 imm, ARM64Reg scratch) { diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index df4c78f3d8..1b65e69d66 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -85,12 +85,14 @@ enum ARM64Reg WZR = WSP, ZR = SP, + FP = X29, + LR = X30, INVALID_REG = 0xFFFFFFFF }; -// R19-R28, R29 (FP), R30 (LR). FP seems questionable? -const u32 ALL_CALLEE_SAVED = 0x7FF80000; +// R19-R28. R29 (FP), R30 (LR) are always saved and FP updated appropriately. +const u32 ALL_CALLEE_SAVED = 0x1FF80000; const u32 ALL_CALLEE_SAVED_FP = 0x0000FF00; // d8-d15 inline bool Is64Bit(ARM64Reg reg) { return (reg & 0x20) != 0; } @@ -720,10 +722,6 @@ public: bool TryORRI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); bool TryEORI2R(ARM64Reg Rd, ARM64Reg Rn, u32 imm); - // ABI related - void ABI_PushRegisters(BitSet32 registers); - void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); - // Pseudo-instruction for convenience. PUSH pushes 16 bytes even though we only push a single register. // This is so the stack pointer is always 16-byte aligned, which is checked by hardware! void PUSH(ARM64Reg Rd); @@ -943,8 +941,8 @@ public: void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); // ABI related - void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); - void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); + void ABI_PushRegisters(uint32_t gpr_registers, uint32_t fp_registers); + void ABI_PopRegisters(uint32_t gpr_registers, uint32_t fp_registers); private: ARM64XEmitter* m_emit; diff --git a/Common/BitSet.h b/Common/BitSet.h index d60b507dd4..2fbc80011b 100644 --- a/Common/BitSet.h +++ b/Common/BitSet.h @@ -3,8 +3,6 @@ #pragma once #include -#include -#include #include "CommonTypes.h" // Helper functions: @@ -12,8 +10,7 @@ #ifdef _WIN32 #include template -static inline int CountSetBits(T v) -{ +inline int CountSetBits(T v) { // from https://graphics.stanford.edu/~seander/bithacks.html // GCC has this built in, but MSVC's intrinsic will only emit the actual // POPCNT instruction, which we're not depending on @@ -22,14 +19,14 @@ static inline int CountSetBits(T v) v = (v + (v >> 4)) & (T)~(T)0/255*15; return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; } -static inline int LeastSignificantSetBit(u32 val) +inline int LeastSignificantSetBit(u32 val) { unsigned long index; _BitScanForward(&index, val); return (int)index; } #ifdef _M_X64 -static inline int LeastSignificantSetBit(u64 val) +inline int LeastSignificantSetBit(u64 val) { unsigned long index; _BitScanForward64(&index, val); @@ -37,134 +34,8 @@ static inline int LeastSignificantSetBit(u64 val) } #endif #else -static inline int CountSetBits(u32 val) { return __builtin_popcount(val); } -static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } -static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } -static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } +inline int CountSetBits(u32 val) { return __builtin_popcount(val); } +inline int CountSetBits(u64 val) { return __builtin_popcountll(val); } +inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); } +inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); } #endif - -// namespace avoids conflict with OS X Carbon; don't use BitSet directly -namespace BS -{ - -// Similar to std::bitset, this is a class which encapsulates a bitset, i.e. -// using the set bits of an integer to represent a set of integers. Like that -// class, it acts like an array of bools: -// BitSet32 bs; -// bs[1] = true; -// but also like the underlying integer ([0] = least significant bit): -// BitSet32 bs2 = ...; -// bs = (bs ^ bs2) & BitSet32(0xffff); -// The following additional functionality is provided: -// - Construction using an initializer list. -// BitSet bs { 1, 2, 4, 8 }; -// - Efficiently iterating through the set bits: -// for (int i : bs) -// [i is the *index* of a set bit] -// (This uses the appropriate CPU instruction to find the next set bit in one -// operation.) -// - Counting set bits using .Count() - see comment on that method. - -// TODO: use constexpr when MSVC gets out of the Dark Ages - -template -class BitSet -{ - static_assert(!std::is_signed::value, "BitSet should not be used with signed types"); -public: - // A reference to a particular bit, returned from operator[]. - class Ref - { - public: - Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {} - Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {} - operator bool() const { return (m_bs->m_val & m_mask) != 0; } - bool operator=(bool set) - { - m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0); - return set; - } - private: - BitSet* m_bs; - IntTy m_mask; - }; - - // A STL-like iterator is required to be able to use range-based for loops. - class Iterator - { - public: - Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {} - Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {} - Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; } - int operator*() { return m_bit; } - Iterator& operator++() - { - if (m_val == 0) - { - m_bit = -1; - } - else - { - int bit = LeastSignificantSetBit(m_val); - m_val &= ~(1 << bit); - m_bit = bit; - } - return *this; - } - Iterator operator++(int _) - { - Iterator other(*this); - ++*this; - return other; - } - bool operator==(Iterator other) const { return m_bit == other.m_bit; } - bool operator!=(Iterator other) const { return m_bit != other.m_bit; } - private: - IntTy m_val; - int m_bit; - }; - - BitSet() : m_val(0) {} - explicit BitSet(IntTy val) : m_val(val) {} - BitSet(std::initializer_list init) - { - m_val = 0; - for (int bit : init) - m_val |= (IntTy)1 << bit; - } - - static BitSet AllTrue(size_t count) - { - return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1)); - } - - Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); } - const Ref operator[](size_t bit) const { return (*const_cast(this))[bit]; } - bool operator==(BitSet other) const { return m_val == other.m_val; } - bool operator!=(BitSet other) const { return m_val != other.m_val; } - BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); } - BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); } - BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); } - BitSet operator~() const { return BitSet(~m_val); } - BitSet& operator|=(BitSet other) { return *this = *this | other; } - BitSet& operator&=(BitSet other) { return *this = *this & other; } - BitSet& operator^=(BitSet other) { return *this = *this ^ other; } - operator u32() = delete; - operator bool() { return m_val != 0; } - - // Warning: Even though on modern CPUs this is a single fast instruction, - // Dolphin's official builds do not currently assume POPCNT support on x86, - // so slower explicit bit twiddling is generated. Still should generally - // be faster than a loop. - unsigned int Count() const { return CountSetBits(m_val); } - - Iterator begin() const { Iterator it(m_val, 0); return ++it; } - Iterator end() const { return Iterator(m_val, -1); } - - IntTy m_val; -}; - -} - -typedef BS::BitSet BitSet32; -typedef BS::BitSet BitSet64; diff --git a/Core/MIPS/ARM64/Arm64Asm.cpp b/Core/MIPS/ARM64/Arm64Asm.cpp index 291e73e780..8e163e8756 100644 --- a/Core/MIPS/ARM64/Arm64Asm.cpp +++ b/Core/MIPS/ARM64/Arm64Asm.cpp @@ -35,7 +35,7 @@ using namespace Arm64Gen; //static int temp32; // unused? static const bool enableDebug = false; -static const bool enableDisasm = false; +static const bool enableDisasm = true; //static bool enableStatistics = false; //unused? @@ -198,10 +198,9 @@ void Arm64Jit::GenerateFixedCode(const JitOptions &jo) { enterDispatcher = AlignCode16(); - BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED); - BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP); - ABI_PushRegisters(regs_to_save); - fp.ABI_PushRegisters(regs_to_save_fp); + uint32_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED; + uint32_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP; + fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp); // Fixed registers, these are always kept when in Jit context. MOVP2R(MEMBASEREG, Memory::base); @@ -290,8 +289,7 @@ void Arm64Jit::GenerateFixedCode(const JitOptions &jo) { SaveStaticRegisters(); RestoreRoundingMode(true); - fp.ABI_PopRegisters(regs_to_save_fp); - ABI_PopRegisters(regs_to_save); + fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp); RET(); diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index 16b1dc0ac5..dfed30d0be 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -175,10 +175,9 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int // if (skinning) log = true; - BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED); - BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP); - ABI_PushRegisters(regs_to_save); - fp.ABI_PushRegisters(regs_to_save_fp); + uint64_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED; + uint64_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP; + fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp); // Keep the scale/offset in a few fp registers if we need it. if (prescaleStep) { @@ -279,8 +278,7 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int STRH(INDEX_UNSIGNED, boundsMaxVReg, scratchReg64, offsetof(KnownVertexBounds, maxV)); } - fp.ABI_PopRegisters(regs_to_save_fp); - ABI_PopRegisters(regs_to_save); + fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp); RET(); diff --git a/android/jni/Arm64EmitterTest.cpp b/android/jni/Arm64EmitterTest.cpp index 872810c812..b9240fc352 100644 --- a/android/jni/Arm64EmitterTest.cpp +++ b/android/jni/Arm64EmitterTest.cpp @@ -39,10 +39,9 @@ void TestCode::Generate() const u8 *start = AlignCode16(); - BitSet32 regs_to_save(Arm64Gen::ALL_CALLEE_SAVED); - BitSet32 regs_to_save_fp(Arm64Gen::ALL_CALLEE_SAVED_FP); - ABI_PushRegisters(regs_to_save); - fp.ABI_PushRegisters(regs_to_save_fp); + uint32_t regs_to_save = Arm64Gen::ALL_CALLEE_SAVED; + uint32_t regs_to_save_fp = Arm64Gen::ALL_CALLEE_SAVED_FP; + fp.ABI_PushRegisters(regs_to_save, regs_to_save_fp); PUSH(X3); POP(X3); @@ -54,8 +53,7 @@ void TestCode::Generate() fp.SCVTF(S3, W12); MOVI2R(X0, 1337); - ABI_PopRegisters(regs_to_save); - fp.ABI_PopRegisters(regs_to_save_fp); + fp.ABI_PopRegisters(regs_to_save, regs_to_save_fp); RET();