diff --git a/Common/Arm64Emitter.cpp b/Common/Arm64Emitter.cpp index 1b626fc063..5dfa4e3b0d 100644 --- a/Common/Arm64Emitter.cpp +++ b/Common/Arm64Emitter.cpp @@ -394,7 +394,7 @@ static const u32 LogicalEnc[][2] = { }; // Load/Store Exclusive -static u32 LoadStoreExcEnc[][5] = { +static const u32 LoadStoreExcEnc[][5] = { {0, 0, 0, 0, 0}, // STXRB {0, 0, 0, 0, 1}, // STLXRB {0, 0, 1, 0, 0}, // LDXRB @@ -434,11 +434,11 @@ void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr bool b64Bit = Is64Bit(Rt); s64 distance = (s64)ptr - (s64)m_code; - _assert_msg_(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %lx", __FUNCTION__, (int)distance); + _assert_msg_(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %llx", __FUNCTION__, distance); distance >>= 2; - _assert_msg_(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF, "%s: Received too large distance: %lx", __FUNCTION__, (int)distance); + _assert_msg_(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF, "%s: Received too large distance: %llx", __FUNCTION__, distance); Rt = DecodeReg(Rt); Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | \ @@ -744,8 +744,9 @@ void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 i (immr << 16) | (imms << 10) | (Rn << 5) | Rd); } -void ARM64XEmitter::EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { + bool b64Bit = Is64Bit(Rt); u32 type_encode = 0; switch (type) { @@ -758,30 +759,18 @@ void ARM64XEmitter::EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type case INDEX_PRE: type_encode = 3; break; + case INDEX_SIGNED: + _assert_msg_(DYNA_REC, false, "%s doesn't support INDEX_SIGNED!", __FUNCTION__); + break; } - // ASIMD - if (V) { + if (b64Bit) { + op |= 2; + imm >>= 3; + } + else + { imm >>= 2; - if (IsQuad(Rt)) { - op = 2; - if (type_encode == 2) - imm >>= 2; - } else if (IsDouble(Rt)) { - op = 1; - if (type_encode == 2) - imm >>= 1; - } else { - op = 0; - } - } else { - bool b64Bit = Is64Bit(Rt); - if (b64Bit) { - op |= 2; - imm >>= 3; - } else { - imm >>= 2; - } } _assert_msg_(JIT, imm >= -64 && imm <= 63, "%s recieved too large imm: %d", imm); @@ -790,10 +779,9 @@ void ARM64XEmitter::EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type Rt2 = DecodeReg(Rt2); Rn = DecodeReg(Rn); - Write32((op << 30) | (5 << 27) | (V << 26) | (type_encode << 23) | (load << 22) | \ + Write32((op << 30) | (5 << 27) | (type_encode << 23) | (load << 22) | \ (((uint32_t)imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt); } - void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm) { Rd = DecodeReg(Rd); @@ -849,21 +837,21 @@ void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch) Not = true; case 0: // CBZ { - _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, (int)distance); + _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %llx", __FUNCTION__, branch.type, distance); bool b64Bit = Is64Bit(branch.reg); ARM64Reg reg = DecodeReg(branch.reg); inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg; } break; case 2: // B (conditional) - _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); + _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %llx", __FUNCTION__, branch.type, distance); inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond; break; case 4: // TBNZ Not = true; case 3: // TBZ { - _assert_msg_(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, (int)distance); + _assert_msg_(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %llx", __FUNCTION__, branch.type, distance); ARM64Reg reg = DecodeReg(branch.reg); inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) | (MaskImm14(distance) << 5) | reg; } @@ -961,7 +949,7 @@ void ARM64XEmitter::B(CCFlags cond, const void* ptr) distance >>= 2; - _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s: Received too large distance: %p->%p %lld %llx", __FUNCTION__, m_code, ptr, distance, distance); + _assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s: Received too large distance: %p->%p %ld %lx", __FUNCTION__, m_code, ptr, distance, distance); Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond); } @@ -1517,6 +1505,21 @@ void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms) { EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms); } + +void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) +{ + u32 size = Is64Bit(Rn) ? 64 : 32; + _assert_msg_(DYNA_REC, (lsb + width) <= size, "%s passed lsb %d and width %d which is greater than the register size!", + __FUNCTION__, lsb, width); + EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1); +} +void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width) +{ + u32 size = Is64Bit(Rn) ? 64 : 32; + _assert_msg_(DYNA_REC, (lsb + width) <= size, "%s passed lsb %d and width %d which is greater than the register size!", + __FUNCTION__, lsb, width); + EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1); +} void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift) { bool sf = Is64Bit(Rd); bool N = sf; @@ -1564,15 +1567,15 @@ void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm) // Load/Store pair void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { - EncodeLoadStorePair(0, false, 1, type, Rt, Rt2, Rn, imm); + EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm); } void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { - EncodeLoadStorePair(1, false, 1, type, Rt, Rt2, Rn, imm); + EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm); } void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { - EncodeLoadStorePair(0, false, 0, type, Rt, Rt2, Rn, imm); + EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm); } // Load/Store Exclusive @@ -2344,6 +2347,27 @@ void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opc (encoded_size << 10) | (Rn << 5) | Rt); } +void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + bool quad = IsQuad(Rt); + u32 encoded_size = 0; + + if (size == 16) + encoded_size = 1; + else if (size == 32) + encoded_size = 2; + else if (size == 64) + encoded_size = 3; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (0x19 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) | \ + (encoded_size << 10) | (Rn << 5) | Rt); + +} + void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); @@ -2376,6 +2400,99 @@ void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); } +void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + u32 type_encode = 0; + u32 opc = 0; + + switch (type) + { + case INDEX_SIGNED: + type_encode = 2; + break; + case INDEX_POST: + type_encode = 1; + break; + case INDEX_PRE: + type_encode = 3; + break; + case INDEX_UNSIGNED: + _assert_msg_(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __FUNCTION__); + break; + } + + if (size == 128) + { + _assert_msg_(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __FUNCTION__, imm); + opc = 2; + imm >>= 4; + } + else if (size == 64) + { + _assert_msg_(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __FUNCTION__, imm); + opc = 1; + imm >>= 3; + } + else if (size == 32) + { + _assert_msg_(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __FUNCTION__, imm); + opc = 0; + imm >>= 2; + } + + Rt = DecodeReg(Rt); + Rt2 = DecodeReg(Rt2); + Rn = DecodeReg(Rn); + + Write32((opc << 30) | (0xB << 26) | (type_encode << 23) | (load << 22) | \ + ((imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt); + +} + +void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + _assert_msg_(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG, "%s must contain an extended reg as Rm!", __FUNCTION__); + + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 0; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 0; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 0; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 0; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 2; + } + + if (load) + encoded_op |= 1; + + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg()); + + Write32((encoded_size << 30) | (encoded_op << 22) | (0x1E1 << 21) | (decoded_Rm << 16) | \ + Rm.GetData() | (1 << 11) | (Rn << 5) | Rt); +} + void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); @@ -2565,6 +2682,18 @@ void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn) { EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn); } +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn); +} +void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm); +} void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) { @@ -2687,6 +2816,22 @@ void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) opcode = 2; EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); } +void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); + _assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__); + + u32 opcode = 0; + if (count == 1) + opcode = 7; + else if (count == 2) + opcode = 0xA; + else if (count == 3) + opcode = 6; + else if (count == 4) + opcode = 2; + EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm); +} void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) { _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); @@ -2701,6 +2846,22 @@ void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) opcode = 2; EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn); } +void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) +{ + _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); + _assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__); + + u32 opcode = 0; + if (count == 1) + opcode = 7; + else if (count == 2) + opcode = 0xA; + else if (count == 3) + opcode = 6; + else if (count == 4) + opcode = 2; + EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm); +} // Scalar - 1 Source void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) @@ -2709,7 +2870,6 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn); } else { _assert_msg_(JIT, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads"); - int type = 0; int rmode = 0; int opcode = 6; int sf = 0; @@ -2728,6 +2888,26 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top) } } +// Loadstore paired +void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm); +} +void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm); +} + +// Loadstore register offset +void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm); +} +void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) +{ + EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm); +} + void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) { EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn); @@ -2854,13 +3034,25 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm); } +void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm); +} void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm); } -void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn, bool source_upper) +void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - Emit2RegMisc(source_upper, 0, size >> 6, 0x17, Rd, Rn); + EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm); +} +void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn); +} +void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn) +{ + Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn); } void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn) { @@ -3254,6 +3446,47 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) } // Shift by immediate +void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SSHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SSHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + SHRN(dest_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, false); +} +void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +{ + USHLL(src_size, Rd, Rn, shift, true); +} +void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + SXTL(src_size, Rd, Rn, true); +} +void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, false); +} +void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) +{ + UXTL(src_size, Rd, Rn, true); +} + void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { _assert_msg_(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", __FUNCTION__); @@ -3296,7 +3529,7 @@ void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, EmitShiftImm(upper, 1, immh, immb, 0x14, Rd, Rn); } -void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) +void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper) { _assert_msg_(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!", __FUNCTION__); u32 immh = 0; @@ -3314,7 +3547,7 @@ void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { immh = 4 | ((shift >> 3) & 3);; } - EmitShiftImm(IsQuad(Rd), 1, immh, immb, 0x10, Rd, Rn); + EmitShiftImm(upper, 1, immh, immb, 0x10, Rd, Rn); } void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper) @@ -3360,32 +3593,160 @@ void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm); } -void ARM64FloatEmitter::LDP(IndexType index_type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { - m_emit->EncodeLoadStorePair(0, true, 1, index_type, Rt, Rt2, Rn, imm); -} - -void ARM64FloatEmitter::STP(IndexType index_type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) { - m_emit->EncodeLoadStorePair(0, true, 0, index_type, Rt, Rt2, Rn, imm); -} - -// TODO: According to the ABI, we really only need to save the bottom 64 bits of D8-D15. -void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) +void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) { - for (auto it : registers) - STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16); -} + bool bundled_loadstore = false; -void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask) -{ - for (int i = 31; i >= 0; --i) + for (int i = 0; i < 32; ++i) { if (!registers[i]) continue; - if (ignore_mask[i]) - m_emit->ADD(SP, SP, 16); - else - LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) + { + int num_regs = registers.Count(); + m_emit->SUB(SP, SP, num_regs * 16); + m_emit->ADD(tmp, SP, 0); + std::vector island_regs; + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + + // 0 = true + // 1 < 4 && registers[i + 1] true! + // 2 < 4 && registers[i + 2] true! + // 3 < 4 && registers[i + 3] true! + // 4 < 4 && registers[i + 4] false! + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp); + + i += count - 1; + } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_POST, pair_regs[0], tmp, 16); + } + else + { + std::vector pair_regs; + for (auto it : registers) + { + pair_regs.push_back((ARM64Reg)(Q0 + it)); + if (pair_regs.size() == 2) + { + STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + STR(128, INDEX_PRE, pair_regs[0], SP, -16); + } +} +void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp) +{ + bool bundled_loadstore = false; + int num_regs = registers.Count(); + + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + if (count > 1) + { + bundled_loadstore = true; + break; + } + } + + if (bundled_loadstore && tmp != INVALID_REG) + { + // The temporary register is only used to indicate that we can use this code path + std::vector island_regs; + for (int i = 0; i < 32; ++i) + { + if (!registers[i]) + continue; + + int count = 0; + while (++count < 4 && (i + count) < 32 && registers[i + count]) {} + + if (count == 1) + island_regs.push_back((ARM64Reg)(Q0 + i)); + else + LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP); + + i += count - 1; + } + + // Handle island registers + std::vector pair_regs; + for (auto& it : island_regs) + { + pair_regs.push_back(it); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32); + pair_regs.clear(); + } + } + if (pair_regs.size()) + LDR(128, INDEX_POST, pair_regs[0], SP, 16); + } + else + { + bool odd = num_regs % 2; + std::vector pair_regs; + for (int i = 31; i >= 0; --i) + { + if (!registers[i]) + continue; + + if (odd) + { + // First load must be a regular LDR if odd + odd = false; + LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16); + } + else + { + pair_regs.push_back((ARM64Reg)(Q0 + i)); + if (pair_regs.size() == 2) + { + LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32); + pair_regs.clear(); + } + } + } } } diff --git a/Common/Arm64Emitter.h b/Common/Arm64Emitter.h index 9cae8bf9f8..43e746aba0 100644 --- a/Common/Arm64Emitter.h +++ b/Common/Arm64Emitter.h @@ -1,5 +1,5 @@ -// Copyright 2013 Dolphin Emulator Project -// Licensed under GPLv2 +// Copyright 2015 Dolphin Emulator Project +// Licensed under GPLv2+ // Refer to the license.txt file included. #pragma once @@ -139,7 +139,7 @@ enum IndexType INDEX_UNSIGNED = 0, INDEX_POST = 1, INDEX_PRE = 2, - INDEX_SIGNED = INDEX_UNSIGNED // used in LDP/STP + INDEX_SIGNED = 3, // used in LDP/STP }; enum ShiftAmount @@ -309,7 +309,7 @@ public: { return m_type; } - ARM64Reg GetReg() + ARM64Reg GetReg() const { return m_destReg; } @@ -367,7 +367,7 @@ private: void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd); void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n); - void EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); @@ -592,10 +592,8 @@ public: void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); - - void BFI(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) { - BFM(Rd, Rn, (64 - lsb) % (Is64Bit(Rd) ? 64 : 32), width - 1); - } + void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); + void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width); // Extract register (ROR with two inputs, if same then faster on A67) void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift); @@ -773,12 +771,25 @@ public: void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn); + void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); + void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); // Loadstore multiple structure void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP); + + // Loadstore paired + void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + + // Loadstore register offset + void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); // Scalar - 1 Source void FABS(ARM64Reg Rd, ARM64Reg Rn); @@ -812,9 +823,12 @@ public: void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index); void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); - void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn, bool source_upper = false); + void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn); + void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn); void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn); @@ -835,11 +849,11 @@ public: void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn); void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale); - void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); + void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn); // Move @@ -892,11 +906,16 @@ public: void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); // Shift by immediate - void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper = false); - void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper = false); + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); - void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper = false); - void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper = false); + void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); // vector x indexed element void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); @@ -905,12 +924,9 @@ public: void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); - void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); - void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); - // ABI related - void ABI_PushRegisters(BitSet32 registers); - void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); + void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); + void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG); private: ARM64XEmitter* m_emit; @@ -934,11 +950,20 @@ private: void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); + void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm); void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign); void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode); + void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); + void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + + void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); + void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); + void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper); }; class ARM64CodeBlock : public CodeBlock @@ -956,4 +981,3 @@ private: } }; } - diff --git a/Core/MIPS/ARM/ArmRegCache.cpp b/Core/MIPS/ARM/ArmRegCache.cpp index dd25de6575..f09e3dbc28 100644 --- a/Core/MIPS/ARM/ArmRegCache.cpp +++ b/Core/MIPS/ARM/ArmRegCache.cpp @@ -299,7 +299,7 @@ allocate: goto allocate; } - // Uh oh, we have all them spilllocked.... + // Uh oh, we have all of them spilllocked.... ERROR_LOG_REPORT(JIT, "Out of spillable registers at PC %08x!!!", mips_->pc); return INVALID_REG; } diff --git a/Core/MIPS/ARM64/Arm64RegCache.cpp b/Core/MIPS/ARM64/Arm64RegCache.cpp index 7ff57a3f6d..12b67ec328 100644 --- a/Core/MIPS/ARM64/Arm64RegCache.cpp +++ b/Core/MIPS/ARM64/Arm64RegCache.cpp @@ -226,7 +226,7 @@ allocate: goto allocate; } - // Uh oh, we have all them spilllocked.... + // Uh oh, we have all of them spilllocked.... ERROR_LOG_REPORT(JIT, "Out of spillable registers at PC %08x!!!", mips_->pc); return INVALID_REG; } @@ -347,6 +347,10 @@ void Arm64RegCache::DiscardR(MIPSGPReg mipsReg) { ARM64Reg Arm64RegCache::ARM64RegForFlush(MIPSGPReg r) { switch (mr[r].loc) { case ML_IMM: + // Zero is super easy. + if (mr[r].imm == 0) { + return WZR; + } // Could we get lucky? Check for an exact match in another armreg. for (int i = 0; i < NUM_MIPSREG; ++i) { if (mr[i].loc == ML_ARMREG_IMM && mr[i].imm == mr[r].imm) { @@ -434,13 +438,24 @@ void Arm64RegCache::FlushAll() { // Flush it first so we don't get it confused. FlushR(MIPS_REG_LO); - // TODO: We could do a pass to allocate regs for imms for more paired stores. + // 1 because MIPS_REG_ZERO isn't flushable anyway. // 31 because 30 and 31 are the last possible pair - MIPS_REG_FPCOND, etc. are too far away. - for (int i = 0; i < 31; i++) { + for (int i = 1; i < 31; i++) { MIPSGPReg mreg1 = MIPSGPReg(i); MIPSGPReg mreg2 = MIPSGPReg(i + 1); ARM64Reg areg1 = ARM64RegForFlush(mreg1); ARM64Reg areg2 = ARM64RegForFlush(mreg2); + + // If either one doesn't have a reg yet, try flushing imms to scratch regs. + if (areg1 == INVALID_REG && IsImm(mreg1)) { + SetRegImm(SCRATCH1, GetImm(mreg1)); + areg1 = SCRATCH1; + } + if (areg2 == INVALID_REG && IsImm(mreg2)) { + SetRegImm(SCRATCH2, GetImm(mreg2)); + areg2 = SCRATCH2; + } + if (areg1 != INVALID_REG && areg2 != INVALID_REG) { // We can use a paired store, awesome. emit_->STP(INDEX_SIGNED, areg1, areg2, CTXREG, GetMipsRegOffset(mreg1)); diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index aa345a160b..a7c447df91 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -307,8 +307,8 @@ void VertexDecoderJitCache::Jit_ApplyWeights() { break; default: // Matrices 4+ need to be loaded from memory. - fp.LDP(INDEX_SIGNED, Q8, Q9, scratchReg64, 0); - fp.LDP(INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16); + fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0); + fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16); fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3); fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3); fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3);