Merge pull request #7843 from unknownbrackets/arm64-micro

Update ARM64 emitter from Dolphin, minor flushing tweaks
This commit is contained in:
Henrik Rydgård 2015-07-04 09:46:20 +02:00
commit 511d8bd3d4
5 changed files with 486 additions and 86 deletions

View file

@ -394,7 +394,7 @@ static const u32 LogicalEnc[][2] = {
};
// Load/Store Exclusive
static u32 LoadStoreExcEnc[][5] = {
static const u32 LoadStoreExcEnc[][5] = {
{0, 0, 0, 0, 0}, // STXRB
{0, 0, 0, 0, 1}, // STLXRB
{0, 0, 1, 0, 0}, // LDXRB
@ -434,11 +434,11 @@ void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr
bool b64Bit = Is64Bit(Rt);
s64 distance = (s64)ptr - (s64)m_code;
_assert_msg_(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %lx", __FUNCTION__, (int)distance);
_assert_msg_(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %llx", __FUNCTION__, distance);
distance >>= 2;
_assert_msg_(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF, "%s: Received too large distance: %lx", __FUNCTION__, (int)distance);
_assert_msg_(DYNA_REC, distance >= -0x40000 && distance <= 0x3FFFF, "%s: Received too large distance: %llx", __FUNCTION__, distance);
Rt = DecodeReg(Rt);
Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | \
@ -744,8 +744,9 @@ void ARM64XEmitter::EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 i
(immr << 16) | (imms << 10) | (Rn << 5) | Rd);
}
void ARM64XEmitter::EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
void ARM64XEmitter::EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
{
bool b64Bit = Is64Bit(Rt);
u32 type_encode = 0;
switch (type) {
@ -758,30 +759,18 @@ void ARM64XEmitter::EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type
case INDEX_PRE:
type_encode = 3;
break;
case INDEX_SIGNED:
_assert_msg_(DYNA_REC, false, "%s doesn't support INDEX_SIGNED!", __FUNCTION__);
break;
}
// ASIMD
if (V) {
if (b64Bit) {
op |= 2;
imm >>= 3;
}
else
{
imm >>= 2;
if (IsQuad(Rt)) {
op = 2;
if (type_encode == 2)
imm >>= 2;
} else if (IsDouble(Rt)) {
op = 1;
if (type_encode == 2)
imm >>= 1;
} else {
op = 0;
}
} else {
bool b64Bit = Is64Bit(Rt);
if (b64Bit) {
op |= 2;
imm >>= 3;
} else {
imm >>= 2;
}
}
_assert_msg_(JIT, imm >= -64 && imm <= 63, "%s recieved too large imm: %d", imm);
@ -790,10 +779,9 @@ void ARM64XEmitter::EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type
Rt2 = DecodeReg(Rt2);
Rn = DecodeReg(Rn);
Write32((op << 30) | (5 << 27) | (V << 26) | (type_encode << 23) | (load << 22) | \
Write32((op << 30) | (5 << 27) | (type_encode << 23) | (load << 22) | \
(((uint32_t)imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
}
void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm)
{
Rd = DecodeReg(Rd);
@ -849,21 +837,21 @@ void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch)
Not = true;
case 0: // CBZ
{
_assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, (int)distance);
_assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %llx", __FUNCTION__, branch.type, distance);
bool b64Bit = Is64Bit(branch.reg);
ARM64Reg reg = DecodeReg(branch.reg);
inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (MaskImm19(distance) << 5) | reg;
}
break;
case 2: // B (conditional)
_assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance);
_assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s(%d): Received too large distance: %llx", __FUNCTION__, branch.type, distance);
inst = (0x2A << 25) | (MaskImm19(distance) << 5) | branch.cond;
break;
case 4: // TBNZ
Not = true;
case 3: // TBZ
{
_assert_msg_(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, (int)distance);
_assert_msg_(DYNA_REC, IsInRangeImm14(distance), "%s(%d): Received too large distance: %llx", __FUNCTION__, branch.type, distance);
ARM64Reg reg = DecodeReg(branch.reg);
inst = ((branch.bit & 0x20) << 26) | (0x1B << 25) | (Not << 24) | ((branch.bit & 0x1F) << 19) | (MaskImm14(distance) << 5) | reg;
}
@ -961,7 +949,7 @@ void ARM64XEmitter::B(CCFlags cond, const void* ptr)
distance >>= 2;
_assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s: Received too large distance: %p->%p %lld %llx", __FUNCTION__, m_code, ptr, distance, distance);
_assert_msg_(DYNA_REC, IsInRangeImm19(distance), "%s: Received too large distance: %p->%p %ld %lx", __FUNCTION__, m_code, ptr, distance, distance);
Write32((0x54 << 24) | (MaskImm19(distance) << 5) | cond);
}
@ -1517,6 +1505,21 @@ void ARM64XEmitter::UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms)
{
EncodeBitfieldMOVInst(2, Rd, Rn, immr, imms);
}
void ARM64XEmitter::BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
{
u32 size = Is64Bit(Rn) ? 64 : 32;
_assert_msg_(DYNA_REC, (lsb + width) <= size, "%s passed lsb %d and width %d which is greater than the register size!",
__FUNCTION__, lsb, width);
EncodeBitfieldMOVInst(1, Rd, Rn, (size - lsb) % size, width - 1);
}
void ARM64XEmitter::UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width)
{
u32 size = Is64Bit(Rn) ? 64 : 32;
_assert_msg_(DYNA_REC, (lsb + width) <= size, "%s passed lsb %d and width %d which is greater than the register size!",
__FUNCTION__, lsb, width);
EncodeBitfieldMOVInst(2, Rd, Rn, (size - lsb) % size, width - 1);
}
void ARM64XEmitter::EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift) {
bool sf = Is64Bit(Rd);
bool N = sf;
@ -1564,15 +1567,15 @@ void ARM64XEmitter::PRFM(ARM64Reg Rt, u32 imm)
// Load/Store pair
void ARM64XEmitter::LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
{
EncodeLoadStorePair(0, false, 1, type, Rt, Rt2, Rn, imm);
EncodeLoadStorePair(0, 1, type, Rt, Rt2, Rn, imm);
}
void ARM64XEmitter::LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
{
EncodeLoadStorePair(1, false, 1, type, Rt, Rt2, Rn, imm);
EncodeLoadStorePair(1, 1, type, Rt, Rt2, Rn, imm);
}
void ARM64XEmitter::STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
{
EncodeLoadStorePair(0, false, 0, type, Rt, Rt2, Rn, imm);
EncodeLoadStorePair(0, 0, type, Rt, Rt2, Rn, imm);
}
// Load/Store Exclusive
@ -2344,6 +2347,27 @@ void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opc
(encoded_size << 10) | (Rn << 5) | Rt);
}
void ARM64FloatEmitter::EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
{
bool quad = IsQuad(Rt);
u32 encoded_size = 0;
if (size == 16)
encoded_size = 1;
else if (size == 32)
encoded_size = 2;
else if (size == 64)
encoded_size = 3;
Rt = DecodeReg(Rt);
Rn = DecodeReg(Rn);
Rm = DecodeReg(Rm);
Write32((quad << 30) | (0x19 << 23) | (L << 22) | (Rm << 16) | (opcode << 12) | \
(encoded_size << 10) | (Rn << 5) | Rt);
}
void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn)
{
_assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__);
@ -2376,6 +2400,99 @@ void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM
Write32((size << 30) | (0xF << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt);
}
void ARM64FloatEmitter::EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
{
u32 type_encode = 0;
u32 opc = 0;
switch (type)
{
case INDEX_SIGNED:
type_encode = 2;
break;
case INDEX_POST:
type_encode = 1;
break;
case INDEX_PRE:
type_encode = 3;
break;
case INDEX_UNSIGNED:
_assert_msg_(DYNA_REC, false, "%s doesn't support INDEX_UNSIGNED!", __FUNCTION__);
break;
}
if (size == 128)
{
_assert_msg_(DYNA_REC, !(imm & 0xF), "%s received invalid offset 0x%x!", __FUNCTION__, imm);
opc = 2;
imm >>= 4;
}
else if (size == 64)
{
_assert_msg_(DYNA_REC, !(imm & 0x7), "%s received invalid offset 0x%x!", __FUNCTION__, imm);
opc = 1;
imm >>= 3;
}
else if (size == 32)
{
_assert_msg_(DYNA_REC, !(imm & 0x3), "%s received invalid offset 0x%x!", __FUNCTION__, imm);
opc = 0;
imm >>= 2;
}
Rt = DecodeReg(Rt);
Rt2 = DecodeReg(Rt2);
Rn = DecodeReg(Rn);
Write32((opc << 30) | (0xB << 26) | (type_encode << 23) | (load << 22) | \
((imm & 0x7F) << 15) | (Rt2 << 10) | (Rn << 5) | Rt);
}
void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
{
_assert_msg_(DYNA_REC, Rm.GetType() == ArithOption::TYPE_EXTENDEDREG, "%s must contain an extended reg as Rm!", __FUNCTION__);
u32 encoded_size = 0;
u32 encoded_op = 0;
if (size == 8)
{
encoded_size = 0;
encoded_op = 0;
}
else if (size == 16)
{
encoded_size = 1;
encoded_op = 0;
}
else if (size == 32)
{
encoded_size = 2;
encoded_op = 0;
}
else if (size == 64)
{
encoded_size = 3;
encoded_op = 0;
}
else if (size == 128)
{
encoded_size = 0;
encoded_op = 2;
}
if (load)
encoded_op |= 1;
Rt = DecodeReg(Rt);
Rn = DecodeReg(Rn);
ARM64Reg decoded_Rm = DecodeReg(Rm.GetReg());
Write32((encoded_size << 30) | (encoded_op << 22) | (0x1E1 << 21) | (decoded_Rm << 16) | \
Rm.GetData() | (1 << 11) | (Rn << 5) | Rt);
}
void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm)
{
EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm);
@ -2565,6 +2682,18 @@ void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
{
EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn);
}
void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn)
{
EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn);
}
void ARM64FloatEmitter::LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
{
EmitLoadStoreSingleStructure(1, 0, 6, 0, size >> 4, Rt, Rn, Rm);
}
void ARM64FloatEmitter::LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
{
EmitLoadStoreSingleStructure(1, 1, 6, 0, size >> 4, Rt, Rn, Rm);
}
void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn)
{
@ -2687,6 +2816,22 @@ void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
opcode = 2;
EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn);
}
void ARM64FloatEmitter::LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
{
_assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__);
_assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__);
u32 opcode = 0;
if (count == 1)
opcode = 7;
else if (count == 2)
opcode = 0xA;
else if (count == 3)
opcode = 6;
else if (count == 4)
opcode = 2;
EmitLoadStoreMultipleStructurePost(size, 1, opcode, Rt, Rn, Rm);
}
void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
{
_assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__);
@ -2701,6 +2846,22 @@ void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn)
opcode = 2;
EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn);
}
void ARM64FloatEmitter::ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm)
{
_assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__);
_assert_msg_(DYNA_REC, type == INDEX_POST, "%s only supports post indexing!", __FUNCTION__);
u32 opcode = 0;
if (count == 1)
opcode = 7;
else if (count == 2)
opcode = 0xA;
else if (count == 3)
opcode = 6;
else if (count == 4)
opcode = 2;
EmitLoadStoreMultipleStructurePost(size, 0, opcode, Rt, Rn, Rm);
}
// Scalar - 1 Source
void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
@ -2709,7 +2870,6 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
EmitScalar1Source(0, 0, IsDouble(Rd), 0, Rd, Rn);
} else {
_assert_msg_(JIT, !IsQuad(Rd) && !IsQuad(Rn), "FMOV can't move to/from quads");
int type = 0;
int rmode = 0;
int opcode = 6;
int sf = 0;
@ -2728,6 +2888,26 @@ void ARM64FloatEmitter::FMOV(ARM64Reg Rd, ARM64Reg Rn, bool top)
}
}
// Loadstore paired
void ARM64FloatEmitter::LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
{
EncodeLoadStorePair(size, true, type, Rt, Rt2, Rn, imm);
}
void ARM64FloatEmitter::STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm)
{
EncodeLoadStorePair(size, false, type, Rt, Rt2, Rn, imm);
}
// Loadstore register offset
void ARM64FloatEmitter::STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
{
EncodeLoadStoreRegisterOffset(size, false, Rt, Rn, Rm);
}
void ARM64FloatEmitter::LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm)
{
EncodeLoadStoreRegisterOffset(size, true, Rt, Rn, Rm);
}
void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn)
{
EmitScalar1Source(0, 0, IsDouble(Rd), 1, Rd, Rn);
@ -2854,13 +3034,25 @@ void ARM64FloatEmitter::FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, size >> 6, 0x1A, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, size >> 6, 0b11110, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
EmitThreeSame(0, size >> 6, 0x19, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn, bool source_upper)
void ARM64FloatEmitter::FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
{
Emit2RegMisc(source_upper, 0, size >> 6, 0x17, Rd, Rn);
EmitThreeSame(0, 2 | size >> 6, 0b11110, Rd, Rn, Rm);
}
void ARM64FloatEmitter::FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(false, 0, size >> 6, 0x17, Rd, Rn);
}
void ARM64FloatEmitter::FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn)
{
Emit2RegMisc(true, 0, size >> 6, 0x17, Rd, Rn);
}
void ARM64FloatEmitter::FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn)
{
@ -3254,6 +3446,47 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm)
}
// Shift by immediate
void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
SSHLL(src_size, Rd, Rn, shift, false);
}
void ARM64FloatEmitter::SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
SSHLL(src_size, Rd, Rn, shift, true);
}
void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
SHRN(dest_size, Rd, Rn, shift, false);
}
void ARM64FloatEmitter::SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
SHRN(dest_size, Rd, Rn, shift, true);
}
void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
USHLL(src_size, Rd, Rn, shift, false);
}
void ARM64FloatEmitter::USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
USHLL(src_size, Rd, Rn, shift, true);
}
void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
{
SXTL(src_size, Rd, Rn, false);
}
void ARM64FloatEmitter::SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
{
SXTL(src_size, Rd, Rn, true);
}
void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
{
UXTL(src_size, Rd, Rn, false);
}
void ARM64FloatEmitter::UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn)
{
UXTL(src_size, Rd, Rn, true);
}
void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
{
_assert_msg_(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", __FUNCTION__);
@ -3296,7 +3529,7 @@ void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift,
EmitShiftImm(upper, 1, immh, immb, 0x14, Rd, Rn);
}
void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper)
{
_assert_msg_(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!", __FUNCTION__);
u32 immh = 0;
@ -3314,7 +3547,7 @@ void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift)
{
immh = 4 | ((shift >> 3) & 3);;
}
EmitShiftImm(IsQuad(Rd), 1, immh, immb, 0x10, Rd, Rn);
EmitShiftImm(upper, 1, immh, immb, 0x10, Rd, Rn);
}
void ARM64FloatEmitter::SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper)
@ -3360,32 +3593,160 @@ void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8
EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm);
}
void ARM64FloatEmitter::LDP(IndexType index_type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) {
m_emit->EncodeLoadStorePair(0, true, 1, index_type, Rt, Rt2, Rn, imm);
}
void ARM64FloatEmitter::STP(IndexType index_type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm) {
m_emit->EncodeLoadStorePair(0, true, 0, index_type, Rt, Rt2, Rn, imm);
}
// TODO: According to the ABI, we really only need to save the bottom 64 bits of D8-D15.
void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers)
void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp)
{
for (auto it : registers)
STR(128, INDEX_PRE, (ARM64Reg)(Q0 + it), SP, -16);
}
bool bundled_loadstore = false;
void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask)
{
for (int i = 31; i >= 0; --i)
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
if (ignore_mask[i])
m_emit->ADD(SP, SP, 16);
else
LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
int count = 0;
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count > 1)
{
bundled_loadstore = true;
break;
}
}
if (bundled_loadstore && tmp != INVALID_REG)
{
int num_regs = registers.Count();
m_emit->SUB(SP, SP, num_regs * 16);
m_emit->ADD(tmp, SP, 0);
std::vector<ARM64Reg> island_regs;
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
int count = 0;
// 0 = true
// 1 < 4 && registers[i + 1] true!
// 2 < 4 && registers[i + 2] true!
// 3 < 4 && registers[i + 3] true!
// 4 < 4 && registers[i + 4] false!
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count == 1)
island_regs.push_back((ARM64Reg)(Q0 + i));
else
ST1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), tmp);
i += count - 1;
}
// Handle island registers
std::vector<ARM64Reg> pair_regs;
for (auto& it : island_regs)
{
pair_regs.push_back(it);
if (pair_regs.size() == 2)
{
STP(128, INDEX_POST, pair_regs[0], pair_regs[1], tmp, 32);
pair_regs.clear();
}
}
if (pair_regs.size())
STR(128, INDEX_POST, pair_regs[0], tmp, 16);
}
else
{
std::vector<ARM64Reg> pair_regs;
for (auto it : registers)
{
pair_regs.push_back((ARM64Reg)(Q0 + it));
if (pair_regs.size() == 2)
{
STP(128, INDEX_PRE, pair_regs[0], pair_regs[1], SP, -32);
pair_regs.clear();
}
}
if (pair_regs.size())
STR(128, INDEX_PRE, pair_regs[0], SP, -16);
}
}
void ARM64FloatEmitter::ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp)
{
bool bundled_loadstore = false;
int num_regs = registers.Count();
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
int count = 0;
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count > 1)
{
bundled_loadstore = true;
break;
}
}
if (bundled_loadstore && tmp != INVALID_REG)
{
// The temporary register is only used to indicate that we can use this code path
std::vector<ARM64Reg> island_regs;
for (int i = 0; i < 32; ++i)
{
if (!registers[i])
continue;
int count = 0;
while (++count < 4 && (i + count) < 32 && registers[i + count]) {}
if (count == 1)
island_regs.push_back((ARM64Reg)(Q0 + i));
else
LD1(64, count, INDEX_POST, (ARM64Reg)(Q0 + i), SP);
i += count - 1;
}
// Handle island registers
std::vector<ARM64Reg> pair_regs;
for (auto& it : island_regs)
{
pair_regs.push_back(it);
if (pair_regs.size() == 2)
{
LDP(128, INDEX_POST, pair_regs[0], pair_regs[1], SP, 32);
pair_regs.clear();
}
}
if (pair_regs.size())
LDR(128, INDEX_POST, pair_regs[0], SP, 16);
}
else
{
bool odd = num_regs % 2;
std::vector<ARM64Reg> pair_regs;
for (int i = 31; i >= 0; --i)
{
if (!registers[i])
continue;
if (odd)
{
// First load must be a regular LDR if odd
odd = false;
LDR(128, INDEX_POST, (ARM64Reg)(Q0 + i), SP, 16);
}
else
{
pair_regs.push_back((ARM64Reg)(Q0 + i));
if (pair_regs.size() == 2)
{
LDP(128, INDEX_POST, pair_regs[1], pair_regs[0], SP, 32);
pair_regs.clear();
}
}
}
}
}

View file

@ -1,5 +1,5 @@
// Copyright 2013 Dolphin Emulator Project
// Licensed under GPLv2
// Copyright 2015 Dolphin Emulator Project
// Licensed under GPLv2+
// Refer to the license.txt file included.
#pragma once
@ -139,7 +139,7 @@ enum IndexType
INDEX_UNSIGNED = 0,
INDEX_POST = 1,
INDEX_PRE = 2,
INDEX_SIGNED = INDEX_UNSIGNED // used in LDP/STP
INDEX_SIGNED = 3, // used in LDP/STP
};
enum ShiftAmount
@ -309,7 +309,7 @@ public:
{
return m_type;
}
ARM64Reg GetReg()
ARM64Reg GetReg() const
{
return m_destReg;
}
@ -367,7 +367,7 @@ private:
void EncodeLoadStoreRegisterOffset(u32 size, u32 opc, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
void EncodeAddSubImmInst(u32 op, bool flags, u32 shift, u32 imm, ARM64Reg Rn, ARM64Reg Rd);
void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms, int n);
void EncodeLoadStorePair(u32 op, bool V, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm);
void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
@ -592,10 +592,8 @@ public:
void BFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
void SBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
void UBFM(ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms);
void BFI(ARM64Reg Rd, ARM64Reg Rn, int lsb, int width) {
BFM(Rd, Rn, (64 - lsb) % (Is64Bit(Rd) ? 64 : 32), width - 1);
}
void BFI(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
void UBFIZ(ARM64Reg Rd, ARM64Reg Rn, u32 lsb, u32 width);
// Extract register (ROR with two inputs, if same then faster on A67)
void EXTR(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u32 shift);
@ -773,12 +771,25 @@ public:
void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn);
void LD1R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
void LD2R(u8 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn);
void ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm);
// Loadstore multiple structure
void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
void LD1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn);
void ST1(u8 size, u8 count, IndexType type, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm = SP);
// Loadstore paired
void LDP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
void STP(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
// Loadstore register offset
void STR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
void LDR(u8 size, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
// Scalar - 1 Source
void FABS(ARM64Reg Rd, ARM64Reg Rn);
@ -812,9 +823,12 @@ public:
void DUP(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index);
void FABS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FADD(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMAX(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FMLS(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn, bool source_upper = false);
void FMIN(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void FCVTL(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCVTL2(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCVTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void FCVTZS(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void FCVTZU(u8 size, ARM64Reg Rd, ARM64Reg Rn);
@ -835,11 +849,11 @@ public:
void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn);
void SCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
void UCVTF(u8 size, ARM64Reg Rd, ARM64Reg Rn, int scale);
void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void SQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void SQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void UQXTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void UQXTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void XTN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
void XTN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn);
// Move
@ -892,11 +906,16 @@ public:
void ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
// Shift by immediate
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper = false);
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper = false);
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SSHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void USHLL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper = false);
void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper = false);
void SHRN2(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift);
void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
void SXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
void UXTL2(u8 src_size, ARM64Reg Rd, ARM64Reg Rn);
// vector x indexed element
void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index);
@ -905,12 +924,9 @@ public:
void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false);
void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG);
void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
void STP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
// ABI related
void ABI_PushRegisters(BitSet32 registers);
void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0));
void ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
void ABI_PopRegisters(BitSet32 registers, ARM64Reg tmp = INVALID_REG);
private:
ARM64XEmitter* m_emit;
@ -934,11 +950,20 @@ private:
void EmitShiftImm(bool Q, bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitScalarShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn);
void EmitLoadStoreMultipleStructurePost(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm);
void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn);
void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm);
void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm);
void EmitConvertScalarToInt(ARM64Reg Rd, ARM64Reg Rn, RoundingMode round, bool sign);
void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode);
void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm);
void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm);
void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
void SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper);
void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, bool upper);
};
class ARM64CodeBlock : public CodeBlock<ARM64XEmitter>
@ -956,4 +981,3 @@ private:
}
};
}

View file

@ -299,7 +299,7 @@ allocate:
goto allocate;
}
// Uh oh, we have all them spilllocked....
// Uh oh, we have all of them spilllocked....
ERROR_LOG_REPORT(JIT, "Out of spillable registers at PC %08x!!!", mips_->pc);
return INVALID_REG;
}

View file

@ -226,7 +226,7 @@ allocate:
goto allocate;
}
// Uh oh, we have all them spilllocked....
// Uh oh, we have all of them spilllocked....
ERROR_LOG_REPORT(JIT, "Out of spillable registers at PC %08x!!!", mips_->pc);
return INVALID_REG;
}
@ -347,6 +347,10 @@ void Arm64RegCache::DiscardR(MIPSGPReg mipsReg) {
ARM64Reg Arm64RegCache::ARM64RegForFlush(MIPSGPReg r) {
switch (mr[r].loc) {
case ML_IMM:
// Zero is super easy.
if (mr[r].imm == 0) {
return WZR;
}
// Could we get lucky? Check for an exact match in another armreg.
for (int i = 0; i < NUM_MIPSREG; ++i) {
if (mr[i].loc == ML_ARMREG_IMM && mr[i].imm == mr[r].imm) {
@ -434,13 +438,24 @@ void Arm64RegCache::FlushAll() {
// Flush it first so we don't get it confused.
FlushR(MIPS_REG_LO);
// TODO: We could do a pass to allocate regs for imms for more paired stores.
// 1 because MIPS_REG_ZERO isn't flushable anyway.
// 31 because 30 and 31 are the last possible pair - MIPS_REG_FPCOND, etc. are too far away.
for (int i = 0; i < 31; i++) {
for (int i = 1; i < 31; i++) {
MIPSGPReg mreg1 = MIPSGPReg(i);
MIPSGPReg mreg2 = MIPSGPReg(i + 1);
ARM64Reg areg1 = ARM64RegForFlush(mreg1);
ARM64Reg areg2 = ARM64RegForFlush(mreg2);
// If either one doesn't have a reg yet, try flushing imms to scratch regs.
if (areg1 == INVALID_REG && IsImm(mreg1)) {
SetRegImm(SCRATCH1, GetImm(mreg1));
areg1 = SCRATCH1;
}
if (areg2 == INVALID_REG && IsImm(mreg2)) {
SetRegImm(SCRATCH2, GetImm(mreg2));
areg2 = SCRATCH2;
}
if (areg1 != INVALID_REG && areg2 != INVALID_REG) {
// We can use a paired store, awesome.
emit_->STP(INDEX_SIGNED, areg1, areg2, CTXREG, GetMipsRegOffset(mreg1));

View file

@ -307,8 +307,8 @@ void VertexDecoderJitCache::Jit_ApplyWeights() {
break;
default:
// Matrices 4+ need to be loaded from memory.
fp.LDP(INDEX_SIGNED, Q8, Q9, scratchReg64, 0);
fp.LDP(INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16);
fp.LDP(128, INDEX_SIGNED, Q8, Q9, scratchReg64, 0);
fp.LDP(128, INDEX_SIGNED, Q10, Q11, scratchReg64, 2 * 16);
fp.FMLA(32, Q4, Q8, neonWeightRegsQ[i >> 2], i & 3);
fp.FMLA(32, Q5, Q9, neonWeightRegsQ[i >> 2], i & 3);
fp.FMLA(32, Q6, Q10, neonWeightRegsQ[i >> 2], i & 3);