diff --git a/Core/MIPS/ARM64/Arm64IRRegCache.cpp b/Core/MIPS/ARM64/Arm64IRRegCache.cpp index f48207fa5f..f58e9ee780 100644 --- a/Core/MIPS/ARM64/Arm64IRRegCache.cpp +++ b/Core/MIPS/ARM64/Arm64IRRegCache.cpp @@ -433,6 +433,295 @@ void Arm64IRRegCache::StoreRegValue(IRReg mreg, uint32_t imm) { emit_->STR(INDEX_UNSIGNED, storeReg, CTXREG, GetMipsRegOffset(mreg)); } +bool Arm64IRRegCache::TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) { + bool allowed = !mr[nr[nreg].mipsReg].isStatic; + // There's currently no support for non-FREGs here. + allowed = allowed && type == MIPSLoc::FREG; + + if (dest == -1) + dest = nreg; + + if (allowed && (flags == MIPSMap::INIT || flags == MIPSMap::DIRTY)) { + // Alright, changing lane count (possibly including lane position.) + IRReg oldfirst = nr[nreg].mipsReg; + int oldlanes = 0; + while (mr[oldfirst + oldlanes].nReg == nreg) + oldlanes++; + _assert_msg_(oldlanes != 0, "TransferNativeReg encountered nreg mismatch"); + _assert_msg_(oldlanes != lanes, "TransferNativeReg transfer to same lanecount, misaligned?"); + + if (lanes == 1 && TransferVecTo1(nreg, dest, first, oldlanes)) + return true; + if (oldlanes == 1 && Transfer1ToVec(nreg, dest, first, lanes)) + return true; + } + + return IRNativeRegCacheBase::TransferNativeReg(nreg, dest, type, first, lanes, flags); +} + +bool Arm64IRRegCache::TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes) { + IRReg oldfirst = nr[nreg].mipsReg; + + // Is it worth preserving any of the old regs? + int numKept = 0; + for (int i = 0; i < oldlanes; ++i) { + // Skip whichever one this is extracting. + if (oldfirst + i == first) + continue; + // If 0 isn't being transfered, easy to keep in its original reg. + if (i == 0 && dest != nreg) { + numKept++; + continue; + } + + IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT); + if (freeReg != -1 && IsRegRead(MIPSLoc::FREG, oldfirst + i)) { + // If there's one free, use it. Don't modify nreg, though. + fp_->DUP(32, FromNativeReg(freeReg), FromNativeReg(nreg), i); + + // Update accounting. + nr[freeReg].isDirty = nr[nreg].isDirty; + nr[freeReg].mipsReg = oldfirst + i; + mr[oldfirst + i].lane = -1; + mr[oldfirst + i].nReg = freeReg; + numKept++; + } + } + + // Unless all other lanes were kept, store. + if (nr[nreg].isDirty && numKept < oldlanes - 1) { + StoreNativeReg(nreg, oldfirst, oldlanes); + // Set false even for regs that were split out, since they were flushed too. + for (int i = 0; i < oldlanes; ++i) { + if (mr[oldfirst + i].nReg != -1) + nr[mr[oldfirst + i].nReg].isDirty = false; + } + } + + // Next, move the desired element into first place. + if (mr[first].lane > 0) { + fp_->DUP(32, FromNativeReg(dest), FromNativeReg(nreg), mr[first].lane); + } else if (mr[first].lane <= 0 && dest != nreg) { + fp_->DUP(32, FromNativeReg(dest), FromNativeReg(nreg), 0); + } + + // Now update accounting. + for (int i = 0; i < oldlanes; ++i) { + auto &mreg = mr[oldfirst + i]; + if (oldfirst + i == first) { + mreg.lane = -1; + mreg.nReg = dest; + } else if (mreg.nReg == nreg && i == 0 && nreg != dest) { + // Still in the same register, but no longer a vec. + mreg.lane = -1; + } else if (mreg.nReg == nreg) { + // No longer in a register. + mreg.nReg = -1; + mreg.lane = -1; + mreg.loc = MIPSLoc::MEM; + } + } + + if (dest != nreg) { + nr[dest].isDirty = nr[nreg].isDirty; + if (oldfirst == first) { + nr[nreg].mipsReg = -1; + nr[nreg].isDirty = false; + } + } + nr[dest].mipsReg = first; + + return true; +} + +bool Arm64IRRegCache::Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes) { + ARM64Reg destReg = FromNativeReg(dest); + ARM64Reg cur[4]{}; + int numInRegs = 0; + u8 blendMask = 0; + for (int i = 0; i < lanes; ++i) { + if (mr[first + i].lane != -1 || (i != 0 && mr[first + i].spillLockIRIndex >= irIndex_)) { + // Can't do it, either double mapped or overlapping vec. + return false; + } + + if (mr[first + i].nReg == -1) { + cur[i] = INVALID_REG; + blendMask |= 1 << i; + } else { + cur[i] = FromNativeReg(mr[first + i].nReg); + numInRegs++; + } + } + + // Shouldn't happen, this should only get called to transfer one in a reg. + if (numInRegs == 0) + return false; + + // If everything's currently in a reg, move it into this reg. + if (lanes == 4) { + // Go with an exhaustive approach, only 15 possibilities... + if (blendMask == 0) { + // y = yw##, x = xz##, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3])); + fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2])); + fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1])); + } else if (blendMask == 0b0001) { + // y = yw##, w = x###, w = xz##, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3])); + fp_->LDR(32, INDEX_UNSIGNED, cur[3], CTXREG, GetMipsRegOffset(first + 0)); + fp_->ZIP1(32, EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[2])); + fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[1])); + } else if (blendMask == 0b0010) { + // x = xz##, z = y###, z = yw##, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2])); + fp_->LDR(32, INDEX_UNSIGNED, cur[2], CTXREG, GetMipsRegOffset(first + 1)); + fp_->ZIP1(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[3])); + fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2])); + } else if (blendMask == 0b0011 && (first & 1) == 0) { + // z = zw##, w = xy##, dest = xyzw. Mixed lane sizes. + fp_->ZIP1(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[3])); + fp_->LDR(64, INDEX_UNSIGNED, EncodeRegToDouble(cur[3]), CTXREG, GetMipsRegOffset(first + 0)); + fp_->ZIP1(64, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[2])); + } else if (blendMask == 0b0100) { + // y = yw##, w = z###, x = xz##, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3])); + fp_->LDR(32, INDEX_UNSIGNED, cur[3], CTXREG, GetMipsRegOffset(first + 2)); + fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[3])); + fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1])); + } else if (blendMask == 0b0101 && (first & 3) == 0) { + // y = yw##, w=x#z#, w = xz##, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[3])); + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(cur[3]), CTXREG, GetMipsRegOffset(first)); + fp_->UZP1(32, EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3])); + fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[1])); + } else if (blendMask == 0b0110 && (first & 3) == 0) { + if (destReg == cur[0]) { + // w = wx##, dest = #yz#, dest = xyz#, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[3]), EncodeRegToQuad(cur[0])); + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(cur[3]), 1); + fp_->INS(32, EncodeRegToQuad(destReg), 3, EncodeRegToQuad(cur[3]), 0); + } else { + // Assumes destReg may equal cur[3]. + // x = xw##, dest = #yz#, dest = xyz#, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[3])); + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(cur[0]), 0); + fp_->INS(32, EncodeRegToQuad(destReg), 3, EncodeRegToQuad(cur[0]), 1); + } + } else if (blendMask == 0b0111 && (first & 3) == 0 && destReg != cur[3]) { + // dest = xyz#, dest = xyzw. + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 3, EncodeRegToQuad(cur[3]), 0); + } else if (blendMask == 0b1000) { + // x = xz##, z = w###, y = yw##, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2])); + fp_->LDR(32, INDEX_UNSIGNED, cur[2], CTXREG, GetMipsRegOffset(first + 3)); + fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[2])); + fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1])); + } else if (blendMask == 0b1001 && (first & 3) == 0) { + if (destReg == cur[1]) { + // w = zy##, dest = x##w, dest = xy#w, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[1])); + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 1, EncodeRegToQuad(cur[2]), 1); + fp_->INS(32, EncodeRegToQuad(destReg), 2, EncodeRegToQuad(cur[2]), 0); + } else { + // Assumes destReg may equal cur[2]. + // y = yz##, dest = x##w, dest = xy#w, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[1]), EncodeRegToQuad(cur[2])); + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 1, EncodeRegToQuad(cur[1]), 0); + fp_->INS(32, EncodeRegToQuad(destReg), 2, EncodeRegToQuad(cur[1]), 1); + } + } else if (blendMask == 0b1010 && (first & 3) == 0) { + // x = xz##, z = #y#w, z=yw##, dest = xyzw. + fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2])); + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(cur[2]), CTXREG, GetMipsRegOffset(first)); + fp_->UZP2(32, EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2]), EncodeRegToQuad(cur[2])); + fp_->ZIP1(32, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[2])); + } else if (blendMask == 0b1011 && (first & 3) == 0 && destReg != cur[2]) { + // dest = xy#w, dest = xyzw. + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 2, EncodeRegToQuad(cur[2]), 0); + } else if (blendMask == 0b1100 && (first & 1) == 0) { + // x = xy##, y = zw##, dest = xyzw. Mixed lane sizes. + fp_->ZIP1(32, EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1])); + fp_->LDR(64, INDEX_UNSIGNED, EncodeRegToDouble(cur[1]), CTXREG, GetMipsRegOffset(first + 2)); + fp_->ZIP1(64, EncodeRegToQuad(destReg), EncodeRegToQuad(cur[0]), EncodeRegToQuad(cur[1])); + } else if (blendMask == 0b1101 && (first & 3) == 0 && destReg != cur[1]) { + // dest = x#zw, dest = xyzw. + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 1, EncodeRegToQuad(cur[1]), 0); + } else if (blendMask == 0b1110 && (first & 3) == 0 && destReg != cur[0]) { + // dest = #yzw, dest = xyzw. + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(cur[0]), 0); + } else if (blendMask == 0b1110 && (first & 3) == 0) { + // If dest == cur[0] (which may be common), we need a temp... + IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT); + // Very unfortunate. + if (freeReg == INVALID_REG) + return false; + + // free = x###, dest = #yzw, dest = xyzw. + fp_->DUP(32, EncodeRegToQuad(FromNativeReg(freeReg)), EncodeRegToQuad(cur[0]), 0); + fp_->LDR(128, INDEX_UNSIGNED, EncodeRegToQuad(destReg), CTXREG, GetMipsRegOffset(first)); + fp_->INS(32, EncodeRegToQuad(destReg), 0, EncodeRegToQuad(FromNativeReg(freeReg)), 0); + } else { + return false; + } + } else if (lanes == 2) { + if (cur[0] != INVALID_REG && cur[1] != INVALID_REG) { + fp_->ZIP1(32, EncodeRegToDouble(destReg), EncodeRegToDouble(cur[0]), EncodeRegToDouble(cur[1])); + } else if (cur[0] == INVALID_REG && dest != nreg) { + fp_->LDR(32, INDEX_UNSIGNED, destReg, CTXREG, GetMipsRegOffset(first + 0)); + fp_->INS(32, EncodeRegToDouble(destReg), 1, EncodeRegToDouble(cur[1]), 0); + } else { + IRNativeReg freeReg = FindFreeReg(MIPSLoc::FREG, MIPSMap::INIT); + if (freeReg == INVALID_REG) + return false; + + if (cur[0] == INVALID_REG) { + fp_->LDR(32, INDEX_UNSIGNED, FromNativeReg(freeReg), CTXREG, GetMipsRegOffset(first + 0)); + fp_->ZIP1(32, EncodeRegToDouble(destReg), EncodeRegToDouble(FromNativeReg(freeReg)), EncodeRegToDouble(cur[1])); + } else { + fp_->LDR(32, INDEX_UNSIGNED, FromNativeReg(freeReg), CTXREG, GetMipsRegOffset(first + 1)); + fp_->ZIP1(32, EncodeRegToDouble(destReg), EncodeRegToDouble(cur[0]), EncodeRegToDouble(FromNativeReg(freeReg))); + } + } + } else { + return false; + } + + mr[first].lane = 0; + for (int i = 0; i < lanes; ++i) { + if (mr[first + i].nReg != -1) { + // If this was dirty, the combined reg is now dirty. + if (nr[mr[first + i].nReg].isDirty) + nr[dest].isDirty = true; + + // Throw away the other register we're no longer using. + if (i != 0) + DiscardNativeReg(mr[first + i].nReg); + } + + // And set it as using the new one. + mr[first + i].lane = i; + mr[first + i].loc = MIPSLoc::FREG; + mr[first + i].nReg = dest; + } + + if (dest != nreg) { + nr[dest].mipsReg = first; + nr[nreg].mipsReg = -1; + nr[nreg].isDirty = false; + } + + return true; +} + void Arm64IRRegCache::FlushAll(bool gprs, bool fprs) { // Note: make sure not to change the registers when flushing: // Branching code may expect the armreg to retain its value. diff --git a/Core/MIPS/ARM64/Arm64IRRegCache.h b/Core/MIPS/ARM64/Arm64IRRegCache.h index 9f0b0cbbac..f3f0b58b6f 100644 --- a/Core/MIPS/ARM64/Arm64IRRegCache.h +++ b/Core/MIPS/ARM64/Arm64IRRegCache.h @@ -91,8 +91,12 @@ protected: void StoreNativeReg(IRNativeReg nreg, IRReg first, int lanes) override; void SetNativeRegValue(IRNativeReg nreg, uint32_t imm) override; void StoreRegValue(IRReg mreg, uint32_t imm) override; + bool TransferNativeReg(IRNativeReg nreg, IRNativeReg dest, MIPSLoc type, IRReg first, int lanes, MIPSMap flags) override; private: + bool TransferVecTo1(IRNativeReg nreg, IRNativeReg dest, IRReg first, int oldlanes); + bool Transfer1ToVec(IRNativeReg nreg, IRNativeReg dest, IRReg first, int lanes); + IRNativeReg GPRToNativeReg(Arm64Gen::ARM64Reg r); IRNativeReg VFPToNativeReg(Arm64Gen::ARM64Reg r); Arm64Gen::ARM64Reg FromNativeReg(IRNativeReg r);