#include #include #include #include "Common/BitSet.h" #include "Common/Data/Convert/SmallDataConvert.h" #include "Common/Log.h" #include "Core/Config.h" #include "Core/MIPS/MIPSVFPUUtils.h" #include "Core/MIPS/IR/IRAnalysis.h" #include "Core/MIPS/IR/IRInterpreter.h" #include "Core/MIPS/IR/IRPassSimplify.h" #include "Core/MIPS/IR/IRRegCache.h" // #define CONDITIONAL_DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; } #define CONDITIONAL_DISABLE #define DISABLE { for (IRInst inst : in.GetInstructions()) { out.Write(inst); } return false; } u32 Evaluate(u32 a, u32 b, IROp op) { switch (op) { case IROp::Add: case IROp::AddConst: return a + b; case IROp::Sub: case IROp::SubConst: return a - b; case IROp::And: case IROp::AndConst: return a & b; case IROp::Or: case IROp::OrConst: return a | b; case IROp::Xor: case IROp::XorConst: return a ^ b; case IROp::Shr: case IROp::ShrImm: return a >> b; case IROp::Sar: case IROp::SarImm: return (s32)a >> b; case IROp::Ror: case IROp::RorImm: return (a >> b) | (a << (32 - b)); case IROp::Shl: case IROp::ShlImm: return a << b; case IROp::Slt: case IROp::SltConst: return ((s32)a < (s32)b); case IROp::SltU: case IROp::SltUConst: return (a < b); default: _assert_msg_(false, "Unable to evaluate two op %d", (int)op); return -1; } } u32 Evaluate(u32 a, IROp op) { switch (op) { case IROp::Not: return ~a; case IROp::Neg: return -(s32)a; case IROp::BSwap16: return ((a & 0xFF00FF00) >> 8) | ((a & 0x00FF00FF) << 8); case IROp::BSwap32: return swap32(a); case IROp::Ext8to32: return SignExtend8ToU32(a); case IROp::Ext16to32: return SignExtend16ToU32(a); case IROp::ReverseBits: return ReverseBits32(a); case IROp::Clz: { int x = 31; int count = 0; while (x >= 0 && !(a & (1 << x))) { count++; x--; } return count; } default: _assert_msg_(false, "Unable to evaluate one op %d", (int)op); return -1; } } IROp ArithToArithConst(IROp op) { switch (op) { case IROp::Add: return IROp::AddConst; case IROp::Sub: return IROp::SubConst; case IROp::And: return IROp::AndConst; case IROp::Or: return IROp::OrConst; case IROp::Xor: return IROp::XorConst; case IROp::Slt: return IROp::SltConst; case IROp::SltU: return IROp::SltUConst; default: _assert_msg_(false, "Invalid ArithToArithConst for op %d", (int)op); return (IROp)-1; } } IROp ShiftToShiftImm(IROp op) { switch (op) { case IROp::Shl: return IROp::ShlImm; case IROp::Shr: return IROp::ShrImm; case IROp::Ror: return IROp::RorImm; case IROp::Sar: return IROp::SarImm; default: _assert_msg_(false, "Invalid ShiftToShiftImm for op %d", (int)op); return (IROp)-1; } } bool IRApplyPasses(const IRPassFunc *passes, size_t c, const IRWriter &in, IRWriter &out, const IROptions &opts) { out.Reserve(in.GetInstructions().size()); if (c == 1) { return passes[0](in, out, opts); } bool logBlocks = false; IRWriter temp[2]; const IRWriter *nextIn = ∈ IRWriter *nextOut = &temp[1]; temp[1].Reserve(nextIn->GetInstructions().size()); for (size_t i = 0; i < c - 1; ++i) { if (passes[i](*nextIn, *nextOut, opts)) { logBlocks = true; } temp[0] = std::move(temp[1]); nextIn = &temp[0]; temp[1].Clear(); temp[1].Reserve(nextIn->GetInstructions().size()); } out.Reserve(nextIn->GetInstructions().size()); if (passes[c - 1](*nextIn, out, opts)) { logBlocks = true; } return logBlocks; } bool OptimizeFPMoves(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; bool logBlocks = false; IRInst prev{ IROp::Nop }; for (int i = 0; i < (int)in.GetInstructions().size(); i++) { IRInst inst = in.GetInstructions()[i]; switch (inst.op) { case IROp::FMovFromGPR: //FMovToGPR a0, f12 //FMovFromGPR f14, a0 // to //FMovToGPR a0, f12 //FMov f14, f12 if (prev.op == IROp::FMovToGPR && prev.dest == inst.src1) { inst.op = IROp::FMov; inst.src1 = prev.src1; // Skip it entirely if it's just a copy to and back. if (inst.dest != inst.src1) out.Write(inst); } else { out.Write(inst); } break; // This will need to scan forward or keep track of more information to be useful. // Just doing one isn't. /* case IROp::LoadVec4: // AddConst a0, sp, 0x30 // LoadVec4 v16, a0, 0x0 // to // AddConst a0, sp, 0x30 // LoadVec4 v16, sp, 0x30 if (prev.op == IROp::AddConst && prev.dest == inst.src1 && prev.dest != prev.src1 && prev.src1 == MIPS_REG_SP) { inst.constant += prev.constant; inst.src1 = prev.src1; logBlocks = 1; } else { goto doDefault; } out.Write(inst); break; */ default: out.Write(inst); break; } prev = inst; } return logBlocks; } // Might be useful later on x86. bool ThreeOpToTwoOp(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; bool logBlocks = false; for (int i = 0; i < (int)in.GetInstructions().size(); i++) { IRInst inst = in.GetInstructions()[i]; switch (inst.op) { case IROp::Sub: case IROp::Slt: case IROp::SltU: case IROp::Add: case IROp::And: case IROp::Or: case IROp::Xor: if (inst.src1 != inst.dest && inst.src2 != inst.dest) { out.Write(IROp::Mov, inst.dest, inst.src1); out.Write(inst.op, inst.dest, inst.dest, inst.src2); } else { out.Write(inst); } break; case IROp::FMul: case IROp::FAdd: if (inst.src1 != inst.dest && inst.src2 != inst.dest) { out.Write(IROp::FMov, inst.dest, inst.src1); out.Write(inst.op, inst.dest, inst.dest, inst.src2); } else { out.Write(inst); } break; case IROp::Vec4Add: case IROp::Vec4Sub: case IROp::Vec4Mul: case IROp::Vec4Div: if (inst.src1 != inst.dest && inst.src2 != inst.dest) { out.Write(IROp::Vec4Mov, inst.dest, inst.src1); out.Write(inst.op, inst.dest, inst.dest, inst.src2); } else { out.Write(inst); } break; default: out.Write(inst); break; } } return logBlocks; } bool RemoveLoadStoreLeftRight(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; bool logBlocks = false; for (int i = 0, n = (int)in.GetInstructions().size(); i < n; ++i) { const IRInst &inst = in.GetInstructions()[i]; // TODO: Reorder or look ahead to combine? auto nextOp = [&]() -> const IRInst &{ return in.GetInstructions()[i + 1]; }; auto combineOpposite = [&](IROp matchOp, int matchOff, IROp replaceOp, int replaceOff) { if (i + 1 >= n) return false; const IRInst &next = nextOp(); if (next.op != matchOp || next.dest != inst.dest || next.src1 != inst.src1) return false; if (inst.constant + matchOff != next.constant) return false; if (opts.unalignedLoadStore) { // Write out one unaligned op. out.Write(replaceOp, inst.dest, inst.src1, out.AddConstant(inst.constant + replaceOff)); } else if (replaceOp == IROp::Load32) { // We can still combine to a simpler set of two loads. // We start by isolating the address and shift amount. // IRTEMP_LR_ADDR = rs + imm out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant + replaceOff)); // IRTEMP_LR_SHIFT = (addr & 3) * 8 out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3)); out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3); // IRTEMP_LR_ADDR = addr & 0xfffffffc out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC)); // IRTEMP_LR_VALUE = low_word, dest = high_word out.Write(IROp::Load32, inst.dest, IRTEMP_LR_ADDR, out.AddConstant(0)); out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(4)); // Now we just need to adjust and combine dest and IRTEMP_LR_VALUE. // inst.dest >>= shift (putting its bits in the right spot.) out.Write(IROp::Shr, inst.dest, inst.dest, IRTEMP_LR_SHIFT); // We can't shift by 32, so we compromise by shifting twice. out.Write(IROp::ShlImm, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, 8); // IRTEMP_LR_SHIFT = 24 - shift out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT); out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24)); // IRTEMP_LR_VALUE <<= (24 - shift) out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT); // At this point the values are aligned, and we just merge. out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE); } else { return false; } // Skip the next one, replaced. i++; return true; }; auto addCommonProlog = [&]() { // IRTEMP_LR_ADDR = rs + imm out.Write(IROp::AddConst, IRTEMP_LR_ADDR, inst.src1, out.AddConstant(inst.constant)); // IRTEMP_LR_SHIFT = (addr & 3) * 8 out.Write(IROp::AndConst, IRTEMP_LR_SHIFT, IRTEMP_LR_ADDR, out.AddConstant(3)); out.Write(IROp::ShlImm, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, 3); // IRTEMP_LR_ADDR = addr & 0xfffffffc (for stores, later) out.Write(IROp::AndConst, IRTEMP_LR_ADDR, IRTEMP_LR_ADDR, out.AddConstant(0xFFFFFFFC)); // IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR) out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(0)); }; auto addCommonStore = [&](int off = 0) { // RAM(IRTEMP_LR_ADDR) = IRTEMP_LR_VALUE out.Write(IROp::Store32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(off)); }; switch (inst.op) { case IROp::Load32Left: if (!combineOpposite(IROp::Load32Right, -3, IROp::Load32, -3)) { addCommonProlog(); // dest &= (0x00ffffff >> shift) // Alternatively, could shift to a wall and back (but would require two shifts each way.) out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff); out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT); out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK); // IRTEMP_LR_SHIFT = 24 - shift out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT); out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24)); // IRTEMP_LR_VALUE <<= (24 - shift) out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT); // dest |= IRTEMP_LR_VALUE out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE); bool src1Dirty = inst.dest == inst.src1; while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) { // IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta) out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant)); // dest &= IRTEMP_LR_MASK out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK); // IRTEMP_LR_VALUE <<= (24 - shift) out.Write(IROp::Shl, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT); // dest |= IRTEMP_LR_VALUE out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE); src1Dirty = nextOp().dest == inst.src1; ++i; } } break; case IROp::Load32Right: if (!combineOpposite(IROp::Load32Left, 3, IROp::Load32, 0)) { addCommonProlog(); // IRTEMP_LR_VALUE >>= shift out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT); // IRTEMP_LR_SHIFT = 24 - shift out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT); out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24)); // dest &= (0xffffff00 << (24 - shift)) // Alternatively, could shift to a wall and back (but would require two shifts each way.) out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00); out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT); out.Write(IROp::And, inst.dest, inst.dest, IRTEMP_LR_MASK); // dest |= IRTEMP_LR_VALUE out.Write(IROp::Or, inst.dest, inst.dest, IRTEMP_LR_VALUE); // Building display lists sometimes involves a bunch of lwr in a row. // We can generate more optimal code by combining. bool shiftNeedsReverse = true; bool src1Dirty = inst.dest == inst.src1; while (i + 1 < n && !src1Dirty && nextOp().op == inst.op && nextOp().src1 == inst.src1 && (nextOp().constant & 3) == (inst.constant & 3)) { // IRTEMP_LR_VALUE = RAM(IRTEMP_LR_ADDR + offsetDelta) out.Write(IROp::Load32, IRTEMP_LR_VALUE, IRTEMP_LR_ADDR, out.AddConstant(nextOp().constant - inst.constant)); if (shiftNeedsReverse) { // IRTEMP_LR_SHIFT = shift again out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT); out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24)); shiftNeedsReverse = false; } // IRTEMP_LR_VALUE >>= IRTEMP_LR_SHIFT out.Write(IROp::Shr, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_SHIFT); // dest &= IRTEMP_LR_MASK out.Write(IROp::And, nextOp().dest, nextOp().dest, IRTEMP_LR_MASK); // dest |= IRTEMP_LR_VALUE out.Write(IROp::Or, nextOp().dest, nextOp().dest, IRTEMP_LR_VALUE); src1Dirty = nextOp().dest == inst.src1; ++i; } } break; case IROp::Store32Left: if (!combineOpposite(IROp::Store32Right, -3, IROp::Store32, -3)) { addCommonProlog(); // IRTEMP_LR_VALUE &= 0xffffff00 << shift out.WriteSetConstant(IRTEMP_LR_MASK, 0xffffff00); out.Write(IROp::Shl, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT); out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK); // IRTEMP_LR_SHIFT = 24 - shift out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT); out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24)); // IRTEMP_LR_VALUE |= src3 >> (24 - shift) out.Write(IROp::Shr, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT); out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK); addCommonStore(0); } break; case IROp::Store32Right: if (!combineOpposite(IROp::Store32Left, 3, IROp::Store32, 0)) { addCommonProlog(); // IRTEMP_LR_VALUE &= 0x00ffffff << (24 - shift) out.WriteSetConstant(IRTEMP_LR_MASK, 0x00ffffff); out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT); out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24)); out.Write(IROp::Shr, IRTEMP_LR_MASK, IRTEMP_LR_MASK, IRTEMP_LR_SHIFT); out.Write(IROp::And, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK); out.Write(IROp::Neg, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT); out.Write(IROp::AddConst, IRTEMP_LR_SHIFT, IRTEMP_LR_SHIFT, out.AddConstant(24)); // IRTEMP_LR_VALUE |= src3 << shift out.Write(IROp::Shl, IRTEMP_LR_MASK, inst.src3, IRTEMP_LR_SHIFT); out.Write(IROp::Or, IRTEMP_LR_VALUE, IRTEMP_LR_VALUE, IRTEMP_LR_MASK); addCommonStore(0); } break; default: out.Write(inst); break; } } return logBlocks; } bool PropagateConstants(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; IRImmRegCache gpr(&out); bool logBlocks = false; bool skipNextExitToConst = false; for (int i = 0; i < (int)in.GetInstructions().size(); i++) { IRInst inst = in.GetInstructions()[i]; bool symmetric = true; switch (inst.op) { case IROp::SetConst: gpr.SetImm(inst.dest, inst.constant); break; case IROp::SetConstF: goto doDefault; case IROp::Sub: if (gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0 && !gpr.IsImm(inst.src2)) { // Morph into a Neg. gpr.MapDirtyIn(inst.dest, inst.src2); out.Write(IROp::Neg, inst.dest, inst.src2); break; } else if (inst.src1 == inst.src2) { // Seen sometimes, yet another way of producing zero. gpr.SetImm(inst.dest, 0); break; } #if __cplusplus >= 201703 || _MSC_VER > 1910 [[fallthrough]]; #endif case IROp::Slt: case IROp::SltU: symmetric = false; #if __cplusplus >= 201703 || _MSC_VER > 1910 [[fallthrough]]; #endif case IROp::Add: case IROp::And: case IROp::Or: case IROp::Xor: // Regularize, for the add/or check below. if (symmetric && inst.src2 == inst.dest && inst.src1 != inst.src2) { std::swap(inst.src1, inst.src2); } if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) { gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op)); } else if (inst.op == IROp::And && gpr.IsImm(inst.src1) && gpr.GetImm(inst.src1) == 0) { gpr.SetImm(inst.dest, 0); } else if (inst.op == IROp::And && gpr.IsImm(inst.src2) && gpr.GetImm(inst.src2) == 0) { gpr.SetImm(inst.dest, 0); } else if (gpr.IsImm(inst.src2)) { const u32 imm2 = gpr.GetImm(inst.src2); gpr.MapDirtyIn(inst.dest, inst.src1); if (imm2 == 0 && (inst.op == IROp::Add || inst.op == IROp::Sub || inst.op == IROp::Or || inst.op == IROp::Xor)) { // Add / Sub / Or / Xor with zero is just a Mov. Add / Or are most common. if (inst.dest != inst.src1) out.Write(IROp::Mov, inst.dest, inst.src1); } else { out.Write(ArithToArithConst(inst.op), inst.dest, inst.src1, out.AddConstant(imm2)); } } else if (symmetric && gpr.IsImm(inst.src1)) { const u32 imm1 = gpr.GetImm(inst.src1); gpr.MapDirtyIn(inst.dest, inst.src2); if (imm1 == 0 && (inst.op == IROp::Add || inst.op == IROp::Or || inst.op == IROp::Xor)) { // Add / Or / Xor with zero is just a Mov. if (inst.dest != inst.src2) out.Write(IROp::Mov, inst.dest, inst.src2); } else { out.Write(ArithToArithConst(inst.op), inst.dest, inst.src2, out.AddConstant(imm1)); } } else { gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2); goto doDefault; } break; case IROp::Neg: case IROp::Not: case IROp::BSwap16: case IROp::BSwap32: case IROp::Ext8to32: case IROp::Ext16to32: case IROp::ReverseBits: case IROp::Clz: if (gpr.IsImm(inst.src1)) { gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.op)); } else { gpr.MapDirtyIn(inst.dest, inst.src1); goto doDefault; } break; case IROp::AddConst: case IROp::SubConst: case IROp::AndConst: case IROp::OrConst: case IROp::XorConst: case IROp::SltConst: case IROp::SltUConst: // And 0 is otherwise set to 0. Happens when optimizing lwl. if (inst.op == IROp::AndConst && inst.constant == 0) { gpr.SetImm(inst.dest, 0); } else if (gpr.IsImm(inst.src1)) { gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.constant, inst.op)); } else if (inst.constant == 0 && (inst.op == IROp::AddConst || inst.op == IROp::SubConst || inst.op == IROp::OrConst || inst.op == IROp::XorConst)) { // Convert an Add/Sub/Or/Xor with a constant zero to a Mov (just like with reg zero.) gpr.MapDirtyIn(inst.dest, inst.src1); if (inst.dest != inst.src1) out.Write(IROp::Mov, inst.dest, inst.src1); } else { gpr.MapDirtyIn(inst.dest, inst.src1); goto doDefault; } break; case IROp::Shl: case IROp::Shr: case IROp::Ror: case IROp::Sar: if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) { gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), gpr.GetImm(inst.src2), inst.op)); } else if (gpr.IsImm(inst.src2)) { const u8 sa = gpr.GetImm(inst.src2) & 31; gpr.MapDirtyIn(inst.dest, inst.src1); if (sa == 0) { if (inst.dest != inst.src1) out.Write(IROp::Mov, inst.dest, inst.src1); } else { out.Write(ShiftToShiftImm(inst.op), inst.dest, inst.src1, sa); } } else { gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2); goto doDefault; } break; case IROp::ShlImm: case IROp::ShrImm: case IROp::RorImm: case IROp::SarImm: if (gpr.IsImm(inst.src1)) { gpr.SetImm(inst.dest, Evaluate(gpr.GetImm(inst.src1), inst.src2, inst.op)); } else { gpr.MapDirtyIn(inst.dest, inst.src1); goto doDefault; } break; case IROp::Mov: if (inst.dest == inst.src1) { // Nop } else if (gpr.IsImm(inst.src1)) { gpr.SetImm(inst.dest, gpr.GetImm(inst.src1)); } else { gpr.MapDirtyIn(inst.dest, inst.src1); goto doDefault; } break; case IROp::Mult: case IROp::MultU: case IROp::Madd: case IROp::MaddU: case IROp::Msub: case IROp::MsubU: case IROp::Div: case IROp::DivU: gpr.MapInIn(inst.src1, inst.src2); goto doDefault; case IROp::MovZ: case IROp::MovNZ: gpr.MapInInIn(inst.dest, inst.src1, inst.src2); goto doDefault; case IROp::Min: case IROp::Max: gpr.MapDirtyInIn(inst.dest, inst.src1, inst.src2); goto doDefault; case IROp::FMovFromGPR: if (gpr.IsImm(inst.src1)) { out.Write(IROp::SetConstF, inst.dest, out.AddConstant(gpr.GetImm(inst.src1))); } else { gpr.MapIn(inst.src1); goto doDefault; } break; case IROp::FMovToGPR: gpr.MapDirty(inst.dest); goto doDefault; case IROp::MfHi: case IROp::MfLo: gpr.MapDirty(inst.dest); goto doDefault; case IROp::MtHi: case IROp::MtLo: gpr.MapIn(inst.src1); goto doDefault; case IROp::Store8: case IROp::Store16: case IROp::Store32: case IROp::Store32Left: case IROp::Store32Right: case IROp::Store32Conditional: if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) { gpr.MapIn(inst.dest); out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant)); } else { gpr.MapInIn(inst.dest, inst.src1); goto doDefault; } break; case IROp::StoreFloat: case IROp::StoreVec4: if (gpr.IsImm(inst.src1)) { out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant)); } else { gpr.MapIn(inst.src1); goto doDefault; } break; case IROp::Load8: case IROp::Load8Ext: case IROp::Load16: case IROp::Load16Ext: case IROp::Load32: case IROp::Load32Linked: if (gpr.IsImm(inst.src1) && inst.src1 != inst.dest) { gpr.MapDirty(inst.dest); out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant)); } else { gpr.MapDirtyIn(inst.dest, inst.src1); goto doDefault; } break; case IROp::LoadFloat: case IROp::LoadVec4: if (gpr.IsImm(inst.src1)) { out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant)); } else { gpr.MapIn(inst.src1); goto doDefault; } break; case IROp::Load32Left: case IROp::Load32Right: if (gpr.IsImm(inst.src1)) { gpr.MapIn(inst.dest); out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant)); } else { gpr.MapInIn(inst.dest, inst.src1); goto doDefault; } break; case IROp::ValidateAddress8: case IROp::ValidateAddress16: case IROp::ValidateAddress32: case IROp::ValidateAddress128: if (gpr.IsImm(inst.src1)) { out.Write(inst.op, inst.dest, 0, out.AddConstant(gpr.GetImm(inst.src1) + inst.constant)); } else { gpr.MapIn(inst.src1); goto doDefault; } break; case IROp::Downcount: case IROp::SetPCConst: goto doDefault; case IROp::SetPC: if (gpr.IsImm(inst.src1)) { out.Write(IROp::SetPCConst, out.AddConstant(gpr.GetImm(inst.src1))); } else { gpr.MapIn(inst.src1); goto doDefault; } break; // FP-only instructions don't need to flush immediates. case IROp::FAdd: case IROp::FMul: // Regularize, to help x86 backends (add.s r0, r1, r0 -> add.s r0, r0, r1) if (inst.src2 == inst.dest && inst.src1 != inst.src2) std::swap(inst.src1, inst.src2); out.Write(inst); break; case IROp::FSub: case IROp::FDiv: case IROp::FNeg: case IROp::FAbs: case IROp::FMov: case IROp::FRound: case IROp::FTrunc: case IROp::FCeil: case IROp::FFloor: case IROp::FCvtSW: case IROp::FCvtScaledWS: case IROp::FCvtScaledSW: case IROp::FSin: case IROp::FCos: case IROp::FSqrt: case IROp::FRSqrt: case IROp::FRecip: case IROp::FAsin: out.Write(inst); break; case IROp::SetCtrlVFPU: gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest); goto doDefault; case IROp::SetCtrlVFPUReg: if (gpr.IsImm(inst.src1)) { out.Write(IROp::SetCtrlVFPU, inst.dest, out.AddConstant(gpr.GetImm(inst.src1))); } else { gpr.MapDirtyIn(IRREG_VFPU_CTRL_BASE + inst.dest, inst.src1); out.Write(inst); } break; case IROp::SetCtrlVFPUFReg: gpr.MapDirty(IRREG_VFPU_CTRL_BASE + inst.dest); goto doDefault; case IROp::FCvtWS: // TODO: Actually, this should just use the currently set rounding mode. // Move up with FCvtSW when that's implemented. gpr.MapIn(IRREG_FCR31); out.Write(inst); break; case IROp::FpCondFromReg: gpr.MapDirtyIn(IRREG_FPCOND, inst.src1); out.Write(inst); break; case IROp::FpCondToReg: if (gpr.IsImm(IRREG_FPCOND)) { gpr.SetImm(inst.dest, gpr.GetImm(IRREG_FPCOND)); } else { gpr.MapDirtyIn(inst.dest, IRREG_FPCOND); out.Write(inst); } break; case IROp::FpCtrlFromReg: gpr.MapDirtyIn(IRREG_FCR31, inst.src1); gpr.MapDirty(IRREG_FPCOND); goto doDefault; case IROp::FpCtrlToReg: gpr.MapDirtyInIn(inst.dest, IRREG_FPCOND, IRREG_FCR31); goto doDefault; case IROp::Vec4Init: case IROp::Vec4Mov: case IROp::Vec4Add: case IROp::Vec4Sub: case IROp::Vec4Mul: case IROp::Vec4Div: case IROp::Vec4Dot: case IROp::Vec4Scale: case IROp::Vec4Shuffle: case IROp::Vec4Blend: case IROp::Vec4Neg: case IROp::Vec4Abs: case IROp::Vec4Pack31To8: case IROp::Vec4Pack32To8: case IROp::Vec2Pack32To16: case IROp::Vec4Unpack8To32: case IROp::Vec2Unpack16To32: case IROp::Vec4DuplicateUpperBitsAndShift1: case IROp::Vec2ClampToZero: case IROp::Vec4ClampToZero: out.Write(inst); break; case IROp::FCmp: gpr.MapDirty(IRREG_FPCOND); goto doDefault; case IROp::RestoreRoundingMode: case IROp::ApplyRoundingMode: case IROp::UpdateRoundingMode: goto doDefault; case IROp::VfpuCtrlToReg: gpr.MapDirtyIn(inst.dest, IRREG_VFPU_CTRL_BASE + inst.src1); goto doDefault; case IROp::FCmpVfpuBit: gpr.MapDirty(IRREG_VFPU_CC); goto doDefault; case IROp::FCmovVfpuCC: gpr.MapIn(IRREG_VFPU_CC); goto doDefault; case IROp::FCmpVfpuAggregate: gpr.MapDirtyIn(IRREG_VFPU_CC, IRREG_VFPU_CC); goto doDefault; case IROp::ExitToConstIfEq: case IROp::ExitToConstIfNeq: if (gpr.IsImm(inst.src1) && gpr.IsImm(inst.src2)) { bool passed = false; switch (inst.op) { case IROp::ExitToConstIfEq: passed = gpr.GetImm(inst.src1) == gpr.GetImm(inst.src2); break; case IROp::ExitToConstIfNeq: passed = gpr.GetImm(inst.src1) != gpr.GetImm(inst.src2); break; default: _assert_(false); break; } // This is a bit common for the first cycle of loops. // Reduce bloat by skipping on fail, and const exit on pass. if (passed) { gpr.FlushAll(); out.Write(IROp::ExitToConst, out.AddConstant(inst.constant)); skipNextExitToConst = true; } break; } gpr.FlushAll(); goto doDefault; case IROp::ExitToConstIfGtZ: case IROp::ExitToConstIfGeZ: case IROp::ExitToConstIfLtZ: case IROp::ExitToConstIfLeZ: if (gpr.IsImm(inst.src1)) { bool passed = false; switch (inst.op) { case IROp::ExitToConstIfGtZ: passed = (s32)gpr.GetImm(inst.src1) > 0; break; case IROp::ExitToConstIfGeZ: passed = (s32)gpr.GetImm(inst.src1) >= 0; break; case IROp::ExitToConstIfLtZ: passed = (s32)gpr.GetImm(inst.src1) < 0; break; case IROp::ExitToConstIfLeZ: passed = (s32)gpr.GetImm(inst.src1) <= 0; break; default: _assert_(false); break; } if (passed) { gpr.FlushAll(); out.Write(IROp::ExitToConst, out.AddConstant(inst.constant)); skipNextExitToConst = true; } break; } gpr.FlushAll(); goto doDefault; case IROp::ExitToConst: if (skipNextExitToConst) { skipNextExitToConst = false; break; } gpr.FlushAll(); goto doDefault; case IROp::ExitToReg: if (gpr.IsImm(inst.src1)) { // This happens sometimes near loops. // Prefer ExitToConst to allow block linking. u32 dest = gpr.GetImm(inst.src1); gpr.FlushAll(); out.Write(IROp::ExitToConst, out.AddConstant(dest)); break; } gpr.FlushAll(); goto doDefault; case IROp::CallReplacement: case IROp::Break: case IROp::Syscall: case IROp::Interpret: case IROp::ExitToConstIfFpFalse: case IROp::ExitToConstIfFpTrue: case IROp::Breakpoint: case IROp::MemoryCheck: default: { gpr.FlushAll(); doDefault: out.Write(inst); break; } } } gpr.FlushAll(); return logBlocks; } IRInstMeta IRReplaceSrcGPR(const IRInstMeta &inst, int fromReg, int toReg) { IRInstMeta newInst = inst; if (inst.m.types[1] == 'G' && inst.src1 == fromReg) { newInst.src1 = toReg; } if (inst.m.types[2] == 'G' && inst.src2 == fromReg) { newInst.src2 = toReg; } if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0 && inst.m.types[0] == 'G' && inst.src3 == fromReg) { newInst.src3 = toReg; } return newInst; } IRInstMeta IRReplaceDestGPR(const IRInstMeta &inst, int fromReg, int toReg) { IRInstMeta newInst = inst; if ((inst.m.flags & IRFLAG_SRC3) == 0 && inst.m.types[0] == 'G' && inst.dest == fromReg) { newInst.dest = toReg; } return newInst; } bool IRMutatesDestGPR(const IRInstMeta &inst, int reg) { return (inst.m.flags & IRFLAG_SRC3DST) != 0 && inst.m.types[0] == 'G' && inst.src3 == reg; } bool PurgeTemps(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; std::vector insts; insts.reserve(in.GetInstructions().size()); // We track writes both to rename regs and to purge dead stores. struct Check { Check(int r, int i, bool rbx) : reg(r), index(i), readByExit(rbx) { } // Register this instruction wrote to. int reg; // Only other than -1 when it's a Mov, equivalent reg at this point. int srcReg = -1; // Index into insts for this op. int index; // Whether the dest reg is read by any Exit. bool readByExit; int8_t fplen = 0; }; std::vector checks; checks.reserve(insts.size() / 2); // This tracks the last index at which each reg was modified. int lastWrittenTo[256]; int lastReadFrom[256]; memset(lastWrittenTo, -1, sizeof(lastWrittenTo)); memset(lastReadFrom, -1, sizeof(lastReadFrom)); auto readsFromFPRCheck = [](IRInstMeta &inst, Check &check, bool *directly) { if (check.reg < 32) return false; bool result = false; *directly = true; for (int i = 0; i < 4; ++i) { bool laneDirectly; if (check.fplen >= i + 1 && IRReadsFromFPR(inst, check.reg - 32 + i, &laneDirectly)) { result = true; if (!laneDirectly) { *directly = false; break; } } } return result; }; bool logBlocks = false; size_t firstCheck = 0; for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) { IRInstMeta inst = GetIRMeta(in.GetInstructions()[i]); // It helps to skip through rechecking ones we already discarded. for (size_t ch = firstCheck; ch < checks.size(); ++ch) { Check &check = checks[ch]; if (check.reg != 0) { firstCheck = ch; break; } } // Check if we can optimize by running through all the writes we've previously found. for (size_t ch = firstCheck; ch < checks.size(); ++ch) { Check &check = checks[ch]; if (check.reg == 0) { // This means we already optimized this or a later inst depends on it. continue; } bool readsDirectly; if (IRReadsFromGPR(inst, check.reg, &readsDirectly)) { // If this reads from the reg, we either depend on it or we can fold or swap. // That's determined below. // If this reads and writes the reg (e.g. MovZ, Load32Left), we can't just swap. bool mutatesReg = IRMutatesDestGPR(inst, check.reg); // If this doesn't directly read (i.e. Interpret), we can't swap. bool cannotReplace = !readsDirectly; if (!mutatesReg && !cannotReplace && check.srcReg >= 0 && lastWrittenTo[check.srcReg] < check.index) { // Replace with the srcReg instead. This happens with non-nice delay slots. // We're changing "Mov A, B; Add C, C, A" to "Mov A, B; Add C, C, B" here. // srcReg should only be set when it was a Mov. inst = IRReplaceSrcGPR(inst, check.reg, check.srcReg); // If the Mov modified the same reg as this instruction, we can't optimize from it anymore. if (inst.dest == check.reg) { check.reg = 0; // We can also optimize it out since we've essentially moved now. insts[check.index].op = IROp::Mov; insts[check.index].dest = 0; insts[check.index].src1 = 0; } } else if (!IRMutatesDestGPR(insts[check.index], check.reg) && inst.op == IROp::Mov && i == check.index + 1) { // As long as the previous inst wasn't modifying its dest reg, and this is a Mov, we can swap. // We're changing "Add A, B, C; Mov B, A" to "Add B, B, C; Mov A, B" here. // This happens with lwl/lwr temps. Replace the original dest. insts[check.index] = IRReplaceDestGPR(insts[check.index], check.reg, inst.dest); lastWrittenTo[inst.dest] = check.index; // If it's being read from (by inst now), we can't optimize out. check.reg = 0; // Update the read by exit flag to match the new reg. check.readByExit = inst.dest < IRTEMP_0 || inst.dest > IRTEMP_LR_SHIFT; // And swap the args for this mov, since we changed the other dest. We'll optimize this out later. std::swap(inst.dest, inst.src1); } else { // Legitimately read from, so we can't optimize out. // Unless this is an exit and a temp not read directly by the exit. if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly) check.reg = 0; } } else if (check.fplen >= 1 && readsFromFPRCheck(inst, check, &readsDirectly)) { // If one or the other is a Vec, they must match. bool lenMismatch = false; auto checkMismatch = [&check, &lenMismatch](IRReg src, char type) { int srclen = 1; if (type == 'V') srclen = 4; else if (type == '2') srclen = 2; else if (type != 'F') return; if (src + 32 + srclen > check.reg && src + 32 < check.reg + check.fplen) { if (src + 32 != check.reg || srclen != check.fplen) lenMismatch = true; } }; checkMismatch(inst.src1, inst.m.types[1]); checkMismatch(inst.src2, inst.m.types[2]); if ((inst.m.flags & (IRFLAG_SRC3 | IRFLAG_SRC3DST)) != 0) checkMismatch(inst.src3, inst.m.types[3]); bool cannotReplace = !readsDirectly || lenMismatch; if (!cannotReplace && check.srcReg >= 32 && lastWrittenTo[check.srcReg] < check.index) { // This is probably not worth doing unless we can get rid of a temp. if (!check.readByExit) { if (insts[check.index].dest == inst.src1) inst.src1 = check.srcReg - 32; else if (insts[check.index].dest == inst.src2) inst.src2 = check.srcReg - 32; else _assert_msg_(false, "Unexpected src3 read of FPR"); // Check if we've clobbered it entirely. if (inst.dest == check.reg) { check.reg = 0; insts[check.index].op = IROp::Mov; insts[check.index].dest = 0; insts[check.index].src1 = 0; } } else { // Let's not bother. check.reg = 0; } } else if ((inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) && !lenMismatch) { // A swap could be profitable if this is a temp, and maybe in other cases. // These can happen a lot from mask regs, etc. // But make sure no other changes happened between. bool destNotChanged = true; for (int j = 0; j < check.fplen; ++j) destNotChanged = destNotChanged && lastWrittenTo[inst.dest + 32 + j] < check.index; bool destNotRead = true; for (int j = 0; j < check.fplen; ++j) destNotRead = destNotRead && lastReadFrom[inst.dest + 32 + j] <= check.index; if (!check.readByExit && destNotChanged && destNotRead) { _dbg_assert_(insts[check.index].dest == inst.src1); insts[check.index].dest = inst.dest; for (int j = 0; j < check.fplen; ++j) lastWrittenTo[inst.dest + 32 + j] = check.index; // If it's being read from (by inst now), we can't optimize out. check.reg = 0; // Swap the dest and src1 so we can optimize this out later, maybe. std::swap(inst.dest, inst.src1); } else { // Doesn't look like a good candidate. check.reg = 0; } } else { // Legitimately read from, so we can't optimize out. if ((inst.m.flags & IRFLAG_EXIT) == 0 || check.readByExit || readsDirectly) check.reg = 0; } } else if (check.readByExit && (inst.m.flags & IRFLAG_EXIT) != 0) { // This is an exit, and the reg is read by any exit. Clear it. check.reg = 0; } else if (IRDestGPR(inst) == check.reg) { // Clobbered, we can optimize out. // This happens sometimes with temporaries used for constant addresses. insts[check.index].op = IROp::Mov; insts[check.index].dest = 0; insts[check.index].src1 = 0; check.reg = 0; } else if (IRWritesToFPR(inst, check.reg - 32) && check.fplen >= 1) { IRReg destFPRs[4]; int numFPRs = IRDestFPRs(inst, destFPRs); if (numFPRs == check.fplen && inst.dest + 32 == check.reg) { // This means we've clobbered it, and with full overlap. // Sometimes this happens for non-temps, i.e. vmmov + vinit last row. insts[check.index].op = IROp::Mov; insts[check.index].dest = 0; insts[check.index].src1 = 0; check.reg = 0; } else { // Since there's an overlap, we simply cannot optimize. check.reg = 0; } } } int dest = IRDestGPR(inst); switch (dest) { case IRTEMP_0: case IRTEMP_1: case IRTEMP_2: case IRTEMP_3: case IRTEMP_LHS: case IRTEMP_RHS: case IRTEMP_LR_ADDR: case IRTEMP_LR_VALUE: case IRTEMP_LR_MASK: case IRTEMP_LR_SHIFT: // Unlike other registers, these don't need to persist between blocks. // So we consider them not read unless proven read. lastWrittenTo[dest] = i; // If this is a copy, we might be able to optimize out the copy. if (inst.op == IROp::Mov) { Check check(dest, i, false); check.srcReg = inst.src1; checks.push_back(check); } else { checks.push_back(Check(dest, i, false)); } break; default: lastWrittenTo[dest] = i; if (dest > IRTEMP_LR_SHIFT) { // These might sometimes be implicitly read/written by other instructions. break; } checks.push_back(Check(dest, i, true)); break; // Not a GPR output. case 0: case -1: break; } IRReg regs[16]; int readGPRs = IRReadsFromGPRs(inst, regs); if (readGPRs == -1) { for (int j = 0; j < 256; ++j) lastReadFrom[j] = i; } else { for (int j = 0; j < readGPRs; ++j) lastReadFrom[regs[j]] = i; } int readFPRs = IRReadsFromFPRs(inst, regs); if (readFPRs == -1) { for (int j = 0; j < 256; ++j) lastReadFrom[j] = i; } else { for (int j = 0; j < readFPRs; ++j) lastReadFrom[regs[j] + 32] = i; } int destFPRs = IRDestFPRs(inst, regs); for (int j = 0; j < destFPRs; ++j) lastWrittenTo[regs[j] + 32] = i; dest = destFPRs > 0 ? regs[0] + 32 : -1; if (dest >= 32 && dest < IRTEMP_0) { // Standard FPU or VFPU reg. Check check(dest, i, true); check.fplen = (int8_t)destFPRs; checks.push_back(check); } else if (dest >= IRVTEMP_PFX_S + 32 && dest < IRVTEMP_PFX_S + 32 + 16) { // These are temporary regs and not read by exits. Check check(dest, i, false); check.fplen = (int8_t)destFPRs; if (inst.op == IROp::FMov || inst.op == IROp::Vec4Mov) { check.srcReg = inst.src1 + 32; } checks.push_back(check); } else if (dest != -1) { _assert_msg_(false, "Unexpected FPR output %d", dest); } insts.push_back(inst); } // Since we're done with the instructions, all remaining can be nuked. for (Check &check : checks) { if (!check.readByExit && check.reg > 0) { insts[check.index].op = IROp::Mov; insts[check.index].dest = 0; insts[check.index].src1 = 0; } } for (const IRInstMeta &inst : insts) { // Simply skip any Mov 0, 0 instructions, since that's how we nuke one. if (inst.op != IROp::Mov || inst.dest != 0 || inst.src1 != 0) { out.Write(inst.i); } } return logBlocks; } bool ReduceLoads(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; // This tells us to skip an AND op that has been optimized out. // Maybe we could skip multiple, but that'd slow things down and is pretty uncommon. int nextSkip = -1; bool logBlocks = false; for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) { IRInst inst = in.GetInstructions()[i]; if (inst.op == IROp::Load32 || inst.op == IROp::Load16 || inst.op == IROp::Load16Ext) { int dest = IRDestGPR(GetIRMeta(inst)); for (int j = i + 1; j < n; j++) { const IRInstMeta laterInst = GetIRMeta(in.GetInstructions()[j]); if ((laterInst.m.flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) { // Exit, so we can't do the optimization. break; } if (IRReadsFromGPR(laterInst, dest)) { if (IRDestGPR(laterInst) == dest && laterInst.op == IROp::AndConst) { const u32 mask = laterInst.constant; // Here we are, maybe we can reduce the load size based on the mask. if ((mask & 0xffffff00) == 0) { inst.op = IROp::Load8; if (mask == 0xff) { nextSkip = j; } } else if ((mask & 0xffff0000) == 0 && inst.op == IROp::Load32) { inst.op = IROp::Load16; if (mask == 0xffff) { nextSkip = j; } } } // If it was read, we can't do the optimization. break; } if (IRDestGPR(laterInst) == dest) { // Someone else wrote, so we can't do the optimization. break; } } } if (i != nextSkip) { out.Write(inst); } } return logBlocks; } static std::vector ReorderLoadStoreOps(std::vector &ops) { if (ops.size() < 2) { return ops; } bool modifiedRegs[256] = {}; for (size_t i = 0, n = ops.size(); i < n - 1; ++i) { bool modifiesReg = false; bool usesFloatReg = false; switch (ops[i].op) { case IROp::Load8: case IROp::Load8Ext: case IROp::Load16: case IROp::Load16Ext: case IROp::Load32: case IROp::Load32Left: case IROp::Load32Right: modifiesReg = true; if (ops[i].src1 == ops[i].dest) { // Can't ever reorder these, since it changes. continue; } break; case IROp::Store8: case IROp::Store16: case IROp::Store32: case IROp::Store32Left: case IROp::Store32Right: break; case IROp::LoadFloat: case IROp::LoadVec4: usesFloatReg = true; modifiesReg = true; break; case IROp::StoreFloat: case IROp::StoreVec4: usesFloatReg = true; break; default: continue; } memset(modifiedRegs, 0, sizeof(modifiedRegs)); size_t start = i; size_t j; for (j = i; j < n; ++j) { if (ops[start].op != ops[j].op || ops[start].src1 != ops[j].src1) { // Incompatible ops, so let's not reorder. break; } if (modifiedRegs[ops[j].dest] || (!usesFloatReg && modifiedRegs[ops[j].src1])) { // Can't reorder, this reg was modified. break; } if (modifiesReg) { // Modifies itself, can't reorder this. if (!usesFloatReg && ops[j].dest == ops[j].src1) { break; } modifiedRegs[ops[j].dest] = true; } // Keep going, these operations are compatible. } // Everything up to (but not including) j will be sorted, so skip them. i = j - 1; size_t end = j; if (start + 1 < end) { std::stable_sort(ops.begin() + start, ops.begin() + end, [&](const IRInst &a, const IRInst &b) { return a.constant < b.constant; }); } } return ops; } bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; bool logBlocks = false; enum class RegState : u8 { UNUSED = 0, READ = 1, CHANGED = 2, }; bool queuing = false; std::vector loadStoreQueue; std::vector otherQueue; RegState otherRegs[256] = {}; auto flushQueue = [&]() { if (!queuing) { return; } std::vector loadStoreUnsorted = loadStoreQueue; std::vector loadStoreSorted = ReorderLoadStoreOps(loadStoreQueue); if (memcmp(&loadStoreSorted[0], &loadStoreUnsorted[0], sizeof(IRInst) * loadStoreSorted.size()) != 0) { logBlocks = true; } queuing = false; for (IRInst queued : loadStoreSorted) { out.Write(queued); } for (IRInst queued : otherQueue) { out.Write(queued); } loadStoreQueue.clear(); otherQueue.clear(); memset(otherRegs, 0, sizeof(otherRegs)); }; for (int i = 0; i < (int)in.GetInstructions().size(); i++) { IRInst inst = in.GetInstructions()[i]; switch (inst.op) { case IROp::Load8: case IROp::Load8Ext: case IROp::Load16: case IROp::Load16Ext: case IROp::Load32: case IROp::Load32Left: case IROp::Load32Right: // To move a load up, its dest can't be changed by things we move down. if (otherRegs[inst.dest] != RegState::UNUSED || otherRegs[inst.src1] == RegState::CHANGED) { flushQueue(); } queuing = true; loadStoreQueue.push_back(inst); break; case IROp::Store8: case IROp::Store16: case IROp::Store32: case IROp::Store32Left: case IROp::Store32Right: // A store can move above even if it's read, as long as it's not changed by the other ops. if (otherRegs[inst.src3] == RegState::CHANGED || otherRegs[inst.src1] == RegState::CHANGED) { flushQueue(); } queuing = true; loadStoreQueue.push_back(inst); break; case IROp::LoadVec4: case IROp::LoadFloat: case IROp::StoreVec4: case IROp::StoreFloat: // Floats can always move as long as their address is safe. if (otherRegs[inst.src1] == RegState::CHANGED) { flushQueue(); } queuing = true; loadStoreQueue.push_back(inst); break; case IROp::Sub: case IROp::Slt: case IROp::SltU: case IROp::Add: case IROp::And: case IROp::Or: case IROp::Xor: case IROp::Shl: case IROp::Shr: case IROp::Ror: case IROp::Sar: case IROp::MovZ: case IROp::MovNZ: case IROp::Max: case IROp::Min: // We'll try to move this downward. otherRegs[inst.dest] = RegState::CHANGED; if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED) otherRegs[inst.src1] = RegState::READ; if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED) otherRegs[inst.src2] = RegState::READ; otherQueue.push_back(inst); queuing = true; break; case IROp::Neg: case IROp::Not: case IROp::BSwap16: case IROp::BSwap32: case IROp::Ext8to32: case IROp::Ext16to32: case IROp::ReverseBits: case IROp::Clz: case IROp::AddConst: case IROp::SubConst: case IROp::AndConst: case IROp::OrConst: case IROp::XorConst: case IROp::SltConst: case IROp::SltUConst: case IROp::ShlImm: case IROp::ShrImm: case IROp::RorImm: case IROp::SarImm: case IROp::Mov: // We'll try to move this downward. otherRegs[inst.dest] = RegState::CHANGED; if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED) otherRegs[inst.src1] = RegState::READ; otherQueue.push_back(inst); queuing = true; break; case IROp::SetConst: // We'll try to move this downward. otherRegs[inst.dest] = RegState::CHANGED; otherQueue.push_back(inst); queuing = true; break; case IROp::Mult: case IROp::MultU: case IROp::Madd: case IROp::MaddU: case IROp::Msub: case IROp::MsubU: case IROp::Div: case IROp::DivU: if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED) otherRegs[inst.src1] = RegState::READ; if (inst.src2 && otherRegs[inst.src2] != RegState::CHANGED) otherRegs[inst.src2] = RegState::READ; otherQueue.push_back(inst); queuing = true; break; case IROp::MfHi: case IROp::MfLo: case IROp::FpCondToReg: otherRegs[inst.dest] = RegState::CHANGED; otherQueue.push_back(inst); queuing = true; break; case IROp::MtHi: case IROp::MtLo: case IROp::FpCondFromReg: if (inst.src1 && otherRegs[inst.src1] != RegState::CHANGED) otherRegs[inst.src1] = RegState::READ; otherQueue.push_back(inst); queuing = true; break; case IROp::Nop: case IROp::Downcount: if (queuing) { // These are freebies. Sometimes helps with delay slots. otherQueue.push_back(inst); } else { out.Write(inst); } break; default: flushQueue(); out.Write(inst); break; } } return logBlocks; } bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; bool logBlocks = false; auto opsCompatible = [&](const IRInst &a, const IRInst &b, int dist) { if (a.op != b.op || a.src1 != b.src1) { // Not similar enough at all. return false; } u32 off1 = a.constant; u32 off2 = b.constant; if (off1 + dist != off2) { // Not immediately sequential. return false; } return true; }; IRInst prev = { IROp::Nop }; for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) { IRInst inst = in.GetInstructions()[i]; int c = 0; switch (inst.op) { case IROp::Store8: for (c = 1; c < 4 && i + c < n; ++c) { const IRInst &nextInst = in.GetInstructions()[i + c]; // TODO: Might be nice to check if this is an obvious constant. if (inst.src3 != nextInst.src3 || inst.src3 != 0) { break; } if (!opsCompatible(inst, nextInst, c)) { break; } } if ((c == 2 || c == 3) && opts.unalignedLoadStore) { inst.op = IROp::Store16; out.Write(inst); prev = inst; // Skip the next one (the 3rd will be separate.) ++i; continue; } if (c == 4 && opts.unalignedLoadStore) { inst.op = IROp::Store32; out.Write(inst); prev = inst; // Skip all 4. i += 3; continue; } out.Write(inst); prev = inst; break; case IROp::Store16: for (c = 1; c < 2 && i + c < n; ++c) { const IRInst &nextInst = in.GetInstructions()[i + c]; // TODO: Might be nice to check if this is an obvious constant. if (inst.src3 != nextInst.src3 || inst.src3 != 0) { break; } if (!opsCompatible(inst, nextInst, c * 2)) { break; } } if (c == 2 && opts.unalignedLoadStore) { inst.op = IROp::Store32; out.Write(inst); prev = inst; // Skip the next one. ++i; continue; } out.Write(inst); prev = inst; break; case IROp::Load32: if (prev.src1 == inst.src1 && prev.src2 == inst.src2) { // A store and then an immediate load. This is sadly common in minis. if (prev.op == IROp::Store32 && prev.src3 == inst.dest) { // Even the same reg, a volatile variable? Skip it. continue; } // Store16 and Store8 in rare cases happen... could be made AndConst, but not worth the trouble. if (prev.op == IROp::Store32) { inst.op = IROp::Mov; inst.src1 = prev.src3; inst.src2 = 0; } else if (prev.op == IROp::StoreFloat) { inst.op = IROp::FMovToGPR; inst.src1 = prev.src3; inst.src2 = 0; } // The actual op is written below. } out.Write(inst); prev = inst; break; case IROp::LoadFloat: if (prev.src1 == inst.src1 && prev.src2 == inst.src2) { // A store and then an immediate load, of a float. if (prev.op == IROp::StoreFloat && prev.src3 == inst.dest) { // Volatile float, I suppose? continue; } if (prev.op == IROp::StoreFloat) { inst.op = IROp::FMov; inst.src1 = prev.src3; inst.src2 = 0; } else if (prev.op == IROp::Store32) { inst.op = IROp::FMovFromGPR; inst.src1 = prev.src3; inst.src2 = 0; } // The actual op is written below. } out.Write(inst); prev = inst; break; default: out.Write(inst); prev = inst; break; } } return logBlocks; } struct IRMemoryOpInfo { int size; bool isWrite; bool isWordLR; }; static IRMemoryOpInfo IROpMemoryAccessSize(IROp op) { // Assumes all take src1 + constant. switch (op) { case IROp::Load8: case IROp::Load8Ext: case IROp::Store8: return { 1, op == IROp::Store8 }; case IROp::Load16: case IROp::Load16Ext: case IROp::Store16: return { 2, op == IROp::Store16 }; case IROp::Load32: case IROp::Load32Linked: case IROp::LoadFloat: case IROp::Store32: case IROp::Store32Conditional: case IROp::StoreFloat: return { 4, op == IROp::Store32 || op == IROp::Store32Conditional || op == IROp::StoreFloat }; case IROp::LoadVec4: case IROp::StoreVec4: return { 16, op == IROp::StoreVec4 }; case IROp::Load32Left: case IROp::Load32Right: case IROp::Store32Left: case IROp::Store32Right: // This explicitly does not require alignment, so validate as an 8-bit operation. return { 1, op == IROp::Store32Left || op == IROp::Store32Right, true }; default: return { 0 }; } } bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; if (g_Config.bFastMemory) DISABLE; int spLower = 0; int spUpper = -1; bool spWrite = false; bool spModified = false; for (IRInst inst : in.GetInstructions()) { IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op); // Note: we only combine word aligned accesses. if (info.size != 0 && inst.src1 == MIPS_REG_SP && info.size == 4) { if (spModified) { // No good, it was modified and then we did more accesses. Can't combine. spUpper = -1; break; } if ((int)inst.constant < 0 || (int)inst.constant >= 0x4000) { // Let's assume this might cross boundaries or something. Uncommon. spUpper = -1; break; } spLower = std::min(spLower, (int)inst.constant); spUpper = std::max(spUpper, (int)inst.constant + info.size); spWrite = spWrite || info.isWrite; } const IRMeta *m = GetIRMeta(inst.op); if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0 && inst.dest == MIPS_REG_SP) { // We only care if it changes after we start combining. spModified = spUpper != -1; } } bool skipSP = spUpper != -1; bool flushedSP = false; std::map checks; const auto addValidate = [&](IROp validate, uint8_t sz, const IRInst &inst, bool isStore) { if (inst.src1 == MIPS_REG_SP && skipSP && validate == IROp::ValidateAddress32) { if (!flushedSP) { out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spLower); if (spUpper > spLower + 4) out.Write(IROp::ValidateAddress32, 0, MIPS_REG_SP, spWrite ? 1U : 0U, spUpper - 4); flushedSP = true; } return; } uint64_t key = ((uint64_t)inst.src1 << 32) | inst.constant; auto it = checks.find(key); if (it == checks.end() || it->second < sz) { out.Write(validate, 0, inst.src1, isStore ? 1U : 0U, inst.constant); checks[key] = sz; } }; bool logBlocks = false; for (IRInst inst : in.GetInstructions()) { IRMemoryOpInfo info = IROpMemoryAccessSize(inst.op); IROp validateOp = IROp::Nop; switch (info.size) { case 1: validateOp = IROp::ValidateAddress8; break; case 2: validateOp = IROp::ValidateAddress16; break; case 4: validateOp = IROp::ValidateAddress32; break; case 16: validateOp = IROp::ValidateAddress128; break; case 0: break; default: _assert_msg_(false, "Unexpected memory access size"); } if (validateOp != IROp::Nop) { addValidate(validateOp, info.size, inst, info.isWrite); } const IRMeta *m = GetIRMeta(inst.op); if (m->types[0] == 'G' && (m->flags & IRFLAG_SRC3) == 0) { uint64_t key = (uint64_t)inst.dest << 32; // Wipe out all the already done checks since this was modified. checks.erase(checks.lower_bound(key), checks.upper_bound(key | 0xFFFFFFFFULL)); } // Always write out the original. We're only adding. out.Write(inst); } return logBlocks; } bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) { CONDITIONAL_DISABLE; // Only do this when using a SIMD backend. if (!opts.preferVec4) { DISABLE; } bool isVec4[256]{}; bool isUsed[256]{}; bool isVec4Dirty[256]{}; auto updateVec4 = [&](char type, IRReg r) { bool downgraded = false; switch (type) { case 'F': downgraded = isVec4[r & ~3]; isVec4[r & ~3] = false; isUsed[r] = true; break; case 'V': _dbg_assert_((r & 3) == 0); isVec4[r] = true; for (int i = 0; i < 4; ++i) isUsed[r + i] = true; break; case '2': downgraded = isVec4[r & ~3]; isVec4[r & ~3] = false; for (int i = 0; i < 2; ++i) isUsed[r + i] = true; break; default: break; } return downgraded; }; auto updateVec4Dest = [&](char type, IRReg r, uint32_t flags) { if ((flags & IRFLAG_SRC3) == 0) { switch (type) { case 'F': isVec4Dirty[r & ~3] = false; break; case 'V': _dbg_assert_((r & 3) == 0); isVec4Dirty[r] = true; break; case '2': isVec4Dirty[r & ~3] = false; break; default: break; } } return updateVec4(type, r); }; // Checks overlap from r1 to other params. auto overlapped = [](IRReg r1, int l1, IRReg r2, int l2, IRReg r3 = IRREG_INVALID, int l3 = 0) { if (r1 < r2 + l2 && r1 + l1 > r2) return true; if (r1 < r3 + l3 && r1 + l1 > r3) return true; return false; }; bool logBlocks = false; int inCount = (int)in.GetInstructions().size(); for (int i = 0; i < inCount; ++i) { IRInst inst = in.GetInstructions()[i]; const IRMeta *m = GetIRMeta(inst.op); if ((m->flags & (IRFLAG_EXIT | IRFLAG_BARRIER)) != 0) { memset(isVec4, 0, sizeof(isVec4)); out.Write(inst); continue; } IRReg temp = IRREG_INVALID; auto findAvailTempVec4 = [&]() { // If it's not used yet in this block, we can use it. // Note: even if the instruction uses it to write, that should be fine. for (IRReg r = IRVTEMP_PFX_S; r < IRVTEMP_0 + 4; r += 4) { if (isUsed[r]) continue; bool usable = true; for (int j = 1; j < 4; ++j) usable = usable && !isUsed[r + j]; if (usable) { temp = r; // We don't update isUsed because our temporary doesn't need to last. return true; } } return false; }; auto usedLaterAsVec4 = [&](IRReg r) { for (int j = i + 1; j < inCount; ++j) { IRInst inst = in.GetInstructions()[j]; const IRMeta *m = GetIRMeta(inst.op); if (m->types[0] == 'V' && inst.dest == r) return true; if (m->types[1] == 'V' && inst.src1 == r) return true; if (m->types[2] == 'V' && inst.src2 == r) return true; } return false; }; bool skip = false; switch (inst.op) { case IROp::SetConstF: if (isVec4[inst.dest & ~3] && findAvailTempVec4()) { // Check if we're setting multiple in a row, this is a bit common. u8 blendMask = 1 << (inst.dest & 3); while (i + 1 < inCount) { IRInst next = in.GetInstructions()[i + 1]; if (next.op != IROp::SetConstF || (next.dest & ~3) != (inst.dest & ~3)) break; if (next.constant != inst.constant) break; blendMask |= 1 << (next.dest & 3); i++; } if (inst.constant == 0) { out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllZERO); } else if (inst.constant == 0x3F800000) { out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllONE); } else if (inst.constant == 0xBF800000) { out.Write(IROp::Vec4Init, temp, (int)Vec4Init::AllMinusONE); } else { out.Write(IROp::SetConstF, temp, out.AddConstant(inst.constant)); out.Write(IROp::Vec4Shuffle, temp, temp, 0); } out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask); isVec4Dirty[inst.dest & ~3] = true; continue; } break; case IROp::FMovFromGPR: if (isVec4[inst.dest & ~3] && findAvailTempVec4()) { u8 blendMask = 1 << (inst.dest & 3); out.Write(IROp::FMovFromGPR, temp, inst.src1); out.Write(IROp::Vec4Shuffle, temp, temp, 0); out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask); isVec4Dirty[inst.dest & ~3] = true; continue; } break; case IROp::LoadFloat: if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3) && findAvailTempVec4()) { u8 blendMask = 1 << (inst.dest & 3); out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant); out.Write(IROp::Vec4Shuffle, temp, temp, 0); out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask); isVec4Dirty[inst.dest & ~3] = true; continue; } break; case IROp::StoreFloat: if (isVec4[inst.src3 & ~3] && isVec4Dirty[inst.src3 & ~3] && usedLaterAsVec4(inst.src3 & ~3) && findAvailTempVec4()) { out.Write(IROp::FMov, temp, inst.src3, 0); out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant); continue; } break; case IROp::FMov: if (isVec4[inst.dest & ~3] && (inst.dest & ~3) == (inst.src1 & ~3)) { // Oh, actually a shuffle? uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(0, 1, 2, 3); uint8_t destShift = (inst.dest & 3) * 2; shuffle = (shuffle & ~(3 << destShift)) | ((inst.src1 & 3) << destShift); out.Write(IROp::Vec4Shuffle, inst.dest & ~3, inst.dest & ~3, shuffle); isVec4Dirty[inst.dest & ~3] = true; continue; } else if (isVec4[inst.dest & ~3] && (inst.dest & 3) == (inst.src1 & 3)) { // We can turn this directly into a blend, since it's the same lane. out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, inst.src1 & ~3, 1 << (inst.dest & 3)); isVec4Dirty[inst.dest & ~3] = true; continue; } else if (isVec4[inst.dest & ~3] && isVec4[inst.src1 & ~3] && findAvailTempVec4()) { // For this, we'll need a temporary to move to the right lane. int lane = inst.src1 & 3; uint8_t shuffle = (uint8_t)VFPU_SWIZZLE(lane, lane, lane, lane); out.Write(IROp::Vec4Shuffle, temp, inst.src1 & ~3, shuffle); out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3)); isVec4Dirty[inst.dest & ~3] = true; continue; } break; case IROp::FAdd: case IROp::FSub: case IROp::FMul: case IROp::FDiv: if (isVec4[inst.dest & ~3] && isVec4Dirty[inst.dest & ~3] && usedLaterAsVec4(inst.dest & ~3)) { if (!overlapped(inst.dest & ~3, 4, inst.src1, 1, inst.src2, 1) && findAvailTempVec4()) { u8 blendMask = 1 << (inst.dest & 3); out.Write(inst.op, temp, inst.src1, inst.src2); out.Write(IROp::Vec4Shuffle, temp, temp, 0); out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, blendMask); updateVec4('F', inst.src1); updateVec4('F', inst.src2); isVec4Dirty[inst.dest & ~3] = true; continue; } } break; case IROp::Vec4Dot: if (overlapped(inst.dest, 1, inst.src1, 4, inst.src2, 4) && findAvailTempVec4()) { out.Write(inst.op, temp, inst.src1, inst.src2, inst.constant); if (usedLaterAsVec4(inst.dest & ~3)) { // Broadcast to other lanes if needed. if ((inst.dest & 3) != 0) out.Write(IROp::Vec4Shuffle, temp, temp, 0); out.Write(IROp::Vec4Blend, inst.dest & ~3, inst.dest & ~3, temp, 1 << (inst.dest & 3)); // It's overlapped, so it'll get marked as Vec4 and used anyway. isVec4Dirty[inst.dest & ~3] = true; inst.dest = IRREG_INVALID; } else { out.Write(IROp::FMov, inst.dest, temp); } skip = true; } break; case IROp::Vec4Scale: if (overlapped(inst.src2, 1, inst.src1, 4, inst.dest, 4) && findAvailTempVec4()) { out.Write(IROp::FMov, temp, inst.src2); out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant); skip = true; inst.src2 = IRREG_INVALID; } else if (isVec4[inst.src2 & 3] && usedLaterAsVec4(inst.src2 & ~3) && findAvailTempVec4()) { out.Write(IROp::FMov, temp, inst.src2); out.Write(inst.op, inst.dest, inst.src1, temp, inst.constant); skip = true; inst.src2 = IRREG_INVALID; } break; default: break; } bool downgrade = false; if (inst.src1 != IRREG_INVALID && updateVec4(m->types[1], inst.src1)) downgrade = true; if (inst.src2 != IRREG_INVALID && updateVec4(m->types[2], inst.src2)) downgrade = true; if (inst.dest != IRREG_INVALID && updateVec4Dest(m->types[0], inst.dest, m->flags)) downgrade = true; if (downgrade) { //WARN_LOG(JIT, "Vec4 downgrade by: %s", m->name); } if (!skip) out.Write(inst); } return logBlocks; }