More JIT work, not quite there yet...

2025-04-02 11:01:50 -04:00 · 2015-03-07 22:29:44 +01:00 · 2015-03-07 22:29:44 +01:00 · e7e58591da
commit e7e58591da
parent f732fbb885
8 changed files with 114 additions and 166 deletions
--- a/Core/MIPS/ARM64/Arm64Asm.cpp
+++ b/Core/MIPS/ARM64/Arm64Asm.cpp
@ -31,7 +31,7 @@ using namespace Arm64Gen;

 //static int temp32; // unused?

-static const bool enableDebug = false;
+static const bool enableDebug = true;

 //static bool enableStatistics = false; //unused?

@ -68,8 +68,9 @@ static const bool enableDebug = false;
 // saving them when we call out of the JIT. We will perform regular dynamic register allocation in the rest (x0-x15)

 // STATIC ALLOCATION ARM64 (these are all callee-save registers):
+// x25 : MSR/MRS temporary (to be eliminated later)
 // x26 : JIT base reg
-// x27 : MIPS state
+// x27 : MIPS state (Could eliminate by placing the MIPS state right at the memory base)
 // x28 : Memory base pointer.
 // x29 : Down counter

@ -102,7 +103,6 @@ void Arm64Jit::GenerateFixedCode() {

 	ABI_PushRegisters(regs_to_save);

-
 	// Fixed registers, these are always kept when in Jit context.
 	// R8 is used to hold flags during delay slots. Not always needed.
 	// R13 cannot be used as it's the stack pointer.
@ -119,7 +119,7 @@ void Arm64Jit::GenerateFixedCode() {

 	RestoreDowncount();
 	MovFromPC(SCRATCH1);
-	outerLoopPCInR0 = GetCodePtr();
+	outerLoopPCInSCRATCH1 = GetCodePtr();
 	MovToPC(SCRATCH1);
 	outerLoop = GetCodePtr();
 		SaveDowncount();
@ -159,8 +159,7 @@ void Arm64Jit::GenerateFixedCode() {

 			// Debug
 			if (enableDebug) {
-				// MOV(SCRATCH1, R13);
-				// QuickCallFunction(R1, (void *)&ShowPC);
+				QuickCallFunction(SCRATCH1, (void *)&ShowPC);
 			}

 			LDR(INDEX_UNSIGNED, SCRATCH1, CTXREG, offsetof(MIPSState, pc));
@ -168,7 +167,7 @@ void Arm64Jit::GenerateFixedCode() {
 			ANDI2R(SCRATCH2, SCRATCH1, 0xFF000000);   // rotation is to the right, in 2-bit increments.
 			ANDI2R(SCRATCH1, SCRATCH1, 0x00FFFFFF);  // TODO: Replace this and the next op by a bit field extract
 			LSR(SCRATCH2, SCRATCH2, 24);
-			CMP(SCRATCH2, MIPS_EMUHACK_OPCODE);
+			CMP(SCRATCH2, MIPS_EMUHACK_OPCODE>>24);
 			FixupBranch skipJump = B(CC_NEQ);
 				// IDEA - we have 26 bits, why not just use offsets from base of code?
 				// Another idea: Shift the bloc number left by two in the op, this would let us do
@ -207,9 +206,12 @@ void Arm64Jit::GenerateFixedCode() {

 	ABI_PopRegisters(regs_to_save);

-	INFO_LOG(JIT, "THE DISASM ========================");
-	DisassembleArm64(enterCode, GetCodePtr() - enterCode);
-	INFO_LOG(JIT, "END OF THE DISASM ========================");
+	INFO_LOG(JIT, "THE DISASM : %p ========================", enterCode);
+	std::vector<std::string> lines = DisassembleArm64(enterCode, GetCodePtr() - enterCode);
+	for (auto s : lines) {
+		INFO_LOG(JIT, "%s", s.c_str());
+	}
+	INFO_LOG(JIT, "END OF THE DISASM : %p ========================", GetCodePtr());

 	// Don't forget to zap the instruction cache!
 	FlushIcache();
--- a/Core/MIPS/ARM64/Arm64CompBranch.cpp
+++ b/Core/MIPS/ARM64/Arm64CompBranch.cpp
@ -124,9 +124,14 @@ void Arm64Jit::BranchRSRTComp(MIPSOpcode op, CCFlags cc, bool likely)
 		// We might be able to flip the condition (EQ/NEQ are easy.)
 		const bool canFlip = cc == CC_EQ || cc == CC_NEQ;

-		// TODO ARM64: Optimize for immediates
-		gpr.MapInIn(rs, rt);
-		CMP(gpr.R(rs), gpr.R(rt));
+		// TODO ARM64: Optimize for immediates other than zero
+		if (rt == 0) {
+			gpr.MapIn(rs);
+			CMP(gpr.R(rs), 0);
+		} else {
+			gpr.MapInIn(rs, rt);
+			CMP(gpr.R(rs), gpr.R(rt));
+		}

 		Arm64Gen::FixupBranch ptr;
 		if (!likely) {
@ -493,7 +498,7 @@ void Arm64Jit::Comp_JumpReg(MIPSOpcode op)
 		delaySlotIsNice = false;
 	CONDITIONAL_NICE_DELAYSLOT;

-	ARM64Reg destReg = X8;
+	ARM64Reg destReg = X18;
 	if (IsSyscall(delaySlotOp)) {
 		gpr.MapReg(rs);
 		MovToPC(gpr.R(rs));  // For syscall to be able to return.
@ -502,6 +507,7 @@ void Arm64Jit::Comp_JumpReg(MIPSOpcode op)
 		CompileDelaySlot(DELAYSLOT_FLUSH);
 		return;  // Syscall wrote exit code.
 	} else if (delaySlotIsNice) {
+		INFO_LOG(JIT, "jreg DelaySlotIsNice");
 		if (andLink)
 			gpr.SetImm(rd, js.compilerPC + 8);
 		CompileDelaySlot(DELAYSLOT_NICE);
@ -533,7 +539,7 @@ void Arm64Jit::Comp_JumpReg(MIPSOpcode op)
 	} else {
 		// Delay slot - this case is very rare, might be able to free up R8.
 		gpr.MapReg(rs);
-		MOV(W8, gpr.R(rs));
+		MOV(X18, gpr.R(rs));
 		if (andLink)
 			gpr.SetImm(rd, js.compilerPC + 8);
 		CompileDelaySlot(DELAYSLOT_NICE);
@ -586,15 +592,12 @@ void Arm64Jit::Comp_Syscall(MIPSOpcode op)
 	SaveDowncount();
 	// Skip the CallSyscall where possible.
 	void *quickFunc = GetQuickSyscallFunc(op);
-	if (quickFunc)
-	{
-		gpr.SetRegImm(W0, (u32)(intptr_t)GetSyscallInfo(op));
+	if (quickFunc) {
+		MOVI2R(W0, (u32)(intptr_t)GetSyscallInfo(op));
 		// Already flushed, so X1 is safe.
 		QuickCallFunction(X1, quickFunc);
-	}
-	else
-	{
-		gpr.SetRegImm(W0, op.encoding);
+	} else {
+		MOVI2R(W0, op.encoding);
 		QuickCallFunction(X1, (void *)&CallSyscall);
 	}
 	ApplyRoundingMode();
--- a/Core/MIPS/ARM64/Arm64Jit.cpp
+++ b/Core/MIPS/ARM64/Arm64Jit.cpp
@ -35,13 +35,22 @@
 #include "Core/MIPS/ARM64/Arm64RegCacheFPU.h"

 #include "Core/MIPS/ARM64/Arm64Jit.h"
-
-#include "ext/disarm.h"
+#include "Core/MIPS/JitCommon/JitCommon.h"

 using namespace Arm64JitConstants;

 void DisassembleArm64Print(const u8 *data, int size) {
-	ILOG("ARM64 TODO");
+	std::vector<std::string> lines = DisassembleArm64(data, size);
+	for (auto s : lines) {
+		ILOG("%s", s.c_str());
+	}
+	ILOG("+++");
+	// A format friendly to Online Disassembler which gets endianness wrong
+	for (size_t i = 0; i < lines.size(); i++) {
+		uint32_t opcode = ((uint32_t *)data)[i];
+		ILOG("%08x", swap32(opcode));
+	}
+	ILOG("===");
 }

 namespace MIPSComp
@ -124,6 +133,7 @@ void Arm64Jit::FlushPrefixV()

 void Arm64Jit::ClearCache()
 {
+	ILOG("ARM64Jit: Clearing the cache!");
 	blocks.Clear();
 	ClearCodeSpace();
 	GenerateFixedCode();
@ -155,15 +165,32 @@ void Arm64Jit::EatInstruction(MIPSOpcode op) {

 void Arm64Jit::CompileDelaySlot(int flags)
 {
-	// TODO ARM64
+	// preserve flag around the delay slot! Maybe this is not always necessary on ARM where 
+	// we can (mostly) control whether we set the flag or not. Of course, if someone puts an slt in to the
+	// delay slot, we're screwed.
+	if (flags & DELAYSLOT_SAFE)
+		MRS(FLAGTEMPREG, FIELD_NZCV);  // Save flags register. X18 is preserved through function calls and is not allocated.
+
+	js.inDelaySlot = true;
+	MIPSOpcode op = Memory::Read_Opcode_JIT(js.compilerPC + 4);
+	MIPSCompileOp(op);
+	js.inDelaySlot = false;
+
+	if (flags & DELAYSLOT_FLUSH)
+		FlushAll();
+	if (flags & DELAYSLOT_SAFE)
+		_MSR(FIELD_NZCV, FLAGTEMPREG);  // Restore flags register
 }


 void Arm64Jit::Compile(u32 em_address) {
 	if (GetSpaceLeft() < 0x10000 || blocks.IsFull()) {
+		INFO_LOG(JIT, "Space left: %i", GetSpaceLeft());
 		ClearCache();
 	}

+	INFO_LOG(JIT, "In Compile, at %08x!", em_address);
+
 	int block_num = blocks.AllocateBlock(em_address);
 	JitBlock *b = blocks.GetBlock(block_num);
 	DoJit(em_address, b);
@ -213,37 +240,31 @@ const u8 *Arm64Jit::DoJit(u32 em_address, JitBlock *b)
 	js.inDelaySlot = false;
 	js.PrefixStart();

+	logBlocks = 1;
+
 	// We add a downcount flag check before the block, used when entering from a linked block.
 	// The last block decremented downcounter, and the flag should still be available.
 	// Got three variants here of where we position the code, needs detailed benchmarking.

 	FixupBranch bail;
-	/*
 	if (jo.useBackJump) {
 		// Moves the MOVI2R and B *before* checkedEntry, and just branch backwards there.
 		// Speedup seems to be zero unfortunately but I guess it may vary from device to device.
 		// Not intrusive so keeping it around here to experiment with, may help on ARMv6 due to
 		// large/slow construction of 32-bit immediates?
-		JumpTarget backJump = GetCodePtr();
-		gpr.SetRegImm(R0, js.blockStart);
-		B((const void *)outerLoopPCInR0);
+		const u8 *backJump = GetCodePtr();
+		MOVI2R(SCRATCH1, js.blockStart);
+		B((const void *)outerLoopPCInSCRATCH1);
 		b->checkedEntry = GetCodePtr();
-		SetCC(CC_LT);
-		B(backJump);
-		SetCC(CC_AL);
+		B(CC_LT, backJump);
 	} else if (jo.useForwardJump) {
 		b->checkedEntry = GetCodePtr();
-		SetCC(CC_LT);
-		bail = B();
-		SetCC(CC_AL);
+		bail = B(CC_LT);
 	} else {
 		b->checkedEntry = GetCodePtr();
-		SetCC(CC_LT);
-		gpr.SetRegImm(R0, js.blockStart);
-		B((const void *)outerLoopPCInR0);
-		SetCC(CC_AL);
-	}*/
-	// TODO ARM64
+		MOVI2R(SCRATCH1, js.blockStart);
+		B(CC_LT, (const void *)outerLoopPCInSCRATCH1);
+	}

 	b->normalEntry = GetCodePtr();
 	// TODO: this needs work
@ -281,9 +302,9 @@ const u8 *Arm64Jit::DoJit(u32 em_address, JitBlock *b)
 	}

 	if (jo.useForwardJump) {
-		//SetJumpTarget(bail);
-		//gpr.SetRegImm(R0, js.blockStart);
-		//B((const void *)outerLoopPCInR0);
+		SetJumpTarget(bail);
+		gpr.SetRegImm(SCRATCH1, js.blockStart);
+		B((const void *)outerLoopPCInSCRATCH1);
 	}

 	char temp[256];
@ -298,7 +319,7 @@ const u8 *Arm64Jit::DoJit(u32 em_address, JitBlock *b)
 	b->codeSize = GetCodePtr() - b->normalEntry;

 	if (logBlocks > 0 && dontLogBlocks == 0) {
-		INFO_LOG(JIT, "=============== ARM ===============");
+		INFO_LOG(JIT, "=============== ARM (%d instructions -> %d bytes) ===============", js.numInstructions, b->codeSize);
 		DisassembleArm64Print(b->normalEntry, GetCodePtr() - b->normalEntry);
 	}
 	if (logBlocks > 0)
@ -317,6 +338,7 @@ const u8 *Arm64Jit::DoJit(u32 em_address, JitBlock *b)
 		blocks.ProxyBlock(js.blockStart, js.lastContinuedPC, (js.compilerPC - js.lastContinuedPC) / sizeof(u32), GetCodePtr());
 		b->originalSize = js.initialBlockSize;
 	}
+
 	return b->normalEntry;
 }

@ -359,18 +381,16 @@ void Arm64Jit::Comp_Generic(MIPSOpcode op)
 		SaveDowncount();
 		// TODO: Perhaps keep the rounding mode for interp?
 		RestoreRoundingMode();
-		// gpr.SetRegImm(SCRATCHREG1, js.compilerPC);
-		// MovToPC(SCRATCHREG1);
-		//gpr.SetRegImm(R0, op.encoding);
-		//QuickCallFunction(R1, (void *)func);
-		// TODO ARM64
+		MOVI2R(SCRATCH1, js.compilerPC);
+		MovToPC(SCRATCH1);
+		MOVI2R(W0, op.encoding);
+		QuickCallFunction(SCRATCH2_64, (void *)func);
 		ApplyRoundingMode();
 		RestoreDowncount();
 	}

 	const MIPSInfo info = MIPSGetInfo(op);
-	if ((info & IS_VFPU) != 0 && (info & VFPU_NO_PREFIX) == 0)
-	{
+	if ((info & IS_VFPU) != 0 && (info & VFPU_NO_PREFIX) == 0) {
 		// If it does eat them, it'll happen in MIPSCompileOp().
 		if ((info & OUT_EAT_PREFIX) == 0)
 			js.PrefixUnknown();
@ -386,28 +406,19 @@ void Arm64Jit::MovToPC(ARM64Reg r) {
 }

 void Arm64Jit::SaveDowncount() {
-	if (jo.downcountInRegister)
-		STR(INDEX_UNSIGNED, DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
+	STR(INDEX_UNSIGNED, DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
 }

 void Arm64Jit::RestoreDowncount() {
-	if (jo.downcountInRegister)
-		LDR(INDEX_UNSIGNED, DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
+	LDR(INDEX_UNSIGNED, DOWNCOUNTREG, CTXREG, offsetof(MIPSState, downcount));
 }

 void Arm64Jit::WriteDownCount(int offset) {
 	// TODO ARM64
 }

-// Abuses R2
 void Arm64Jit::WriteDownCountR(ARM64Reg reg) {
-	if (jo.downcountInRegister) {
-		SUBS(DOWNCOUNTREG, DOWNCOUNTREG, reg);
-	} else {
-		LDR(INDEX_UNSIGNED, X2, CTXREG, offsetof(MIPSState, downcount));
-		SUBS(X2, X2, reg);
-		STR(INDEX_UNSIGNED, X2, CTXREG, offsetof(MIPSState, downcount));
-	}
+	SUBS(DOWNCOUNTREG, DOWNCOUNTREG, reg);
 }

 void Arm64Jit::RestoreRoundingMode(bool force) {
@ -441,7 +452,7 @@ void Arm64Jit::WriteExit(u32 destination, int exit_num)
 		B(blocks.GetBlock(block)->checkedEntry);
 		b->linkStatus[exit_num] = true;
 	} else {
-		gpr.SetRegImm(X0, destination);
+		MOVI2R(SCRATCH1, destination);
 		B((const void *)dispatcherPCInSCRATCH1);	
 	}
 }
--- a/Core/MIPS/ARM64/Arm64Jit.h
+++ b/Core/MIPS/ARM64/Arm64Jit.h
@ -37,8 +37,7 @@ namespace MIPSComp
 struct Arm64JitOptions
 {
 	Arm64JitOptions()  {
-		enableBlocklink = true;
-		downcountInRegister = true;
+		enableBlocklink = false;
 		useBackJump = false;
 		useForwardJump = false;
 		cachePointers = true;
@ -54,7 +53,6 @@ struct Arm64JitOptions

 	bool useNEONVFPU;
 	bool enableBlocklink;
-	bool downcountInRegister;
 	bool useBackJump;
 	bool useForwardJump;
 	bool cachePointers;
@ -278,7 +276,7 @@ public:
 	const u8 *enterCode;

 	const u8 *outerLoop;
-	const u8 *outerLoopPCInR0;
+	const u8 *outerLoopPCInSCRATCH1;
 	const u8 *dispatcherCheckCoreState;
 	const u8 *dispatcherPCInSCRATCH1;
 	const u8 *dispatcher;
--- a/Core/MIPS/ARM64/Arm64RegCache.cpp
+++ b/Core/MIPS/ARM64/Arm64RegCache.cpp
@ -50,35 +50,18 @@ void Arm64RegCache::Start(MIPSAnalyst::AnalysisResults &stats) {
 }

 const ARM64Reg *Arm64RegCache::GetMIPSAllocationOrder(int &count) {
-	// Note that R0 is reserved as scratch for now.
-	// R12 is also potentially usable.
-	// R4-R7 are registers we could use for static allocation or downcount.
-	// R8 is used to preserve flags in nasty branches.
-	// R9 and upwards are reserved for jit basics.
-	// R14 (LR) is used as a scratch reg (overwritten on calls/return.)
-
-	// TODO ARM64
-	if (jo_->downcountInRegister) {
-		static const ARM64Reg allocationOrder[] = {
-			X1, X2, X3, X4, X5, X6, X12,
-		};
-		count = sizeof(allocationOrder) / sizeof(const int);
-		return allocationOrder;
-	} else {
-		static const ARM64Reg allocationOrder2[] = {
-			X1, X2, X3, X4, X5, X6, X7, X12,
-		};
-		count = sizeof(allocationOrder2) / sizeof(const int);
-		return allocationOrder2;
-	}
+	// See register alloc remarks in Arm64Asm.cpp
+	// TODO: Add static allocation of top MIPS registers like SP
+	static const ARM64Reg allocationOrder[] = {
+		W19, W20, W21, W22, W23, W24, W25, W27, W28, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, W0, W1,
+	};
+	count = sizeof(allocationOrder) / sizeof(const int);
+	return allocationOrder;
 }

 void Arm64RegCache::FlushBeforeCall() {
-	// R4-R11 are preserved. Others need flushing.
-	FlushArmReg(X1);
-	FlushArmReg(X2);
-	FlushArmReg(X3);
-	FlushArmReg(X12);
+	// TODO: More optimal
+	FlushAll();
 }

 bool Arm64RegCache::IsMapped(MIPSGPReg mipsReg) {
@ -105,9 +88,13 @@ void Arm64RegCache::MapRegTo(ARM64Reg reg, MIPSGPReg mipsReg, int mapFlags) {
 		} else {
 			switch (mr[mipsReg].loc) {
 			case ML_MEM:
-				emit_->LDR(INDEX_UNSIGNED, reg, CTXREG, GetMipsRegOffset(mipsReg));
+			{
+				int offset = GetMipsRegOffset(mipsReg);
+				INFO_LOG(JIT, "MapRegTo %d mips: %d offset %d", (int)reg, mipsReg, offset);
+				emit_->LDR(INDEX_UNSIGNED, reg, CTXREG, offset);
 				mr[mipsReg].loc = ML_ARMREG;
 				break;
+			}
 			case ML_IMM:
 				SetRegImm(reg, mr[mipsReg].imm);
 				ar[reg].isDirty = true;  // IMM is always dirty.
@ -190,20 +177,6 @@ ARM64Reg Arm64RegCache::MapReg(MIPSGPReg mipsReg, int mapFlags) {
 	int allocCount;
 	const ARM64Reg *allocOrder = GetMIPSAllocationOrder(allocCount);

-	ARM64Reg desiredReg = INVALID_REG;
-	// Try to "statically" allocate the first 6 regs after v0.
-	int desiredOrder = allocCount - (6 - (mipsReg - (int)MIPS_REG_V0));
-	if (desiredOrder >= 0 && desiredOrder < allocCount)
-		desiredReg = allocOrder[desiredOrder];
-
-	if (desiredReg != INVALID_REG) {
-		if (ar[desiredReg].mipsReg == MIPS_REG_INVALID) {
-			// With this placement, we may be able to optimize flush.
-			MapRegTo(desiredReg, mipsReg, mapFlags);
-			return desiredReg;
-		}
-	}
-
 allocate:
 	for (int i = 0; i < allocCount; i++) {
 		ARM64Reg reg = allocOrder[i];
@ -240,6 +213,10 @@ allocate:
 	return INVALID_REG;
 }

+void Arm64RegCache::MapIn(MIPSGPReg rs) {
+	MapReg(rs);
+}
+
 void Arm64RegCache::MapInIn(MIPSGPReg rd, MIPSGPReg rs) {
 	SpillLock(rd, rs);
 	MapReg(rd);
@ -340,7 +317,7 @@ void Arm64RegCache::FlushR(MIPSGPReg r) {
 		}
 		if (ar[mr[r].reg].isDirty) {
 			if (r != MIPS_REG_ZERO) {
-				emit_->STR(INDEX_UNSIGNED, (ARM64Reg)mr[r].reg, CTXREG, GetMipsRegOffset(r));
+				emit_->STR(INDEX_UNSIGNED, mr[r].reg, CTXREG, GetMipsRegOffset(r));
 			}
 			ar[mr[r].reg].isDirty = false;
 		}
@ -360,58 +337,6 @@ void Arm64RegCache::FlushR(MIPSGPReg r) {
 	mr[r].imm = 0;
 }

-// Note: if allowFlushImm is set, this also flushes imms while checking the sequence.
-int Arm64RegCache::FlushGetSequential(MIPSGPReg startMipsReg, bool allowFlushImm) {
-	// Only start a sequence on a dirty armreg.
-	// TODO: Could also start with an imm?
-	const auto &startMipsInfo = mr[startMipsReg];
-	if ((startMipsInfo.loc != ML_ARMREG && startMipsInfo.loc != ML_ARMREG_IMM) || startMipsInfo.reg == INVALID_REG || !ar[startMipsInfo.reg].isDirty) {
-		return 0;
-	}
-
-	int allocCount;
-	const ARM64Reg *allocOrder = GetMIPSAllocationOrder(allocCount);
-
-	int c = 1;
-	// The sequence needs to have ascending arm regs for STMIA.
-	int lastArmReg = startMipsInfo.reg;
-	// Can't use HI/LO, only regs in the main r[] array.
-	for (int r = (int)startMipsReg + 1; r < 32; ++r) {
-		if ((mr[r].loc == ML_ARMREG || mr[r].loc == ML_ARMREG_IMM) && mr[r].reg != INVALID_REG) {
-			if ((int)mr[r].reg > lastArmReg && ar[mr[r].reg].isDirty) {
-				++c;
-				lastArmReg = mr[r].reg;
-				continue;
-			}
-		// If we're not allowed to flush imms, don't even consider them.
-		} else if (allowFlushImm && mr[r].loc == ML_IMM && MIPSGPReg(r) != MIPS_REG_ZERO) {
-			// Okay, let's search for a free (and later) reg to put this imm into.
-			bool found = false;
-			for (int j = 0; j < allocCount; ++j) {
-				ARM64Reg immReg = allocOrder[j];
-				if ((int)immReg > lastArmReg && ar[immReg].mipsReg == MIPS_REG_INVALID) {
-					++c;
-					lastArmReg = immReg;
-
-					// Even if the sequence fails, we'll need it in a reg anyway, might as well be this one.
-					MapRegTo(immReg, MIPSGPReg(r), 0);
-					found = true;
-					break;
-				}
-			}
-			if (found) {
-				continue;
-			}
-		}
-
-		// If it didn't hit a continue above, the chain is over.
-		// There's no way to skip a slot with STMIA.
-		break;
-	}
-
-	return c;
-}
-
 void Arm64RegCache::FlushAll() {
 	// TODO: Flush in pairs
 	for (int i = 0; i < NUM_MIPSREG; i++) {
--- a/Core/MIPS/ARM64/Arm64RegCache.h
+++ b/Core/MIPS/ARM64/Arm64RegCache.h
@ -24,6 +24,7 @@
 namespace Arm64JitConstants {

 // Bogus mappings, TODO ARM64
+const Arm64Gen::ARM64Reg FLAGTEMPREG = Arm64Gen::X25;
 const Arm64Gen::ARM64Reg JITBASEREG = Arm64Gen::X26;
 const Arm64Gen::ARM64Reg CTXREG = Arm64Gen::X27;
 const Arm64Gen::ARM64Reg MEMBASEREG = Arm64Gen::X28;
@ -113,6 +114,7 @@ public:
 	bool IsMapped(MIPSGPReg reg);
 	bool IsMappedAsPointer(MIPSGPReg reg);

+	void MapIn(MIPSGPReg rs);
 	void MapInIn(MIPSGPReg rd, MIPSGPReg rs);
 	void MapDirtyIn(MIPSGPReg rd, MIPSGPReg rs, bool avoidLoad = true);
 	void MapDirtyInIn(MIPSGPReg rd, MIPSGPReg rs, MIPSGPReg rt, bool avoidLoad = true);
@ -136,7 +138,6 @@ public:
 private:
 	const Arm64Gen::ARM64Reg *GetMIPSAllocationOrder(int &count);
 	void MapRegTo(Arm64Gen::ARM64Reg reg, MIPSGPReg mipsReg, int mapFlags);
-	int FlushGetSequential(MIPSGPReg startMipsReg, bool allowFlushImm);
 	Arm64Gen::ARM64Reg FindBestToSpill(bool unusedOnly, bool *clobbered);
 		
 	MIPSState *mips_;
@ -146,7 +147,7 @@ private:
 	u32 compilerPC_;

 	enum {
-		NUM_ARMREG = 32,  // 31 actual registers, plus the zero register.
+		NUM_ARMREG = 32,  // 31 actual registers, plus the zero/sp register which is not mappable.
 		NUM_MIPSREG = Arm64JitConstants::TOTAL_MAPPABLE_MIPSREGS,
 	};

--- a/Core/MIPS/JitCommon/JitCommon.cpp
+++ b/Core/MIPS/JitCommon/JitCommon.cpp
@ -99,6 +99,12 @@ std::vector<std::string> DisassembleArm64(const u8 *data, int size) {
 				lines.push_back(StringFromFormat("BKPT 1 (x%i)", bkpt_count));
 				bkpt_count = 0;
 			}
+			if (true) {
+				uint64_t addr = (intptr_t)(data + i);
+				char buf2[16];
+				snprintf(buf2, sizeof(buf2), "%04x%08x", addr >> 32, addr & 0xFFFFFFFF);
+				buf = std::string(buf2) + " " + buf;
+			}
 			lines.push_back(buf);
 		}
 	}
--- a/Core/MIPS/MIPS.cpp
+++ b/Core/MIPS/MIPS.cpp
@ -237,6 +237,7 @@ void MIPSState::UpdateCore(CPUCore desired) {
 	PSP_CoreParameter().cpuCore = desired;
 	switch (PSP_CoreParameter().cpuCore) {
 	case CPU_JIT:
+		INFO_LOG(CPU, "Switching to JIT");
 		if (!MIPSComp::jit) {
 #ifdef ARM
 			MIPSComp::jit = new MIPSComp::ArmJit(this);
@ -253,6 +254,7 @@ void MIPSState::UpdateCore(CPUCore desired) {
 		break;

 	case CPU_INTERPRETER:
+		INFO_LOG(CPU, "Switching to interpreter");
 		delete MIPSComp::jit;
 		MIPSComp::jit = 0;
 		break;