diff --git a/Core/HLE/HLE.cpp b/Core/HLE/HLE.cpp
index 24c5acad3c..2916cf96f2 100644
--- a/Core/HLE/HLE.cpp
+++ b/Core/HLE/HLE.cpp
@@ -59,6 +59,11 @@ enum
 	HLE_AFTER_SKIP_DEADBEEF     = 0x40,
 	// Execute pending mips calls.
 	HLE_AFTER_QUEUED_CALLS      = 0x80,
+	// Call CoreTiming::ForceCheck
+	HLE_AFTER_CORETIMING_FORCE_CHECK = 0x100,
+	// Split syscall over GE execution
+	HLE_SPLIT_SYSCALL_OVER_GE = 0x200,
+	HLE_SPLIT_SYSCALL_PART2 = 0x400,
 };
 
 static std::vector<HLEModule> moduleDB;
@@ -69,6 +74,10 @@ static const HLEFunction *latestSyscall = nullptr;
 static uint32_t latestSyscallPC = 0;
 static int idleOp;
 
+// Split syscall support. NOTE: This needs to be saved in DoState somehow!
+static int splitSyscallEatCycles = 0;
+
+
 struct HLEMipsCallInfo {
 	u32 func;
 	PSPAction *action;
@@ -354,6 +363,10 @@ void hleSkipDeadbeef()
 	hleAfterSyscall |= HLE_AFTER_SKIP_DEADBEEF;
 }
 
+void hleCoreTimingForceCheck() {
+	hleAfterSyscall |= HLE_AFTER_CORETIMING_FORCE_CHECK;
+}
+
 // Pauses execution after an HLE call.
 bool hleExecuteDebugBreak(const HLEFunction &func)
 {
@@ -399,8 +412,16 @@ u64 hleDelayResult(u64 result, const char *reason, int usec) {
 }
 
 void hleEatCycles(int cycles) {
-	// Maybe this should Idle, at least for larger delays?  Could that cause issues?
-	currentMIPS->downcount -= cycles;
+	if (hleAfterSyscall & HLE_SPLIT_SYSCALL_OVER_GE) {
+		splitSyscallEatCycles = cycles;
+	} else {
+		// Maybe this should Idle, at least for larger delays?  Could that cause issues?
+		currentMIPS->downcount -= cycles;
+	}
+}
+
+void hleSplitSyscallOverGe() {
+	hleAfterSyscall |= HLE_SPLIT_SYSCALL_OVER_GE;
 }
 
 void hleEatMicro(int usec) {
@@ -562,8 +583,27 @@ inline static void SetDeadbeefRegs()
 	currentMIPS->hi = 0xDEADBEEF;
 }
 
-inline void hleFinishSyscall(const HLEFunction &info)
-{
+static void hleFinishSyscall(const HLEFunction *info) {
+	if (hleAfterSyscall & HLE_SPLIT_SYSCALL_OVER_GE) {
+		hleAfterSyscall &= ~HLE_SPLIT_SYSCALL_OVER_GE;
+		hleAfterSyscall |= HLE_SPLIT_SYSCALL_PART2;
+		// Switch to GE execution immediately.
+		// coreState is checked after the syscall, always.
+		Core_SwitchToGe();
+		return;
+	}
+
+	if (hleAfterSyscall & HLE_SPLIT_SYSCALL_PART2) {
+		// Eat the extra cycle we added above.
+		hleEatCycles(splitSyscallEatCycles + 1);
+		// Make sure to zero it so it's not accidentally re-used.
+		splitSyscallEatCycles = 0;
+	}
+
+	if (hleAfterSyscall & HLE_AFTER_CORETIMING_FORCE_CHECK) {
+		CoreTiming::ForceCheck();
+	}
+
 	if ((hleAfterSyscall & HLE_AFTER_SKIP_DEADBEEF) == 0)
 		SetDeadbeefRegs();
 
@@ -580,9 +620,9 @@ inline void hleFinishSyscall(const HLEFunction &info)
 	else if ((hleAfterSyscall & HLE_AFTER_RESCHED) != 0)
 		__KernelReSchedule(hleAfterSyscallReschedReason);
 
-	if ((hleAfterSyscall & HLE_AFTER_DEBUG_BREAK) != 0)
-	{
-		if (!hleExecuteDebugBreak(info))
+	if ((hleAfterSyscall & HLE_AFTER_DEBUG_BREAK) != 0) {
+		_dbg_assert_(info);
+		if (!hleExecuteDebugBreak(*info))
 		{
 			// We'll do it next syscall.
 			hleAfterSyscall = HLE_AFTER_DEBUG_BREAK;
@@ -595,6 +635,10 @@ inline void hleFinishSyscall(const HLEFunction &info)
 	hleAfterSyscallReschedReason = 0;
 }
 
+void hleFinishSyscallAfterGe() {
+	hleFinishSyscall(nullptr);
+}
+
 static void updateSyscallStats(int modulenum, int funcnum, double total)
 {
 	const char *name = moduleDB[modulenum].funcTable[funcnum].name;
@@ -653,7 +697,7 @@ inline void CallSyscallWithFlags(const HLEFunction *info)
 	}
 
 	if (hleAfterSyscall != HLE_AFTER_NOTHING)
-		hleFinishSyscall(*info);
+		hleFinishSyscall(info);
 	else
 		SetDeadbeefRegs();
 }
@@ -665,7 +709,7 @@ inline void CallSyscallWithoutFlags(const HLEFunction *info)
 	info->func();
 
 	if (hleAfterSyscall != HLE_AFTER_NOTHING)
-		hleFinishSyscall(*info);
+		hleFinishSyscall(info);
 	else
 		SetDeadbeefRegs();
 }
diff --git a/Core/HLE/HLE.h b/Core/HLE/HLE.h
index 12a0590351..eff5fba1c9 100644
--- a/Core/HLE/HLE.h
+++ b/Core/HLE/HLE.h
@@ -129,6 +129,15 @@ u64 hleDelayResult(u64 result, const char *reason, int usec);
 void hleEatCycles(int cycles);
 void hleEatMicro(int usec);
 
+void hleCoreTimingForceCheck();
+
+// Causes the syscall to not fully execute immediately, instead give the Ge a chance to
+// execute display lists.
+void hleSplitSyscallOverGe();
+
+// Called after a split syscall from System.cpp
+void hleFinishSyscallAfterGe();
+
 inline int hleDelayResult(int result, const char *reason, int usec) {
 	return hleDelayResult((u32) result, reason, usec);
 }
diff --git a/Core/HLE/sceGe.cpp b/Core/HLE/sceGe.cpp
index 1269be02a3..fb2597ce8d 100644
--- a/Core/HLE/sceGe.cpp
+++ b/Core/HLE/sceGe.cpp
@@ -145,7 +145,7 @@ public:
 
 		// Hm. This might be really tricky to get to behave the same in both modes. Here we are in __KernelReschedule, CoreTiming::Advance, ProcessEvents, GeExecuteInterrupt, ... .... __RunOnePendingInterrupt
 		// But not sure how much it will matter. The test pause2 hits here.
-		gpu->RunGe();
+		gpu->ProcessDLQueue();
 		return false;
 	}
 
@@ -181,8 +181,11 @@ public:
 		}
 
 		gpu->InterruptEnd(intrdata.listid);
-		// This is the last thing done here in the syscall (__KernelReturnFromInterrupt) so switching coreState should just work.
-		gpu->RunGe();
+
+		// TODO: This is called from __KernelReturnFromInterrupt which does a bunch of stuff afterwards.
+		// Using hleSplitSyscallOverGe here breaks the gpu/signals/suspend.prx test, for that reason.
+		// So we just process inline and sacrifice debuggability a little.
+		gpu->ProcessDLQueue();
 	}
 };
 
@@ -349,12 +352,15 @@ u32 sceGeListEnQueue(u32 listAddress, u32 stallAddress, int callbackId, u32 optP
 	if ((int)listID >= 0)
 		listID = LIST_ID_MAGIC ^ listID;
 	if (runList) {
-		gpu->RunGe();
+		if (gpu->ShouldSplitOverGe()) {
+			hleSplitSyscallOverGe();
+		} else {
+			gpu->ProcessDLQueue();
+		}
 	}
-	// The stuff here below must be deferred...
 	hleEatCycles(490);
-	CoreTiming::ForceCheck();
-	return hleLogSuccessX(Log::sceGe, listID);
+	hleCoreTimingForceCheck();
+	return listID; // We already logged above, logs get confusing if we use hleLogSuccess.
 }
 
 u32 sceGeListEnQueueHead(u32 listAddress, u32 stallAddress, int callbackId, u32 optParamAddr) {
@@ -368,11 +374,15 @@ u32 sceGeListEnQueueHead(u32 listAddress, u32 stallAddress, int callbackId, u32
 	if ((int)listID >= 0)
 		listID = LIST_ID_MAGIC ^ listID;
 	if (runList) {
-		gpu->RunGe();
+		if (gpu->ShouldSplitOverGe()) {
+			hleSplitSyscallOverGe();
+		} else {
+			gpu->ProcessDLQueue();
+		}
 	}
 	hleEatCycles(480);
-	CoreTiming::ForceCheck();
-	return hleLogSuccessX(Log::sceGe, listID);
+	hleCoreTimingForceCheck();
+	return listID; // We already logged above, logs get confusing if we use hleLogSuccess.
 }
 
 static int sceGeListDeQueue(u32 listID) {
@@ -386,13 +396,17 @@ static int sceGeListUpdateStallAddr(u32 displayListID, u32 stallAddress) {
 	// Advance() might cause an interrupt, so defer the Advance but do it ASAP.
 	// Final Fantasy Type-0 has a graphical artifact without this (timing issue.)
 	hleEatCycles(190);
-	CoreTiming::ForceCheck();
+	hleCoreTimingForceCheck();
 
 	DEBUG_LOG(Log::sceGe, "sceGeListUpdateStallAddr(dlid=%i, stalladdr=%08x)", displayListID, stallAddress);
 	bool runList;
 	int retval = gpu->UpdateStall(LIST_ID_MAGIC ^ displayListID, stallAddress, &runList);
 	if (runList) {
-		gpu->RunGe();
+		if (gpu->ShouldSplitOverGe()) {
+			hleSplitSyscallOverGe();
+		} else {
+			gpu->ProcessDLQueue();
+		}
 	}
 	return retval;
 }
@@ -419,7 +433,11 @@ static int sceGeContinue() {
 	bool runList;
 	int ret = gpu->Continue(&runList);
 	if (runList) {
-		gpu->RunGe();
+		if (gpu->ShouldSplitOverGe()) {
+			hleSplitSyscallOverGe();
+		} else {
+			gpu->ProcessDLQueue();
+		}
 	}
 	hleEatCycles(220);
 	hleReSchedule("ge continue");
diff --git a/Core/MIPS/MIPS.cpp b/Core/MIPS/MIPS.cpp
index 03af89c732..e0719971bd 100644
--- a/Core/MIPS/MIPS.cpp
+++ b/Core/MIPS/MIPS.cpp
@@ -336,6 +336,7 @@ int MIPSState::RunLoopUntil(u64 globalTicks) {
 	case CPUCore::IR_INTERPRETER:
 		while (inDelaySlot) {
 			// We must get out of the delay slot before going into jit.
+			// This normally should never take more than one step...
 			SingleStep();
 		}
 		insideJit = true;
diff --git a/Core/System.cpp b/Core/System.cpp
index fceecc9206..eb9039df9f 100644
--- a/Core/System.cpp
+++ b/Core/System.cpp
@@ -642,8 +642,14 @@ void PSP_RunLoopUntil(u64 globalticks) {
 			_dbg_assert_(false);
 			break;
 		case CORE_RUNNING_GE:
-			gpu->ProcessDLQueue(true);
-			coreState = CORE_RUNNING_CPU;
+			switch (gpu->ProcessDLQueue()) {
+			case DLResult::Error:  // TODO: shouldn't return this normally
+			case DLResult::Pause:  // like updatestall.
+			case DLResult::Done:
+				hleFinishSyscallAfterGe();
+				coreState = CORE_RUNNING_CPU;
+				break;
+			}
 			break;
 		}
 	}
diff --git a/GPU/Debugger/Playback.cpp b/GPU/Debugger/Playback.cpp
index 93732ff603..42f062d1a5 100644
--- a/GPU/Debugger/Playback.cpp
+++ b/GPU/Debugger/Playback.cpp
@@ -337,7 +337,8 @@ void DumpExecute::SyncStall() {
 	bool runList;
 	gpu->UpdateStall(execListID, execListPos, &runList);
 	if (runList) {
-		gpu->RunGe();
+		DLResult result = gpu->ProcessDLQueue();
+		_dbg_assert_(result == DLResult::Done || result == DLResult::Pause);
 	}
 	s64 listTicks = gpu->GetListTicks(execListID);
 	if (listTicks != -1) {
@@ -372,7 +373,7 @@ bool DumpExecute::SubmitCmds(const void *p, u32 sz) {
 		bool runList;
 		execListID = gpu->EnqueueList(execListBuf, execListPos, -1, optParam, false, &runList);
 		if (runList) {
-			gpu->RunGe();
+			gpu->ProcessDLQueue();
 		}
 		gpu->EnableInterrupts(true);
 	}
diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp
index 7ac9dbccd7..d264c524e2 100644
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@@ -552,19 +552,6 @@ u32 GPUCommon::Continue(bool *runList) {
 	return 0;
 }
 
-void GPUCommon::RunGe() {
-	// Old method, although may make sense for performance if the ImDebugger isn't active.
-#if 1
-	// Call ProcessDLQueue directly.
-	ProcessDLQueue(false);
-#else
-	// New method, will allow ImDebugger to step the GPU.
-	// ARGH, what makes this different appears to be what happens AFTER the call to
-	// EnqueueList inside sceGeListEnqueue. Like the cycle eating and CoreTiming forcecheck.
-	Core_SwitchToGe();
-#endif
-}
-
 u32 GPUCommon::Break(int mode) {
 	if (mode < 0 || mode > 1)
 		return SCE_KERNEL_ERROR_INVALID_MODE;
@@ -848,7 +835,7 @@ int GPUCommon::GetNextListIndex() {
 
 // This is now called when coreState == CORE_RUNNING_GE.
 // TODO: It should return the next action.. (break into debugger or continue running)
-DLResult GPUCommon::ProcessDLQueue(bool fromCore) {
+DLResult GPUCommon::ProcessDLQueue() {
 	startingTicks = CoreTiming::GetTicks();
 	cyclesExecuted = 0;
 
@@ -884,15 +871,15 @@ DLResult GPUCommon::ProcessDLQueue(bool fromCore) {
 
 	__GeTriggerSync(GPU_SYNC_DRAW, 1, drawCompleteTicks);
 	// Since the event is in CoreTiming, we're in sync.  Just set 0 now.
-
-	if (fromCore) {
-		// Now update the core timing like we would have previously...
-		// TODO
-	}
-
 	return DLResult::Done;
 }
 
+bool GPUCommon::ShouldSplitOverGe() const {
+	// TODO: Should check for debugger active, etc.
+	// We only need to do this if we want to step through Ge display lists using the Ge debuggers.
+	return false;
+}
+
 void GPUCommon::Execute_OffsetAddr(u32 op, u32 diff) {
 	gstate_c.offsetAddr = op << 8;
 }
diff --git a/GPU/GPUCommon.h b/GPU/GPUCommon.h
index 8601e7eeae..3ea07bf0b4 100644
--- a/GPU/GPUCommon.h
+++ b/GPU/GPUCommon.h
@@ -243,7 +243,7 @@ public:
 
 	bool InterpretList(DisplayList &list);
 
-	DLResult ProcessDLQueue(bool fromCore);
+	DLResult ProcessDLQueue();
 
 	u32 UpdateStall(int listid, u32 newstall, bool *runList);
 	u32 EnqueueList(u32 listpc, u32 stall, int subIntrBase, PSPPointer<PspGeListArgs> args, bool head, bool *runList);
@@ -266,7 +266,9 @@ public:
 	virtual void DeviceLost() = 0;
 	virtual void DeviceRestore(Draw::DrawContext *draw) = 0;
 
-	void RunGe();
+	// Returns true if we should split the call across GE execution.
+	// For example, a debugger is active.
+	bool ShouldSplitOverGe() const;
 
 	void DrawImGuiDebugger();
 
diff --git a/headless/Headless.cpp b/headless/Headless.cpp
index 9e6de56f1a..9154c21e96 100644
--- a/headless/Headless.cpp
+++ b/headless/Headless.cpp
@@ -255,7 +255,12 @@ bool RunAutoTest(HeadlessHost *headlessHost, CoreParameter &coreParameter, const
 		if (coreState == CORE_STEPPING_CPU && !coreParameter.startBreak) {
 			break;
 		}
-		if (time_now_d() > deadline) {
+		bool debugger = false;
+#ifdef _WIN32
+		if (IsDebuggerPresent())
+			debugger = true;
+#endif
+		if (time_now_d() > deadline && !debugger) {
 			// Don't compare, print the output at least up to this point, and bail.
 			if (!opt.bench) {
 				printf("%s", output.c_str());