diff --git a/Core/HLE/HLE.cpp b/Core/HLE/HLE.cpp index 24c5acad3c..2916cf96f2 100644 --- a/Core/HLE/HLE.cpp +++ b/Core/HLE/HLE.cpp @@ -59,6 +59,11 @@ enum HLE_AFTER_SKIP_DEADBEEF = 0x40, // Execute pending mips calls. HLE_AFTER_QUEUED_CALLS = 0x80, + // Call CoreTiming::ForceCheck + HLE_AFTER_CORETIMING_FORCE_CHECK = 0x100, + // Split syscall over GE execution + HLE_SPLIT_SYSCALL_OVER_GE = 0x200, + HLE_SPLIT_SYSCALL_PART2 = 0x400, }; static std::vector moduleDB; @@ -69,6 +74,10 @@ static const HLEFunction *latestSyscall = nullptr; static uint32_t latestSyscallPC = 0; static int idleOp; +// Split syscall support. NOTE: This needs to be saved in DoState somehow! +static int splitSyscallEatCycles = 0; + + struct HLEMipsCallInfo { u32 func; PSPAction *action; @@ -354,6 +363,10 @@ void hleSkipDeadbeef() hleAfterSyscall |= HLE_AFTER_SKIP_DEADBEEF; } +void hleCoreTimingForceCheck() { + hleAfterSyscall |= HLE_AFTER_CORETIMING_FORCE_CHECK; +} + // Pauses execution after an HLE call. bool hleExecuteDebugBreak(const HLEFunction &func) { @@ -399,8 +412,16 @@ u64 hleDelayResult(u64 result, const char *reason, int usec) { } void hleEatCycles(int cycles) { - // Maybe this should Idle, at least for larger delays? Could that cause issues? - currentMIPS->downcount -= cycles; + if (hleAfterSyscall & HLE_SPLIT_SYSCALL_OVER_GE) { + splitSyscallEatCycles = cycles; + } else { + // Maybe this should Idle, at least for larger delays? Could that cause issues? + currentMIPS->downcount -= cycles; + } +} + +void hleSplitSyscallOverGe() { + hleAfterSyscall |= HLE_SPLIT_SYSCALL_OVER_GE; } void hleEatMicro(int usec) { @@ -562,8 +583,27 @@ inline static void SetDeadbeefRegs() currentMIPS->hi = 0xDEADBEEF; } -inline void hleFinishSyscall(const HLEFunction &info) -{ +static void hleFinishSyscall(const HLEFunction *info) { + if (hleAfterSyscall & HLE_SPLIT_SYSCALL_OVER_GE) { + hleAfterSyscall &= ~HLE_SPLIT_SYSCALL_OVER_GE; + hleAfterSyscall |= HLE_SPLIT_SYSCALL_PART2; + // Switch to GE execution immediately. + // coreState is checked after the syscall, always. + Core_SwitchToGe(); + return; + } + + if (hleAfterSyscall & HLE_SPLIT_SYSCALL_PART2) { + // Eat the extra cycle we added above. + hleEatCycles(splitSyscallEatCycles + 1); + // Make sure to zero it so it's not accidentally re-used. + splitSyscallEatCycles = 0; + } + + if (hleAfterSyscall & HLE_AFTER_CORETIMING_FORCE_CHECK) { + CoreTiming::ForceCheck(); + } + if ((hleAfterSyscall & HLE_AFTER_SKIP_DEADBEEF) == 0) SetDeadbeefRegs(); @@ -580,9 +620,9 @@ inline void hleFinishSyscall(const HLEFunction &info) else if ((hleAfterSyscall & HLE_AFTER_RESCHED) != 0) __KernelReSchedule(hleAfterSyscallReschedReason); - if ((hleAfterSyscall & HLE_AFTER_DEBUG_BREAK) != 0) - { - if (!hleExecuteDebugBreak(info)) + if ((hleAfterSyscall & HLE_AFTER_DEBUG_BREAK) != 0) { + _dbg_assert_(info); + if (!hleExecuteDebugBreak(*info)) { // We'll do it next syscall. hleAfterSyscall = HLE_AFTER_DEBUG_BREAK; @@ -595,6 +635,10 @@ inline void hleFinishSyscall(const HLEFunction &info) hleAfterSyscallReschedReason = 0; } +void hleFinishSyscallAfterGe() { + hleFinishSyscall(nullptr); +} + static void updateSyscallStats(int modulenum, int funcnum, double total) { const char *name = moduleDB[modulenum].funcTable[funcnum].name; @@ -653,7 +697,7 @@ inline void CallSyscallWithFlags(const HLEFunction *info) } if (hleAfterSyscall != HLE_AFTER_NOTHING) - hleFinishSyscall(*info); + hleFinishSyscall(info); else SetDeadbeefRegs(); } @@ -665,7 +709,7 @@ inline void CallSyscallWithoutFlags(const HLEFunction *info) info->func(); if (hleAfterSyscall != HLE_AFTER_NOTHING) - hleFinishSyscall(*info); + hleFinishSyscall(info); else SetDeadbeefRegs(); } diff --git a/Core/HLE/HLE.h b/Core/HLE/HLE.h index 12a0590351..eff5fba1c9 100644 --- a/Core/HLE/HLE.h +++ b/Core/HLE/HLE.h @@ -129,6 +129,15 @@ u64 hleDelayResult(u64 result, const char *reason, int usec); void hleEatCycles(int cycles); void hleEatMicro(int usec); +void hleCoreTimingForceCheck(); + +// Causes the syscall to not fully execute immediately, instead give the Ge a chance to +// execute display lists. +void hleSplitSyscallOverGe(); + +// Called after a split syscall from System.cpp +void hleFinishSyscallAfterGe(); + inline int hleDelayResult(int result, const char *reason, int usec) { return hleDelayResult((u32) result, reason, usec); } diff --git a/Core/HLE/sceGe.cpp b/Core/HLE/sceGe.cpp index 1269be02a3..fb2597ce8d 100644 --- a/Core/HLE/sceGe.cpp +++ b/Core/HLE/sceGe.cpp @@ -145,7 +145,7 @@ public: // Hm. This might be really tricky to get to behave the same in both modes. Here we are in __KernelReschedule, CoreTiming::Advance, ProcessEvents, GeExecuteInterrupt, ... .... __RunOnePendingInterrupt // But not sure how much it will matter. The test pause2 hits here. - gpu->RunGe(); + gpu->ProcessDLQueue(); return false; } @@ -181,8 +181,11 @@ public: } gpu->InterruptEnd(intrdata.listid); - // This is the last thing done here in the syscall (__KernelReturnFromInterrupt) so switching coreState should just work. - gpu->RunGe(); + + // TODO: This is called from __KernelReturnFromInterrupt which does a bunch of stuff afterwards. + // Using hleSplitSyscallOverGe here breaks the gpu/signals/suspend.prx test, for that reason. + // So we just process inline and sacrifice debuggability a little. + gpu->ProcessDLQueue(); } }; @@ -349,12 +352,15 @@ u32 sceGeListEnQueue(u32 listAddress, u32 stallAddress, int callbackId, u32 optP if ((int)listID >= 0) listID = LIST_ID_MAGIC ^ listID; if (runList) { - gpu->RunGe(); + if (gpu->ShouldSplitOverGe()) { + hleSplitSyscallOverGe(); + } else { + gpu->ProcessDLQueue(); + } } - // The stuff here below must be deferred... hleEatCycles(490); - CoreTiming::ForceCheck(); - return hleLogSuccessX(Log::sceGe, listID); + hleCoreTimingForceCheck(); + return listID; // We already logged above, logs get confusing if we use hleLogSuccess. } u32 sceGeListEnQueueHead(u32 listAddress, u32 stallAddress, int callbackId, u32 optParamAddr) { @@ -368,11 +374,15 @@ u32 sceGeListEnQueueHead(u32 listAddress, u32 stallAddress, int callbackId, u32 if ((int)listID >= 0) listID = LIST_ID_MAGIC ^ listID; if (runList) { - gpu->RunGe(); + if (gpu->ShouldSplitOverGe()) { + hleSplitSyscallOverGe(); + } else { + gpu->ProcessDLQueue(); + } } hleEatCycles(480); - CoreTiming::ForceCheck(); - return hleLogSuccessX(Log::sceGe, listID); + hleCoreTimingForceCheck(); + return listID; // We already logged above, logs get confusing if we use hleLogSuccess. } static int sceGeListDeQueue(u32 listID) { @@ -386,13 +396,17 @@ static int sceGeListUpdateStallAddr(u32 displayListID, u32 stallAddress) { // Advance() might cause an interrupt, so defer the Advance but do it ASAP. // Final Fantasy Type-0 has a graphical artifact without this (timing issue.) hleEatCycles(190); - CoreTiming::ForceCheck(); + hleCoreTimingForceCheck(); DEBUG_LOG(Log::sceGe, "sceGeListUpdateStallAddr(dlid=%i, stalladdr=%08x)", displayListID, stallAddress); bool runList; int retval = gpu->UpdateStall(LIST_ID_MAGIC ^ displayListID, stallAddress, &runList); if (runList) { - gpu->RunGe(); + if (gpu->ShouldSplitOverGe()) { + hleSplitSyscallOverGe(); + } else { + gpu->ProcessDLQueue(); + } } return retval; } @@ -419,7 +433,11 @@ static int sceGeContinue() { bool runList; int ret = gpu->Continue(&runList); if (runList) { - gpu->RunGe(); + if (gpu->ShouldSplitOverGe()) { + hleSplitSyscallOverGe(); + } else { + gpu->ProcessDLQueue(); + } } hleEatCycles(220); hleReSchedule("ge continue"); diff --git a/Core/MIPS/MIPS.cpp b/Core/MIPS/MIPS.cpp index 03af89c732..e0719971bd 100644 --- a/Core/MIPS/MIPS.cpp +++ b/Core/MIPS/MIPS.cpp @@ -336,6 +336,7 @@ int MIPSState::RunLoopUntil(u64 globalTicks) { case CPUCore::IR_INTERPRETER: while (inDelaySlot) { // We must get out of the delay slot before going into jit. + // This normally should never take more than one step... SingleStep(); } insideJit = true; diff --git a/Core/System.cpp b/Core/System.cpp index fceecc9206..eb9039df9f 100644 --- a/Core/System.cpp +++ b/Core/System.cpp @@ -642,8 +642,14 @@ void PSP_RunLoopUntil(u64 globalticks) { _dbg_assert_(false); break; case CORE_RUNNING_GE: - gpu->ProcessDLQueue(true); - coreState = CORE_RUNNING_CPU; + switch (gpu->ProcessDLQueue()) { + case DLResult::Error: // TODO: shouldn't return this normally + case DLResult::Pause: // like updatestall. + case DLResult::Done: + hleFinishSyscallAfterGe(); + coreState = CORE_RUNNING_CPU; + break; + } break; } } diff --git a/GPU/Debugger/Playback.cpp b/GPU/Debugger/Playback.cpp index 93732ff603..42f062d1a5 100644 --- a/GPU/Debugger/Playback.cpp +++ b/GPU/Debugger/Playback.cpp @@ -337,7 +337,8 @@ void DumpExecute::SyncStall() { bool runList; gpu->UpdateStall(execListID, execListPos, &runList); if (runList) { - gpu->RunGe(); + DLResult result = gpu->ProcessDLQueue(); + _dbg_assert_(result == DLResult::Done || result == DLResult::Pause); } s64 listTicks = gpu->GetListTicks(execListID); if (listTicks != -1) { @@ -372,7 +373,7 @@ bool DumpExecute::SubmitCmds(const void *p, u32 sz) { bool runList; execListID = gpu->EnqueueList(execListBuf, execListPos, -1, optParam, false, &runList); if (runList) { - gpu->RunGe(); + gpu->ProcessDLQueue(); } gpu->EnableInterrupts(true); } diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index 7ac9dbccd7..d264c524e2 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -552,19 +552,6 @@ u32 GPUCommon::Continue(bool *runList) { return 0; } -void GPUCommon::RunGe() { - // Old method, although may make sense for performance if the ImDebugger isn't active. -#if 1 - // Call ProcessDLQueue directly. - ProcessDLQueue(false); -#else - // New method, will allow ImDebugger to step the GPU. - // ARGH, what makes this different appears to be what happens AFTER the call to - // EnqueueList inside sceGeListEnqueue. Like the cycle eating and CoreTiming forcecheck. - Core_SwitchToGe(); -#endif -} - u32 GPUCommon::Break(int mode) { if (mode < 0 || mode > 1) return SCE_KERNEL_ERROR_INVALID_MODE; @@ -848,7 +835,7 @@ int GPUCommon::GetNextListIndex() { // This is now called when coreState == CORE_RUNNING_GE. // TODO: It should return the next action.. (break into debugger or continue running) -DLResult GPUCommon::ProcessDLQueue(bool fromCore) { +DLResult GPUCommon::ProcessDLQueue() { startingTicks = CoreTiming::GetTicks(); cyclesExecuted = 0; @@ -884,15 +871,15 @@ DLResult GPUCommon::ProcessDLQueue(bool fromCore) { __GeTriggerSync(GPU_SYNC_DRAW, 1, drawCompleteTicks); // Since the event is in CoreTiming, we're in sync. Just set 0 now. - - if (fromCore) { - // Now update the core timing like we would have previously... - // TODO - } - return DLResult::Done; } +bool GPUCommon::ShouldSplitOverGe() const { + // TODO: Should check for debugger active, etc. + // We only need to do this if we want to step through Ge display lists using the Ge debuggers. + return false; +} + void GPUCommon::Execute_OffsetAddr(u32 op, u32 diff) { gstate_c.offsetAddr = op << 8; } diff --git a/GPU/GPUCommon.h b/GPU/GPUCommon.h index 8601e7eeae..3ea07bf0b4 100644 --- a/GPU/GPUCommon.h +++ b/GPU/GPUCommon.h @@ -243,7 +243,7 @@ public: bool InterpretList(DisplayList &list); - DLResult ProcessDLQueue(bool fromCore); + DLResult ProcessDLQueue(); u32 UpdateStall(int listid, u32 newstall, bool *runList); u32 EnqueueList(u32 listpc, u32 stall, int subIntrBase, PSPPointer args, bool head, bool *runList); @@ -266,7 +266,9 @@ public: virtual void DeviceLost() = 0; virtual void DeviceRestore(Draw::DrawContext *draw) = 0; - void RunGe(); + // Returns true if we should split the call across GE execution. + // For example, a debugger is active. + bool ShouldSplitOverGe() const; void DrawImGuiDebugger(); diff --git a/headless/Headless.cpp b/headless/Headless.cpp index 9e6de56f1a..9154c21e96 100644 --- a/headless/Headless.cpp +++ b/headless/Headless.cpp @@ -255,7 +255,12 @@ bool RunAutoTest(HeadlessHost *headlessHost, CoreParameter &coreParameter, const if (coreState == CORE_STEPPING_CPU && !coreParameter.startBreak) { break; } - if (time_now_d() > deadline) { + bool debugger = false; +#ifdef _WIN32 + if (IsDebuggerPresent()) + debugger = true; +#endif + if (time_now_d() > deadline && !debugger) { // Don't compare, print the output at least up to this point, and bail. if (!opt.bench) { printf("%s", output.c_str());