diff --git a/Common/Thread/ThreadPool.cpp b/Common/Thread/ThreadPool.cpp index e64d1c8786..2624252a25 100644 --- a/Common/Thread/ThreadPool.cpp +++ b/Common/Thread/ThreadPool.cpp @@ -1,3 +1,5 @@ +#include +#include #include "Common/Thread/ThreadPool.h" #include "Common/Thread/ThreadUtil.h" @@ -54,43 +56,27 @@ void WorkerThread::WorkFunc() { } } -void LoopWorkerThread::Process(std::function work, int start, int end) { +void LoopWorkerThread::ProcessLoop(std::function work, int start, int end) { std::lock_guard guard(mutex); - work_ = std::move(work); + loopWork_ = std::move(work); + work_ = [this]() { + loopWork_(start_, end_); + }; start_ = start; end_ = end; jobsTarget = jobsDone + 1; signal.notify_one(); } -void LoopWorkerThread::WorkFunc() { - setCurrentThreadName("LoopWorker"); - std::unique_lock guard(mutex); - while (active) { - // 'active == false' is one of the conditions for signaling, - // do not "optimize" it - while (active && jobsTarget <= jobsDone) { - signal.wait(guard); - } - if (active) { - work_(start_, end_); - - std::lock_guard doneGuard(doneMutex); - jobsDone++; - done.notify_one(); - } - } -} - ///////////////////////////// ThreadPool ThreadPool::ThreadPool(int numThreads) { if (numThreads <= 0) { numThreads_ = 1; INFO_LOG(JIT, "ThreadPool: Bad number of threads %d", numThreads); - } else if (numThreads > 8) { - INFO_LOG(JIT, "ThreadPool: Capping number of threads to 8 (was %d)", numThreads); - numThreads_ = 8; + } else if (numThreads > 16) { + INFO_LOG(JIT, "ThreadPool: Capping number of threads to 16 (was %d)", numThreads); + numThreads_ = 16; } else { numThreads_ = numThreads; } @@ -108,23 +94,32 @@ void ThreadPool::StartWorkers() { } } -void ThreadPool::ParallelLoop(const std::function &loop, int lower, int upper) { +void ThreadPool::ParallelLoop(const std::function &loop, int lower, int upper, int minSize) { + // Don't parallelize tiny loops. + if (minSize == -1) + minSize = 4; + int range = upper - lower; - if (range >= numThreads_ * 2) { // don't parallelize tiny loops (this could be better, maybe add optional parameter that estimates work per iteration) + if (range >= minSize) { std::lock_guard guard(mutex); StartWorkers(); // could do slightly better load balancing for the generic case, // but doesn't matter since all our loops are power of 2 - int chunk = range / numThreads_; + int chunk = std::max(minSize, range / numThreads_); int s = lower; - for (auto& worker : workers) { - worker->Process(loop, s, s+chunk); - s+=chunk; + for (auto &worker : workers) { + // We'll do the last chunk on the current thread. + if (s + chunk >= upper) { + break; + } + worker->ProcessLoop(loop, s, s + chunk); + s += chunk; } // This is the final chunk. - loop(s, upper); - for (auto& worker : workers) { + if (s < upper) + loop(s, upper); + for (auto &worker : workers) { worker->WaitForCompletion(); } } else { @@ -132,3 +127,16 @@ void ThreadPool::ParallelLoop(const std::function &loop, int lowe } } +void ThreadPool::ParallelMemcpy(void *dest, const void *src, int size) { + static const int MIN_SIZE = 128 * 1024; + ParallelLoop([&](int l, int h) { + memmove((uint8_t *)dest + l, (const uint8_t *)src + l, h - l); + }, 0, size, MIN_SIZE); +} + +void ThreadPool::ParallelMemset(void *dest, uint8_t val, int size) { + static const int MIN_SIZE = 128 * 1024; + ParallelLoop([&](int l, int h) { + memset((uint8_t *)dest + l, val, h - l); + }, 0, size, MIN_SIZE); +} diff --git a/Common/Thread/ThreadPool.h b/Common/Thread/ThreadPool.h index 2064b31c28..ceac3c39c6 100644 --- a/Common/Thread/ThreadPool.h +++ b/Common/Thread/ThreadPool.h @@ -23,6 +23,8 @@ public: void WaitForCompletion(); protected: + virtual void WorkFunc(); + std::thread thread; // the worker thread std::condition_variable signal; // used to signal new work std::condition_variable done; // used to signal work completion @@ -30,11 +32,10 @@ protected: bool active = true; int jobsDone = 0; int jobsTarget = 0; -private: - virtual void WorkFunc(); std::function work_; // the work to be done by this thread +private: WorkerThread(const WorkerThread& other) = delete; // prevent copies void operator =(const WorkerThread &other) = delete; }; @@ -42,14 +43,12 @@ private: class LoopWorkerThread final : public WorkerThread { public: LoopWorkerThread() = default; - void Process(std::function work, int start, int end); + void ProcessLoop(std::function work, int start, int end); private: - virtual void WorkFunc() override; - int start_; int end_; - std::function work_; // the work to be done by this thread + std::function loopWork_; // the work to be done by this thread }; // A thread pool manages a set of worker threads, and allows the execution of parallel loops on them @@ -61,7 +60,9 @@ public: // don't need a destructor, "workers" is cleared on delete, // leading to the stopping and joining of all worker threads (RAII and all that) - void ParallelLoop(const std::function &loop, int lower, int upper); + void ParallelLoop(const std::function &loop, int lower, int upper, int minSize); + void ParallelMemcpy(void *dest, const void *src, int sz); + void ParallelMemset(void *dest, uint8_t val, int sz); private: int numThreads_; diff --git a/Core/ELF/ElfReader.cpp b/Core/ELF/ElfReader.cpp index 69328d49e2..410bbb8d6d 100644 --- a/Core/ELF/ElfReader.cpp +++ b/Core/ELF/ElfReader.cpp @@ -17,6 +17,7 @@ #include "Core/MemMap.h" #include "Core/Reporting.h" +#include "Core/ThreadPools.h" #include "Core/MIPS/MIPSTables.h" #include "Core/ELF/ElfReader.h" #include "Core/Debugger/MemBlockInfo.h" @@ -57,86 +58,83 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs) { int numErrors = 0; DEBUG_LOG(LOADER, "Loading %i relocations...", numRelocs); - for (int r = 0; r < numRelocs; r++) - { - // INFO_LOG(LOADER, "Loading reloc %i (%p)...", r, rels + r); - u32 info = rels[r].r_info; - u32 addr = rels[r].r_offset; + GlobalThreadPool::Loop([&](int l, int h) { + for (int r = l; r < h; r++) { + VERBOSE_LOG(LOADER, "Loading reloc %i (%p)...", r, rels + r); + u32 info = rels[r].r_info; + u32 addr = rels[r].r_offset; - int type = info & 0xf; + int type = info & 0xf; - int readwrite = (info>>8) & 0xff; - int relative = (info>>16) & 0xff; + int readwrite = (info >> 8) & 0xff; + int relative = (info >> 16) & 0xff; - //0 = code - //1 = data + //0 = code + //1 = data - if (readwrite >= (int)ARRAY_SIZE(segmentVAddr)) { - if (numErrors < 10) { - ERROR_LOG_REPORT(LOADER, "Bad segment number %i", readwrite); + if (readwrite >= (int)ARRAY_SIZE(segmentVAddr)) { + if (numErrors < 10) { + ERROR_LOG_REPORT(LOADER, "Bad segment number %i", readwrite); + } + numErrors++; + continue; } - numErrors++; - continue; - } - addr += segmentVAddr[readwrite]; + addr += segmentVAddr[readwrite]; - // It appears that misaligned relocations are allowed. - // Will they work correctly on big-endian? + // It appears that misaligned relocations are allowed. + // Will they work correctly on big-endian? - if (((addr & 3) && type != R_MIPS_32) || !Memory::IsValidAddress(addr)) { - if (numErrors < 10) { - WARN_LOG_REPORT(LOADER, "Suspicious address %08x, skipping reloc, type = %d", addr, type); - } else if (numErrors == 10) { - WARN_LOG(LOADER, "Too many bad relocations, skipping logging"); + if (((addr & 3) && type != R_MIPS_32) || !Memory::IsValidAddress(addr)) { + if (numErrors < 10) { + WARN_LOG_REPORT(LOADER, "Suspicious address %08x, skipping reloc, type = %d", addr, type); + } else if (numErrors == 10) { + WARN_LOG(LOADER, "Too many bad relocations, skipping logging"); + } + numErrors++; + continue; } - numErrors++; - continue; - } - u32 op = Memory::Read_Instruction(addr, true).encoding; + u32 op = Memory::ReadUnchecked_Instruction(addr, true).encoding; - const bool log = false; - //log=true; - if (log) { - DEBUG_LOG(LOADER,"rel at: %08x info: %08x type: %i",addr, info, type); - } - u32 relocateTo = segmentVAddr[relative]; + const bool log = false; + //log=true; + if (log) { + DEBUG_LOG(LOADER, "rel at: %08x info: %08x type: %i", addr, info, type); + } + u32 relocateTo = segmentVAddr[relative]; - switch (type) - { - case R_MIPS_32: - if (log) - DEBUG_LOG(LOADER,"Full address reloc %08x", addr); - //full address, no problemo - op += relocateTo; - break; + switch (type) { + case R_MIPS_32: + if (log) + DEBUG_LOG(LOADER, "Full address reloc %08x", addr); + //full address, no problemo + op += relocateTo; + break; - case R_MIPS_26: //j, jal - //add on to put in correct address space - if (log) - DEBUG_LOG(LOADER,"j/jal reloc %08x", addr); - op = (op & 0xFC000000) | (((op&0x03FFFFFF)+(relocateTo>>2))&0x03FFFFFF); - break; + case R_MIPS_26: //j, jal + //add on to put in correct address space + if (log) + DEBUG_LOG(LOADER, "j/jal reloc %08x", addr); + op = (op & 0xFC000000) | (((op & 0x03FFFFFF) + (relocateTo >> 2)) & 0x03FFFFFF); + break; - case R_MIPS_HI16: //lui part of lui-addiu pairs + case R_MIPS_HI16: //lui part of lui-addiu pairs { if (log) - DEBUG_LOG(LOADER,"HI reloc %08x", addr); + DEBUG_LOG(LOADER, "HI reloc %08x", addr); u32 cur = (op & 0xFFFF) << 16; u16 hi = 0; bool found = false; - for (int t = r + 1; t= nm.data_size + nm.bss_size + nm.text_size) { DEBUG_LOG(LOADER, "Zeroing out module %s memory: %08x - %08x", nm.name, memoryBlockAddr, memoryBlockAddr + memoryBlockSize); - for (u32 i = 0; i < (u32)(nm.text_size + 3); i += 4) { - Memory::Write_U32(MIPS_MAKE_BREAK(1), nm.text_addr + i); + u32 clearSize = Memory::ValidSize(nm.text_addr, (u32)nm.text_size + 3); + for (u32 i = 0; i < clearSize; i += 4) { + Memory::WriteUnchecked_U32(MIPS_MAKE_BREAK(1), nm.text_addr + i); } + NotifyMemInfo(MemBlockFlags::WRITE, nm.text_addr, clearSize, "ModuleClear"); Memory::Memset(nm.text_addr + nm.text_size, -1, nm.data_size + nm.bss_size, "ModuleClear"); // Let's also invalidate, just to make sure it's cleared out for any future data. @@ -1268,7 +1270,7 @@ static PSPModule *__KernelLoadELFFromPtr(const u8 *ptr, size_t elfSize, u32 load ElfReader reader((void*)ptr, elfSize); int result = reader.LoadInto(loadAddress, fromTop); - if (result != SCE_KERNEL_ERROR_OK) { + if (result != SCE_KERNEL_ERROR_OK) { ERROR_LOG(SCEMODULE, "LoadInto failed with error %08x",result); if (newptr) delete [] newptr; diff --git a/Core/HW/MemoryStick.cpp b/Core/HW/MemoryStick.cpp index f2520085bd..50f46a9321 100644 --- a/Core/HW/MemoryStick.cpp +++ b/Core/HW/MemoryStick.cpp @@ -16,6 +16,9 @@ // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include +#include +#include +#include #include "Common/Serialize/Serializer.h" #include "Common/Serialize/SerializeFuncs.h" #include "Core/CoreTiming.h" @@ -31,8 +34,20 @@ static bool memStickNeedsAssign = false; static u64 memStickInsertedAt = 0; static uint64_t memstickInitialFree = 0; -const u64 normalMemstickSize = 9ULL * 1024 * 1024 * 1024; -const u64 smallMemstickSize = 1ULL * 1024 * 1024 * 1024; +enum FreeCalcStatus { + NONE, + RUNNING, + DONE, + CLEANED_UP, +}; + +static std::thread freeCalcThread; +static std::condition_variable freeCalcCond; +static std::mutex freeCalcMutex; +static FreeCalcStatus freeCalcStatus = FreeCalcStatus::NONE; + +static const u64 normalMemstickSize = 9ULL * 1024 * 1024 * 1024; +static const u64 smallMemstickSize = 1ULL * 1024 * 1024 * 1024; void MemoryStick_DoState(PointerWrap &p) { auto s = p.Section("MemoryStick", 1, 5); @@ -75,7 +90,31 @@ u64 MemoryStick_SectorSize() { return 32 * 1024; // 32KB } +static void MemoryStick_CalcInitialFree() { + std::unique_lock guard(freeCalcMutex); + freeCalcStatus = FreeCalcStatus::RUNNING; + freeCalcThread = std::thread([] { + memstickInitialFree = pspFileSystem.FreeSpace("ms0:/") + pspFileSystem.getDirSize("ms0:/PSP/SAVEDATA/"); + + std::unique_lock guard(freeCalcMutex); + freeCalcStatus = FreeCalcStatus::DONE; + freeCalcCond.notify_all(); + }); +} + +static void MemoryStick_WaitInitialFree() { + std::unique_lock guard(freeCalcMutex); + while (freeCalcStatus == FreeCalcStatus::RUNNING) { + freeCalcCond.wait(guard); + } + if (freeCalcStatus == FreeCalcStatus::DONE) + freeCalcThread.join(); + freeCalcStatus = FreeCalcStatus::CLEANED_UP; +} + u64 MemoryStick_FreeSpace() { + MemoryStick_WaitInitialFree(); + const CompatFlags &flags = PSP_CoreParameter().compat.flags(); u64 realFreeSpace = pspFileSystem.FreeSpace("ms0:/"); @@ -135,5 +174,9 @@ void MemoryStick_Init() { } memStickNeedsAssign = false; - memstickInitialFree = pspFileSystem.FreeSpace("ms0:/") + pspFileSystem.getDirSize("ms0:/PSP/SAVEDATA/"); + MemoryStick_CalcInitialFree(); +} + +void MemoryStick_Shutdown() { + MemoryStick_WaitInitialFree(); } diff --git a/Core/HW/MemoryStick.h b/Core/HW/MemoryStick.h index f8bd295aba..481acf57b8 100644 --- a/Core/HW/MemoryStick.h +++ b/Core/HW/MemoryStick.h @@ -41,6 +41,7 @@ enum MemStickDriverState { }; void MemoryStick_Init(); +void MemoryStick_Shutdown(); void MemoryStick_DoState(PointerWrap &p); MemStickState MemoryStick_State(); MemStickFatState MemoryStick_FatState(); diff --git a/Core/MemMap.cpp b/Core/MemMap.cpp index 1f50b3b810..1dda153cb3 100644 --- a/Core/MemMap.cpp +++ b/Core/MemMap.cpp @@ -324,14 +324,10 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) { switch (p.mode) { case PointerWrap::MODE_READ: - GlobalThreadPool::Loop([&](int l, int h) { - memmove(d + l, storage + l, h - l); - }, 0, size); + GlobalThreadPool::Memcpy(d, storage, size); break; case PointerWrap::MODE_WRITE: - GlobalThreadPool::Loop([&](int l, int h) { - memmove(storage + l, d + l, h - l); - }, 0, size); + GlobalThreadPool::Memcpy(storage, d, size); break; case PointerWrap::MODE_MEASURE: // Nothing to do here. @@ -395,15 +391,6 @@ void Shutdown() { DEBUG_LOG(MEMMAP, "Memory system shut down."); } -void Clear() { - if (m_pPhysicalRAM) - memset(GetPointerUnchecked(PSP_GetKernelMemoryBase()), 0, g_MemorySize); - if (m_pPhysicalScratchPad) - memset(m_pPhysicalScratchPad, 0, SCRATCHPAD_SIZE); - if (m_pPhysicalVRAM1) - memset(m_pPhysicalVRAM1, 0, VRAM_SIZE); -} - bool IsActive() { return base != nullptr; } diff --git a/Core/ThreadPools.cpp b/Core/ThreadPools.cpp index 9af420cf37..e4eb7b5b9d 100644 --- a/Core/ThreadPools.cpp +++ b/Core/ThreadPools.cpp @@ -6,9 +6,19 @@ std::unique_ptr GlobalThreadPool::pool; std::once_flag GlobalThreadPool::init_flag; -void GlobalThreadPool::Loop(const std::function& loop, int lower, int upper) { +void GlobalThreadPool::Loop(const std::function& loop, int lower, int upper, int minSize) { std::call_once(init_flag, Inititialize); - pool->ParallelLoop(loop, lower, upper); + pool->ParallelLoop(loop, lower, upper, minSize); +} + +void GlobalThreadPool::Memcpy(void *dest, const void *src, int size) { + std::call_once(init_flag, Inititialize); + pool->ParallelMemcpy(dest, src, size); +} + +void GlobalThreadPool::Memset(void *dest, uint8_t val, int size) { + std::call_once(init_flag, Inititialize); + pool->ParallelMemset(dest, val, size); } void GlobalThreadPool::Inititialize() { diff --git a/Core/ThreadPools.h b/Core/ThreadPools.h index 350d56da1e..86af0afdad 100644 --- a/Core/ThreadPools.h +++ b/Core/ThreadPools.h @@ -6,7 +6,9 @@ class GlobalThreadPool { public: // will execute slices of "loop" from "lower" to "upper" // in parallel on the global thread pool - static void Loop(const std::function& loop, int lower, int upper); + static void Loop(const std::function& loop, int lower, int upper, int minSize = -1); + static void Memcpy(void *dest, const void *src, int size); + static void Memset(void *dest, uint8_t val, int size); private: static std::unique_ptr pool; diff --git a/GPU/Debugger/Record.cpp b/GPU/Debugger/Record.cpp index eca8ec9a68..90701dd2ef 100644 --- a/GPU/Debugger/Record.cpp +++ b/GPU/Debugger/Record.cpp @@ -16,6 +16,7 @@ // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. #include +#include #include #include #include @@ -32,6 +33,7 @@ #include "Core/HLE/sceDisplay.h" #include "Core/MemMap.h" #include "Core/System.h" +#include "Core/ThreadPools.h" #include "GPU/GPUInterface.h" #include "GPU/GPUState.h" #include "GPU/ge_constants.h" @@ -166,34 +168,47 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8 } const u8 *last_possible = haystack + hlen - nlen; + const u8 *first_possible = haystack + off; int first = *needle; - const u8 *p = haystack + off; - const uintptr_t align_mask = align - 1; - auto poffset = [&]() { - return ((uintptr_t)(p - haystack) & align_mask); - }; - auto alignp = [&]() { - uintptr_t offset = poffset(); - if (offset != 0) - p += align - offset; - }; + const u8 *result = nullptr; + std::mutex resultLock; - alignp(); - while (p <= last_possible) { - p = (const u8 *)memchr(p, first, last_possible - p + 1); - if (!p) { - return nullptr; - } - if (poffset() == 0 && !memcmp(p, needle, nlen)) { - return p; - } + int range = (int)(last_possible - first_possible); + GlobalThreadPool::Loop([&](int l, int h) { + const u8 *p = haystack + off + l; + const u8 *pend = haystack + off + h; + + const uintptr_t align_mask = align - 1; + auto poffset = [&]() { + return ((uintptr_t)(p - haystack) & align_mask); + }; + auto alignp = [&]() { + uintptr_t offset = poffset(); + if (offset != 0) + p += align - offset; + }; - p++; alignp(); - } + while (p <= pend) { + p = (const u8 *)memchr(p, first, pend - p + 1); + if (!p) { + return; + } + if (poffset() == 0 && !memcmp(p, needle, nlen)) { + std::lock_guard guard(resultLock); + // Take the lowest result so we get the same file for any # of threads. + if (!result || p < result) + result = p; + return; + } - return nullptr; + p++; + alignp(); + } + }, 0, range, 128 * 1024); + + return result; } static Command EmitCommandWithRAM(CommandType t, const void *p, u32 sz, u32 align) {