diff --git a/Common/Thread/ThreadPool.cpp b/Common/Thread/ThreadPool.cpp
index e64d1c8786..2624252a25 100644
--- a/Common/Thread/ThreadPool.cpp
+++ b/Common/Thread/ThreadPool.cpp
@@ -1,3 +1,5 @@
+#include <algorithm>
+#include <cstring>
 #include "Common/Thread/ThreadPool.h"
 #include "Common/Thread/ThreadUtil.h"
 
@@ -54,43 +56,27 @@ void WorkerThread::WorkFunc() {
 	}
 }
 
-void LoopWorkerThread::Process(std::function<void(int, int)> work, int start, int end) {
+void LoopWorkerThread::ProcessLoop(std::function<void(int, int)> work, int start, int end) {
 	std::lock_guard<std::mutex> guard(mutex);
-	work_ = std::move(work);
+	loopWork_ = std::move(work);
+	work_ = [this]() {
+		loopWork_(start_, end_);
+	};
 	start_ = start;
 	end_ = end;
 	jobsTarget = jobsDone + 1;
 	signal.notify_one();
 }
 
-void LoopWorkerThread::WorkFunc() {
-	setCurrentThreadName("LoopWorker");
-	std::unique_lock<std::mutex> guard(mutex);
-	while (active) {
-		// 'active == false' is one of the conditions for signaling,
-		// do not "optimize" it
-		while (active && jobsTarget <= jobsDone) {
-			signal.wait(guard);
-		}
-		if (active) {
-			work_(start_, end_);
-
-			std::lock_guard<std::mutex> doneGuard(doneMutex);
-			jobsDone++;
-			done.notify_one();
-		}
-	}
-}
-
 ///////////////////////////// ThreadPool
 
 ThreadPool::ThreadPool(int numThreads) {
 	if (numThreads <= 0) {
 		numThreads_ = 1;
 		INFO_LOG(JIT, "ThreadPool: Bad number of threads %d", numThreads);
-	} else if (numThreads > 8) {
-		INFO_LOG(JIT, "ThreadPool: Capping number of threads to 8 (was %d)", numThreads);
-		numThreads_ = 8;
+	} else if (numThreads > 16) {
+		INFO_LOG(JIT, "ThreadPool: Capping number of threads to 16 (was %d)", numThreads);
+		numThreads_ = 16;
 	} else {
 		numThreads_ = numThreads;
 	}
@@ -108,23 +94,32 @@ void ThreadPool::StartWorkers() {
 	}
 }
 
-void ThreadPool::ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper) {
+void ThreadPool::ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper, int minSize) {
+	// Don't parallelize tiny loops.
+	if (minSize == -1)
+		minSize = 4;
+
 	int range = upper - lower;
-	if (range >= numThreads_ * 2) { // don't parallelize tiny loops (this could be better, maybe add optional parameter that estimates work per iteration)
+	if (range >= minSize) {
 		std::lock_guard<std::mutex> guard(mutex);
 		StartWorkers();
 
 		// could do slightly better load balancing for the generic case, 
 		// but doesn't matter since all our loops are power of 2
-		int chunk = range / numThreads_;
+		int chunk = std::max(minSize, range / numThreads_);
 		int s = lower;
-		for (auto& worker : workers) {
-			worker->Process(loop, s, s+chunk);
-			s+=chunk;
+		for (auto &worker : workers) {
+			// We'll do the last chunk on the current thread.
+			if (s + chunk >= upper) {
+				break;
+			}
+			worker->ProcessLoop(loop, s, s + chunk);
+			s += chunk;
 		}
 		// This is the final chunk.
-		loop(s, upper);
-		for (auto& worker : workers) {
+		if (s < upper)
+			loop(s, upper);
+		for (auto &worker : workers) {
 			worker->WaitForCompletion();
 		}
 	} else {
@@ -132,3 +127,16 @@ void ThreadPool::ParallelLoop(const std::function<void(int,int)> &loop, int lowe
 	}
 }
 
+void ThreadPool::ParallelMemcpy(void *dest, const void *src, int size) {
+	static const int MIN_SIZE = 128 * 1024;
+	ParallelLoop([&](int l, int h) {
+		memmove((uint8_t *)dest + l, (const uint8_t *)src + l, h - l);
+	}, 0, size, MIN_SIZE);
+}
+
+void ThreadPool::ParallelMemset(void *dest, uint8_t val, int size) {
+	static const int MIN_SIZE = 128 * 1024;
+	ParallelLoop([&](int l, int h) {
+		memset((uint8_t *)dest + l, val, h - l);
+	}, 0, size, MIN_SIZE);
+}
diff --git a/Common/Thread/ThreadPool.h b/Common/Thread/ThreadPool.h
index 2064b31c28..ceac3c39c6 100644
--- a/Common/Thread/ThreadPool.h
+++ b/Common/Thread/ThreadPool.h
@@ -23,6 +23,8 @@ public:
 	void WaitForCompletion();
 
 protected:
+	virtual void WorkFunc();
+
 	std::thread thread; // the worker thread
 	std::condition_variable signal; // used to signal new work
 	std::condition_variable done; // used to signal work completion
@@ -30,11 +32,10 @@ protected:
 	bool active = true;
 	int jobsDone = 0;
 	int jobsTarget = 0;
-private:
-	virtual void WorkFunc();
 
 	std::function<void()> work_; // the work to be done by this thread
 
+private:
 	WorkerThread(const WorkerThread& other) = delete; // prevent copies
 	void operator =(const WorkerThread &other) = delete;
 };
@@ -42,14 +43,12 @@ private:
 class LoopWorkerThread final : public WorkerThread {
 public:
 	LoopWorkerThread() = default;
-	void Process(std::function<void(int, int)> work, int start, int end);
+	void ProcessLoop(std::function<void(int, int)> work, int start, int end);
 
 private:
-	virtual void WorkFunc() override;
-
 	int start_;
 	int end_;
-	std::function<void(int, int)> work_; // the work to be done by this thread
+	std::function<void(int, int)> loopWork_; // the work to be done by this thread
 };
 
 // A thread pool manages a set of worker threads, and allows the execution of parallel loops on them
@@ -61,7 +60,9 @@ public:
 	// don't need a destructor, "workers" is cleared on delete, 
 	// leading to the stopping and joining of all worker threads (RAII and all that)
 
-	void ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper);
+	void ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper, int minSize);
+	void ParallelMemcpy(void *dest, const void *src, int sz);
+	void ParallelMemset(void *dest, uint8_t val, int sz);
 
 private:
 	int numThreads_;
diff --git a/Core/ELF/ElfReader.cpp b/Core/ELF/ElfReader.cpp
index 69328d49e2..410bbb8d6d 100644
--- a/Core/ELF/ElfReader.cpp
+++ b/Core/ELF/ElfReader.cpp
@@ -17,6 +17,7 @@
 
 #include "Core/MemMap.h"
 #include "Core/Reporting.h"
+#include "Core/ThreadPools.h"
 #include "Core/MIPS/MIPSTables.h"
 #include "Core/ELF/ElfReader.h"
 #include "Core/Debugger/MemBlockInfo.h"
@@ -57,86 +58,83 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs)
 {
 	int numErrors = 0;
 	DEBUG_LOG(LOADER, "Loading %i relocations...", numRelocs);
-	for (int r = 0; r < numRelocs; r++)
-	{
-		// INFO_LOG(LOADER, "Loading reloc %i  (%p)...", r, rels + r);
-		u32 info = rels[r].r_info;
-		u32 addr = rels[r].r_offset;
+	GlobalThreadPool::Loop([&](int l, int h) {
+		for (int r = l; r < h; r++) {
+			VERBOSE_LOG(LOADER, "Loading reloc %i  (%p)...", r, rels + r);
+			u32 info = rels[r].r_info;
+			u32 addr = rels[r].r_offset;
 
-		int type = info & 0xf;
+			int type = info & 0xf;
 
-		int readwrite = (info>>8) & 0xff; 
-		int relative  = (info>>16) & 0xff;
+			int readwrite = (info >> 8) & 0xff;
+			int relative = (info >> 16) & 0xff;
 
-		//0 = code
-		//1 = data
+			//0 = code
+			//1 = data
 
-		if (readwrite >= (int)ARRAY_SIZE(segmentVAddr)) {
-			if (numErrors < 10) {
-				ERROR_LOG_REPORT(LOADER, "Bad segment number %i", readwrite);
+			if (readwrite >= (int)ARRAY_SIZE(segmentVAddr)) {
+				if (numErrors < 10) {
+					ERROR_LOG_REPORT(LOADER, "Bad segment number %i", readwrite);
+				}
+				numErrors++;
+				continue;
 			}
-			numErrors++;
-			continue;
-		}
 
-		addr += segmentVAddr[readwrite];
+			addr += segmentVAddr[readwrite];
 
-		// It appears that misaligned relocations are allowed.
-		// Will they work correctly on big-endian?
+			// It appears that misaligned relocations are allowed.
+			// Will they work correctly on big-endian?
 
-		if (((addr & 3) && type != R_MIPS_32) || !Memory::IsValidAddress(addr)) {
-			if (numErrors < 10) {
-				WARN_LOG_REPORT(LOADER, "Suspicious address %08x, skipping reloc, type = %d", addr, type);
-			} else if (numErrors == 10) {
-				WARN_LOG(LOADER, "Too many bad relocations, skipping logging");
+			if (((addr & 3) && type != R_MIPS_32) || !Memory::IsValidAddress(addr)) {
+				if (numErrors < 10) {
+					WARN_LOG_REPORT(LOADER, "Suspicious address %08x, skipping reloc, type = %d", addr, type);
+				} else if (numErrors == 10) {
+					WARN_LOG(LOADER, "Too many bad relocations, skipping logging");
+				}
+				numErrors++;
+				continue;
 			}
-			numErrors++;
-			continue;
-		}
 
-		u32 op = Memory::Read_Instruction(addr, true).encoding;
+			u32 op = Memory::ReadUnchecked_Instruction(addr, true).encoding;
 
-		const bool log = false;
-		//log=true;
-		if (log) {
-			DEBUG_LOG(LOADER,"rel at: %08x  info: %08x   type: %i",addr, info, type);
-		}
-		u32 relocateTo = segmentVAddr[relative];
+			const bool log = false;
+			//log=true;
+			if (log) {
+				DEBUG_LOG(LOADER, "rel at: %08x  info: %08x   type: %i", addr, info, type);
+			}
+			u32 relocateTo = segmentVAddr[relative];
 
-		switch (type) 
-		{
-		case R_MIPS_32:
-			if (log)
-				DEBUG_LOG(LOADER,"Full address reloc %08x", addr);
-			//full address, no problemo
-			op += relocateTo;
-			break;
+			switch (type) {
+			case R_MIPS_32:
+				if (log)
+					DEBUG_LOG(LOADER, "Full address reloc %08x", addr);
+				//full address, no problemo
+				op += relocateTo;
+				break;
 
-		case R_MIPS_26: //j, jal
-			//add on to put in correct address space
-			if (log)
-				DEBUG_LOG(LOADER,"j/jal reloc %08x", addr);
-			op = (op & 0xFC000000) | (((op&0x03FFFFFF)+(relocateTo>>2))&0x03FFFFFF);
-			break;
+			case R_MIPS_26: //j, jal
+				//add on to put in correct address space
+				if (log)
+					DEBUG_LOG(LOADER, "j/jal reloc %08x", addr);
+				op = (op & 0xFC000000) | (((op & 0x03FFFFFF) + (relocateTo >> 2)) & 0x03FFFFFF);
+				break;
 
-		case R_MIPS_HI16: //lui part of lui-addiu pairs
+			case R_MIPS_HI16: //lui part of lui-addiu pairs
 			{
 				if (log)
-					DEBUG_LOG(LOADER,"HI reloc %08x", addr);
+					DEBUG_LOG(LOADER, "HI reloc %08x", addr);
 
 				u32 cur = (op & 0xFFFF) << 16;
 				u16 hi = 0;
 				bool found = false;
-				for (int t = r + 1; t<numRelocs; t++)
-				{
-					if ((rels[t].r_info & 0xF) == R_MIPS_LO16) 
-					{
+				for (int t = r + 1; t < numRelocs; t++) {
+					if ((rels[t].r_info & 0xF) == R_MIPS_LO16) {
 						u32 corrLoAddr = rels[t].r_offset + segmentVAddr[readwrite];
 						if (log) {
-							DEBUG_LOG(LOADER,"Corresponding lo found at %08x", corrLoAddr);
+							DEBUG_LOG(LOADER, "Corresponding lo found at %08x", corrLoAddr);
 						}
 						if (Memory::IsValidAddress(corrLoAddr)) {
-							s16 lo = (s16)Memory::ReadUnchecked_U16(corrLoAddr);
+							s16 lo = (s16)Memory::ReadUnchecked_Instruction(corrLoAddr, true).encoding;
 							cur += lo;
 							cur += relocateTo;
 							addrToHiLo(cur, hi, lo);
@@ -150,14 +148,14 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs)
 				if (!found) {
 					ERROR_LOG_REPORT(LOADER, "R_MIPS_HI16: could not find R_MIPS_LO16");
 				}
-				op = (op & 0xFFFF0000) | (hi);
+				op = (op & 0xFFFF0000) | hi;
 			}
 			break;
 
-		case R_MIPS_LO16: //addiu part of lui-addiu pairs
+			case R_MIPS_LO16: //addiu part of lui-addiu pairs
 			{
 				if (log)
-					DEBUG_LOG(LOADER,"LO reloc %08x", addr);
+					DEBUG_LOG(LOADER, "LO reloc %08x", addr);
 				u32 cur = op & 0xFFFF;
 				cur += relocateTo;
 				cur &= 0xFFFF;
@@ -165,29 +163,32 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs)
 			}
 			break;
 
-		case R_MIPS_GPREL16: //gp
-			// It seems safe to ignore this, almost a notification of a gp-relative operation?
-			break;
+			case R_MIPS_GPREL16: //gp
+				// It seems safe to ignore this, almost a notification of a gp-relative operation?
+				break;
 
-		case R_MIPS_16:
-			op = (op & 0xFFFF0000) | (((int)(op & 0xFFFF) + (int)relocateTo) & 0xFFFF);
-			break;
+			case R_MIPS_16:
+				op = (op & 0xFFFF0000) | (((int)(op & 0xFFFF) + (int)relocateTo) & 0xFFFF);
+				break;
 
-		case R_MIPS_NONE:
-			// This shouldn't matter, not sure the purpose of it.
-			break;
+			case R_MIPS_NONE:
+				// This shouldn't matter, not sure the purpose of it.
+				break;
 
-		default:
+			default:
 			{
 				char temp[256];
 				MIPSDisAsm(MIPSOpcode(op), 0, temp);
-				ERROR_LOG_REPORT(LOADER,"ARGH IT'S AN UNKNOWN RELOCATION!!!!!!!! %08x, type=%d : %s", addr, type, temp);
+				ERROR_LOG_REPORT(LOADER, "ARGH IT'S AN UNKNOWN RELOCATION!!!!!!!! %08x, type=%d : %s", addr, type, temp);
 			}
 			break;
+			}
+
+			Memory::WriteUnchecked_U32(op, addr);
+			NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
 		}
-		Memory::Write_U32(op, addr);
-		NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
-	}
+	}, 0, numRelocs, 32);
+
 	if (numErrors) {
 		WARN_LOG(LOADER, "%i bad relocations found!!!", numErrors);
 	}
diff --git a/Core/HLE/sceIo.cpp b/Core/HLE/sceIo.cpp
index 7706c08620..e292aad9aa 100644
--- a/Core/HLE/sceIo.cpp
+++ b/Core/HLE/sceIo.cpp
@@ -774,6 +774,7 @@ void __IoShutdown() {
 	delete flash0System;
 	flash0System = nullptr;
 
+	MemoryStick_Shutdown();
 	memStickCallbacks.clear();
 	memStickFatCallbacks.clear();
 }
diff --git a/Core/HLE/sceKernelMemory.cpp b/Core/HLE/sceKernelMemory.cpp
index fa8a80b61f..634eb43f58 100644
--- a/Core/HLE/sceKernelMemory.cpp
+++ b/Core/HLE/sceKernelMemory.cpp
@@ -24,10 +24,11 @@
 #include "Core/Debugger/MemBlockInfo.h"
 #include "Core/HLE/HLE.h"
 #include "Core/HLE/FunctionWrappers.h"
-#include "Core/System.h"
 #include "Core/MIPS/MIPS.h"
 #include "Core/MemMapHelpers.h"
 #include "Core/Reporting.h"
+#include "Core/System.h"
+#include "Core/ThreadPools.h"
 #include "Common/Serialize/Serializer.h"
 #include "Common/Serialize/SerializeFuncs.h"
 #include "Common/Serialize/SerializeMap.h"
@@ -430,8 +431,8 @@ void __KernelMemoryInit()
 	MemBlockInfoInit();
 	kernelMemory.Init(PSP_GetKernelMemoryBase(), PSP_GetKernelMemoryEnd() - PSP_GetKernelMemoryBase(), false);
 	userMemory.Init(PSP_GetUserMemoryBase(), PSP_GetUserMemoryEnd() - PSP_GetUserMemoryBase(), false);
-	Memory::Memset(PSP_GetKernelMemoryBase(), 0, PSP_GetKernelMemoryEnd() - PSP_GetKernelMemoryBase(), "MemInit");
-	Memory::Memset(PSP_GetUserMemoryBase(), 0, PSP_GetUserMemoryEnd() - PSP_GetUserMemoryBase(), "MemInit");
+	GlobalThreadPool::Memset(Memory::GetPointer(PSP_GetKernelMemoryBase()), 0, PSP_GetUserMemoryEnd() - PSP_GetKernelMemoryBase());
+	NotifyMemInfo(MemBlockFlags::WRITE, PSP_GetKernelMemoryBase(), PSP_GetUserMemoryEnd() - PSP_GetKernelMemoryBase(), "MemInit");
 	INFO_LOG(SCEKERNEL, "Kernel and user memory pools initialized");
 
 	vplWaitTimer = CoreTiming::RegisterEvent("VplTimeout", __KernelVplTimeout);
diff --git a/Core/HLE/sceKernelModule.cpp b/Core/HLE/sceKernelModule.cpp
index 1501681784..4607c75efd 100644
--- a/Core/HLE/sceKernelModule.cpp
+++ b/Core/HLE/sceKernelModule.cpp
@@ -863,9 +863,11 @@ void PSPModule::Cleanup() {
 
 	if (memoryBlockAddr != 0 && nm.text_addr != 0 && memoryBlockSize >= nm.data_size + nm.bss_size + nm.text_size) {
 		DEBUG_LOG(LOADER, "Zeroing out module %s memory: %08x - %08x", nm.name, memoryBlockAddr, memoryBlockAddr + memoryBlockSize);
-		for (u32 i = 0; i < (u32)(nm.text_size + 3); i += 4) {
-			Memory::Write_U32(MIPS_MAKE_BREAK(1), nm.text_addr + i);
+		u32 clearSize = Memory::ValidSize(nm.text_addr, (u32)nm.text_size + 3);
+		for (u32 i = 0; i < clearSize; i += 4) {
+			Memory::WriteUnchecked_U32(MIPS_MAKE_BREAK(1), nm.text_addr + i);
 		}
+		NotifyMemInfo(MemBlockFlags::WRITE, nm.text_addr, clearSize, "ModuleClear");
 		Memory::Memset(nm.text_addr + nm.text_size, -1, nm.data_size + nm.bss_size, "ModuleClear");
 
 		// Let's also invalidate, just to make sure it's cleared out for any future data.
@@ -1268,7 +1270,7 @@ static PSPModule *__KernelLoadELFFromPtr(const u8 *ptr, size_t elfSize, u32 load
 	ElfReader reader((void*)ptr, elfSize);
 
 	int result = reader.LoadInto(loadAddress, fromTop);
-	if (result != SCE_KERNEL_ERROR_OK) 	{
+	if (result != SCE_KERNEL_ERROR_OK) {
 		ERROR_LOG(SCEMODULE, "LoadInto failed with error %08x",result);
 		if (newptr)
 			delete [] newptr;
diff --git a/Core/HW/MemoryStick.cpp b/Core/HW/MemoryStick.cpp
index f2520085bd..50f46a9321 100644
--- a/Core/HW/MemoryStick.cpp
+++ b/Core/HW/MemoryStick.cpp
@@ -16,6 +16,9 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include <algorithm>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
 #include "Common/Serialize/Serializer.h"
 #include "Common/Serialize/SerializeFuncs.h"
 #include "Core/CoreTiming.h"
@@ -31,8 +34,20 @@ static bool memStickNeedsAssign = false;
 static u64 memStickInsertedAt = 0;
 static uint64_t memstickInitialFree = 0;
 
-const u64 normalMemstickSize = 9ULL * 1024 * 1024 * 1024;
-const u64 smallMemstickSize = 1ULL * 1024 * 1024 * 1024;
+enum FreeCalcStatus {
+	NONE,
+	RUNNING,
+	DONE,
+	CLEANED_UP,
+};
+
+static std::thread freeCalcThread;
+static std::condition_variable freeCalcCond;
+static std::mutex freeCalcMutex;
+static FreeCalcStatus freeCalcStatus = FreeCalcStatus::NONE;
+
+static const u64 normalMemstickSize = 9ULL * 1024 * 1024 * 1024;
+static const u64 smallMemstickSize = 1ULL * 1024 * 1024 * 1024;
 
 void MemoryStick_DoState(PointerWrap &p) {
 	auto s = p.Section("MemoryStick", 1, 5);
@@ -75,7 +90,31 @@ u64 MemoryStick_SectorSize() {
 	return 32 * 1024; // 32KB
 }
 
+static void MemoryStick_CalcInitialFree() {
+	std::unique_lock<std::mutex> guard(freeCalcMutex);
+	freeCalcStatus = FreeCalcStatus::RUNNING;
+	freeCalcThread = std::thread([] {
+		memstickInitialFree = pspFileSystem.FreeSpace("ms0:/") + pspFileSystem.getDirSize("ms0:/PSP/SAVEDATA/");
+
+		std::unique_lock<std::mutex> guard(freeCalcMutex);
+		freeCalcStatus = FreeCalcStatus::DONE;
+		freeCalcCond.notify_all();
+	});
+}
+
+static void MemoryStick_WaitInitialFree() {
+	std::unique_lock<std::mutex> guard(freeCalcMutex);
+	while (freeCalcStatus == FreeCalcStatus::RUNNING) {
+		freeCalcCond.wait(guard);
+	}
+	if (freeCalcStatus == FreeCalcStatus::DONE)
+		freeCalcThread.join();
+	freeCalcStatus = FreeCalcStatus::CLEANED_UP;
+}
+
 u64 MemoryStick_FreeSpace() {
+	MemoryStick_WaitInitialFree();
+
 	const CompatFlags &flags = PSP_CoreParameter().compat.flags();
 	u64 realFreeSpace = pspFileSystem.FreeSpace("ms0:/");
 
@@ -135,5 +174,9 @@ void MemoryStick_Init() {
 	}
 
 	memStickNeedsAssign = false;
-	memstickInitialFree = pspFileSystem.FreeSpace("ms0:/") + pspFileSystem.getDirSize("ms0:/PSP/SAVEDATA/");
+	MemoryStick_CalcInitialFree();
+}
+
+void MemoryStick_Shutdown() {
+	MemoryStick_WaitInitialFree();
 }
diff --git a/Core/HW/MemoryStick.h b/Core/HW/MemoryStick.h
index f8bd295aba..481acf57b8 100644
--- a/Core/HW/MemoryStick.h
+++ b/Core/HW/MemoryStick.h
@@ -41,6 +41,7 @@ enum MemStickDriverState {
 };
 
 void MemoryStick_Init();
+void MemoryStick_Shutdown();
 void MemoryStick_DoState(PointerWrap &p);
 MemStickState MemoryStick_State();
 MemStickFatState MemoryStick_FatState();
diff --git a/Core/MemMap.cpp b/Core/MemMap.cpp
index 1f50b3b810..1dda153cb3 100644
--- a/Core/MemMap.cpp
+++ b/Core/MemMap.cpp
@@ -324,14 +324,10 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) {
 
 	switch (p.mode) {
 	case PointerWrap::MODE_READ:
-		GlobalThreadPool::Loop([&](int l, int h) {
-			memmove(d + l, storage + l, h - l);
-		}, 0, size);
+		GlobalThreadPool::Memcpy(d, storage, size);
 		break;
 	case PointerWrap::MODE_WRITE:
-		GlobalThreadPool::Loop([&](int l, int h) {
-			memmove(storage + l, d + l, h - l);
-		}, 0, size);
+		GlobalThreadPool::Memcpy(storage, d, size);
 		break;
 	case PointerWrap::MODE_MEASURE:
 		// Nothing to do here.
@@ -395,15 +391,6 @@ void Shutdown() {
 	DEBUG_LOG(MEMMAP, "Memory system shut down.");
 }
 
-void Clear() {
-	if (m_pPhysicalRAM)
-		memset(GetPointerUnchecked(PSP_GetKernelMemoryBase()), 0, g_MemorySize);
-	if (m_pPhysicalScratchPad)
-		memset(m_pPhysicalScratchPad, 0, SCRATCHPAD_SIZE);
-	if (m_pPhysicalVRAM1)
-		memset(m_pPhysicalVRAM1, 0, VRAM_SIZE);
-}
-
 bool IsActive() {
 	return base != nullptr;
 }
diff --git a/Core/ThreadPools.cpp b/Core/ThreadPools.cpp
index 9af420cf37..e4eb7b5b9d 100644
--- a/Core/ThreadPools.cpp
+++ b/Core/ThreadPools.cpp
@@ -6,9 +6,19 @@
 std::unique_ptr<ThreadPool> GlobalThreadPool::pool;
 std::once_flag GlobalThreadPool::init_flag;
 
-void GlobalThreadPool::Loop(const std::function<void(int,int)>& loop, int lower, int upper) {
+void GlobalThreadPool::Loop(const std::function<void(int,int)>& loop, int lower, int upper, int minSize) {
 	std::call_once(init_flag, Inititialize);
-	pool->ParallelLoop(loop, lower, upper);
+	pool->ParallelLoop(loop, lower, upper, minSize);
+}
+
+void GlobalThreadPool::Memcpy(void *dest, const void *src, int size) {
+	std::call_once(init_flag, Inititialize);
+	pool->ParallelMemcpy(dest, src, size);
+}
+
+void GlobalThreadPool::Memset(void *dest, uint8_t val, int size) {
+	std::call_once(init_flag, Inititialize);
+	pool->ParallelMemset(dest, val, size);
 }
 
 void GlobalThreadPool::Inititialize() {
diff --git a/Core/ThreadPools.h b/Core/ThreadPools.h
index 350d56da1e..86af0afdad 100644
--- a/Core/ThreadPools.h
+++ b/Core/ThreadPools.h
@@ -6,7 +6,9 @@ class GlobalThreadPool {
 public:
 	// will execute slices of "loop" from "lower" to "upper"
 	// in parallel on the global thread pool
-	static void Loop(const std::function<void(int,int)>& loop, int lower, int upper);
+	static void Loop(const std::function<void(int,int)>& loop, int lower, int upper, int minSize = -1);
+	static void Memcpy(void *dest, const void *src, int size);
+	static void Memset(void *dest, uint8_t val, int size);
 
 private:
 	static std::unique_ptr<ThreadPool> pool;
diff --git a/GPU/Debugger/Record.cpp b/GPU/Debugger/Record.cpp
index eca8ec9a68..90701dd2ef 100644
--- a/GPU/Debugger/Record.cpp
+++ b/GPU/Debugger/Record.cpp
@@ -16,6 +16,7 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include <algorithm>
+#include <atomic>
 #include <cstring>
 #include <functional>
 #include <set>
@@ -32,6 +33,7 @@
 #include "Core/HLE/sceDisplay.h"
 #include "Core/MemMap.h"
 #include "Core/System.h"
+#include "Core/ThreadPools.h"
 #include "GPU/GPUInterface.h"
 #include "GPU/GPUState.h"
 #include "GPU/ge_constants.h"
@@ -166,34 +168,47 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8
 	}
 
 	const u8 *last_possible = haystack + hlen - nlen;
+	const u8 *first_possible = haystack + off;
 	int first = *needle;
-	const u8 *p = haystack + off;
 
-	const uintptr_t align_mask = align - 1;
-	auto poffset = [&]() {
-		return ((uintptr_t)(p - haystack) & align_mask);
-	};
-	auto alignp = [&]() {
-		uintptr_t offset = poffset();
-		if (offset != 0)
-			p += align - offset;
-	};
+	const u8 *result = nullptr;
+	std::mutex resultLock;
 
-	alignp();
-	while (p <= last_possible) {
-		p = (const u8 *)memchr(p, first, last_possible - p + 1);
-		if (!p) {
-			return nullptr;
-		}
-		if (poffset() == 0 && !memcmp(p, needle, nlen)) {
-			return p;
-		}
+	int range = (int)(last_possible - first_possible);
+	GlobalThreadPool::Loop([&](int l, int h) {
+		const u8 *p = haystack + off + l;
+		const u8 *pend = haystack + off + h;
+
+		const uintptr_t align_mask = align - 1;
+		auto poffset = [&]() {
+			return ((uintptr_t)(p - haystack) & align_mask);
+		};
+		auto alignp = [&]() {
+			uintptr_t offset = poffset();
+			if (offset != 0)
+				p += align - offset;
+		};
 
-		p++;
 		alignp();
-	}
+		while (p <= pend) {
+			p = (const u8 *)memchr(p, first, pend - p + 1);
+			if (!p) {
+				return;
+			}
+			if (poffset() == 0 && !memcmp(p, needle, nlen)) {
+				std::lock_guard<std::mutex> guard(resultLock);
+				// Take the lowest result so we get the same file for any # of threads.
+				if (!result || p < result)
+					result = p;
+				return;
+			}
 
-	return nullptr;
+			p++;
+			alignp();
+		}
+	}, 0, range, 128 * 1024);
+
+	return result;
 }
 
 static Command EmitCommandWithRAM(CommandType t, const void *p, u32 sz, u32 align) {