Merge pull request #14383 from unknownbrackets/threadpool

Use threads more on startup to improve load time
This commit is contained in:
Henrik Rydgård 2021-04-18 09:57:33 +02:00 committed by GitHub
commit 8e953b2a97
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 233 additions and 161 deletions

View file

@ -1,3 +1,5 @@
#include <algorithm>
#include <cstring>
#include "Common/Thread/ThreadPool.h"
#include "Common/Thread/ThreadUtil.h"
@ -54,43 +56,27 @@ void WorkerThread::WorkFunc() {
}
}
void LoopWorkerThread::Process(std::function<void(int, int)> work, int start, int end) {
void LoopWorkerThread::ProcessLoop(std::function<void(int, int)> work, int start, int end) {
std::lock_guard<std::mutex> guard(mutex);
work_ = std::move(work);
loopWork_ = std::move(work);
work_ = [this]() {
loopWork_(start_, end_);
};
start_ = start;
end_ = end;
jobsTarget = jobsDone + 1;
signal.notify_one();
}
void LoopWorkerThread::WorkFunc() {
setCurrentThreadName("LoopWorker");
std::unique_lock<std::mutex> guard(mutex);
while (active) {
// 'active == false' is one of the conditions for signaling,
// do not "optimize" it
while (active && jobsTarget <= jobsDone) {
signal.wait(guard);
}
if (active) {
work_(start_, end_);
std::lock_guard<std::mutex> doneGuard(doneMutex);
jobsDone++;
done.notify_one();
}
}
}
///////////////////////////// ThreadPool
ThreadPool::ThreadPool(int numThreads) {
if (numThreads <= 0) {
numThreads_ = 1;
INFO_LOG(JIT, "ThreadPool: Bad number of threads %d", numThreads);
} else if (numThreads > 8) {
INFO_LOG(JIT, "ThreadPool: Capping number of threads to 8 (was %d)", numThreads);
numThreads_ = 8;
} else if (numThreads > 16) {
INFO_LOG(JIT, "ThreadPool: Capping number of threads to 16 (was %d)", numThreads);
numThreads_ = 16;
} else {
numThreads_ = numThreads;
}
@ -108,23 +94,32 @@ void ThreadPool::StartWorkers() {
}
}
void ThreadPool::ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper) {
void ThreadPool::ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper, int minSize) {
// Don't parallelize tiny loops.
if (minSize == -1)
minSize = 4;
int range = upper - lower;
if (range >= numThreads_ * 2) { // don't parallelize tiny loops (this could be better, maybe add optional parameter that estimates work per iteration)
if (range >= minSize) {
std::lock_guard<std::mutex> guard(mutex);
StartWorkers();
// could do slightly better load balancing for the generic case,
// but doesn't matter since all our loops are power of 2
int chunk = range / numThreads_;
int chunk = std::max(minSize, range / numThreads_);
int s = lower;
for (auto& worker : workers) {
worker->Process(loop, s, s+chunk);
s+=chunk;
for (auto &worker : workers) {
// We'll do the last chunk on the current thread.
if (s + chunk >= upper) {
break;
}
worker->ProcessLoop(loop, s, s + chunk);
s += chunk;
}
// This is the final chunk.
loop(s, upper);
for (auto& worker : workers) {
if (s < upper)
loop(s, upper);
for (auto &worker : workers) {
worker->WaitForCompletion();
}
} else {
@ -132,3 +127,16 @@ void ThreadPool::ParallelLoop(const std::function<void(int,int)> &loop, int lowe
}
}
void ThreadPool::ParallelMemcpy(void *dest, const void *src, int size) {
static const int MIN_SIZE = 128 * 1024;
ParallelLoop([&](int l, int h) {
memmove((uint8_t *)dest + l, (const uint8_t *)src + l, h - l);
}, 0, size, MIN_SIZE);
}
void ThreadPool::ParallelMemset(void *dest, uint8_t val, int size) {
static const int MIN_SIZE = 128 * 1024;
ParallelLoop([&](int l, int h) {
memset((uint8_t *)dest + l, val, h - l);
}, 0, size, MIN_SIZE);
}

View file

@ -23,6 +23,8 @@ public:
void WaitForCompletion();
protected:
virtual void WorkFunc();
std::thread thread; // the worker thread
std::condition_variable signal; // used to signal new work
std::condition_variable done; // used to signal work completion
@ -30,11 +32,10 @@ protected:
bool active = true;
int jobsDone = 0;
int jobsTarget = 0;
private:
virtual void WorkFunc();
std::function<void()> work_; // the work to be done by this thread
private:
WorkerThread(const WorkerThread& other) = delete; // prevent copies
void operator =(const WorkerThread &other) = delete;
};
@ -42,14 +43,12 @@ private:
class LoopWorkerThread final : public WorkerThread {
public:
LoopWorkerThread() = default;
void Process(std::function<void(int, int)> work, int start, int end);
void ProcessLoop(std::function<void(int, int)> work, int start, int end);
private:
virtual void WorkFunc() override;
int start_;
int end_;
std::function<void(int, int)> work_; // the work to be done by this thread
std::function<void(int, int)> loopWork_; // the work to be done by this thread
};
// A thread pool manages a set of worker threads, and allows the execution of parallel loops on them
@ -61,7 +60,9 @@ public:
// don't need a destructor, "workers" is cleared on delete,
// leading to the stopping and joining of all worker threads (RAII and all that)
void ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper);
void ParallelLoop(const std::function<void(int,int)> &loop, int lower, int upper, int minSize);
void ParallelMemcpy(void *dest, const void *src, int sz);
void ParallelMemset(void *dest, uint8_t val, int sz);
private:
int numThreads_;

View file

@ -17,6 +17,7 @@
#include "Core/MemMap.h"
#include "Core/Reporting.h"
#include "Core/ThreadPools.h"
#include "Core/MIPS/MIPSTables.h"
#include "Core/ELF/ElfReader.h"
#include "Core/Debugger/MemBlockInfo.h"
@ -57,86 +58,83 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs)
{
int numErrors = 0;
DEBUG_LOG(LOADER, "Loading %i relocations...", numRelocs);
for (int r = 0; r < numRelocs; r++)
{
// INFO_LOG(LOADER, "Loading reloc %i (%p)...", r, rels + r);
u32 info = rels[r].r_info;
u32 addr = rels[r].r_offset;
GlobalThreadPool::Loop([&](int l, int h) {
for (int r = l; r < h; r++) {
VERBOSE_LOG(LOADER, "Loading reloc %i (%p)...", r, rels + r);
u32 info = rels[r].r_info;
u32 addr = rels[r].r_offset;
int type = info & 0xf;
int type = info & 0xf;
int readwrite = (info>>8) & 0xff;
int relative = (info>>16) & 0xff;
int readwrite = (info >> 8) & 0xff;
int relative = (info >> 16) & 0xff;
//0 = code
//1 = data
//0 = code
//1 = data
if (readwrite >= (int)ARRAY_SIZE(segmentVAddr)) {
if (numErrors < 10) {
ERROR_LOG_REPORT(LOADER, "Bad segment number %i", readwrite);
if (readwrite >= (int)ARRAY_SIZE(segmentVAddr)) {
if (numErrors < 10) {
ERROR_LOG_REPORT(LOADER, "Bad segment number %i", readwrite);
}
numErrors++;
continue;
}
numErrors++;
continue;
}
addr += segmentVAddr[readwrite];
addr += segmentVAddr[readwrite];
// It appears that misaligned relocations are allowed.
// Will they work correctly on big-endian?
// It appears that misaligned relocations are allowed.
// Will they work correctly on big-endian?
if (((addr & 3) && type != R_MIPS_32) || !Memory::IsValidAddress(addr)) {
if (numErrors < 10) {
WARN_LOG_REPORT(LOADER, "Suspicious address %08x, skipping reloc, type = %d", addr, type);
} else if (numErrors == 10) {
WARN_LOG(LOADER, "Too many bad relocations, skipping logging");
if (((addr & 3) && type != R_MIPS_32) || !Memory::IsValidAddress(addr)) {
if (numErrors < 10) {
WARN_LOG_REPORT(LOADER, "Suspicious address %08x, skipping reloc, type = %d", addr, type);
} else if (numErrors == 10) {
WARN_LOG(LOADER, "Too many bad relocations, skipping logging");
}
numErrors++;
continue;
}
numErrors++;
continue;
}
u32 op = Memory::Read_Instruction(addr, true).encoding;
u32 op = Memory::ReadUnchecked_Instruction(addr, true).encoding;
const bool log = false;
//log=true;
if (log) {
DEBUG_LOG(LOADER,"rel at: %08x info: %08x type: %i",addr, info, type);
}
u32 relocateTo = segmentVAddr[relative];
const bool log = false;
//log=true;
if (log) {
DEBUG_LOG(LOADER, "rel at: %08x info: %08x type: %i", addr, info, type);
}
u32 relocateTo = segmentVAddr[relative];
switch (type)
{
case R_MIPS_32:
if (log)
DEBUG_LOG(LOADER,"Full address reloc %08x", addr);
//full address, no problemo
op += relocateTo;
break;
switch (type) {
case R_MIPS_32:
if (log)
DEBUG_LOG(LOADER, "Full address reloc %08x", addr);
//full address, no problemo
op += relocateTo;
break;
case R_MIPS_26: //j, jal
//add on to put in correct address space
if (log)
DEBUG_LOG(LOADER,"j/jal reloc %08x", addr);
op = (op & 0xFC000000) | (((op&0x03FFFFFF)+(relocateTo>>2))&0x03FFFFFF);
break;
case R_MIPS_26: //j, jal
//add on to put in correct address space
if (log)
DEBUG_LOG(LOADER, "j/jal reloc %08x", addr);
op = (op & 0xFC000000) | (((op & 0x03FFFFFF) + (relocateTo >> 2)) & 0x03FFFFFF);
break;
case R_MIPS_HI16: //lui part of lui-addiu pairs
case R_MIPS_HI16: //lui part of lui-addiu pairs
{
if (log)
DEBUG_LOG(LOADER,"HI reloc %08x", addr);
DEBUG_LOG(LOADER, "HI reloc %08x", addr);
u32 cur = (op & 0xFFFF) << 16;
u16 hi = 0;
bool found = false;
for (int t = r + 1; t<numRelocs; t++)
{
if ((rels[t].r_info & 0xF) == R_MIPS_LO16)
{
for (int t = r + 1; t < numRelocs; t++) {
if ((rels[t].r_info & 0xF) == R_MIPS_LO16) {
u32 corrLoAddr = rels[t].r_offset + segmentVAddr[readwrite];
if (log) {
DEBUG_LOG(LOADER,"Corresponding lo found at %08x", corrLoAddr);
DEBUG_LOG(LOADER, "Corresponding lo found at %08x", corrLoAddr);
}
if (Memory::IsValidAddress(corrLoAddr)) {
s16 lo = (s16)Memory::ReadUnchecked_U16(corrLoAddr);
s16 lo = (s16)Memory::ReadUnchecked_Instruction(corrLoAddr, true).encoding;
cur += lo;
cur += relocateTo;
addrToHiLo(cur, hi, lo);
@ -150,14 +148,14 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs)
if (!found) {
ERROR_LOG_REPORT(LOADER, "R_MIPS_HI16: could not find R_MIPS_LO16");
}
op = (op & 0xFFFF0000) | (hi);
op = (op & 0xFFFF0000) | hi;
}
break;
case R_MIPS_LO16: //addiu part of lui-addiu pairs
case R_MIPS_LO16: //addiu part of lui-addiu pairs
{
if (log)
DEBUG_LOG(LOADER,"LO reloc %08x", addr);
DEBUG_LOG(LOADER, "LO reloc %08x", addr);
u32 cur = op & 0xFFFF;
cur += relocateTo;
cur &= 0xFFFF;
@ -165,29 +163,32 @@ bool ElfReader::LoadRelocations(const Elf32_Rel *rels, int numRelocs)
}
break;
case R_MIPS_GPREL16: //gp
// It seems safe to ignore this, almost a notification of a gp-relative operation?
break;
case R_MIPS_GPREL16: //gp
// It seems safe to ignore this, almost a notification of a gp-relative operation?
break;
case R_MIPS_16:
op = (op & 0xFFFF0000) | (((int)(op & 0xFFFF) + (int)relocateTo) & 0xFFFF);
break;
case R_MIPS_16:
op = (op & 0xFFFF0000) | (((int)(op & 0xFFFF) + (int)relocateTo) & 0xFFFF);
break;
case R_MIPS_NONE:
// This shouldn't matter, not sure the purpose of it.
break;
case R_MIPS_NONE:
// This shouldn't matter, not sure the purpose of it.
break;
default:
default:
{
char temp[256];
MIPSDisAsm(MIPSOpcode(op), 0, temp);
ERROR_LOG_REPORT(LOADER,"ARGH IT'S AN UNKNOWN RELOCATION!!!!!!!! %08x, type=%d : %s", addr, type, temp);
ERROR_LOG_REPORT(LOADER, "ARGH IT'S AN UNKNOWN RELOCATION!!!!!!!! %08x, type=%d : %s", addr, type, temp);
}
break;
}
Memory::WriteUnchecked_U32(op, addr);
NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
}
Memory::Write_U32(op, addr);
NotifyMemInfo(MemBlockFlags::WRITE, addr, 4, "Relocation");
}
}, 0, numRelocs, 32);
if (numErrors) {
WARN_LOG(LOADER, "%i bad relocations found!!!", numErrors);
}

View file

@ -774,6 +774,7 @@ void __IoShutdown() {
delete flash0System;
flash0System = nullptr;
MemoryStick_Shutdown();
memStickCallbacks.clear();
memStickFatCallbacks.clear();
}

View file

@ -24,10 +24,11 @@
#include "Core/Debugger/MemBlockInfo.h"
#include "Core/HLE/HLE.h"
#include "Core/HLE/FunctionWrappers.h"
#include "Core/System.h"
#include "Core/MIPS/MIPS.h"
#include "Core/MemMapHelpers.h"
#include "Core/Reporting.h"
#include "Core/System.h"
#include "Core/ThreadPools.h"
#include "Common/Serialize/Serializer.h"
#include "Common/Serialize/SerializeFuncs.h"
#include "Common/Serialize/SerializeMap.h"
@ -430,8 +431,8 @@ void __KernelMemoryInit()
MemBlockInfoInit();
kernelMemory.Init(PSP_GetKernelMemoryBase(), PSP_GetKernelMemoryEnd() - PSP_GetKernelMemoryBase(), false);
userMemory.Init(PSP_GetUserMemoryBase(), PSP_GetUserMemoryEnd() - PSP_GetUserMemoryBase(), false);
Memory::Memset(PSP_GetKernelMemoryBase(), 0, PSP_GetKernelMemoryEnd() - PSP_GetKernelMemoryBase(), "MemInit");
Memory::Memset(PSP_GetUserMemoryBase(), 0, PSP_GetUserMemoryEnd() - PSP_GetUserMemoryBase(), "MemInit");
GlobalThreadPool::Memset(Memory::GetPointer(PSP_GetKernelMemoryBase()), 0, PSP_GetUserMemoryEnd() - PSP_GetKernelMemoryBase());
NotifyMemInfo(MemBlockFlags::WRITE, PSP_GetKernelMemoryBase(), PSP_GetUserMemoryEnd() - PSP_GetKernelMemoryBase(), "MemInit");
INFO_LOG(SCEKERNEL, "Kernel and user memory pools initialized");
vplWaitTimer = CoreTiming::RegisterEvent("VplTimeout", __KernelVplTimeout);

View file

@ -863,9 +863,11 @@ void PSPModule::Cleanup() {
if (memoryBlockAddr != 0 && nm.text_addr != 0 && memoryBlockSize >= nm.data_size + nm.bss_size + nm.text_size) {
DEBUG_LOG(LOADER, "Zeroing out module %s memory: %08x - %08x", nm.name, memoryBlockAddr, memoryBlockAddr + memoryBlockSize);
for (u32 i = 0; i < (u32)(nm.text_size + 3); i += 4) {
Memory::Write_U32(MIPS_MAKE_BREAK(1), nm.text_addr + i);
u32 clearSize = Memory::ValidSize(nm.text_addr, (u32)nm.text_size + 3);
for (u32 i = 0; i < clearSize; i += 4) {
Memory::WriteUnchecked_U32(MIPS_MAKE_BREAK(1), nm.text_addr + i);
}
NotifyMemInfo(MemBlockFlags::WRITE, nm.text_addr, clearSize, "ModuleClear");
Memory::Memset(nm.text_addr + nm.text_size, -1, nm.data_size + nm.bss_size, "ModuleClear");
// Let's also invalidate, just to make sure it's cleared out for any future data.
@ -1268,7 +1270,7 @@ static PSPModule *__KernelLoadELFFromPtr(const u8 *ptr, size_t elfSize, u32 load
ElfReader reader((void*)ptr, elfSize);
int result = reader.LoadInto(loadAddress, fromTop);
if (result != SCE_KERNEL_ERROR_OK) {
if (result != SCE_KERNEL_ERROR_OK) {
ERROR_LOG(SCEMODULE, "LoadInto failed with error %08x",result);
if (newptr)
delete [] newptr;

View file

@ -16,6 +16,9 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include <algorithm>
#include <condition_variable>
#include <mutex>
#include <thread>
#include "Common/Serialize/Serializer.h"
#include "Common/Serialize/SerializeFuncs.h"
#include "Core/CoreTiming.h"
@ -31,8 +34,20 @@ static bool memStickNeedsAssign = false;
static u64 memStickInsertedAt = 0;
static uint64_t memstickInitialFree = 0;
const u64 normalMemstickSize = 9ULL * 1024 * 1024 * 1024;
const u64 smallMemstickSize = 1ULL * 1024 * 1024 * 1024;
enum FreeCalcStatus {
NONE,
RUNNING,
DONE,
CLEANED_UP,
};
static std::thread freeCalcThread;
static std::condition_variable freeCalcCond;
static std::mutex freeCalcMutex;
static FreeCalcStatus freeCalcStatus = FreeCalcStatus::NONE;
static const u64 normalMemstickSize = 9ULL * 1024 * 1024 * 1024;
static const u64 smallMemstickSize = 1ULL * 1024 * 1024 * 1024;
void MemoryStick_DoState(PointerWrap &p) {
auto s = p.Section("MemoryStick", 1, 5);
@ -75,7 +90,31 @@ u64 MemoryStick_SectorSize() {
return 32 * 1024; // 32KB
}
static void MemoryStick_CalcInitialFree() {
std::unique_lock<std::mutex> guard(freeCalcMutex);
freeCalcStatus = FreeCalcStatus::RUNNING;
freeCalcThread = std::thread([] {
memstickInitialFree = pspFileSystem.FreeSpace("ms0:/") + pspFileSystem.getDirSize("ms0:/PSP/SAVEDATA/");
std::unique_lock<std::mutex> guard(freeCalcMutex);
freeCalcStatus = FreeCalcStatus::DONE;
freeCalcCond.notify_all();
});
}
static void MemoryStick_WaitInitialFree() {
std::unique_lock<std::mutex> guard(freeCalcMutex);
while (freeCalcStatus == FreeCalcStatus::RUNNING) {
freeCalcCond.wait(guard);
}
if (freeCalcStatus == FreeCalcStatus::DONE)
freeCalcThread.join();
freeCalcStatus = FreeCalcStatus::CLEANED_UP;
}
u64 MemoryStick_FreeSpace() {
MemoryStick_WaitInitialFree();
const CompatFlags &flags = PSP_CoreParameter().compat.flags();
u64 realFreeSpace = pspFileSystem.FreeSpace("ms0:/");
@ -135,5 +174,9 @@ void MemoryStick_Init() {
}
memStickNeedsAssign = false;
memstickInitialFree = pspFileSystem.FreeSpace("ms0:/") + pspFileSystem.getDirSize("ms0:/PSP/SAVEDATA/");
MemoryStick_CalcInitialFree();
}
void MemoryStick_Shutdown() {
MemoryStick_WaitInitialFree();
}

View file

@ -41,6 +41,7 @@ enum MemStickDriverState {
};
void MemoryStick_Init();
void MemoryStick_Shutdown();
void MemoryStick_DoState(PointerWrap &p);
MemStickState MemoryStick_State();
MemStickFatState MemoryStick_FatState();

View file

@ -324,14 +324,10 @@ static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) {
switch (p.mode) {
case PointerWrap::MODE_READ:
GlobalThreadPool::Loop([&](int l, int h) {
memmove(d + l, storage + l, h - l);
}, 0, size);
GlobalThreadPool::Memcpy(d, storage, size);
break;
case PointerWrap::MODE_WRITE:
GlobalThreadPool::Loop([&](int l, int h) {
memmove(storage + l, d + l, h - l);
}, 0, size);
GlobalThreadPool::Memcpy(storage, d, size);
break;
case PointerWrap::MODE_MEASURE:
// Nothing to do here.
@ -395,15 +391,6 @@ void Shutdown() {
DEBUG_LOG(MEMMAP, "Memory system shut down.");
}
void Clear() {
if (m_pPhysicalRAM)
memset(GetPointerUnchecked(PSP_GetKernelMemoryBase()), 0, g_MemorySize);
if (m_pPhysicalScratchPad)
memset(m_pPhysicalScratchPad, 0, SCRATCHPAD_SIZE);
if (m_pPhysicalVRAM1)
memset(m_pPhysicalVRAM1, 0, VRAM_SIZE);
}
bool IsActive() {
return base != nullptr;
}

View file

@ -6,9 +6,19 @@
std::unique_ptr<ThreadPool> GlobalThreadPool::pool;
std::once_flag GlobalThreadPool::init_flag;
void GlobalThreadPool::Loop(const std::function<void(int,int)>& loop, int lower, int upper) {
void GlobalThreadPool::Loop(const std::function<void(int,int)>& loop, int lower, int upper, int minSize) {
std::call_once(init_flag, Inititialize);
pool->ParallelLoop(loop, lower, upper);
pool->ParallelLoop(loop, lower, upper, minSize);
}
void GlobalThreadPool::Memcpy(void *dest, const void *src, int size) {
std::call_once(init_flag, Inititialize);
pool->ParallelMemcpy(dest, src, size);
}
void GlobalThreadPool::Memset(void *dest, uint8_t val, int size) {
std::call_once(init_flag, Inititialize);
pool->ParallelMemset(dest, val, size);
}
void GlobalThreadPool::Inititialize() {

View file

@ -6,7 +6,9 @@ class GlobalThreadPool {
public:
// will execute slices of "loop" from "lower" to "upper"
// in parallel on the global thread pool
static void Loop(const std::function<void(int,int)>& loop, int lower, int upper);
static void Loop(const std::function<void(int,int)>& loop, int lower, int upper, int minSize = -1);
static void Memcpy(void *dest, const void *src, int size);
static void Memset(void *dest, uint8_t val, int size);
private:
static std::unique_ptr<ThreadPool> pool;

View file

@ -16,6 +16,7 @@
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include <algorithm>
#include <atomic>
#include <cstring>
#include <functional>
#include <set>
@ -32,6 +33,7 @@
#include "Core/HLE/sceDisplay.h"
#include "Core/MemMap.h"
#include "Core/System.h"
#include "Core/ThreadPools.h"
#include "GPU/GPUInterface.h"
#include "GPU/GPUState.h"
#include "GPU/ge_constants.h"
@ -166,34 +168,47 @@ static const u8 *mymemmem(const u8 *haystack, size_t off, size_t hlen, const u8
}
const u8 *last_possible = haystack + hlen - nlen;
const u8 *first_possible = haystack + off;
int first = *needle;
const u8 *p = haystack + off;
const uintptr_t align_mask = align - 1;
auto poffset = [&]() {
return ((uintptr_t)(p - haystack) & align_mask);
};
auto alignp = [&]() {
uintptr_t offset = poffset();
if (offset != 0)
p += align - offset;
};
const u8 *result = nullptr;
std::mutex resultLock;
alignp();
while (p <= last_possible) {
p = (const u8 *)memchr(p, first, last_possible - p + 1);
if (!p) {
return nullptr;
}
if (poffset() == 0 && !memcmp(p, needle, nlen)) {
return p;
}
int range = (int)(last_possible - first_possible);
GlobalThreadPool::Loop([&](int l, int h) {
const u8 *p = haystack + off + l;
const u8 *pend = haystack + off + h;
const uintptr_t align_mask = align - 1;
auto poffset = [&]() {
return ((uintptr_t)(p - haystack) & align_mask);
};
auto alignp = [&]() {
uintptr_t offset = poffset();
if (offset != 0)
p += align - offset;
};
p++;
alignp();
}
while (p <= pend) {
p = (const u8 *)memchr(p, first, pend - p + 1);
if (!p) {
return;
}
if (poffset() == 0 && !memcmp(p, needle, nlen)) {
std::lock_guard<std::mutex> guard(resultLock);
// Take the lowest result so we get the same file for any # of threads.
if (!result || p < result)
result = p;
return;
}
return nullptr;
p++;
alignp();
}
}, 0, range, 128 * 1024);
return result;
}
static Command EmitCommandWithRAM(CommandType t, const void *p, u32 sz, u32 align) {