mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
105 lines
2.9 KiB
C++
105 lines
2.9 KiB
C++
#include <cstring>
|
|
|
|
#include "ParallelLoop.h"
|
|
|
|
class LoopRangeTask : public Task {
|
|
public:
|
|
LoopRangeTask(WaitableCounter *counter, const std::function<void(int, int)> &loop, int lower, int upper)
|
|
: counter_(counter), loop_(loop), lower_(lower), upper_(upper) {}
|
|
|
|
void Run() override {
|
|
loop_(lower_, upper_);
|
|
counter_->Count();
|
|
}
|
|
|
|
std::function<void(int, int)> loop_;
|
|
WaitableCounter *counter_;
|
|
|
|
int lower_;
|
|
int upper_;
|
|
};
|
|
|
|
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize) {
|
|
if (minSize == -1) {
|
|
minSize = 1;
|
|
}
|
|
|
|
// TODO: Optimize using minSize.
|
|
int numTasks = threadMan->GetNumLooperThreads();
|
|
|
|
int range = upper - lower;
|
|
if (range <= 0) {
|
|
// Bad range. A finished counter allocated.
|
|
return new WaitableCounter(0);
|
|
}
|
|
|
|
if (range <= numTasks) {
|
|
// Just assign one task per thread, as many as we have.
|
|
WaitableCounter *counter = new WaitableCounter(range);
|
|
for (int i = 0; i < range; i++) {
|
|
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(counter, loop, i, i + 1));
|
|
}
|
|
return counter;
|
|
} else {
|
|
WaitableCounter *counter = new WaitableCounter(numTasks);
|
|
// Split the range between threads.
|
|
double dx = (double)range / (double)numTasks;
|
|
double d = 0.0;
|
|
int lastEnd = 0;
|
|
for (int i = 0; i < numTasks; i++) {
|
|
int start = lastEnd;
|
|
d += dx;
|
|
int end = i == numTasks - 1 ? range : (int)d;
|
|
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(counter, loop, start, end));
|
|
lastEnd = end;
|
|
}
|
|
return counter;
|
|
}
|
|
}
|
|
|
|
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize) {
|
|
if (minSize == -1) {
|
|
minSize = 4;
|
|
}
|
|
|
|
WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize);
|
|
// TODO: Optimize using minSize. We'll just compute whether there's a remainer, remove it from the call to ParallelRangeLoopWaitable,
|
|
// and process the remainder right here. If there's no remainer, we'll steal a whole chunk.
|
|
if (counter) {
|
|
counter->WaitAndRelease();
|
|
}
|
|
}
|
|
|
|
void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t bytes) {
|
|
// This threshold can probably be a lot bigger.
|
|
if (bytes < 512) {
|
|
memcpy(dst, src, bytes);
|
|
return;
|
|
}
|
|
|
|
// 128 is the largest cacheline size on common CPUs.
|
|
// Still I suspect that the optimal minSize is a lot higher.
|
|
|
|
char *d = (char *)dst;
|
|
char *s = (char *)src;
|
|
ParallelRangeLoop(threadMan, [&](int l, int h) {
|
|
memmove(d + l, s + l, h - l);
|
|
}, 0, 128);
|
|
}
|
|
|
|
|
|
void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t bytes) {
|
|
// This threshold can probably be a lot bigger.
|
|
if (bytes < 512) {
|
|
memset(dst, 0, bytes);
|
|
return;
|
|
}
|
|
|
|
// 128 is the largest cacheline size on common CPUs.
|
|
// Still I suspect that the optimal minSize is a lot higher.
|
|
|
|
char *d = (char *)dst;
|
|
ParallelRangeLoop(threadMan, [&](int l, int h) {
|
|
memset(d + l, value, h - l);
|
|
}, 0, 128);
|
|
}
|