ParallelLoop: A bit smarter straggler handling.

This commit is contained in:
Henrik Rydgård 2021-06-12 21:57:16 +02:00
parent 3be5c7bd9a
commit 5b64a41a97
4 changed files with 47 additions and 21 deletions

View file

@ -35,25 +35,52 @@ WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::
if (range <= numTasks) {
// Just assign one task per thread, as many as we have.
WaitableCounter *counter = new WaitableCounter(range);
WaitableCounter *waitableCounter = new WaitableCounter(range);
for (int i = 0; i < range; i++) {
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(counter, loop, i, i + 1), TaskType::CPU_COMPUTE);
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(waitableCounter, loop, i, i + 1), TaskType::CPU_COMPUTE);
}
return counter;
return waitableCounter;
} else {
WaitableCounter *counter = new WaitableCounter(numTasks);
// Split the range between threads.
double dx = (double)range / (double)numTasks;
double d = 0.0;
int lastEnd = 0;
// Split the range between threads. Allow for some fractional bits.
const int fractionalBits = 8;
int64_t totalFrac = (int64_t)range << fractionalBits;
int64_t delta = totalFrac / numTasks;
delta = std::max(delta, (int64_t)minSize << fractionalBits);
// Now we can compute the actual number of tasks.
// Remember that stragglers are done on the current thread
// so we don't round up.
numTasks = (int)(totalFrac / delta);
WaitableCounter *waitableCounter = new WaitableCounter(numTasks);
int64_t counter = (int64_t)lower << fractionalBits;
// Split up tasks as equitable as possible.
for (int i = 0; i < numTasks; i++) {
int start = lastEnd;
d += dx;
int end = i == numTasks - 1 ? range : (int)d;
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(counter, loop, start, end), TaskType::CPU_COMPUTE);
lastEnd = end;
int start = (int)(counter >> fractionalBits);
int end = (int)((counter + delta) >> fractionalBits);
if (end > upper) {
// Let's do the stragglers on the current thread.
break;
}
threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(waitableCounter, loop, start, end), TaskType::CPU_COMPUTE);
counter += delta;
if ((counter >> fractionalBits) > upper) {
break;
}
}
return counter;
// Run stragglers on the calling thread directly.
// We might add a flag later to avoid this for some cases.
int stragglerStart = (int)(counter >> fractionalBits);
int stragglerEnd = upper;
if (stragglerStart < stragglerEnd) {
// printf("doing stragglers: %d-%d\n", start, upper);
loop(stragglerStart, stragglerEnd);
}
return waitableCounter;
}
}

View file

@ -32,11 +32,10 @@ public:
std::condition_variable cond_;
};
// Note that upper bounds are non-inclusive.
// This one never executes the remainer on the calling thread.
// Note that upper bounds are non-inclusive: range is [lower, upper)
WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize);
// This one optimizes by running the remainder on the calling thread.
// Note that upper bounds are non-inclusive: range is [lower, upper)
void ParallelRangeLoop(ThreadManager *threadMan, const std::function<void(int, int)> &loop, int lower, int upper, int minSize);
// Common utilities for large (!) memory copies.

View file

@ -627,8 +627,8 @@ handleELF:
}
done:
info_->pending = false;
info_->working = false;
info_->pending.store(false);
info_->working.store(false);
info_->readyEvent.Notify();
// INFO_LOG(SYSTEM, "Completed writing info for %s", info_->GetTitle().c_str());
}

View file

@ -46,10 +46,10 @@ bool TestParallelLoop(ThreadManager *threadMan) {
// Now it's done.
// Try a loop with stragglers.
printf("blocking test #1\n");
printf("blocking test #1 [0-65)\n");
ParallelRangeLoop(threadMan, rangeFunc, 0, 65, 1);
// Try a loop with a relatively large minimum size.
printf("blocking test #2\n");
printf("blocking test #2 [0-100)\n");
ParallelRangeLoop(threadMan, rangeFunc, 0, 100, 40);
return true;
}