SaveState: Use SIMD to copy RAM faster.

We know its size and base are typically aligned, so abuse those facts.
This commit is contained in:
Unknown W. Brackets 2021-04-11 18:02:21 -07:00
parent 3400e6e517
commit c5729bbd5e

View file

@ -46,6 +46,10 @@
#include "Core/ThreadPools.h"
#include "UI/OnScreenDisplay.h"
#ifdef _M_SSE
#include <emmintrin.h>
#endif
namespace Memory {
// The base pointer to the auto-mirrored arena.
@ -314,19 +318,51 @@ void Reinit() {
Core_NotifyLifecycle(CoreLifecycle::MEMORY_REINITED);
}
static void CopyAlignedFast(uint8_t *dst8, const uint8_t *src8, int l, int h) {
#ifdef _M_SSE
// All threads round both down, so they will still align.
l &= ~0x3F;
h &= ~0x3F;
const int n = (h - l) / 16;
const __m128i *src = (__m128i *)(src8 + l);
__m128i *dst = (__m128i *)(dst8 + l);
for (int i = 0; i < n; i += 4) {
__m128i row0 = _mm_loadu_si128(src + 0);
__m128i row1 = _mm_loadu_si128(src + 1);
__m128i row2 = _mm_loadu_si128(src + 2);
__m128i row3 = _mm_loadu_si128(src + 3);
_mm_storeu_si128(dst + 0, row0);
_mm_storeu_si128(dst + 1, row1);
_mm_storeu_si128(dst + 2, row2);
_mm_storeu_si128(dst + 3, row3);
src += 4;
dst += 4;
}
#else
memcpy(dst8 + l, src8 + l, h - l);
#endif
}
static void DoMemoryVoid(PointerWrap &p, uint32_t start, uint32_t size) {
uint8_t *d = GetPointer(start);
uint8_t *&storage = *p.ptr;
// We only handle aligned data and sizes.
#ifdef _M_SSE
if ((size & 0x3F) != 0 || ((uintptr_t)d & 0x3F) != 0)
return p.DoVoid(d, size);
#endif
switch (p.mode) {
case PointerWrap::MODE_READ:
GlobalThreadPool::Loop([&](int l, int h) {
memcpy(d + l, storage + l, h - l);
CopyAlignedFast(d, storage, l, h);
}, 0, size);
break;
case PointerWrap::MODE_WRITE:
GlobalThreadPool::Loop([&](int l, int h) {
memcpy(storage + l, d + l, h - l);
CopyAlignedFast(storage, d, l, h);
}, 0, size);
break;
case PointerWrap::MODE_MEASURE: