mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
NEON-optimize CLUT loads
This commit is contained in:
parent
50d771961b
commit
d19d8a8bb8
1 changed files with 20 additions and 0 deletions
|
@ -33,6 +33,9 @@
|
|||
#if defined(_M_SSE)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#if PPSSPP_ARCH(ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
// Videos should be updated every few frames, so we forget quickly.
|
||||
#define VIDEO_DECIMATE_AGE 4
|
||||
|
@ -931,6 +934,23 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
|
|||
memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
|
||||
}
|
||||
}
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
if (bytes == loadBytes) {
|
||||
const uint32_t *source = (const uint32_t *)Memory::GetPointerUnchecked(clutAddr);
|
||||
uint32_t *dest = (uint32_t *)clutBufRaw_;
|
||||
int numBlocks = bytes / 32;
|
||||
for (int i = 0; i < numBlocks; i++, source += 8, dest += 8) {
|
||||
uint32x4_t data1 = vld1q_u32(source);
|
||||
uint32x4_t data2 = vld1q_u32(source + 4);
|
||||
vst1q_u32(dest, data1);
|
||||
vst1q_u32(dest + 4, data2);
|
||||
}
|
||||
} else {
|
||||
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
|
||||
if (bytes < loadBytes) {
|
||||
memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes);
|
||||
}
|
||||
}
|
||||
#else
|
||||
Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes);
|
||||
if (bytes < loadBytes) {
|
||||
|
|
Loading…
Add table
Reference in a new issue