From 64e977db08974c2301bf02bbfbf42d389134129e Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 2 Nov 2013 08:48:46 -0700 Subject: [PATCH] Improve the non-NEON tex hash path on ARM. This generates better looking disassembly, though a small change. --- GPU/Common/TextureDecoder.cpp | 31 +++++++++++++++++++++++++++---- GPU/Common/TextureDecoderNEON.cpp | 10 +++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index 9c614e77cd..21d4a76e0d 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -17,6 +17,7 @@ #include "Common/CPUDetect.h" #include "GPU/Common/TextureDecoder.h" +// NEON is in a separate file so that it can be compiled with a runtime check. #include "GPU/Common/TextureDecoderNEON.h" // TODO: Move some common things into here. @@ -43,8 +44,9 @@ static u32 QuickTexHashSSE2(const void *checkp, u32 size) { cursor = _mm_xor_si128(cursor, chunk); cursor2 = _mm_add_epi16(cursor2, update); } + cursor = _mm_add_epi32(cursor, cursor2); // Add the four parts into the low i32. - cursor = _mm_add_epi32(cursor, _mm_add_epi32(_mm_srli_si128(cursor, 8), cursor2)); + cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8)); cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4)); check = _mm_cvtsi128_si32(cursor); } else { @@ -61,13 +63,34 @@ static u32 QuickTexHashSSE2(const void *checkp, u32 size) { return check; } -QuickTexHashFunc DoQuickTexHash = &QuickTexHashSSE2; +static u32 QuickTexHashBasic(const void *checkp, u32 size) { +#ifdef __GNUC__ + __builtin_prefetch(checkp, 0, 0); +#endif + u32 check = 0; + const u32 size_u32 = size / 4; + const u32 *p = (const u32 *)checkp; + for (u32 i = 0; i < size_u32; i += 4) { + check += p[i + 0]; + check ^= p[i + 1]; + check += p[i + 2]; + check ^= p[i + 3]; + } + + return check; +} + +QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic; + +// This has to be done after CPUDetect has done its magic. void SetupQuickTexHash() { #ifdef ARMV7 - if (cpu_info.bNEON) { + if (cpu_info.bNEON) DoQuickTexHash = &QuickTexHashNEON; - } +#else + if (cpu_info.bSSE2) + DoQuickTexHash = &QuickTexHashSSE2; #endif } diff --git a/GPU/Common/TextureDecoderNEON.cpp b/GPU/Common/TextureDecoderNEON.cpp index 34d78c3c8c..3c249de082 100644 --- a/GPU/Common/TextureDecoderNEON.cpp +++ b/GPU/Common/TextureDecoderNEON.cpp @@ -26,6 +26,7 @@ static const u16 MEMORY_ALIGNED16(QuickTexHashInitial[8]) = {0x0001U, 0x0083U, 0 u32 QuickTexHashNEON(const void *checkp, u32 size) { u32 check = 0; + __builtin_prefetch(checkp, 0, 0); if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) { uint32x4_t cursor = vdupq_n_u32(0); @@ -46,10 +47,13 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) { cursor = vaddq_u32(cursor, cursor2); check = vgetq_lane_u32(cursor, 0) + vgetq_lane_u32(cursor, 1) + vgetq_lane_u32(cursor, 2) + vgetq_lane_u32(cursor, 3); } else { + const u32 size_u32 = size / 4; const u32 *p = (const u32 *)checkp; - for (u32 i = 0; i < size / 8; ++i) { - check += *p++; - check ^= *p++; + for (u32 i = 0; i < size_u32; i += 4) { + check += p[i + 0]; + check ^= p[i + 1]; + check += p[i + 2]; + check ^= p[i + 3]; } }