Improve the non-NEON tex hash path on ARM.

This generates better looking disassembly, though a small change.
This commit is contained in:
Unknown W. Brackets 2013-11-02 08:48:46 -07:00
parent 5efc7fd581
commit 64e977db08
2 changed files with 34 additions and 7 deletions

View file

@ -17,6 +17,7 @@
#include "Common/CPUDetect.h"
#include "GPU/Common/TextureDecoder.h"
// NEON is in a separate file so that it can be compiled with a runtime check.
#include "GPU/Common/TextureDecoderNEON.h"
// TODO: Move some common things into here.
@ -43,8 +44,9 @@ static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
cursor = _mm_xor_si128(cursor, chunk);
cursor2 = _mm_add_epi16(cursor2, update);
}
cursor = _mm_add_epi32(cursor, cursor2);
// Add the four parts into the low i32.
cursor = _mm_add_epi32(cursor, _mm_add_epi32(_mm_srli_si128(cursor, 8), cursor2));
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
check = _mm_cvtsi128_si32(cursor);
} else {
@ -61,13 +63,34 @@ static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
return check;
}
QuickTexHashFunc DoQuickTexHash = &QuickTexHashSSE2;
static u32 QuickTexHashBasic(const void *checkp, u32 size) {
#ifdef __GNUC__
__builtin_prefetch(checkp, 0, 0);
#endif
u32 check = 0;
const u32 size_u32 = size / 4;
const u32 *p = (const u32 *)checkp;
for (u32 i = 0; i < size_u32; i += 4) {
check += p[i + 0];
check ^= p[i + 1];
check += p[i + 2];
check ^= p[i + 3];
}
return check;
}
QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
// This has to be done after CPUDetect has done its magic.
void SetupQuickTexHash() {
#ifdef ARMV7
if (cpu_info.bNEON) {
if (cpu_info.bNEON)
DoQuickTexHash = &QuickTexHashNEON;
}
#else
if (cpu_info.bSSE2)
DoQuickTexHash = &QuickTexHashSSE2;
#endif
}

View file

@ -26,6 +26,7 @@ static const u16 MEMORY_ALIGNED16(QuickTexHashInitial[8]) = {0x0001U, 0x0083U, 0
u32 QuickTexHashNEON(const void *checkp, u32 size) {
u32 check = 0;
__builtin_prefetch(checkp, 0, 0);
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
uint32x4_t cursor = vdupq_n_u32(0);
@ -46,10 +47,13 @@ u32 QuickTexHashNEON(const void *checkp, u32 size) {
cursor = vaddq_u32(cursor, cursor2);
check = vgetq_lane_u32(cursor, 0) + vgetq_lane_u32(cursor, 1) + vgetq_lane_u32(cursor, 2) + vgetq_lane_u32(cursor, 3);
} else {
const u32 size_u32 = size / 4;
const u32 *p = (const u32 *)checkp;
for (u32 i = 0; i < size / 8; ++i) {
check += *p++;
check ^= *p++;
for (u32 i = 0; i < size_u32; i += 4) {
check += p[i + 0];
check ^= p[i + 1];
check += p[i + 2];
check ^= p[i + 3];
}
}