Add a NEON-optimized version of XXH32.

This takes at least 40% less time to hash on NEON/ARM devices.
This commit is contained in:
Unknown W. Brackets 2014-03-25 00:21:04 -07:00
parent 3ec61274fa
commit b800762ceb
8 changed files with 118 additions and 12 deletions

View file

@ -15,6 +15,7 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "ext/xxhash.h"
#include "Common/CPUDetect.h"
#include "GPU/Common/TextureDecoder.h"
// NEON is in a separate file so that it can be compiled with a runtime check.
@ -152,6 +153,7 @@ void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32
QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
ReliableHashFunc DoReliableHash = &XXH32;
// This has to be done after CPUDetect has done its magic.
void SetupTextureDecoder() {
@ -159,6 +161,10 @@ void SetupTextureDecoder() {
if (cpu_info.bNEON) {
DoQuickTexHash = &QuickTexHashNEON;
DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
#ifndef IOS
// Not sure if this is safe on iOS, it's had issues with xxhash.
DoReliableHash = &ReliableHashNEON;
#endif
}
#elif _M_SSE
if (cpu_info.bSSE2) {

View file

@ -30,6 +30,9 @@ extern QuickTexHashFunc DoQuickTexHash;
typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
extern UnswizzleTex16Func DoUnswizzleTex16;
typedef u32 (*ReliableHashFunc)(const void *input, int len, u32 seed);
extern ReliableHashFunc DoReliableHash;
// All these DXT structs are in the reverse order, as compared to PC.
// On PC, alpha comes before color, and interpolants are before the tile data.

View file

@ -146,3 +146,100 @@ void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 p
ydestp += (rowWidth * 8) / 4;
}
}
// NOTE: This is just a NEON version of xxhash.
// GCC sucks at making things NEON and can't seem to handle it.
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99
# include <stdint.h>
typedef uint8_t BYTE;
typedef uint16_t U16;
typedef uint32_t U32;
typedef int32_t S32;
typedef uint64_t U64;
#else
typedef unsigned char BYTE;
typedef unsigned short U16;
typedef unsigned int U32;
typedef signed int S32;
typedef unsigned long long U64;
#endif
#define PRIME32_1 2654435761U
#define PRIME32_2 2246822519U
#define PRIME32_3 3266489917U
#define PRIME32_4 668265263U
#define PRIME32_5 374761393U
#if defined(_MSC_VER)
# define XXH_rotl32(x,r) _rotl(x,r)
#else
# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
#endif
u32 ReliableHashNEON(const void *input, int len, u32 seed) {
const u8 *p = (const u8 *)input;
const u8 *const bEnd = p + len;
U32 h32;
#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; }
#endif
if (len>=16)
{
const BYTE* const limit = bEnd - 16;
U32 v1 = seed + PRIME32_1 + PRIME32_2;
U32 v2 = seed + PRIME32_2;
U32 v3 = seed + 0;
U32 v4 = seed - PRIME32_1;
uint32x4_t prime32_1q = vdupq_n_u32(PRIME32_1);
uint32x4_t prime32_2q = vdupq_n_u32(PRIME32_2);
uint32x4_t vq = vcombine_u32(vcreate_u32(v1 | ((U64)v2 << 32)), vcreate_u32(v3 | ((U64)v4 << 32)));
do
{
__builtin_prefetch(p + 0xc0, 0, 0);
vq = vmlaq_u32(vq, vld1q_u32((const U32*)p), prime32_2q);
vq = vorrq_u32(vshlq_n_u32(vq, 13), vshrq_n_u32(vq, 32 - 13));
p += 16;
vq = vmulq_u32(vq, prime32_1q);
} while (p<=limit);
v1 = vgetq_lane_u32(vq, 0);
v2 = vgetq_lane_u32(vq, 1);
v3 = vgetq_lane_u32(vq, 2);
v4 = vgetq_lane_u32(vq, 3);
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
}
else
{
h32 = seed + PRIME32_5;
}
h32 += (U32) len;
while (p<=bEnd-4)
{
h32 += *(const U32*)p * PRIME32_3;
h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
p+=4;
}
while (p<bEnd)
{
h32 += (*p) * PRIME32_5;
h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
p++;
}
h32 ^= h32 >> 15;
h32 *= PRIME32_2;
h32 ^= h32 >> 13;
h32 *= PRIME32_3;
h32 ^= h32 >> 16;
return h32;
}

View file

@ -19,3 +19,4 @@
u32 QuickTexHashNEON(const void *checkp, u32 size);
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
u32 ReliableHashNEON(const void *input, int len, u32 seed);

View file

@ -698,7 +698,7 @@ void TextureCacheDX9::UpdateCurrentClut() {
// If not, we're going to hash random data, which hopefully doesn't cause a performance issue.
const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes;
clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
/*
// Avoid a copy when we don't need to convert colors.

View file

@ -26,12 +26,12 @@
#include "Core/CoreTiming.h"
#include "helper/dx_state.h"
#include "ext/xxhash.h"
#include "GPU/Math3D.h"
#include "GPU/GPUState.h"
#include "GPU/ge_constants.h"
#include "GPU/Common/TextureDecoder.h"
#include "GPU/Directx9/StateMappingDX9.h"
#include "GPU/Directx9/TextureCacheDX9.h"
#include "GPU/Directx9/TransformPipelineDX9.h"
@ -1042,7 +1042,7 @@ u32 TransformDrawEngineDX9::ComputeHash() {
for (int i = 0; i < numDrawCalls; i++) {
if (!drawCalls[i].inds) {
vertexCount = std::min((int)drawCalls[i].vertexCount, 500);
fullhash += XXH32((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4);
fullhash += DoReliableHash((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4);
} else {
vertexCount = std::min((int)drawCalls[i].vertexCount, 500);
@ -1050,10 +1050,10 @@ u32 TransformDrawEngineDX9::ComputeHash() {
// This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way
// we do when drawing.
fullhash += XXH32((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound,
fullhash += DoReliableHash((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound,
vertexSize * indicesCount, 0x029F3EE1);
int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1;
fullhash += XXH32((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA);
fullhash += DoReliableHash((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA);
}
}

View file

@ -734,7 +734,7 @@ void TextureCache::UpdateCurrentClut() {
// If not, we're going to hash random data, which hopefully doesn't cause a performance issue.
const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes;
clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
// Avoid a copy when we don't need to convert colors.
if (clutFormat != GE_CMODE_32BIT_ABGR8888) {

View file

@ -73,12 +73,12 @@
#include "Core/CoreTiming.h"
#include "native/gfx_es2/gl_state.h"
#include "ext/xxhash.h"
#include "GPU/Math3D.h"
#include "GPU/GPUState.h"
#include "GPU/ge_constants.h"
#include "GPU/Common/TextureDecoder.h"
#include "GPU/Common/SplineCommon.h"
#include "GPU/GLES/StateMapping.h"
#include "GPU/GLES/TextureCache.h"
@ -86,7 +86,6 @@
#include "GPU/GLES/VertexDecoder.h"
#include "GPU/GLES/ShaderManager.h"
#include "GPU/GLES/GLES_GPU.h"
#include "GPU/Common/SplineCommon.h"
extern const GLuint glprim[8] = {
GL_POINTS,
@ -454,7 +453,7 @@ u32 TransformDrawEngine::ComputeHash() {
for (int i = 0; i < numDrawCalls; i++) {
const DeferredDrawCall &dc = drawCalls[i];
if (!dc.inds) {
fullhash += XXH32((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4);
fullhash += DoReliableHash((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4);
} else {
int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound;
int j = i + 1;
@ -469,16 +468,16 @@ u32 TransformDrawEngine::ComputeHash() {
}
// This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way
// we do when drawing.
fullhash += XXH32((const char *)dc.verts + vertexSize * indexLowerBound,
fullhash += DoReliableHash((const char *)dc.verts + vertexSize * indexLowerBound,
vertexSize * (indexUpperBound - indexLowerBound), 0x029F3EE1);
int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1;
// Hm, we will miss some indices when combining above, but meh, it should be fine.
fullhash += XXH32((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA);
fullhash += DoReliableHash((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA);
i = lastMatch;
}
}
if (uvScale) {
fullhash += XXH32(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
fullhash += DoReliableHash(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
}
return fullhash;