mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Add a NEON-optimized version of XXH32.
This takes at least 40% less time to hash on NEON/ARM devices.
This commit is contained in:
parent
3ec61274fa
commit
b800762ceb
8 changed files with 118 additions and 12 deletions
|
@ -15,6 +15,7 @@
|
|||
// Official git repository and contact information can be found at
|
||||
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
||||
|
||||
#include "ext/xxhash.h"
|
||||
#include "Common/CPUDetect.h"
|
||||
#include "GPU/Common/TextureDecoder.h"
|
||||
// NEON is in a separate file so that it can be compiled with a runtime check.
|
||||
|
@ -152,6 +153,7 @@ void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32
|
|||
|
||||
QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
|
||||
UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
|
||||
ReliableHashFunc DoReliableHash = &XXH32;
|
||||
|
||||
// This has to be done after CPUDetect has done its magic.
|
||||
void SetupTextureDecoder() {
|
||||
|
@ -159,6 +161,10 @@ void SetupTextureDecoder() {
|
|||
if (cpu_info.bNEON) {
|
||||
DoQuickTexHash = &QuickTexHashNEON;
|
||||
DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
|
||||
#ifndef IOS
|
||||
// Not sure if this is safe on iOS, it's had issues with xxhash.
|
||||
DoReliableHash = &ReliableHashNEON;
|
||||
#endif
|
||||
}
|
||||
#elif _M_SSE
|
||||
if (cpu_info.bSSE2) {
|
||||
|
|
|
@ -30,6 +30,9 @@ extern QuickTexHashFunc DoQuickTexHash;
|
|||
typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
|
||||
extern UnswizzleTex16Func DoUnswizzleTex16;
|
||||
|
||||
typedef u32 (*ReliableHashFunc)(const void *input, int len, u32 seed);
|
||||
extern ReliableHashFunc DoReliableHash;
|
||||
|
||||
// All these DXT structs are in the reverse order, as compared to PC.
|
||||
// On PC, alpha comes before color, and interpolants are before the tile data.
|
||||
|
||||
|
|
|
@ -146,3 +146,100 @@ void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 p
|
|||
ydestp += (rowWidth * 8) / 4;
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: This is just a NEON version of xxhash.
|
||||
// GCC sucks at making things NEON and can't seem to handle it.
|
||||
|
||||
#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99
|
||||
# include <stdint.h>
|
||||
typedef uint8_t BYTE;
|
||||
typedef uint16_t U16;
|
||||
typedef uint32_t U32;
|
||||
typedef int32_t S32;
|
||||
typedef uint64_t U64;
|
||||
#else
|
||||
typedef unsigned char BYTE;
|
||||
typedef unsigned short U16;
|
||||
typedef unsigned int U32;
|
||||
typedef signed int S32;
|
||||
typedef unsigned long long U64;
|
||||
#endif
|
||||
|
||||
#define PRIME32_1 2654435761U
|
||||
#define PRIME32_2 2246822519U
|
||||
#define PRIME32_3 3266489917U
|
||||
#define PRIME32_4 668265263U
|
||||
#define PRIME32_5 374761393U
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
# define XXH_rotl32(x,r) _rotl(x,r)
|
||||
#else
|
||||
# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
|
||||
#endif
|
||||
|
||||
u32 ReliableHashNEON(const void *input, int len, u32 seed) {
|
||||
const u8 *p = (const u8 *)input;
|
||||
const u8 *const bEnd = p + len;
|
||||
U32 h32;
|
||||
|
||||
#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
|
||||
if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; }
|
||||
#endif
|
||||
|
||||
if (len>=16)
|
||||
{
|
||||
const BYTE* const limit = bEnd - 16;
|
||||
U32 v1 = seed + PRIME32_1 + PRIME32_2;
|
||||
U32 v2 = seed + PRIME32_2;
|
||||
U32 v3 = seed + 0;
|
||||
U32 v4 = seed - PRIME32_1;
|
||||
|
||||
uint32x4_t prime32_1q = vdupq_n_u32(PRIME32_1);
|
||||
uint32x4_t prime32_2q = vdupq_n_u32(PRIME32_2);
|
||||
uint32x4_t vq = vcombine_u32(vcreate_u32(v1 | ((U64)v2 << 32)), vcreate_u32(v3 | ((U64)v4 << 32)));
|
||||
|
||||
do
|
||||
{
|
||||
__builtin_prefetch(p + 0xc0, 0, 0);
|
||||
vq = vmlaq_u32(vq, vld1q_u32((const U32*)p), prime32_2q);
|
||||
vq = vorrq_u32(vshlq_n_u32(vq, 13), vshrq_n_u32(vq, 32 - 13));
|
||||
p += 16;
|
||||
vq = vmulq_u32(vq, prime32_1q);
|
||||
} while (p<=limit);
|
||||
|
||||
v1 = vgetq_lane_u32(vq, 0);
|
||||
v2 = vgetq_lane_u32(vq, 1);
|
||||
v3 = vgetq_lane_u32(vq, 2);
|
||||
v4 = vgetq_lane_u32(vq, 3);
|
||||
|
||||
h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
|
||||
}
|
||||
else
|
||||
{
|
||||
h32 = seed + PRIME32_5;
|
||||
}
|
||||
|
||||
h32 += (U32) len;
|
||||
|
||||
while (p<=bEnd-4)
|
||||
{
|
||||
h32 += *(const U32*)p * PRIME32_3;
|
||||
h32 = XXH_rotl32(h32, 17) * PRIME32_4 ;
|
||||
p+=4;
|
||||
}
|
||||
|
||||
while (p<bEnd)
|
||||
{
|
||||
h32 += (*p) * PRIME32_5;
|
||||
h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
|
||||
p++;
|
||||
}
|
||||
|
||||
h32 ^= h32 >> 15;
|
||||
h32 *= PRIME32_2;
|
||||
h32 ^= h32 >> 13;
|
||||
h32 *= PRIME32_3;
|
||||
h32 ^= h32 >> 16;
|
||||
|
||||
return h32;
|
||||
}
|
||||
|
|
|
@ -19,3 +19,4 @@
|
|||
|
||||
u32 QuickTexHashNEON(const void *checkp, u32 size);
|
||||
void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
|
||||
u32 ReliableHashNEON(const void *input, int len, u32 seed);
|
||||
|
|
|
@ -698,7 +698,7 @@ void TextureCacheDX9::UpdateCurrentClut() {
|
|||
// If not, we're going to hash random data, which hopefully doesn't cause a performance issue.
|
||||
const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes;
|
||||
|
||||
clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
|
||||
clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
|
||||
|
||||
/*
|
||||
// Avoid a copy when we don't need to convert colors.
|
||||
|
|
|
@ -26,12 +26,12 @@
|
|||
#include "Core/CoreTiming.h"
|
||||
|
||||
#include "helper/dx_state.h"
|
||||
#include "ext/xxhash.h"
|
||||
|
||||
#include "GPU/Math3D.h"
|
||||
#include "GPU/GPUState.h"
|
||||
#include "GPU/ge_constants.h"
|
||||
|
||||
#include "GPU/Common/TextureDecoder.h"
|
||||
#include "GPU/Directx9/StateMappingDX9.h"
|
||||
#include "GPU/Directx9/TextureCacheDX9.h"
|
||||
#include "GPU/Directx9/TransformPipelineDX9.h"
|
||||
|
@ -1042,7 +1042,7 @@ u32 TransformDrawEngineDX9::ComputeHash() {
|
|||
for (int i = 0; i < numDrawCalls; i++) {
|
||||
if (!drawCalls[i].inds) {
|
||||
vertexCount = std::min((int)drawCalls[i].vertexCount, 500);
|
||||
fullhash += XXH32((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4);
|
||||
fullhash += DoReliableHash((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4);
|
||||
} else {
|
||||
|
||||
vertexCount = std::min((int)drawCalls[i].vertexCount, 500);
|
||||
|
@ -1050,10 +1050,10 @@ u32 TransformDrawEngineDX9::ComputeHash() {
|
|||
|
||||
// This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way
|
||||
// we do when drawing.
|
||||
fullhash += XXH32((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound,
|
||||
fullhash += DoReliableHash((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound,
|
||||
vertexSize * indicesCount, 0x029F3EE1);
|
||||
int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1;
|
||||
fullhash += XXH32((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA);
|
||||
fullhash += DoReliableHash((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -734,7 +734,7 @@ void TextureCache::UpdateCurrentClut() {
|
|||
// If not, we're going to hash random data, which hopefully doesn't cause a performance issue.
|
||||
const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes;
|
||||
|
||||
clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
|
||||
clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
|
||||
|
||||
// Avoid a copy when we don't need to convert colors.
|
||||
if (clutFormat != GE_CMODE_32BIT_ABGR8888) {
|
||||
|
|
|
@ -73,12 +73,12 @@
|
|||
#include "Core/CoreTiming.h"
|
||||
|
||||
#include "native/gfx_es2/gl_state.h"
|
||||
#include "ext/xxhash.h"
|
||||
|
||||
#include "GPU/Math3D.h"
|
||||
#include "GPU/GPUState.h"
|
||||
#include "GPU/ge_constants.h"
|
||||
|
||||
#include "GPU/Common/TextureDecoder.h"
|
||||
#include "GPU/Common/SplineCommon.h"
|
||||
#include "GPU/GLES/StateMapping.h"
|
||||
#include "GPU/GLES/TextureCache.h"
|
||||
|
@ -86,7 +86,6 @@
|
|||
#include "GPU/GLES/VertexDecoder.h"
|
||||
#include "GPU/GLES/ShaderManager.h"
|
||||
#include "GPU/GLES/GLES_GPU.h"
|
||||
#include "GPU/Common/SplineCommon.h"
|
||||
|
||||
extern const GLuint glprim[8] = {
|
||||
GL_POINTS,
|
||||
|
@ -454,7 +453,7 @@ u32 TransformDrawEngine::ComputeHash() {
|
|||
for (int i = 0; i < numDrawCalls; i++) {
|
||||
const DeferredDrawCall &dc = drawCalls[i];
|
||||
if (!dc.inds) {
|
||||
fullhash += XXH32((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4);
|
||||
fullhash += DoReliableHash((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4);
|
||||
} else {
|
||||
int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound;
|
||||
int j = i + 1;
|
||||
|
@ -469,16 +468,16 @@ u32 TransformDrawEngine::ComputeHash() {
|
|||
}
|
||||
// This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way
|
||||
// we do when drawing.
|
||||
fullhash += XXH32((const char *)dc.verts + vertexSize * indexLowerBound,
|
||||
fullhash += DoReliableHash((const char *)dc.verts + vertexSize * indexLowerBound,
|
||||
vertexSize * (indexUpperBound - indexLowerBound), 0x029F3EE1);
|
||||
int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1;
|
||||
// Hm, we will miss some indices when combining above, but meh, it should be fine.
|
||||
fullhash += XXH32((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA);
|
||||
fullhash += DoReliableHash((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA);
|
||||
i = lastMatch;
|
||||
}
|
||||
}
|
||||
if (uvScale) {
|
||||
fullhash += XXH32(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
|
||||
fullhash += DoReliableHash(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
|
||||
}
|
||||
|
||||
return fullhash;
|
||||
|
|
Loading…
Add table
Reference in a new issue