diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index fd30f65b6e..875c7c9c38 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -15,6 +15,7 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include "ext/xxhash.h" #include "Common/CPUDetect.h" #include "GPU/Common/TextureDecoder.h" // NEON is in a separate file so that it can be compiled with a runtime check. @@ -152,6 +153,7 @@ void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic; UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic; +ReliableHashFunc DoReliableHash = &XXH32; // This has to be done after CPUDetect has done its magic. void SetupTextureDecoder() { @@ -159,6 +161,10 @@ void SetupTextureDecoder() { if (cpu_info.bNEON) { DoQuickTexHash = &QuickTexHashNEON; DoUnswizzleTex16 = &DoUnswizzleTex16NEON; +#ifndef IOS + // Not sure if this is safe on iOS, it's had issues with xxhash. + DoReliableHash = &ReliableHashNEON; +#endif } #elif _M_SSE if (cpu_info.bSSE2) { diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index df8756789f..669c5ff72d 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -30,6 +30,9 @@ extern QuickTexHashFunc DoQuickTexHash; typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth); extern UnswizzleTex16Func DoUnswizzleTex16; +typedef u32 (*ReliableHashFunc)(const void *input, int len, u32 seed); +extern ReliableHashFunc DoReliableHash; + // All these DXT structs are in the reverse order, as compared to PC. // On PC, alpha comes before color, and interpolants are before the tile data. diff --git a/GPU/Common/TextureDecoderNEON.cpp b/GPU/Common/TextureDecoderNEON.cpp index a8da56baa5..6c6bbf7101 100644 --- a/GPU/Common/TextureDecoderNEON.cpp +++ b/GPU/Common/TextureDecoderNEON.cpp @@ -146,3 +146,100 @@ void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 p ydestp += (rowWidth * 8) / 4; } } + +// NOTE: This is just a NEON version of xxhash. +// GCC sucks at making things NEON and can't seem to handle it. + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +#else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; +#endif + +#define PRIME32_1 2654435761U +#define PRIME32_2 2246822519U +#define PRIME32_3 3266489917U +#define PRIME32_4 668265263U +#define PRIME32_5 374761393U + +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#endif + +u32 ReliableHashNEON(const void *input, int len, u32 seed) { + const u8 *p = (const u8 *)input; + const u8 *const bEnd = p + len; + U32 h32; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; } +#endif + + if (len>=16) + { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + uint32x4_t prime32_1q = vdupq_n_u32(PRIME32_1); + uint32x4_t prime32_2q = vdupq_n_u32(PRIME32_2); + uint32x4_t vq = vcombine_u32(vcreate_u32(v1 | ((U64)v2 << 32)), vcreate_u32(v3 | ((U64)v4 << 32))); + + do + { + __builtin_prefetch(p + 0xc0, 0, 0); + vq = vmlaq_u32(vq, vld1q_u32((const U32*)p), prime32_2q); + vq = vorrq_u32(vshlq_n_u32(vq, 13), vshrq_n_u32(vq, 32 - 13)); + p += 16; + vq = vmulq_u32(vq, prime32_1q); + } while (p<=limit); + + v1 = vgetq_lane_u32(vq, 0); + v2 = vgetq_lane_u32(vq, 1); + v3 = vgetq_lane_u32(vq, 2); + v4 = vgetq_lane_u32(vq, 3); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } + else + { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p<=bEnd-4) + { + h32 += *(const U32*)p * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} diff --git a/GPU/Common/TextureDecoderNEON.h b/GPU/Common/TextureDecoderNEON.h index 1f0ec94fe0..3cef806ac7 100644 --- a/GPU/Common/TextureDecoderNEON.h +++ b/GPU/Common/TextureDecoderNEON.h @@ -19,3 +19,4 @@ u32 QuickTexHashNEON(const void *checkp, u32 size); void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth); +u32 ReliableHashNEON(const void *input, int len, u32 seed); diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index dafe4c59f3..b7b5d37dfd 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -698,7 +698,7 @@ void TextureCacheDX9::UpdateCurrentClut() { // If not, we're going to hash random data, which hopefully doesn't cause a performance issue. const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes; - clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888); + clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888); /* // Avoid a copy when we don't need to convert colors. diff --git a/GPU/Directx9/TransformPipelineDX9.cpp b/GPU/Directx9/TransformPipelineDX9.cpp index d1512207be..bdb75b002a 100644 --- a/GPU/Directx9/TransformPipelineDX9.cpp +++ b/GPU/Directx9/TransformPipelineDX9.cpp @@ -26,12 +26,12 @@ #include "Core/CoreTiming.h" #include "helper/dx_state.h" -#include "ext/xxhash.h" #include "GPU/Math3D.h" #include "GPU/GPUState.h" #include "GPU/ge_constants.h" +#include "GPU/Common/TextureDecoder.h" #include "GPU/Directx9/StateMappingDX9.h" #include "GPU/Directx9/TextureCacheDX9.h" #include "GPU/Directx9/TransformPipelineDX9.h" @@ -1042,7 +1042,7 @@ u32 TransformDrawEngineDX9::ComputeHash() { for (int i = 0; i < numDrawCalls; i++) { if (!drawCalls[i].inds) { vertexCount = std::min((int)drawCalls[i].vertexCount, 500); - fullhash += XXH32((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4); + fullhash += DoReliableHash((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4); } else { vertexCount = std::min((int)drawCalls[i].vertexCount, 500); @@ -1050,10 +1050,10 @@ u32 TransformDrawEngineDX9::ComputeHash() { // This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way // we do when drawing. - fullhash += XXH32((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound, + fullhash += DoReliableHash((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound, vertexSize * indicesCount, 0x029F3EE1); int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1; - fullhash += XXH32((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA); + fullhash += DoReliableHash((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA); } } diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index 2386d22f56..2fdbcda3a9 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -734,7 +734,7 @@ void TextureCache::UpdateCurrentClut() { // If not, we're going to hash random data, which hopefully doesn't cause a performance issue. const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes; - clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888); + clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888); // Avoid a copy when we don't need to convert colors. if (clutFormat != GE_CMODE_32BIT_ABGR8888) { diff --git a/GPU/GLES/TransformPipeline.cpp b/GPU/GLES/TransformPipeline.cpp index 0b01d00a3a..691b744f2d 100644 --- a/GPU/GLES/TransformPipeline.cpp +++ b/GPU/GLES/TransformPipeline.cpp @@ -73,12 +73,12 @@ #include "Core/CoreTiming.h" #include "native/gfx_es2/gl_state.h" -#include "ext/xxhash.h" #include "GPU/Math3D.h" #include "GPU/GPUState.h" #include "GPU/ge_constants.h" +#include "GPU/Common/TextureDecoder.h" #include "GPU/Common/SplineCommon.h" #include "GPU/GLES/StateMapping.h" #include "GPU/GLES/TextureCache.h" @@ -86,7 +86,6 @@ #include "GPU/GLES/VertexDecoder.h" #include "GPU/GLES/ShaderManager.h" #include "GPU/GLES/GLES_GPU.h" -#include "GPU/Common/SplineCommon.h" extern const GLuint glprim[8] = { GL_POINTS, @@ -454,7 +453,7 @@ u32 TransformDrawEngine::ComputeHash() { for (int i = 0; i < numDrawCalls; i++) { const DeferredDrawCall &dc = drawCalls[i]; if (!dc.inds) { - fullhash += XXH32((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4); + fullhash += DoReliableHash((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4); } else { int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound; int j = i + 1; @@ -469,16 +468,16 @@ u32 TransformDrawEngine::ComputeHash() { } // This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way // we do when drawing. - fullhash += XXH32((const char *)dc.verts + vertexSize * indexLowerBound, + fullhash += DoReliableHash((const char *)dc.verts + vertexSize * indexLowerBound, vertexSize * (indexUpperBound - indexLowerBound), 0x029F3EE1); int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1; // Hm, we will miss some indices when combining above, but meh, it should be fine. - fullhash += XXH32((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA); + fullhash += DoReliableHash((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA); i = lastMatch; } } if (uvScale) { - fullhash += XXH32(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658); + fullhash += DoReliableHash(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658); } return fullhash;