Add a NEON-optimized version of XXH32.

This takes at least 40% less time to hash on NEON/ARM devices.
2025-04-02 11:01:50 -04:00 · 2014-03-25 00:21:04 -07:00 · 2014-03-25 00:21:04 -07:00 · b800762ceb
commit b800762ceb
parent 3ec61274fa
8 changed files with 118 additions and 12 deletions
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@ -15,6 +15,7 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.

+#include "ext/xxhash.h"
 #include "Common/CPUDetect.h"
 #include "GPU/Common/TextureDecoder.h"
 // NEON is in a separate file so that it can be compiled with a runtime check.
@ -152,6 +153,7 @@ void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32

 QuickTexHashFunc DoQuickTexHash = &QuickTexHashBasic;
 UnswizzleTex16Func DoUnswizzleTex16 = &DoUnswizzleTex16Basic;
+ReliableHashFunc DoReliableHash = &XXH32;

 // This has to be done after CPUDetect has done its magic.
 void SetupTextureDecoder() {
@ -159,6 +161,10 @@ void SetupTextureDecoder() {
 	if (cpu_info.bNEON) {
 		DoQuickTexHash = &QuickTexHashNEON;
 		DoUnswizzleTex16 = &DoUnswizzleTex16NEON;
+#ifndef IOS
+		// Not sure if this is safe on iOS, it's had issues with xxhash.
+		DoReliableHash = &ReliableHashNEON;
+#endif
 	}
 #elif _M_SSE
 	if (cpu_info.bSSE2) {
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@ -30,6 +30,9 @@ extern QuickTexHashFunc DoQuickTexHash;
 typedef void (*UnswizzleTex16Func)(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
 extern UnswizzleTex16Func DoUnswizzleTex16;

+typedef u32 (*ReliableHashFunc)(const void *input, int len, u32 seed);
+extern ReliableHashFunc DoReliableHash;
+
 // All these DXT structs are in the reverse order, as compared to PC.
 // On PC, alpha comes before color, and interpolants are before the tile data.

--- a/GPU/Common/TextureDecoderNEON.cpp
+++ b/GPU/Common/TextureDecoderNEON.cpp
@ -146,3 +146,100 @@ void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 p
 		ydestp += (rowWidth * 8) / 4;
 	}
 }
+
+// NOTE: This is just a NEON version of xxhash.
+// GCC sucks at making things NEON and can't seem to handle it.
+
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   // C99
+# include <stdint.h>
+  typedef uint8_t  BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char      BYTE;
+  typedef unsigned short     U16;
+  typedef unsigned int       U32;
+  typedef   signed int       S32;
+  typedef unsigned long long U64;
+#endif
+
+#define PRIME32_1   2654435761U
+#define PRIME32_2   2246822519U
+#define PRIME32_3   3266489917U
+#define PRIME32_4    668265263U
+#define PRIME32_5    374761393U
+
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+
+u32 ReliableHashNEON(const void *input, int len, u32 seed) {
+	const u8 *p = (const u8 *)input;
+	const u8 *const bEnd = p + len;
+	U32 h32;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+	if (p==NULL) { len=0; p=(const BYTE*)(size_t)16; }
+#endif
+
+	if (len>=16)
+	{
+		const BYTE* const limit = bEnd - 16;
+		U32 v1 = seed + PRIME32_1 + PRIME32_2;
+		U32 v2 = seed + PRIME32_2;
+		U32 v3 = seed + 0;
+		U32 v4 = seed - PRIME32_1;
+
+		uint32x4_t prime32_1q = vdupq_n_u32(PRIME32_1);
+		uint32x4_t prime32_2q = vdupq_n_u32(PRIME32_2);
+		uint32x4_t vq = vcombine_u32(vcreate_u32(v1 | ((U64)v2 << 32)), vcreate_u32(v3 | ((U64)v4 << 32)));
+
+		do
+		{
+			__builtin_prefetch(p + 0xc0, 0, 0);
+			vq = vmlaq_u32(vq, vld1q_u32((const U32*)p), prime32_2q);
+			vq = vorrq_u32(vshlq_n_u32(vq, 13), vshrq_n_u32(vq, 32 - 13));
+			p += 16;
+			vq = vmulq_u32(vq, prime32_1q);
+		} while (p<=limit);
+
+		v1 = vgetq_lane_u32(vq, 0);
+		v2 = vgetq_lane_u32(vq, 1);
+		v3 = vgetq_lane_u32(vq, 2);
+		v4 = vgetq_lane_u32(vq, 3);
+
+		h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+	}
+	else
+	{
+		h32  = seed + PRIME32_5;
+	}
+
+	h32 += (U32) len;
+
+	while (p<=bEnd-4)
+	{
+		h32 += *(const U32*)p * PRIME32_3;
+		h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+		p+=4;
+	}
+
+	while (p<bEnd)
+	{
+		h32 += (*p) * PRIME32_5;
+		h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+		p++;
+	}
+
+	h32 ^= h32 >> 15;
+	h32 *= PRIME32_2;
+	h32 ^= h32 >> 13;
+	h32 *= PRIME32_3;
+	h32 ^= h32 >> 16;
+
+	return h32;
+}
--- a/GPU/Common/TextureDecoderNEON.h
+++ b/GPU/Common/TextureDecoderNEON.h
@ -19,3 +19,4 @@

 u32 QuickTexHashNEON(const void *checkp, u32 size);
 void DoUnswizzleTex16NEON(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth);
+u32 ReliableHashNEON(const void *input, int len, u32 seed);
--- a/GPU/Directx9/TextureCacheDX9.cpp
+++ b/GPU/Directx9/TextureCacheDX9.cpp
@ -698,7 +698,7 @@ void TextureCacheDX9::UpdateCurrentClut() {
 	// If not, we're going to hash random data, which hopefully doesn't cause a performance issue.
 	const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes;

-	clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
+	clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
 	
 	/*
 	// Avoid a copy when we don't need to convert colors.
--- a/GPU/Directx9/TransformPipelineDX9.cpp
+++ b/GPU/Directx9/TransformPipelineDX9.cpp
@ -26,12 +26,12 @@
 #include "Core/CoreTiming.h"

 #include "helper/dx_state.h"
-#include "ext/xxhash.h"

 #include "GPU/Math3D.h"
 #include "GPU/GPUState.h"
 #include "GPU/ge_constants.h"

+#include "GPU/Common/TextureDecoder.h"
 #include "GPU/Directx9/StateMappingDX9.h"
 #include "GPU/Directx9/TextureCacheDX9.h"
 #include "GPU/Directx9/TransformPipelineDX9.h"
@ -1042,7 +1042,7 @@ u32 TransformDrawEngineDX9::ComputeHash() {
 	for (int i = 0; i < numDrawCalls; i++) {
 		if (!drawCalls[i].inds) {
 			vertexCount = std::min((int)drawCalls[i].vertexCount, 500);
-			fullhash += XXH32((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4);
+			fullhash += DoReliableHash((const char *)drawCalls[i].verts, vertexSize * vertexCount, 0x1DE8CAC4);
 		} else {
 			
 			vertexCount = std::min((int)drawCalls[i].vertexCount, 500);
@ -1050,10 +1050,10 @@ u32 TransformDrawEngineDX9::ComputeHash() {

 			// This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way
 			// we do when drawing.
-			fullhash += XXH32((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound,
+			fullhash += DoReliableHash((const char *)drawCalls[i].verts + vertexSize * drawCalls[i].indexLowerBound,
 				vertexSize * indicesCount, 0x029F3EE1);
 			int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1;
-			fullhash += XXH32((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA);
+			fullhash += DoReliableHash((const char *)drawCalls[i].inds, indexSize * vertexCount, 0x955FD1CA);
 		}
 	}

--- a/GPU/GLES/TextureCache.cpp
+++ b/GPU/GLES/TextureCache.cpp
@ -734,7 +734,7 @@ void TextureCache::UpdateCurrentClut() {
 	// If not, we're going to hash random data, which hopefully doesn't cause a performance issue.
 	const u32 clutExtendedBytes = clutTotalBytes_ + clutBaseBytes;

-	clutHash_ = XXH32((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);
+	clutHash_ = DoReliableHash((const char *)clutBufRaw_, clutExtendedBytes, 0xC0108888);

 	// Avoid a copy when we don't need to convert colors.
 	if (clutFormat != GE_CMODE_32BIT_ABGR8888) {
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@ -73,12 +73,12 @@
 #include "Core/CoreTiming.h"

 #include "native/gfx_es2/gl_state.h"
-#include "ext/xxhash.h"

 #include "GPU/Math3D.h"
 #include "GPU/GPUState.h"
 #include "GPU/ge_constants.h"

+#include "GPU/Common/TextureDecoder.h"
 #include "GPU/Common/SplineCommon.h"
 #include "GPU/GLES/StateMapping.h"
 #include "GPU/GLES/TextureCache.h"
@ -86,7 +86,6 @@
 #include "GPU/GLES/VertexDecoder.h"
 #include "GPU/GLES/ShaderManager.h"
 #include "GPU/GLES/GLES_GPU.h"
-#include "GPU/Common/SplineCommon.h"

 extern const GLuint glprim[8] = {
 	GL_POINTS,
@ -454,7 +453,7 @@ u32 TransformDrawEngine::ComputeHash() {
 	for (int i = 0; i < numDrawCalls; i++) {
 		const DeferredDrawCall &dc = drawCalls[i];
 		if (!dc.inds) {
-			fullhash += XXH32((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4);
+			fullhash += DoReliableHash((const char *)dc.verts, vertexSize * dc.vertexCount, 0x1DE8CAC4);
 		} else {
 			int indexLowerBound = dc.indexLowerBound, indexUpperBound = dc.indexUpperBound;
 			int j = i + 1;
@ -469,16 +468,16 @@ u32 TransformDrawEngine::ComputeHash() {
 			}
 			// This could get seriously expensive with sparse indices. Need to combine hashing ranges the same way
 			// we do when drawing.
-			fullhash += XXH32((const char *)dc.verts + vertexSize * indexLowerBound,
+			fullhash += DoReliableHash((const char *)dc.verts + vertexSize * indexLowerBound,
 				vertexSize * (indexUpperBound - indexLowerBound), 0x029F3EE1);
 			int indexSize = (dec_->VertexType() & GE_VTYPE_IDX_MASK) == GE_VTYPE_IDX_16BIT ? 2 : 1;
 			// Hm, we will miss some indices when combining above, but meh, it should be fine.
-			fullhash += XXH32((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA);
+			fullhash += DoReliableHash((const char *)dc.inds, indexSize * dc.vertexCount, 0x955FD1CA);
 			i = lastMatch;
 		}
 	}
 	if (uvScale) {
-		fullhash += XXH32(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
+		fullhash += DoReliableHash(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
 	}

 	return fullhash;