Move prototype cross simd wrapper structs to CrossSIMD.h

2025-04-02 11:01:50 -04:00 · 2024-12-20 08:48:16 +01:00 · 2024-12-20 08:48:16 +01:00 · c92b3b6521
commit c92b3b6521
parent 72c954d8c3
5 changed files with 120 additions and 81 deletions
--- a/Common/Data/Convert/ColorConv.cpp
+++ b/Common/Data/Convert/ColorConv.cpp
@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
 }

 #if PPSSPP_ARCH(SSE2)
-// fp64's improved version, see #19751
+// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
 static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
 	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
 	const __m128i maskGA = _mm_set1_epi32(0x8000F800);
@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp,
 		__m128i c0 = _mm_load_si128(&srcp[i + 0]);
 		__m128i c1 = _mm_load_si128(&srcp[i + 1]);

-		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000
+		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
 		__m128i rb1 = _mm_and_si128(c1, maskRB);              // 00000000bbbbb00000000000rrrrr000
 		__m128i ga0 = _mm_and_si128(c0, maskGA);              // a000000000000000ggggg00000000000
 		__m128i ga1 = _mm_and_si128(c1, maskGA);              // a000000000000000ggggg00000000000
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@ -5,3 +5,99 @@
 #pragma once

 #include "Common/Math/SIMDHeaders.h"
+
+#if PPSSPP_ARCH(SSE2)
+
+struct Vec4S32 {
+	__m128i v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ _mm_add_epi32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ _mm_sub_epi32(v, other.v) };
+	}
+	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
+	}
+};
+
+struct Vec4F32 {
+	__m128 v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ _mm_add_ps(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ _mm_sub_ps(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ _mm_mul_ps(v, other.v) };
+	}
+};
+
+struct Vec4U16 {
+	__m128i v;  // we only use the lower 64 bits.
+	static Vec4U16 Load(void *mem) {
+		return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) };
+	}
+	void Store(void *mem) {
+		_mm_storel_epi64((__m128i *)mem, v);
+	}
+	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) {
+		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
+	}
+	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) {
+		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
+	}
+	Vec4U16 CompareLT(Vec4U16 other) {
+		return Vec4U16{ _mm_cmplt_epu16(v, other.v) };
+	}
+};
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+struct Vec4S32 {
+	int32x4_t v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ vaddq_s32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ vsubq_s32(v, other.v) };
+	}
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ vmulq_s32(v, other.v) };
+	}
+};
+
+struct Vec4F32 {
+	float32x4_t v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ vcvtq_f32_s32(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ vaddq_f32(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ vsubq_f32(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ vmulq_f32(v, other.v) };
+	}
+};
+
+#else
+
+struct Vec4S32 {
+	s32 v[4];
+};
+
+#endif
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@ -128,4 +128,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
 	return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
 }

+// The below are not real SSE instructions in any generation, but should exist.
+
+// Return 0xFFFF where x <= y, else 0x0000.
+inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
+	return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
+}
+
+// Return 0xFFFF where x >= y, else 0x0000.
+inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
+	return _mm_cmple_epu16(y, x);
+}
+
+// Return 0xFFFF where x > y, else 0x0000.
+inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
+	return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
+}
+
+// Return 0xFFFF where x < y, else 0x0000.
+inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
+	return _mm_cmpgt_epu16(y, x);
+}
+
 #endif
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@ -6,8 +6,6 @@

 #if PPSSPP_ARCH(SSE2)

-#include "fast_matrix.h"
-
 void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 	int i;
 	__m128 a_col_1 = _mm_loadu_ps(a);
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@ -8,83 +8,6 @@
 #include "Common/Math/math_util.h"
 #include "GPU/Common/VertexDecoderCommon.h"

-#if PPSSPP_ARCH(SSE2)
-
-struct Vec4S32 {
-	__m128i v;
-
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ _mm_add_epi32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ _mm_sub_epi32(v, other.v) };
-	}
-	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
-	}
-};
-
-struct Vec4F32 {
-	__m128 v;
-
-	static Vec4F32 FromVec4S32(Vec4S32 other) {
-		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
-	}
-
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ _mm_add_ps(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ _mm_sub_ps(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ _mm_mul_ps(v, other.v) };
-	}
-};
-
-#elif PPSSPP_ARCH(ARM_NEON)
-
-struct Vec4S32 {
-	uint32x4_t v;
-
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ vaddq_s32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ vsubq_s32(v, other.v) };
-	}
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ vmulq_s32(v, other.v) };
-	}
-};
-
-struct Vec4F32 {
-	float32x4_t v;
-
-	static Vec4F32 FromVec4S32(Vec4S32 other) {
-		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
-	}
-
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ vaddq_f32(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ vsubq_f32(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ vmulq_f32(v, other.v) };
-	}
-};
-
-#else
-
-struct Vec4S32 {
-	s32 v[4];
-};
-
-#endif
-
 struct ScreenVert {
 	int x;
 	int y;