From c92b3b6521cc0a6976ec2a3c0bf918cab0f5e62f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 08:48:16 +0100
Subject: [PATCH] Move prototype cross simd wrapper structs to CrossSIMD.h

---
 Common/Data/Convert/ColorConv.cpp |  4 +-
 Common/Math/CrossSIMD.h           | 96 +++++++++++++++++++++++++++++++
 Common/Math/SIMDHeaders.h         | 22 +++++++
 Common/Math/fast/fast_matrix.c    |  2 -
 GPU/Common/DepthRaster.cpp        | 77 -------------------------
 5 files changed, 120 insertions(+), 81 deletions(-)

diff --git a/Common/Data/Convert/ColorConv.cpp b/Common/Data/Convert/ColorConv.cpp
index 72fac52f2f..5c4df7fca8 100644
--- a/Common/Data/Convert/ColorConv.cpp
+++ b/Common/Data/Convert/ColorConv.cpp
@@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
 }
 
 #if PPSSPP_ARCH(SSE2)
-// fp64's improved version, see #19751
+// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
 static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
 	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
 	const __m128i maskGA = _mm_set1_epi32(0x8000F800);
@@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp,
 		__m128i c0 = _mm_load_si128(&srcp[i + 0]);
 		__m128i c1 = _mm_load_si128(&srcp[i + 1]);
 
-		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000
+		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
 		__m128i rb1 = _mm_and_si128(c1, maskRB);              // 00000000bbbbb00000000000rrrrr000
 		__m128i ga0 = _mm_and_si128(c0, maskGA);              // a000000000000000ggggg00000000000
 		__m128i ga1 = _mm_and_si128(c1, maskGA);              // a000000000000000ggggg00000000000
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 94b2d3933b..11ed217024 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -5,3 +5,99 @@
 #pragma once
 
 #include "Common/Math/SIMDHeaders.h"
+
+#if PPSSPP_ARCH(SSE2)
+
+struct Vec4S32 {
+	__m128i v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ _mm_add_epi32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ _mm_sub_epi32(v, other.v) };
+	}
+	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
+	}
+};
+
+struct Vec4F32 {
+	__m128 v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ _mm_add_ps(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ _mm_sub_ps(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ _mm_mul_ps(v, other.v) };
+	}
+};
+
+struct Vec4U16 {
+	__m128i v;  // we only use the lower 64 bits.
+	static Vec4U16 Load(void *mem) {
+		return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) };
+	}
+	void Store(void *mem) {
+		_mm_storel_epi64((__m128i *)mem, v);
+	}
+	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) {
+		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
+	}
+	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) {
+		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
+	}
+	Vec4U16 CompareLT(Vec4U16 other) {
+		return Vec4U16{ _mm_cmplt_epu16(v, other.v) };
+	}
+};
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+struct Vec4S32 {
+	int32x4_t v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ vaddq_s32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ vsubq_s32(v, other.v) };
+	}
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ vmulq_s32(v, other.v) };
+	}
+};
+
+struct Vec4F32 {
+	float32x4_t v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ vcvtq_f32_s32(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ vaddq_f32(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ vsubq_f32(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ vmulq_f32(v, other.v) };
+	}
+};
+
+#else
+
+struct Vec4S32 {
+	s32 v[4];
+};
+
+#endif
diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
index 8e812a7819..3f8500dfb2 100644
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@@ -128,4 +128,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
 	return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
 }
 
+// The below are not real SSE instructions in any generation, but should exist.
+
+// Return 0xFFFF where x <= y, else 0x0000.
+inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
+	return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
+}
+
+// Return 0xFFFF where x >= y, else 0x0000.
+inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
+	return _mm_cmple_epu16(y, x);
+}
+
+// Return 0xFFFF where x > y, else 0x0000.
+inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
+	return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
+}
+
+// Return 0xFFFF where x < y, else 0x0000.
+inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
+	return _mm_cmpgt_epu16(y, x);
+}
+
 #endif
diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c
index 0402f36629..d23ce3b0e0 100644
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@@ -6,8 +6,6 @@
 
 #if PPSSPP_ARCH(SSE2)
 
-#include "fast_matrix.h"
-
 void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 	int i;
 	__m128 a_col_1 = _mm_loadu_ps(a);
diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 2bb83831bb..8068df066f 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -8,83 +8,6 @@
 #include "Common/Math/math_util.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 
-#if PPSSPP_ARCH(SSE2)
-
-struct Vec4S32 {
-	__m128i v;
-
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ _mm_add_epi32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ _mm_sub_epi32(v, other.v) };
-	}
-	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
-	}
-};
-
-struct Vec4F32 {
-	__m128 v;
-
-	static Vec4F32 FromVec4S32(Vec4S32 other) {
-		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
-	}
-
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ _mm_add_ps(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ _mm_sub_ps(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ _mm_mul_ps(v, other.v) };
-	}
-};
-
-#elif PPSSPP_ARCH(ARM_NEON)
-
-struct Vec4S32 {
-	uint32x4_t v;
-
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ vaddq_s32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ vsubq_s32(v, other.v) };
-	}
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ vmulq_s32(v, other.v) };
-	}
-};
-
-struct Vec4F32 {
-	float32x4_t v;
-
-	static Vec4F32 FromVec4S32(Vec4S32 other) {
-		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
-	}
-
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ vaddq_f32(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ vsubq_f32(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ vmulq_f32(v, other.v) };
-	}
-};
-
-#else
-
-struct Vec4S32 {
-	s32 v[4];
-};
-
-#endif
-
 struct ScreenVert {
 	int x;
 	int y;