From eec7853efe83c404d1906f5b87d24d27e98b030d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 28 Dec 2024 18:45:14 +0100
Subject: [PATCH] More SIMD: Add some matrix operations to CrossSIMD (#19773)

* More CrossSIMD functionality

* Use the new SIMD API for the matrix multiplies
---
 Common/Math/CrossSIMD.h         | 208 ++++++++++++++++++++++++++++++++
 Common/Math/SIMDHeaders.h       |   4 +
 Common/Math/fast/fast_matrix.c  |  10 --
 GPU/Common/DrawEngineCommon.cpp |  29 ++---
 4 files changed, 219 insertions(+), 32 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 4f9f49a2ea..278d8a74c1 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -11,6 +11,7 @@
 // The point of this, as opposed to a float4 array, is to almost force the compiler
 // to keep the matrix in registers, rather than loading on every access.
 struct Mat4F32 {
+	Mat4F32() {}
 	Mat4F32(const float *matrix) {
 		col0 = _mm_loadu_ps(matrix);
 		col1 = _mm_loadu_ps(matrix + 4);
@@ -23,12 +24,118 @@ struct Mat4F32 {
 		_mm_storeu_ps(m + 8, col2);
 		_mm_storeu_ps(m + 12, col3);
 	}
+
+	// Unlike the old one, this one is careful about not loading out-of-range data.
+	// The last two loads overlap.
+	static Mat4F32 Load4x3(const float *m) {
+		Mat4F32 result;
+		alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
+		alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+		__m128 mask1110 = _mm_loadu_ps((const float *)mask);
+		result.col0 = _mm_and_ps(_mm_loadu_ps(m), mask1110);
+		result.col1 = _mm_and_ps(_mm_loadu_ps(m + 3), mask1110);
+		result.col2 = _mm_and_ps(_mm_loadu_ps(m + 6), mask1110);
+		__m128 lastCol = _mm_loadu_ps(m + 8);
+		result.col3 = _mm_or_ps(_mm_and_ps(_mm_shuffle_ps(lastCol, lastCol, _MM_SHUFFLE(3, 3, 2, 1)), mask1110), _mm_load_ps(onelane3));
+		return result;
+	}
+
 	__m128 col0;
 	__m128 col1;
 	__m128 col2;
 	__m128 col3;
 };
 
+// The columns are spread out between the data*. This is just intermediate storage for multiplication.
+struct Mat4x3F32 {
+	Mat4x3F32(const float *matrix) {
+		data0 = _mm_loadu_ps(matrix);
+		data1 = _mm_loadu_ps(matrix + 4);
+		data2 = _mm_loadu_ps(matrix + 8);
+	}
+
+	__m128 data0;
+	__m128 data1;
+	__m128 data2;
+};
+
+// TODO: Check if loading b by 4s and shuffling is cheaper.
+inline Mat4F32 MulMem4x4By4x4(const float *a, Mat4F32 b) {
+	Mat4F32 result;
+
+	__m128 r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[0]));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[1])));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[2])));
+	result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[3])));
+
+	r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[4]));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[5])));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[6])));
+	result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[7])));
+
+	r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[8]));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[9])));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[10])));
+	result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[11])));
+
+	r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[12]));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[13])));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[14])));
+	result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[15])));
+
+	return result;
+}
+
+inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
+	Mat4F32 result;
+
+	__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col0, 0));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col0, 1)));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col0, 2)));
+	result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col0, 3)));
+
+	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col1, 0));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col1, 1)));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col1, 2)));
+	result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col1, 3)));
+
+	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col2, 0));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col2, 1)));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col2, 2)));
+	result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col2, 3)));
+
+	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col3, 0));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col3, 1)));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col3, 2)));
+	result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col3, 3)));
+
+	return result;
+}
+
+inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
+	Mat4F32 result;
+
+	__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 0));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data0, 1)));
+	result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data0, 2)));
+
+	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 3));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 0)));
+	result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data1, 1)));
+
+	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data1, 2));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 3)));
+	result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 0)));
+
+	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data2, 1));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data2, 2)));
+	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 3)));
+
+	// The last entry has an implied 1.0f.
+	result.col3 = _mm_add_ps(r_col, b.col3);
+	return result;
+}
+
 struct Vec4S32 {
 	__m128i v;
 
@@ -90,6 +197,13 @@ struct Vec4F32 {
 	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; }
 	void Store(float *dst) { _mm_storeu_ps(dst, v); }
 	void StoreAligned (float *dst) { _mm_store_ps(dst, v); }
+	void Store3(float *dst) {
+		// TODO: There might be better ways.
+		_mm_store_pd((double *)dst, _mm_castps_pd(v));
+		_mm_store_ss(dst + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
+	}
+
+	static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *)src)) }; }
 
 	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
 		__m128i value = _mm_loadl_epi64((const __m128i *)src);
@@ -104,6 +218,14 @@ struct Vec4F32 {
 		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) };
 	}
 
+	static Vec4F32 LoadF24x3_One(const uint32_t *src) {
+		alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
+		alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
+
+		__m128 value = _mm_castsi128_ps(_mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8));
+		return Vec4F32{ _mm_or_ps(_mm_and_ps(value, _mm_load_ps((const float *)mask)), _mm_load_ps(onelane3)) };
+	}
+
 	static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
 
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
@@ -230,6 +352,7 @@ inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
 #elif PPSSPP_ARCH(ARM_NEON)
 
 struct Mat4F32 {
+	Mat4F32() {}
 	Mat4F32(const float *matrix) {
 		col0 = vld1q_f32(matrix);
 		col1 = vld1q_f32(matrix + 4);
@@ -242,12 +365,86 @@ struct Mat4F32 {
 		vst1q_f32(m + 8, col2);
 		vst1q_f32(m + 12, col3);
 	}
+
+	// Unlike the old one, this one is careful about not loading out-of-range data.
+	// The last two loads overlap.
+	static Mat4F32 Load4x3(const float *m) {
+		Mat4F32 result;
+		result.col0 = vsetq_lane_f32(0.0f, vld1q_f32(m), 3);
+		result.col1 = vsetq_lane_f32(0.0f, vld1q_f32(m + 3), 3);
+		result.col2 = vsetq_lane_f32(0.0f, vld1q_f32(m + 6), 3);
+		result.col3 = vsetq_lane_f32(1.0f, vld1q_f32(m + 9), 3);  // TODO: Fix this out of bounds read
+		return result;
+	}
+
 	float32x4_t col0;
 	float32x4_t col1;
 	float32x4_t col2;
 	float32x4_t col3;
 };
 
+// The columns are spread out between the data*. This is just intermediate storage for multiplication.
+struct Mat4x3F32 {
+	Mat4x3F32(const float *matrix) {
+		data0 = vld1q_f32(matrix);
+		data1 = vld1q_f32(matrix + 4);
+		data2 = vld1q_f32(matrix + 8);
+	}
+
+	float32x4_t data0;
+	float32x4_t data1;
+	float32x4_t data2;
+};
+
+inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
+	Mat4F32 result;
+
+	float32x4_t r_col = vmulq_laneq_f32(b.col0, a.col0, 0);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col0, 1);
+	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col0, 2);
+	result.col0 = vfmaq_laneq_f32(r_col, b.col3, a.col0, 3);
+
+	r_col = vmulq_laneq_f32(b.col0, a.col1, 0);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col1, 1);
+	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col1, 2);
+	result.col1 = vfmaq_laneq_f32(r_col, b.col3, a.col1, 3);
+
+	r_col = vmulq_laneq_f32(b.col0, a.col2, 0);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col2, 1);
+	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col2, 2);
+	result.col2 = vfmaq_laneq_f32(r_col, b.col3, a.col2, 3);
+
+	r_col = vmulq_laneq_f32(b.col0, a.col3, 0);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col3, 1);
+	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col3, 2);
+	result.col3 = vfmaq_laneq_f32(r_col, b.col3, a.col3, 3);
+
+	return result;
+}
+
+inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
+	Mat4F32 result;
+
+	float32x4_t r_col = vmulq_laneq_f32(b.col0, a.data0, 0);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data0, 1);
+	result.col0 = vfmaq_laneq_f32(r_col, b.col2, a.data0, 2);
+
+	r_col = vmulq_laneq_f32(b.col0, a.data0, 3);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 0);
+	result.col1 = vfmaq_laneq_f32(r_col, b.col2, a.data1, 1);
+
+	r_col = vmulq_laneq_f32(b.col0, a.data1, 2);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 3);
+	result.col2 = vfmaq_laneq_f32(r_col, b.col2, a.data2, 0);
+
+	r_col = vmulq_laneq_f32(b.col0, a.data2, 1);
+	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data2, 2);
+	r_col = vfmaq_laneq_f32(r_col, b.col2, a.data2, 3);
+
+	// The last entry has an implied 1.0f.
+	result.col3 = vaddq_f32(r_col, b.col3);
+	return result;
+}
 
 struct Vec4S32 {
 	int32x4_t v;
@@ -292,6 +489,13 @@ struct Vec4F32 {
 	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
 	void Store(float *dst) { vst1q_f32(dst, v); }
 	void StoreAligned(float *dst) { vst1q_f32(dst, v); }
+	void Store3(float *dst) {
+		// TODO: There might be better ways. Try to avoid this when possible.
+		vst1_f32(dst, vget_low_f32(v));
+		dst[2] = vgetq_lane_f32(v, 2);
+	}
+
+	static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ vcombine_f32(vld1_f32(src), vdup_n_f32(0.0f)) }; }  // TODO: Feels like there should be a better way.
 
 	static Vec4F32 LoadConvertS16(const int16_t *src) {
 		int16x4_t value = vld1_s16(src);
@@ -304,6 +508,10 @@ struct Vec4F32 {
 		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value16)) };
 	}
 
+	static Vec4F32 LoadF24x3_One(const uint32_t *src) {
+		return Vec4F32{ vsetq_lane_f32(1.0f, vreinterpretq_f32_u32(vshlq_n_u32(vld1q_u32(src), 8)), 3) };
+	}
+
 	static Vec4F32 FromVec4S32(Vec4S32 other) {
 		return Vec4F32{ vcvtq_f32_s32(other.v) };
 	}
diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
index 9705dca554..cb63b89eef 100644
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@@ -59,6 +59,8 @@ static inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x
 	}
 }
 
+#define vfmaq_laneq_f32 vmlaq_laneq_f32
+
 static inline uint32x4_t vcgezq_f32(float32x4_t v) {
 	return vcgeq_f32(v, vdupq_n_f32(0.0f));
 }
@@ -118,6 +120,8 @@ inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) {
 	return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0)));
 }
 
+#define _mm_splat_lane_ps(v, l) _mm_shuffle_ps((v), (v), _MM_SHUFFLE(l, l, l, l))
+
 #ifdef __cplusplus
 
 alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 };
diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c
index d23ce3b0e0..cff592e680 100644
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@@ -24,16 +24,6 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 
 #elif PPSSPP_ARCH(ARM_NEON)
 
-#if PPSSPP_ARCH(ARM)
-static inline float32x4_t vfmaq_laneq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b, int lane) {
-	if (lane == 0)      return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 0);
-	else if (lane == 1) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 1);
-	else if (lane == 2) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 0);
-	else if (lane == 3) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 1);
-	else return vdupq_n_f32(0.f);
-}
-#endif
-
 // From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example
 void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) {
 	// these are the columns A
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 4cedf95e8b..97432bcf45 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -915,33 +915,18 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const {
 	}
 }
 
-inline void ComputeFinalProjMatrix(float *worldviewproj) {
-	float world[16];
-	float view[16];
-	float worldview[16];
-	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
-	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
-	Matrix4ByMatrix4(worldview, world, view);
-	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
-
-	// Heh, a bit ugly to mix two different matrix APIs here, but it works.
-
-	const float viewportScale[4] = {
-		gstate.getViewportXScale(),
-		gstate.getViewportYScale(),
-		gstate.getViewportZScale(),
-		1.0f
-	};
+Mat4F32 ComputeFinalProjMatrix() {
 	const float viewportTranslate[4] = {
 		gstate.getViewportXCenter() - gstate.getOffsetX(),
 		gstate.getViewportYCenter() - gstate.getOffsetY(),
 		gstate.getViewportZCenter(),
 	};
 
+	Mat4F32 wv = Mul4x3By4x4(Mat4x3F32(gstate.worldMatrix), Mat4F32::Load4x3(gstate.viewMatrix));
+	Mat4F32 m = Mul4x4By4x4(wv, Mat4F32(gstate.projMatrix));
 	// NOTE: Applying the translation actually works pre-divide, since W is also affected.
-	Mat4F32 m(worldviewproj);
-	TranslateAndScaleInplace(m, Vec4F32::Load(viewportScale), Vec4F32::Load(viewportTranslate));
-	m.Store(worldviewproj);
+	TranslateAndScaleInplace(m, Vec4F32::LoadF24x3_One(&gstate.viewportxscale), Vec4F32::Load(viewportTranslate));
+	return m;
 }
 
 void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {
@@ -967,7 +952,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
 	TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats);
 
 	float worldviewproj[16];
-	ComputeFinalProjMatrix(worldviewproj);
+	ComputeFinalProjMatrix().Store(worldviewproj);
 
 	// Decode.
 	int numDec = 0;
@@ -1035,7 +1020,7 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i
 			return;
 		}
 		float worldviewproj[16];
-		ComputeFinalProjMatrix(worldviewproj);
+		ComputeFinalProjMatrix().Store(worldviewproj);
 		TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded);
 
 		switch (prim) {