CrossSIMD: make the transpose function compatible with ARM32

2025-04-02 11:01:50 -04:00 · 2024-12-21 13:21:20 +01:00 · 2024-12-21 13:21:20 +01:00 · 399570e411
commit 399570e411
parent 2051d55c90
1 changed files with 10 additions and 0 deletions
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@ -264,6 +264,8 @@ struct Vec4F32 {

 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+#if PPSSPP_ARCH(ARM64_NEON)
+		// Only works on ARM64
 		float32x4_t temp0 = vzip1q_f32(col0.v, col2.v);
 		float32x4_t temp1 = vzip2q_f32(col0.v, col2.v);
 		float32x4_t temp2 = vzip1q_f32(col1.v, col3.v);
@ -272,6 +274,14 @@ struct Vec4F32 {
 		col1.v = vzip2q_f32(temp0, temp2);
 		col2.v = vzip1q_f32(temp1, temp3);
 		col3.v = vzip2q_f32(temp1, temp3);
+#else
+   		float32x4x2_t col01 = vtrnq_f32(col0.v, col1.v);
+        float32x4x2_t col23 = vtrnq_f32(col2.v, col3.v);
+        col0.v = vcombine_f32(vget_low_f32(col01.val[0]), vget_low_f32(col23.val[0]));
+        col1.v = vcombine_f32(vget_low_f32(col01.val[1]), vget_low_f32(col23.val[1]));
+        col2.v = vcombine_f32(vget_high_f32(col01.val[0]), vget_high_f32(col23.val[0]));
+        col3.v = vcombine_f32(vget_high_f32(col01.val[1]), vget_high_f32(col23.val[1]));
+#endif
 	}

 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {