From 73cd690753f9109e4e92cba4eb8826d6bc90998f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 28 Nov 2023 00:25:04 +0100 Subject: [PATCH 1/3] Optimize NEON matrix multiplication slightly --- Common/Math/fast/fast_matrix.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index 614a552b39..7e142c84f6 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -67,36 +67,30 @@ void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) { A2 = vld1q_f32(A + 8); A3 = vld1q_f32(A + 12); - // Zero accumulators for C values - C0 = vmovq_n_f32(0); - C1 = vmovq_n_f32(0); - C2 = vmovq_n_f32(0); - C3 = vmovq_n_f32(0); - // Multiply accumulate in 4x1 blocks, i.e. each column in C B0 = vld1q_f32(B); - C0 = vfmaq_laneq_f32(C0, A0, B0, 0); + C0 = vmulq_laneq_f32(A0, B0, 0); C0 = vfmaq_laneq_f32(C0, A1, B0, 1); C0 = vfmaq_laneq_f32(C0, A2, B0, 2); C0 = vfmaq_laneq_f32(C0, A3, B0, 3); vst1q_f32(C, C0); B1 = vld1q_f32(B + 4); - C1 = vfmaq_laneq_f32(C1, A0, B1, 0); + C1 = vmulq_laneq_f32(A0, B1, 0); C1 = vfmaq_laneq_f32(C1, A1, B1, 1); C1 = vfmaq_laneq_f32(C1, A2, B1, 2); C1 = vfmaq_laneq_f32(C1, A3, B1, 3); vst1q_f32(C + 4, C1); B2 = vld1q_f32(B + 8); - C2 = vfmaq_laneq_f32(C2, A0, B2, 0); + C2 = vmulq_laneq_f32(A0, B2, 0); C2 = vfmaq_laneq_f32(C2, A1, B2, 1); C2 = vfmaq_laneq_f32(C2, A2, B2, 2); C2 = vfmaq_laneq_f32(C2, A3, B2, 3); vst1q_f32(C + 8, C2); B3 = vld1q_f32(B + 12); - C3 = vfmaq_laneq_f32(C3, A0, B3, 0); + C3 = vmulq_laneq_f32(A0, B3, 0); C3 = vfmaq_laneq_f32(C3, A1, B3, 1); C3 = vfmaq_laneq_f32(C3, A2, B3, 2); C3 = vfmaq_laneq_f32(C3, A3, B3, 3); From 0fef71348deb7c11b78afaa804afea7ba40a1b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 11 Jan 2024 12:42:05 +0100 Subject: [PATCH 2/3] ARM32 buildfix --- Common/Math/fast/fast_matrix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index 7e142c84f6..c6f24ddf13 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -1,5 +1,7 @@ #include "ppsspp_config.h" +#include "Common/Math/CrossSIMD.h" + #include "fast_matrix.h" #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) From aee8f62ed5f1312b01e6e87623200b0f4bf2c1bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 11 Jan 2024 14:59:54 +0100 Subject: [PATCH 3/3] Buildfix (included from c file) --- Common/Math/CrossSIMD.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 3eb8e0e75e..a543fb7378 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -8,7 +8,7 @@ #include "ppsspp_config.h" -#include +#include "stdint.h" #if PPSSPP_ARCH(SSE2) #include