diff --git a/CMakeLists.txt b/CMakeLists.txt index 1194fec7a3..d7fd06a0c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,8 @@ if(CMAKE_SYSTEM_PROCESSOR) set(RISCV64_DEVICE ON) elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^loongarch64") set(LOONGARCH64_DEVICE ON) + add_compile_options(-mlsx) + add_compile_options(-mlasx) else() message("Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") endif() diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h index 790fac494e..f6e86ef0c2 100644 --- a/Common/Math/SIMDHeaders.h +++ b/Common/Math/SIMDHeaders.h @@ -31,6 +31,12 @@ #endif #endif +#if PPSSPP_ARCH(LOONGARCH64) +#if PPSSPP_ARCH(LOONGARCH64_LSX) +#include +#endif +#endif + // Basic types #if PPSSPP_ARCH(ARM64_NEON) diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index cff592e680..6ce87e2a6a 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -22,6 +22,42 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) { } } +#elif PPSSPP_ARCH(LOONGARCH64_LSX) + +typedef union +{ + int32_t i; + float f; +} FloatInt; + +static __m128 __lsx_vreplfr2vr_s(float val) +{ + FloatInt tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(tmpval.i); +} + +void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b) { + __m128 a_col_1 = (__m128)__lsx_vld(a, 0); + __m128 a_col_2 = (__m128)__lsx_vld(a + 4, 0); + __m128 a_col_3 = (__m128)__lsx_vld(a + 8, 0); + __m128 a_col_4 = (__m128)__lsx_vld(a + 12, 0); + + for (int i = 0; i < 16; i += 4) { + + __m128 b1 = __lsx_vreplfr2vr_s(b[i]); + __m128 b2 = __lsx_vreplfr2vr_s(b[i + 1]); + __m128 b3 = __lsx_vreplfr2vr_s(b[i + 2]); + __m128 b4 = __lsx_vreplfr2vr_s(b[i + 3]); + + __m128 result = __lsx_vfmul_s(a_col_1, b1); + result = __lsx_vfmadd_s(a_col_2, b2, result); + result = __lsx_vfmadd_s(a_col_3, b3, result); + result = __lsx_vfmadd_s(a_col_4, b4, result); + + __lsx_vst(result, &dest[i], 0); + } +} + #elif PPSSPP_ARCH(ARM_NEON) // From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example diff --git a/Common/Math/fast/fast_matrix.h b/Common/Math/fast/fast_matrix.h index 8e5fc9320f..fb4a1b7f26 100644 --- a/Common/Math/fast/fast_matrix.h +++ b/Common/Math/fast/fast_matrix.h @@ -11,12 +11,15 @@ extern "C" { extern void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b); extern void fast_matrix_mul_4x4_neon(float *dest, const float *a, const float *b); extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b); +extern void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b); #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64) // Hard link to SSE implementations on x86/amd64 #define fast_matrix_mul_4x4 fast_matrix_mul_4x4_sse #elif PPSSPP_ARCH(ARM_NEON) #define fast_matrix_mul_4x4 fast_matrix_mul_4x4_neon +#elif PPSSPP_ARCH(LOONGARCH64_LSX) +#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_lsx #else #define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c #endif diff --git a/ppsspp_config.h b/ppsspp_config.h index dd9855f582..2a741dca3d 100644 --- a/ppsspp_config.h +++ b/ppsspp_config.h @@ -81,6 +81,7 @@ //https://github.com/gcc-mirror/gcc/blob/master/gcc/config/loongarch/loongarch-c.cc #define PPSSPP_ARCH_LOONGARCH64 1 #define PPSSPP_ARCH_64BIT 1 + #define PPSSPP_ARCH_LOONGARCH64_LSX 1 #endif // PLATFORM defines