mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Add fast_matrix_mul_4x4_lsx function for LoongArch64
This commit is contained in:
parent
d5bd7d24aa
commit
66f5ac9897
5 changed files with 48 additions and 0 deletions
|
@ -69,6 +69,8 @@ if(CMAKE_SYSTEM_PROCESSOR)
|
|||
set(RISCV64_DEVICE ON)
|
||||
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^loongarch64")
|
||||
set(LOONGARCH64_DEVICE ON)
|
||||
add_compile_options(-mlsx)
|
||||
add_compile_options(-mlasx)
|
||||
else()
|
||||
message("Unknown CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
|
|
|
@ -31,6 +31,12 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if PPSSPP_ARCH(LOONGARCH64)
|
||||
#if PPSSPP_ARCH(LOONGARCH64_LSX)
|
||||
#include <lsxintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Basic types
|
||||
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
|
|
|
@ -22,6 +22,42 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
|
|||
}
|
||||
}
|
||||
|
||||
#elif PPSSPP_ARCH(LOONGARCH64_LSX)
|
||||
|
||||
typedef union
|
||||
{
|
||||
int32_t i;
|
||||
float f;
|
||||
} FloatInt;
|
||||
|
||||
static __m128 __lsx_vreplfr2vr_s(float val)
|
||||
{
|
||||
FloatInt tmpval = {.f = val};
|
||||
return (__m128)__lsx_vreplgr2vr_w(tmpval.i);
|
||||
}
|
||||
|
||||
void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b) {
|
||||
__m128 a_col_1 = (__m128)__lsx_vld(a, 0);
|
||||
__m128 a_col_2 = (__m128)__lsx_vld(a + 4, 0);
|
||||
__m128 a_col_3 = (__m128)__lsx_vld(a + 8, 0);
|
||||
__m128 a_col_4 = (__m128)__lsx_vld(a + 12, 0);
|
||||
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
|
||||
__m128 b1 = __lsx_vreplfr2vr_s(b[i]);
|
||||
__m128 b2 = __lsx_vreplfr2vr_s(b[i + 1]);
|
||||
__m128 b3 = __lsx_vreplfr2vr_s(b[i + 2]);
|
||||
__m128 b4 = __lsx_vreplfr2vr_s(b[i + 3]);
|
||||
|
||||
__m128 result = __lsx_vfmul_s(a_col_1, b1);
|
||||
result = __lsx_vfmadd_s(a_col_2, b2, result);
|
||||
result = __lsx_vfmadd_s(a_col_3, b3, result);
|
||||
result = __lsx_vfmadd_s(a_col_4, b4, result);
|
||||
|
||||
__lsx_vst(result, &dest[i], 0);
|
||||
}
|
||||
}
|
||||
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
// From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example
|
||||
|
|
|
@ -11,12 +11,15 @@ extern "C" {
|
|||
extern void fast_matrix_mul_4x4_c(float *dest, const float *a, const float *b);
|
||||
extern void fast_matrix_mul_4x4_neon(float *dest, const float *a, const float *b);
|
||||
extern void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b);
|
||||
extern void fast_matrix_mul_4x4_lsx(float *dest, const float *a, const float *b);
|
||||
|
||||
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
|
||||
// Hard link to SSE implementations on x86/amd64
|
||||
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_sse
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_neon
|
||||
#elif PPSSPP_ARCH(LOONGARCH64_LSX)
|
||||
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_lsx
|
||||
#else
|
||||
#define fast_matrix_mul_4x4 fast_matrix_mul_4x4_c
|
||||
#endif
|
||||
|
|
|
@ -81,6 +81,7 @@
|
|||
//https://github.com/gcc-mirror/gcc/blob/master/gcc/config/loongarch/loongarch-c.cc
|
||||
#define PPSSPP_ARCH_LOONGARCH64 1
|
||||
#define PPSSPP_ARCH_64BIT 1
|
||||
#define PPSSPP_ARCH_LOONGARCH64_LSX 1
|
||||
#endif
|
||||
|
||||
// PLATFORM defines
|
||||
|
|
Loading…
Add table
Reference in a new issue