From acd5b24924e65cd9f87be2b542d12bb21efaaa28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 28 Jan 2025 10:39:04 +0100 Subject: [PATCH] Complete CrossSIMD non-simd fallback (although buggy, it seems). Minor ARM64 opt. --- Common/Math/CrossSIMD.h | 72 ++++++++++++++++++++++++++++++++++++++++- Core/Config.cpp | 11 +++++++ UI/EmuScreen.cpp | 2 ++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 24d851cde0..66c78a51b2 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -565,7 +565,12 @@ struct Vec4F32 { void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); } void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); } void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); } +#if PPSSPP_ARCH(ARM64_NEON) + void operator /=(Vec4F32 other) { v = vdivq_f32(v, other.v); } +#else + // ARM32 doesn't have vdivq. void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); } +#endif void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); } Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } @@ -775,6 +780,15 @@ struct Mat4F32 { float m[16]; }; +// The columns are consecutive but missing the last row (implied 0,0,0,1). +// This is just intermediate storage for multiplication. +struct Mat4x3F32 { + Mat4x3F32(const float *matrix) { + memcpy(m, matrix, 12 * sizeof(float)); + } + float m[12]; +}; + struct Vec4S32 { int32_t v[4]; @@ -982,7 +996,15 @@ struct Vec4F32 { v[i] /= other.v[i]; } } - // void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); } + void operator &=(Vec4S32 other) { + // TODO: This can be done simpler, although with some ugly casts. + for (int i = 0; i < 4; i++) { + uint32_t val; + memcpy(&val, &v[i], 4); + val &= other.v[i]; + memcpy(&v[i], &val, 4); + } + } Vec4F32 operator *(float f) const { return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } }; } @@ -1177,5 +1199,53 @@ inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ { (float)f.v[0], (float)f.v[1], (float)f.v[2], (float)f.v[3] } }; } +// Make sure the W component of scale is 1.0f. +inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) { + for (int i = 0; i < 4; i++) { + m.m[i * 4 + 0] *= scale.v[0]; + m.m[i * 4 + 1] *= scale.v[1]; + m.m[i * 4 + 2] *= scale.v[2]; + m.m[i * 4 + 3] *= scale.v[3]; + } +} + +inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) { + for (int i = 0; i < 4; i++) { + m.m[i * 4 + 0] = m.m[i * 4 + 0] * scale.v[0] + translate.v[0] * m.m[i * 4 + 3]; + m.m[i * 4 + 1] = m.m[i * 4 + 1] * scale.v[1] + translate.v[1] * m.m[i * 4 + 3]; + m.m[i * 4 + 2] = m.m[i * 4 + 2] * scale.v[2] + translate.v[2] * m.m[i * 4 + 3]; + m.m[i * 4 + 3] = m.m[i * 4 + 3] * scale.v[3] + translate.v[3] * m.m[i * 4 + 3]; + } +} + +inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) { + Mat4F32 result; + + for (int j = 0; j < 4; j++) { + for (int i = 0; i < 4; i++) { + float sum = 0.0f; + for (int k = 0; k < 4; k++) { + sum += b.m[i * 4 + k] * a.m[k * 4 + j]; + } + result.m[j * 4 + i] = sum; + } + } + return result; +} + +inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) { + Mat4F32 result; + + for (int j = 0; j < 4; j++) { + for (int i = 0; i < 4; i++) { + float sum = 0.0f; + for (int k = 0; k < 3; k++) { + sum += b.m[i * 4 + k] * a.m[k * 3 + j]; + } + result.m[j * 4 + i] = sum + b.m[i * 4 + 3]; + } + } + return result; +} #endif diff --git a/Core/Config.cpp b/Core/Config.cpp index d6d15e22f3..bad932ae0f 100644 --- a/Core/Config.cpp +++ b/Core/Config.cpp @@ -140,11 +140,22 @@ std::string DefaultLangRegion() { } static int DefaultDepthRaster() { + +// For 64-bit ARM and x86 with SIMD, enable depth raster. +#if PPSSPP_ARCH(ARM64_NEON) || PPSSPP_ARCH(SSE2) + #if PPSSPP_PLATFORM(ANDROID) || PPSSPP_PLATFORM(IOS) return (int)DepthRasterMode::LOW_QUALITY; #else return (int)DepthRasterMode::DEFAULT; #endif + +#else + + // 32-bit ARM or no SIMD, the depth raster will be too slow. + return (int)DepthRasterMode::OFF; + +#endif } std::string CreateRandMAC() { diff --git a/UI/EmuScreen.cpp b/UI/EmuScreen.cpp index 95824d2ceb..695305a085 100644 --- a/UI/EmuScreen.cpp +++ b/UI/EmuScreen.cpp @@ -679,6 +679,8 @@ static void ShowFpsLimitNotice() { case FPSLimit::CUSTOM2: fpsLimit = g_Config.iFpsLimit2; break; + default: + break; } // Now display it.