From eec7853efe83c404d1906f5b87d24d27e98b030d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 28 Dec 2024 18:45:14 +0100 Subject: [PATCH] More SIMD: Add some matrix operations to CrossSIMD (#19773) * More CrossSIMD functionality * Use the new SIMD API for the matrix multiplies --- Common/Math/CrossSIMD.h | 208 ++++++++++++++++++++++++++++++++ Common/Math/SIMDHeaders.h | 4 + Common/Math/fast/fast_matrix.c | 10 -- GPU/Common/DrawEngineCommon.cpp | 29 ++--- 4 files changed, 219 insertions(+), 32 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 4f9f49a2ea..278d8a74c1 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -11,6 +11,7 @@ // The point of this, as opposed to a float4 array, is to almost force the compiler // to keep the matrix in registers, rather than loading on every access. struct Mat4F32 { + Mat4F32() {} Mat4F32(const float *matrix) { col0 = _mm_loadu_ps(matrix); col1 = _mm_loadu_ps(matrix + 4); @@ -23,12 +24,118 @@ struct Mat4F32 { _mm_storeu_ps(m + 8, col2); _mm_storeu_ps(m + 12, col3); } + + // Unlike the old one, this one is careful about not loading out-of-range data. + // The last two loads overlap. + static Mat4F32 Load4x3(const float *m) { + Mat4F32 result; + alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 }; + alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; + __m128 mask1110 = _mm_loadu_ps((const float *)mask); + result.col0 = _mm_and_ps(_mm_loadu_ps(m), mask1110); + result.col1 = _mm_and_ps(_mm_loadu_ps(m + 3), mask1110); + result.col2 = _mm_and_ps(_mm_loadu_ps(m + 6), mask1110); + __m128 lastCol = _mm_loadu_ps(m + 8); + result.col3 = _mm_or_ps(_mm_and_ps(_mm_shuffle_ps(lastCol, lastCol, _MM_SHUFFLE(3, 3, 2, 1)), mask1110), _mm_load_ps(onelane3)); + return result; + } + __m128 col0; __m128 col1; __m128 col2; __m128 col3; }; +// The columns are spread out between the data*. This is just intermediate storage for multiplication. +struct Mat4x3F32 { + Mat4x3F32(const float *matrix) { + data0 = _mm_loadu_ps(matrix); + data1 = _mm_loadu_ps(matrix + 4); + data2 = _mm_loadu_ps(matrix + 8); + } + + __m128 data0; + __m128 data1; + __m128 data2; +}; + +// TODO: Check if loading b by 4s and shuffling is cheaper. +inline Mat4F32 MulMem4x4By4x4(const float *a, Mat4F32 b) { + Mat4F32 result; + + __m128 r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[0])); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[1]))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[2]))); + result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[3]))); + + r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[4])); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[5]))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[6]))); + result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[7]))); + + r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[8])); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[9]))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[10]))); + result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[11]))); + + r_col = _mm_mul_ps(b.col0, _mm_set1_ps(a[12])); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_set1_ps(a[13]))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_set1_ps(a[14]))); + result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_set1_ps(a[15]))); + + return result; +} + +inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) { + Mat4F32 result; + + __m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col0, 0)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col0, 1))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col0, 2))); + result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col0, 3))); + + r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col1, 0)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col1, 1))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col1, 2))); + result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col1, 3))); + + r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col2, 0)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col2, 1))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col2, 2))); + result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col2, 3))); + + r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col3, 0)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col3, 1))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col3, 2))); + result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col3, 3))); + + return result; +} + +inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) { + Mat4F32 result; + + __m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 0)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data0, 1))); + result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data0, 2))); + + r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 3)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 0))); + result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data1, 1))); + + r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data1, 2)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 3))); + result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 0))); + + r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data2, 1)); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data2, 2))); + r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 3))); + + // The last entry has an implied 1.0f. + result.col3 = _mm_add_ps(r_col, b.col3); + return result; +} + struct Vec4S32 { __m128i v; @@ -90,6 +197,13 @@ struct Vec4F32 { static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; } void Store(float *dst) { _mm_storeu_ps(dst, v); } void StoreAligned (float *dst) { _mm_store_ps(dst, v); } + void Store3(float *dst) { + // TODO: There might be better ways. + _mm_store_pd((double *)dst, _mm_castps_pd(v)); + _mm_store_ss(dst + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))); + } + + static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ _mm_castsi128_ps(_mm_loadl_epi64((const __m128i *)src)) }; } static Vec4F32 LoadConvertS16(const int16_t *src) { // Note: will load 8 bytes __m128i value = _mm_loadl_epi64((const __m128i *)src); @@ -104,6 +218,14 @@ struct Vec4F32 { return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) }; } + static Vec4F32 LoadF24x3_One(const uint32_t *src) { + alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 }; + alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f }; + + __m128 value = _mm_castsi128_ps(_mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8)); + return Vec4F32{ _mm_or_ps(_mm_and_ps(value, _mm_load_ps((const float *)mask)), _mm_load_ps(onelane3)) }; + } + static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; } Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; } @@ -230,6 +352,7 @@ inline Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { #elif PPSSPP_ARCH(ARM_NEON) struct Mat4F32 { + Mat4F32() {} Mat4F32(const float *matrix) { col0 = vld1q_f32(matrix); col1 = vld1q_f32(matrix + 4); @@ -242,12 +365,86 @@ struct Mat4F32 { vst1q_f32(m + 8, col2); vst1q_f32(m + 12, col3); } + + // Unlike the old one, this one is careful about not loading out-of-range data. + // The last two loads overlap. + static Mat4F32 Load4x3(const float *m) { + Mat4F32 result; + result.col0 = vsetq_lane_f32(0.0f, vld1q_f32(m), 3); + result.col1 = vsetq_lane_f32(0.0f, vld1q_f32(m + 3), 3); + result.col2 = vsetq_lane_f32(0.0f, vld1q_f32(m + 6), 3); + result.col3 = vsetq_lane_f32(1.0f, vld1q_f32(m + 9), 3); // TODO: Fix this out of bounds read + return result; + } + float32x4_t col0; float32x4_t col1; float32x4_t col2; float32x4_t col3; }; +// The columns are spread out between the data*. This is just intermediate storage for multiplication. +struct Mat4x3F32 { + Mat4x3F32(const float *matrix) { + data0 = vld1q_f32(matrix); + data1 = vld1q_f32(matrix + 4); + data2 = vld1q_f32(matrix + 8); + } + + float32x4_t data0; + float32x4_t data1; + float32x4_t data2; +}; + +inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) { + Mat4F32 result; + + float32x4_t r_col = vmulq_laneq_f32(b.col0, a.col0, 0); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.col0, 1); + r_col = vfmaq_laneq_f32(r_col, b.col2, a.col0, 2); + result.col0 = vfmaq_laneq_f32(r_col, b.col3, a.col0, 3); + + r_col = vmulq_laneq_f32(b.col0, a.col1, 0); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.col1, 1); + r_col = vfmaq_laneq_f32(r_col, b.col2, a.col1, 2); + result.col1 = vfmaq_laneq_f32(r_col, b.col3, a.col1, 3); + + r_col = vmulq_laneq_f32(b.col0, a.col2, 0); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.col2, 1); + r_col = vfmaq_laneq_f32(r_col, b.col2, a.col2, 2); + result.col2 = vfmaq_laneq_f32(r_col, b.col3, a.col2, 3); + + r_col = vmulq_laneq_f32(b.col0, a.col3, 0); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.col3, 1); + r_col = vfmaq_laneq_f32(r_col, b.col2, a.col3, 2); + result.col3 = vfmaq_laneq_f32(r_col, b.col3, a.col3, 3); + + return result; +} + +inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) { + Mat4F32 result; + + float32x4_t r_col = vmulq_laneq_f32(b.col0, a.data0, 0); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.data0, 1); + result.col0 = vfmaq_laneq_f32(r_col, b.col2, a.data0, 2); + + r_col = vmulq_laneq_f32(b.col0, a.data0, 3); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 0); + result.col1 = vfmaq_laneq_f32(r_col, b.col2, a.data1, 1); + + r_col = vmulq_laneq_f32(b.col0, a.data1, 2); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 3); + result.col2 = vfmaq_laneq_f32(r_col, b.col2, a.data2, 0); + + r_col = vmulq_laneq_f32(b.col0, a.data2, 1); + r_col = vfmaq_laneq_f32(r_col, b.col1, a.data2, 2); + r_col = vfmaq_laneq_f32(r_col, b.col2, a.data2, 3); + + // The last entry has an implied 1.0f. + result.col3 = vaddq_f32(r_col, b.col3); + return result; +} struct Vec4S32 { int32x4_t v; @@ -292,6 +489,13 @@ struct Vec4F32 { static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; } void Store(float *dst) { vst1q_f32(dst, v); } void StoreAligned(float *dst) { vst1q_f32(dst, v); } + void Store3(float *dst) { + // TODO: There might be better ways. Try to avoid this when possible. + vst1_f32(dst, vget_low_f32(v)); + dst[2] = vgetq_lane_f32(v, 2); + } + + static Vec4F32 LoadVec2(const float *src) { return Vec4F32{ vcombine_f32(vld1_f32(src), vdup_n_f32(0.0f)) }; } // TODO: Feels like there should be a better way. static Vec4F32 LoadConvertS16(const int16_t *src) { int16x4_t value = vld1_s16(src); @@ -304,6 +508,10 @@ struct Vec4F32 { return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value16)) }; } + static Vec4F32 LoadF24x3_One(const uint32_t *src) { + return Vec4F32{ vsetq_lane_f32(1.0f, vreinterpretq_f32_u32(vshlq_n_u32(vld1q_u32(src), 8)), 3) }; + } + static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ vcvtq_f32_s32(other.v) }; } diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h index 9705dca554..cb63b89eef 100644 --- a/Common/Math/SIMDHeaders.h +++ b/Common/Math/SIMDHeaders.h @@ -59,6 +59,8 @@ static inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x } } +#define vfmaq_laneq_f32 vmlaq_laneq_f32 + static inline uint32x4_t vcgezq_f32(float32x4_t v) { return vcgeq_f32(v, vdupq_n_f32(0.0f)); } @@ -118,6 +120,8 @@ inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) { return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0))); } +#define _mm_splat_lane_ps(v, l) _mm_shuffle_ps((v), (v), _MM_SHUFFLE(l, l, l, l)) + #ifdef __cplusplus alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 }; diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index d23ce3b0e0..cff592e680 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -24,16 +24,6 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) { #elif PPSSPP_ARCH(ARM_NEON) -#if PPSSPP_ARCH(ARM) -static inline float32x4_t vfmaq_laneq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b, int lane) { - if (lane == 0) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 0); - else if (lane == 1) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 1); - else if (lane == 2) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 0); - else if (lane == 3) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 1); - else return vdupq_n_f32(0.f); -} -#endif - // From https://developer.arm.com/documentation/102467/0100/Matrix-multiplication-example void fast_matrix_mul_4x4_neon(float *C, const float *A, const float *B) { // these are the columns A diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 4cedf95e8b..97432bcf45 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -915,33 +915,18 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const { } } -inline void ComputeFinalProjMatrix(float *worldviewproj) { - float world[16]; - float view[16]; - float worldview[16]; - ConvertMatrix4x3To4x4(world, gstate.worldMatrix); - ConvertMatrix4x3To4x4(view, gstate.viewMatrix); - Matrix4ByMatrix4(worldview, world, view); - Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); - - // Heh, a bit ugly to mix two different matrix APIs here, but it works. - - const float viewportScale[4] = { - gstate.getViewportXScale(), - gstate.getViewportYScale(), - gstate.getViewportZScale(), - 1.0f - }; +Mat4F32 ComputeFinalProjMatrix() { const float viewportTranslate[4] = { gstate.getViewportXCenter() - gstate.getOffsetX(), gstate.getViewportYCenter() - gstate.getOffsetY(), gstate.getViewportZCenter(), }; + Mat4F32 wv = Mul4x3By4x4(Mat4x3F32(gstate.worldMatrix), Mat4F32::Load4x3(gstate.viewMatrix)); + Mat4F32 m = Mul4x4By4x4(wv, Mat4F32(gstate.projMatrix)); // NOTE: Applying the translation actually works pre-divide, since W is also affected. - Mat4F32 m(worldviewproj); - TranslateAndScaleInplace(m, Vec4F32::Load(viewportScale), Vec4F32::Load(viewportTranslate)); - m.Store(worldviewproj); + TranslateAndScaleInplace(m, Vec4F32::LoadF24x3_One(&gstate.viewportxscale), Vec4F32::Load(viewportTranslate)); + return m; } void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) { @@ -967,7 +952,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder TimeCollector collectStat(&gpuStats.msRasterizingDepth, coreCollectDebugStats); float worldviewproj[16]; - ComputeFinalProjMatrix(worldviewproj); + ComputeFinalProjMatrix().Store(worldviewproj); // Decode. int numDec = 0; @@ -1035,7 +1020,7 @@ void DrawEngineCommon::DepthRasterPredecoded(GEPrimitiveType prim, const void *i return; } float worldviewproj[16]; - ComputeFinalProjMatrix(worldviewproj); + ComputeFinalProjMatrix().Store(worldviewproj); TransformPredecodedForDepthRaster(depthTransformed_, worldviewproj, decoded_, dec, numDecoded); switch (prim) {