From c92b3b6521cc0a6976ec2a3c0bf918cab0f5e62f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 08:48:16 +0100 Subject: [PATCH] Move prototype cross simd wrapper structs to CrossSIMD.h --- Common/Data/Convert/ColorConv.cpp | 4 +- Common/Math/CrossSIMD.h | 96 +++++++++++++++++++++++++++++++ Common/Math/SIMDHeaders.h | 22 +++++++ Common/Math/fast/fast_matrix.c | 2 - GPU/Common/DepthRaster.cpp | 77 ------------------------- 5 files changed, 120 insertions(+), 81 deletions(-) diff --git a/Common/Data/Convert/ColorConv.cpp b/Common/Data/Convert/ColorConv.cpp index 72fac52f2f..5c4df7fca8 100644 --- a/Common/Data/Convert/ColorConv.cpp +++ b/Common/Data/Convert/ColorConv.cpp @@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) { } #if PPSSPP_ARCH(SSE2) -// fp64's improved version, see #19751 +// fp64's improved SSE2 version, see #19751. SSE4 no longer required here. static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) { const __m128i maskRB = _mm_set1_epi32(0x00F800F8); const __m128i maskGA = _mm_set1_epi32(0x8000F800); @@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, __m128i c0 = _mm_load_si128(&srcp[i + 0]); __m128i c1 = _mm_load_si128(&srcp[i + 1]); - __m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 + __m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane) __m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000 __m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000 __m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000 diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 94b2d3933b..11ed217024 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -5,3 +5,99 @@ #pragma once #include "Common/Math/SIMDHeaders.h" + +#if PPSSPP_ARCH(SSE2) + +struct Vec4S32 { + __m128i v; + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ _mm_add_epi32(v, other.v) }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ _mm_sub_epi32(v, other.v) }; + } + // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. + Vec4S32 operator *(Vec4S32 other) const { + return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0) + } +}; + +struct Vec4F32 { + __m128 v; + + static Vec4F32 FromVec4S32(Vec4S32 other) { + return Vec4F32{ _mm_cvtepi32_ps(other.v) }; + } + + Vec4F32 operator +(Vec4F32 other) const { + return Vec4F32{ _mm_add_ps(v, other.v) }; + } + Vec4F32 operator -(Vec4F32 other) const { + return Vec4F32{ _mm_sub_ps(v, other.v) }; + } + Vec4F32 operator *(Vec4F32 other) const { + return Vec4F32{ _mm_mul_ps(v, other.v) }; + } +}; + +struct Vec4U16 { + __m128i v; // we only use the lower 64 bits. + static Vec4U16 Load(void *mem) { + return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; + } + void Store(void *mem) { + _mm_storel_epi64((__m128i *)mem, v); + } + static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { + return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; + } + static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { + return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; + } + Vec4U16 CompareLT(Vec4U16 other) { + return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; + } +}; + +#elif PPSSPP_ARCH(ARM_NEON) + +struct Vec4S32 { + int32x4_t v; + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ vaddq_s32(v, other.v) }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ vsubq_s32(v, other.v) }; + } + Vec4S32 operator *(Vec4S32 other) const { + return Vec4S32{ vmulq_s32(v, other.v) }; + } +}; + +struct Vec4F32 { + float32x4_t v; + + static Vec4F32 FromVec4S32(Vec4S32 other) { + return Vec4F32{ vcvtq_f32_s32(other.v) }; + } + + Vec4F32 operator +(Vec4F32 other) const { + return Vec4F32{ vaddq_f32(v, other.v) }; + } + Vec4F32 operator -(Vec4F32 other) const { + return Vec4F32{ vsubq_f32(v, other.v) }; + } + Vec4F32 operator *(Vec4F32 other) const { + return Vec4F32{ vmulq_f32(v, other.v) }; + } +}; + +#else + +struct Vec4S32 { + s32 v[4]; +}; + +#endif diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h index 8e812a7819..3f8500dfb2 100644 --- a/Common/Math/SIMDHeaders.h +++ b/Common/Math/SIMDHeaders.h @@ -128,4 +128,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) { return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0))); } +// The below are not real SSE instructions in any generation, but should exist. + +// Return 0xFFFF where x <= y, else 0x0000. +inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) { + return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128()); +} + +// Return 0xFFFF where x >= y, else 0x0000. +inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) { + return _mm_cmple_epu16(y, x); +} + +// Return 0xFFFF where x > y, else 0x0000. +inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) { + return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x)); +} + +// Return 0xFFFF where x < y, else 0x0000. +inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) { + return _mm_cmpgt_epu16(y, x); +} + #endif diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index 0402f36629..d23ce3b0e0 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -6,8 +6,6 @@ #if PPSSPP_ARCH(SSE2) -#include "fast_matrix.h" - void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) { int i; __m128 a_col_1 = _mm_loadu_ps(a); diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 2bb83831bb..8068df066f 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -8,83 +8,6 @@ #include "Common/Math/math_util.h" #include "GPU/Common/VertexDecoderCommon.h" -#if PPSSPP_ARCH(SSE2) - -struct Vec4S32 { - __m128i v; - - Vec4S32 operator +(Vec4S32 other) const { - return Vec4S32{ _mm_add_epi32(v, other.v) }; - } - Vec4S32 operator -(Vec4S32 other) const { - return Vec4S32{ _mm_sub_epi32(v, other.v) }; - } - // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. - Vec4S32 operator *(Vec4S32 other) const { - return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0) - } -}; - -struct Vec4F32 { - __m128 v; - - static Vec4F32 FromVec4S32(Vec4S32 other) { - return Vec4F32{ _mm_cvtepi32_ps(other.v) }; - } - - Vec4F32 operator +(Vec4F32 other) const { - return Vec4F32{ _mm_add_ps(v, other.v) }; - } - Vec4F32 operator -(Vec4F32 other) const { - return Vec4F32{ _mm_sub_ps(v, other.v) }; - } - Vec4F32 operator *(Vec4F32 other) const { - return Vec4F32{ _mm_mul_ps(v, other.v) }; - } -}; - -#elif PPSSPP_ARCH(ARM_NEON) - -struct Vec4S32 { - uint32x4_t v; - - Vec4S32 operator +(Vec4S32 other) const { - return Vec4S32{ vaddq_s32(v, other.v) }; - } - Vec4S32 operator -(Vec4S32 other) const { - return Vec4S32{ vsubq_s32(v, other.v) }; - } - Vec4S32 operator *(Vec4S32 other) const { - return Vec4S32{ vmulq_s32(v, other.v) }; - } -}; - -struct Vec4F32 { - float32x4_t v; - - static Vec4F32 FromVec4S32(Vec4S32 other) { - return Vec4F32{ _mm_cvtepi32_ps(other.v) }; - } - - Vec4F32 operator +(Vec4F32 other) const { - return Vec4F32{ vaddq_f32(v, other.v) }; - } - Vec4F32 operator -(Vec4F32 other) const { - return Vec4F32{ vsubq_f32(v, other.v) }; - } - Vec4F32 operator *(Vec4F32 other) const { - return Vec4F32{ vmulq_f32(v, other.v) }; - } -}; - -#else - -struct Vec4S32 { - s32 v[4]; -}; - -#endif - struct ScreenVert { int x; int y;