diff --git a/Common/Data/Convert/ColorConv.cpp b/Common/Data/Convert/ColorConv.cpp index 5404e89bca..7e9009c4c2 100644 --- a/Common/Data/Convert/ColorConv.cpp +++ b/Common/Data/Convert/ColorConv.cpp @@ -25,9 +25,6 @@ #ifdef _M_SSE #include -#endif - -#if _M_SSE >= 0x401 #include #endif @@ -181,19 +178,15 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) { } } -void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { -#if _M_SSE >= 0x401 +#if defined(_M_SSE) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline void ConvertRGBA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) { const __m128i maskAG = _mm_set1_epi32(0x8000F800); const __m128i maskRB = _mm_set1_epi32(0x00F800F8); const __m128i mask = _mm_set1_epi32(0x0000FFFF); - const __m128i *srcp = (const __m128i *)src; - __m128i *dstp = (__m128i *)dst; - u32 sseChunks = (numPixels / 4) & ~1; - // SSE 4.1 required for _mm_packus_epi32. - if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) { - sseChunks = 0; - } for (u32 i = 0; i < sseChunks; i += 2) { __m128i c1 = _mm_load_si128(&srcp[i + 0]); __m128i c2 = _mm_load_si128(&srcp[i + 1]); @@ -213,6 +206,21 @@ void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { _mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2)); } +} +#endif + +void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { +#if defined(_M_SSE) + const __m128i *srcp = (const __m128i *)src; + __m128i *dstp = (__m128i *)dst; + u32 sseChunks = (numPixels / 4) & ~1; + // SSE 4.1 required for _mm_packus_epi32. + if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) { + sseChunks = 0; + } else { + ConvertRGBA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks); + } + // The remainder starts right after those done via SSE. u32 i = sseChunks * 4; #else @@ -223,19 +231,15 @@ void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { } } -void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { -#if _M_SSE >= 0x401 +#if defined(_M_SSE) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline void ConvertBGRA8888ToRGBA5551_SSE4(__m128i *dstp, const __m128i *srcp, u32 sseChunks) { const __m128i maskAG = _mm_set1_epi32(0x8000F800); const __m128i maskRB = _mm_set1_epi32(0x00F800F8); const __m128i mask = _mm_set1_epi32(0x0000FFFF); - const __m128i *srcp = (const __m128i *)src; - __m128i *dstp = (__m128i *)dst; - u32 sseChunks = (numPixels / 4) & ~1; - // SSE 4.1 required for _mm_packus_epi32. - if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) { - sseChunks = 0; - } for (u32 i = 0; i < sseChunks; i += 2) { __m128i c1 = _mm_load_si128(&srcp[i + 0]); __m128i c2 = _mm_load_si128(&srcp[i + 1]); @@ -255,6 +259,21 @@ void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { _mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2)); } +} +#endif + +void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) { +#if defined(_M_SSE) + const __m128i *srcp = (const __m128i *)src; + __m128i *dstp = (__m128i *)dst; + u32 sseChunks = (numPixels / 4) & ~1; + // SSE 4.1 required for _mm_packus_epi32. + if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) { + sseChunks = 0; + } else { + ConvertBGRA8888ToRGBA5551_SSE4(dstp, srcp, sseChunks); + } + // The remainder starts right after those done via SSE. u32 i = sseChunks * 4; #else diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index ed3be14836..bf02d2ae2d 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -29,9 +29,7 @@ #ifdef _M_SSE #include -#if _M_SSE >= 0x401 #include -#endif u32 QuickTexHashSSE2(const void *checkp, u32 size) { u32 check = 0; diff --git a/GPU/Common/TextureScalerCommon.cpp b/GPU/Common/TextureScalerCommon.cpp index b9ee275732..6f69a360e4 100644 --- a/GPU/Common/TextureScalerCommon.cpp +++ b/GPU/Common/TextureScalerCommon.cpp @@ -31,7 +31,8 @@ #include "Common/CPUDetect.h" #include "ext/xbrz/xbrz.h" -#if _M_SSE >= 0x401 +#if defined(_M_SSE) +#include #include #endif @@ -281,9 +282,12 @@ void scaleBicubicT(u32* data, u32* out, int w, int h, int l, int u) { } } } -#if _M_SSE >= 0x401 +#if defined(_M_SSE) template -void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) { +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) { int outw = w*f; for (int yb = 0; yb < (u - l)*f / BLOCK_SIZE + 1; ++yb) { for (int xb = 0; xb < w*f / BLOCK_SIZE + 1; ++xb) { @@ -321,7 +325,7 @@ void scaleBicubicTSSE41(u32* data, u32* out, int w, int h, int l, int u) { #endif void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, int u) { -#if _M_SSE >= 0x401 +#if defined(_M_SSE) if (cpu_info.bSSE4_1) { switch (factor) { case 2: scaleBicubicTSSE41<2, 0>(data, out, w, h, l, u); break; // when I first tested this, @@ -339,13 +343,13 @@ void scaleBicubicBSpline(int factor, u32* data, u32* out, int w, int h, int l, i case 5: scaleBicubicT<5, 0>(data, out, w, h, l, u); break; // any of these break statements default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); } -#if _M_SSE >= 0x401 +#if defined(_M_SSE) } #endif } void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, int u) { -#if _M_SSE >= 0x401 +#if defined(_M_SSE) if (cpu_info.bSSE4_1) { switch (factor) { case 2: scaleBicubicTSSE41<2, 1>(data, out, w, h, l, u); break; @@ -363,7 +367,7 @@ void scaleBicubicMitchell(int factor, u32* data, u32* out, int w, int h, int l, case 5: scaleBicubicT<5, 1>(data, out, w, h, l, u); break; default: ERROR_LOG(G3D, "Bicubic upsampling only implemented for factors 2 to 5"); } -#if _M_SSE >= 0x401 +#if defined(_M_SSE) } #endif } diff --git a/GPU/Math3D.cpp b/GPU/Math3D.cpp index 811fad4923..b968d5be15 100644 --- a/GPU/Math3D.cpp +++ b/GPU/Math3D.cpp @@ -114,7 +114,9 @@ __m128 SSENormalizeMultiplierSSE2(__m128 v) return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); } -#if _M_SSE >= 0x401 +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif __m128 SSENormalizeMultiplierSSE4(__m128 v) { return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF)); @@ -126,12 +128,7 @@ __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) return SSENormalizeMultiplierSSE4(v); return SSENormalizeMultiplierSSE2(v); } -#else -__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) -{ - return SSENormalizeMultiplierSSE2(v); -} -#endif + template<> Vec3 Vec3::Normalized(bool useSSE4) const { diff --git a/GPU/Math3D.h b/GPU/Math3D.h index a62905fa13..adb23df20b 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -25,10 +25,8 @@ #if defined(_M_SSE) #include -#if _M_SSE >= 0x401 #include #endif -#endif #if PPSSPP_ARCH(ARM_NEON) #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index a8ca4ce89e..4d1df76034 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -40,9 +40,6 @@ #if defined(_M_SSE) #include -#endif - -#if _M_SSE >= 0x401 #include #endif @@ -583,6 +580,17 @@ struct TriangleEdge { Vec4 stepY; }; +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline __m128i SOFTRAST_CALL TriangleEdgeStartSSE4(__m128i initX, __m128i initY, int xf, int yf, int c) { + initX = _mm_mullo_epi32(initX, _mm_set1_epi32(xf)); + initY = _mm_mullo_epi32(initY, _mm_set1_epi32(yf)); + return _mm_add_epi32(_mm_add_epi32(initX, initY), _mm_set1_epi32(c)); +} +#endif + template Vec4 TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoords &v1, const ScreenCoords &origin) { // Start at pixel centers. @@ -597,12 +605,9 @@ Vec4 TriangleEdge::Start(const ScreenCoords &v0, const ScreenCoord stepX = Vec4::AssignToAll(xf * 16 * 2); stepY = Vec4::AssignToAll(yf * 16 * 2); -#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401 - if (useSSE4) { - initX.ivec = _mm_mullo_epi32(initX.ivec, _mm_set1_epi32(xf)); - initY.ivec = _mm_mullo_epi32(initY.ivec, _mm_set1_epi32(yf)); - return _mm_add_epi32(_mm_add_epi32(initX.ivec, initY.ivec), _mm_set1_epi32(c)); - } +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) + if (useSSE4) + return TriangleEdgeStartSSE4(initX.ivec, initY.ivec, xf, yf, c); #endif return Vec4::AssignToAll(xf) * initX + Vec4::AssignToAll(yf) * initY + Vec4::AssignToAll(c); } @@ -625,14 +630,23 @@ inline Vec4 TriangleEdge::StepY(const Vec4 &w) { #endif } +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline int SOFTRAST_CALL MaxWeightSSE4(__m128i w) { + __m128i max2 = _mm_max_epi32(w, _mm_shuffle_epi32(w, _MM_SHUFFLE(3, 2, 3, 2))); + __m128i max1 = _mm_max_epi32(max2, _mm_shuffle_epi32(max2, _MM_SHUFFLE(1, 1, 1, 1))); + return _mm_cvtsi128_si32(max1); +} +#endif + template void TriangleEdge::NarrowMinMaxX(const Vec4 &w, int64_t minX, int64_t &rowMinX, int64_t &rowMaxX) { int wmax; -#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401 +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) if (useSSE4) { - __m128i max01 = _mm_max_epi32(w.ivec, _mm_shuffle_epi32(w.ivec, _MM_SHUFFLE(3, 2, 3, 2))); - __m128i max0 = _mm_max_epi32(max01, _mm_shuffle_epi32(max01, _MM_SHUFFLE(1, 1, 1, 1))); - wmax = _mm_cvtsi128_si32(max0); + wmax = MaxWeightSSE4(w.ivec); } else { wmax = std::max(std::max(w.x, w.y), std::max(w.z, w.w)); } @@ -654,11 +668,20 @@ void TriangleEdge::NarrowMinMaxX(const Vec4 &w, int64_t minX, int6 } } +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline __m128i SOFTRAST_CALL StepTimesSSE4(__m128i w, __m128i step, int c) { + return _mm_add_epi32(w, _mm_mullo_epi32(_mm_set1_epi32(c), step)); +} +#endif + template inline Vec4 TriangleEdge::StepXTimes(const Vec4 &w, int c) { -#if defined(_M_SSE) && !PPSSPP_ARCH(X86) && _M_SSE >= 0x401 +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) if (useSSE4) - return _mm_add_epi32(w.ivec, _mm_mullo_epi32(_mm_set1_epi32(c), stepX.ivec)); + return StepTimesSSE4(w.ivec, stepX.ivec, c); #endif return w + stepX * c; } @@ -675,15 +698,22 @@ static inline Vec4 MakeMask(const Vec4 &w0, const Vec4 &w1, const #endif } +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +[[gnu::target("sse4.1")]] +#endif +static inline bool SOFTRAST_CALL AnyMaskSSE4(__m128i mask) { + __m128i sig = _mm_srai_epi32(mask, 31); + return _mm_test_all_ones(sig) == 0; +} +#endif + template static inline bool AnyMask(const Vec4 &mask) { #if defined(_M_SSE) && !PPSSPP_ARCH(X86) -#if _M_SSE >= 0x401 if (useSSE4) { - __m128i sig = _mm_srai_epi32(mask.ivec, 31); - return _mm_test_all_ones(sig) == 0; + return AnyMaskSSE4(mask.ivec); } -#endif // In other words: !(mask.x < 0 && mask.y < 0 && mask.z < 0 && mask.w < 0) __m128i low2 = _mm_and_si128(mask.ivec, _mm_shuffle_epi32(mask.ivec, _MM_SHUFFLE(3, 2, 3, 2)));