From 587a3222070e89dc28c5f03fb2e951b1b6ec3470 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 7 Jan 2023 16:57:18 -0800 Subject: [PATCH] softgpu: Use NEON SIMD for alpha blending, etc. --- GPU/Software/DrawPixel.cpp | 59 +++++++++++++++++++++++++++- GPU/Software/RasterizerRectangle.cpp | 26 ++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) diff --git a/GPU/Software/DrawPixel.cpp b/GPU/Software/DrawPixel.cpp index 97e84b6807..72b6e2a91a 100644 --- a/GPU/Software/DrawPixel.cpp +++ b/GPU/Software/DrawPixel.cpp @@ -15,6 +15,7 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include "ppsspp_config.h" #include #include "Common/Common.h" #include "Common/Data/Convert/ColorConv.h" @@ -416,6 +417,8 @@ static inline Vec3 GetSourceFactor(PixelBlendFactor factor, const Vec4 case PixelBlendFactor::SRCALPHA: #if defined(_M_SSE) return Vec3(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))); +#elif PPSSPP_ARCH(ARM64_NEON) + return Vec3(vdupq_laneq_s32(source.ivec, 3)); #else return Vec3::AssignToAll(source.a()); #endif @@ -423,6 +426,8 @@ static inline Vec3 GetSourceFactor(PixelBlendFactor factor, const Vec4 case PixelBlendFactor::INVSRCALPHA: #if defined(_M_SSE) return Vec3(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)))); +#elif PPSSPP_ARCH(ARM64_NEON) + return Vec3(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3))); #else return Vec3::AssignToAll(255 - source.a()); #endif @@ -469,6 +474,8 @@ static inline Vec3 GetDestFactor(PixelBlendFactor factor, const Vec4 & case PixelBlendFactor::SRCALPHA: #if defined(_M_SSE) return Vec3(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))); +#elif PPSSPP_ARCH(ARM64_NEON) + return Vec3(vdupq_laneq_s32(source.ivec, 3)); #else return Vec3::AssignToAll(source.a()); #endif @@ -476,6 +483,8 @@ static inline Vec3 GetDestFactor(PixelBlendFactor factor, const Vec4 & case PixelBlendFactor::INVSRCALPHA: #if defined(_M_SSE) return Vec3(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)))); +#elif PPSSPP_ARCH(ARM64_NEON) + return Vec3(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3))); #else return Vec3::AssignToAll(255 - source.a()); #endif @@ -533,6 +542,18 @@ static Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 const __m128i d = _mm_mulhi_epi16(drgb, df); return Vec3(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128())); +#elif PPSSPP_ARCH(ARM64_NEON) + const int32x4_t half = vdupq_n_s32(1); + + const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half); + const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half); + const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10); + + const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half); + const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half); + const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10); + + return Vec3(vaddq_s32(s, d)); #else static constexpr Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; @@ -555,6 +576,18 @@ static Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 const __m128i d = _mm_mulhi_epi16(drgb, df); return Vec3(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128())); +#elif PPSSPP_ARCH(ARM64_NEON) + const int32x4_t half = vdupq_n_s32(1); + + const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half); + const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half); + const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10); + + const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half); + const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half); + const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10); + + return Vec3(vqsubq_s32(s, d)); #else static constexpr Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; @@ -577,6 +610,18 @@ static Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 const __m128i d = _mm_mulhi_epi16(drgb, df); return Vec3(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128())); +#elif PPSSPP_ARCH(ARM64_NEON) + const int32x4_t half = vdupq_n_s32(1); + + const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half); + const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half); + const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10); + + const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half); + const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half); + const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10); + + return Vec3(vqsubq_s32(d, s)); #else static constexpr Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; @@ -586,19 +631,31 @@ static Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 } case GE_BLENDMODE_MIN: +#if PPSSPP_ARCH(ARM64_NEON) + return Vec3(vminq_s32(source.ivec, dst.ivec)); +#else return Vec3(std::min(source.r(), dst.r()), std::min(source.g(), dst.g()), std::min(source.b(), dst.b())); +#endif case GE_BLENDMODE_MAX: +#if PPSSPP_ARCH(ARM64_NEON) + return Vec3(vmaxq_s32(source.ivec, dst.ivec)); +#else return Vec3(std::max(source.r(), dst.r()), std::max(source.g(), dst.g()), std::max(source.b(), dst.b())); +#endif case GE_BLENDMODE_ABSDIFF: +#if PPSSPP_ARCH(ARM64_NEON) + return Vec3(vabdq_s32(source.ivec, dst.ivec)); +#else return Vec3(::abs(source.r() - dst.r()), ::abs(source.g() - dst.g()), ::abs(source.b() - dst.b())); +#endif default: return source.rgb(); @@ -684,7 +741,7 @@ void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg colo prim_color += Vec4::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]); } -#if defined(_M_SSE) +#if defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON) new_color = Vec3(prim_color.ivec).ToRGB(); new_color |= stencil << 24; #else diff --git a/GPU/Software/RasterizerRectangle.cpp b/GPU/Software/RasterizerRectangle.cpp index acb2208c06..3ebff77cb3 100644 --- a/GPU/Software/RasterizerRectangle.cpp +++ b/GPU/Software/RasterizerRectangle.cpp @@ -1,5 +1,6 @@ // See comment in header for the purpose of the code in this file. +#include "ppsspp_config.h" #include #include @@ -25,6 +26,14 @@ #include #endif +#if PPSSPP_ARCH(ARM_NEON) +#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64) +#include +#else +#include +#endif +#endif + extern DSStretch g_DarkStalkerStretch; // For Darkstalkers hack. Ugh. extern bool currentDialogActive; @@ -57,6 +66,23 @@ static uint32_t StandardAlphaBlend(uint32_t source, uint32_t dst) { const __m128i blended16 = _mm_adds_epi16(s, d); return _mm_cvtsi128_si32(_mm_packus_epi16(blended16, blended16)); +#elif PPSSPP_ARCH(ARM64_NEON) + uint16x4_t sf = vdup_n_u16((source >> 24) * 2 + 1); + uint16x4_t df = vdup_n_u16((255 - (source >> 24)) * 2 + 1); + + // Convert both to 16-bit, double, and add the half before even going to 32 bit. + uint16x8_t sd_c16 = vmovl_u8(vcreate_u8((uint64_t)source | ((uint64_t)dst << 32))); + sd_c16 = vaddq_u16(vshlq_n_u16(sd_c16, 1), vdupq_n_u16(1)); + + uint16x4_t srgb = vget_low_u16(sd_c16); + uint16x4_t drgb = vget_high_u16(sd_c16); + + uint16x4_t s = vshrn_n_u32(vmull_u16(srgb, sf), 10); + uint16x4_t d = vshrn_n_u32(vmull_u16(drgb, df), 10); + + uint16x4_t blended = vset_lane_s16(0, vadd_u16(s, d), 3); + uint8x8_t blended8 = vqmovn_u16(vcombine_u16(blended, blended)); + return vget_lane_u32(vreinterpret_u32_u8(blended8), 0); #else Vec3 srcfactor = Vec3::AssignToAll(source >> 24); Vec3 dstfactor = Vec3::AssignToAll(255 - (source >> 24));