mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
softgpu: Use NEON SIMD for alpha blending, etc.
This commit is contained in:
parent
b55dbdab7f
commit
587a322207
2 changed files with 84 additions and 1 deletions
|
@ -15,6 +15,7 @@
|
|||
// Official git repository and contact information can be found at
|
||||
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
|
||||
|
||||
#include "ppsspp_config.h"
|
||||
#include <mutex>
|
||||
#include "Common/Common.h"
|
||||
#include "Common/Data/Convert/ColorConv.h"
|
||||
|
@ -416,6 +417,8 @@ static inline Vec3<int> GetSourceFactor(PixelBlendFactor factor, const Vec4<int>
|
|||
case PixelBlendFactor::SRCALPHA:
|
||||
#if defined(_M_SSE)
|
||||
return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
|
||||
#else
|
||||
return Vec3<int>::AssignToAll(source.a());
|
||||
#endif
|
||||
|
@ -423,6 +426,8 @@ static inline Vec3<int> GetSourceFactor(PixelBlendFactor factor, const Vec4<int>
|
|||
case PixelBlendFactor::INVSRCALPHA:
|
||||
#if defined(_M_SSE)
|
||||
return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
|
||||
#else
|
||||
return Vec3<int>::AssignToAll(255 - source.a());
|
||||
#endif
|
||||
|
@ -469,6 +474,8 @@ static inline Vec3<int> GetDestFactor(PixelBlendFactor factor, const Vec4<int> &
|
|||
case PixelBlendFactor::SRCALPHA:
|
||||
#if defined(_M_SSE)
|
||||
return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
|
||||
#else
|
||||
return Vec3<int>::AssignToAll(source.a());
|
||||
#endif
|
||||
|
@ -476,6 +483,8 @@ static inline Vec3<int> GetDestFactor(PixelBlendFactor factor, const Vec4<int> &
|
|||
case PixelBlendFactor::INVSRCALPHA:
|
||||
#if defined(_M_SSE)
|
||||
return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
|
||||
#else
|
||||
return Vec3<int>::AssignToAll(255 - source.a());
|
||||
#endif
|
||||
|
@ -533,6 +542,18 @@ static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int>
|
|||
const __m128i d = _mm_mulhi_epi16(drgb, df);
|
||||
|
||||
return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
const int32x4_t half = vdupq_n_s32(1);
|
||||
|
||||
const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
|
||||
const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
|
||||
const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
|
||||
|
||||
const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
|
||||
const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
|
||||
const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
|
||||
|
||||
return Vec3<int>(vaddq_s32(s, d));
|
||||
#else
|
||||
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
|
||||
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
|
||||
|
@ -555,6 +576,18 @@ static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int>
|
|||
const __m128i d = _mm_mulhi_epi16(drgb, df);
|
||||
|
||||
return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
const int32x4_t half = vdupq_n_s32(1);
|
||||
|
||||
const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
|
||||
const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
|
||||
const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
|
||||
|
||||
const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
|
||||
const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
|
||||
const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
|
||||
|
||||
return Vec3<int>(vqsubq_s32(s, d));
|
||||
#else
|
||||
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
|
||||
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
|
||||
|
@ -577,6 +610,18 @@ static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int>
|
|||
const __m128i d = _mm_mulhi_epi16(drgb, df);
|
||||
|
||||
return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
const int32x4_t half = vdupq_n_s32(1);
|
||||
|
||||
const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
|
||||
const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
|
||||
const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
|
||||
|
||||
const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
|
||||
const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
|
||||
const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
|
||||
|
||||
return Vec3<int>(vqsubq_s32(d, s));
|
||||
#else
|
||||
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
|
||||
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
|
||||
|
@ -586,19 +631,31 @@ static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int>
|
|||
}
|
||||
|
||||
case GE_BLENDMODE_MIN:
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
return Vec3<int>(vminq_s32(source.ivec, dst.ivec));
|
||||
#else
|
||||
return Vec3<int>(std::min(source.r(), dst.r()),
|
||||
std::min(source.g(), dst.g()),
|
||||
std::min(source.b(), dst.b()));
|
||||
#endif
|
||||
|
||||
case GE_BLENDMODE_MAX:
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
return Vec3<int>(vmaxq_s32(source.ivec, dst.ivec));
|
||||
#else
|
||||
return Vec3<int>(std::max(source.r(), dst.r()),
|
||||
std::max(source.g(), dst.g()),
|
||||
std::max(source.b(), dst.b()));
|
||||
#endif
|
||||
|
||||
case GE_BLENDMODE_ABSDIFF:
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
return Vec3<int>(vabdq_s32(source.ivec, dst.ivec));
|
||||
#else
|
||||
return Vec3<int>(::abs(source.r() - dst.r()),
|
||||
::abs(source.g() - dst.g()),
|
||||
::abs(source.b() - dst.b()));
|
||||
#endif
|
||||
|
||||
default:
|
||||
return source.rgb();
|
||||
|
@ -684,7 +741,7 @@ void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg colo
|
|||
prim_color += Vec4<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);
|
||||
}
|
||||
|
||||
#if defined(_M_SSE)
|
||||
#if defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)
|
||||
new_color = Vec3<int>(prim_color.ivec).ToRGB();
|
||||
new_color |= stencil << 24;
|
||||
#else
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
// See comment in header for the purpose of the code in this file.
|
||||
|
||||
#include "ppsspp_config.h"
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
|
@ -25,6 +26,14 @@
|
|||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#if PPSSPP_ARCH(ARM_NEON)
|
||||
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
|
||||
#include <arm64_neon.h>
|
||||
#else
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
extern DSStretch g_DarkStalkerStretch;
|
||||
// For Darkstalkers hack. Ugh.
|
||||
extern bool currentDialogActive;
|
||||
|
@ -57,6 +66,23 @@ static uint32_t StandardAlphaBlend(uint32_t source, uint32_t dst) {
|
|||
|
||||
const __m128i blended16 = _mm_adds_epi16(s, d);
|
||||
return _mm_cvtsi128_si32(_mm_packus_epi16(blended16, blended16));
|
||||
#elif PPSSPP_ARCH(ARM64_NEON)
|
||||
uint16x4_t sf = vdup_n_u16((source >> 24) * 2 + 1);
|
||||
uint16x4_t df = vdup_n_u16((255 - (source >> 24)) * 2 + 1);
|
||||
|
||||
// Convert both to 16-bit, double, and add the half before even going to 32 bit.
|
||||
uint16x8_t sd_c16 = vmovl_u8(vcreate_u8((uint64_t)source | ((uint64_t)dst << 32)));
|
||||
sd_c16 = vaddq_u16(vshlq_n_u16(sd_c16, 1), vdupq_n_u16(1));
|
||||
|
||||
uint16x4_t srgb = vget_low_u16(sd_c16);
|
||||
uint16x4_t drgb = vget_high_u16(sd_c16);
|
||||
|
||||
uint16x4_t s = vshrn_n_u32(vmull_u16(srgb, sf), 10);
|
||||
uint16x4_t d = vshrn_n_u32(vmull_u16(drgb, df), 10);
|
||||
|
||||
uint16x4_t blended = vset_lane_s16(0, vadd_u16(s, d), 3);
|
||||
uint8x8_t blended8 = vqmovn_u16(vcombine_u16(blended, blended));
|
||||
return vget_lane_u32(vreinterpret_u32_u8(blended8), 0);
|
||||
#else
|
||||
Vec3<int> srcfactor = Vec3<int>::AssignToAll(source >> 24);
|
||||
Vec3<int> dstfactor = Vec3<int>::AssignToAll(255 - (source >> 24));
|
||||
|
|
Loading…
Add table
Reference in a new issue