ppsspp/Common/Math/SIMDHeaders.h
2024-12-31 01:59:11 +01:00

175 lines
5.8 KiB
C

#pragma once
// SIMD headers
// Let's include these in one consistent way across the code base.
// Here we'll also add wrappers that paper over differences between different versions
// of an instruction set, like NEON vs ASIMD (64-bit).
#pragma once
#include "ppsspp_config.h"
#include "stdint.h"
#ifdef __clang__
// Weird how you can't just use #pragma in a macro.
#define DO_NOT_VECTORIZE_LOOP _Pragma("clang loop vectorize(disable)")
#else
#define DO_NOT_VECTORIZE_LOOP
#endif
#if PPSSPP_ARCH(SSE2)
#include <emmintrin.h>
#endif
#if PPSSPP_ARCH(ARM_NEON)
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
// Basic types
#if PPSSPP_ARCH(ARM64_NEON)
// No special ones here.
#elif PPSSPP_ARCH(ARM_NEON)
// Compatibility wrappers making ARM64 NEON code run on ARM32
// With optimization on, these should compile down to the optimal code.
static inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
switch (lane & 3) {
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
}
}
static inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
switch (lane & 3) {
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
}
}
#define vfmaq_laneq_f32 vmlaq_laneq_f32
static inline uint32x4_t vcgezq_f32(float32x4_t v) {
return vcgeq_f32(v, vdupq_n_f32(0.0f));
}
#endif
#if PPSSPP_ARCH(SSE2)
#if defined __SSE4_2__
# define _M_SSE 0x402
#elif defined __SSE4_1__
# define _M_SSE 0x401
#elif defined __SSSE3__
# define _M_SSE 0x301
#elif defined __SSE3__
# define _M_SSE 0x300
#elif defined __SSE2__
# define _M_SSE 0x200
#elif !defined(__GNUC__) && (defined(_M_X64) || defined(_M_IX86))
# define _M_SSE 0x402
#endif
// These are SSE2 versions of SSE4.1 instructions, for compatibility and ease of
// writing code.
// May later figure out how to use the appropriate ones depending on compile flags.
inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
__m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1)
__m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1)
__m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0)
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0)
__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2)
return _mm_unpacklo_epi64(prod01, prod23);
}
inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
return _mm_xor_si128(
_mm_max_epi16(
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
_mm_set1_epi16((int16_t)0x8000));
}
inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
return _mm_xor_si128(
_mm_min_epi16(
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
_mm_set1_epi16((int16_t)0x8000));
}
// SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.
inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) {
__m128i temp = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(3, 3, 2, 0));
__m128 temp2 = _mm_castsi128_ps(_mm_shufflehi_epi16(temp, _MM_SHUFFLE(3, 3, 2, 0)));
return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0)));
}
#define _mm_splat_lane_ps(v, l) _mm_shuffle_ps((v), (v), _MM_SHUFFLE(l, l, l, l))
#ifdef __cplusplus
alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 };
alignas(16) static const uint32_t g_sign16[4] = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
// Alternate solution to the above, not sure if faster or slower.
// SSE2 replacement for half of _mm_packus_epi32 but without the saturation.
// Not ideal! pshufb would make this faster but that's SSSE3.
inline __m128i _mm_packu1_epi32_SSE2(const __m128i v0) {
// Toggle the sign bit, pack, then toggle back.
__m128i toggled = _mm_sub_epi32(v0, _mm_load_si128((const __m128i *)g_sign32));
__m128i temp = _mm_packs_epi32(toggled, toggled);
__m128i restored = _mm_add_epi16(temp, _mm_load_si128((const __m128i *)g_sign16));
return restored;
}
#endif
// SSE2 replacement for the entire _mm_packus_epi32 but without the saturation.
// Not ideal! pshufb would make this faster but that's SSSE3.
inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
__m128i a0 = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(3, 3, 2, 0));
__m128 packed0 = _mm_castsi128_ps(_mm_shufflehi_epi16(a0, _MM_SHUFFLE(3, 3, 2, 0)));
__m128i a1 = _mm_shufflelo_epi16(v1, _MM_SHUFFLE(3, 3, 2, 0));
__m128 packed1 = _mm_castsi128_ps(_mm_shufflehi_epi16(a1, _MM_SHUFFLE(3, 3, 2, 0)));
return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
}
// The below are not real SSE instructions in any generation, but should exist.
// Return 0xFFFF where x <= y, else 0x0000.
inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
}
// Return 0xFFFF where x >= y, else 0x0000.
inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
return _mm_cmple_epu16(y, x);
}
// Return 0xFFFF where x > y, else 0x0000.
inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
}
// Return 0xFFFF where x < y, else 0x0000.
inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
return _mm_cmpgt_epu16(y, x);
}
#endif