Move prototype cross simd wrapper structs to CrossSIMD.h

This commit is contained in:
Henrik Rydgård 2024-12-20 08:48:16 +01:00
parent 72c954d8c3
commit c92b3b6521
5 changed files with 120 additions and 81 deletions

View file

@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
}
#if PPSSPP_ARCH(SSE2)
// fp64's improved version, see #19751
// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
const __m128i maskGA = _mm_set1_epi32(0x8000F800);
@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp,
__m128i c0 = _mm_load_si128(&srcp[i + 0]);
__m128i c1 = _mm_load_si128(&srcp[i + 1]);
__m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000
__m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
__m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000
__m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000
__m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000

View file

@ -5,3 +5,99 @@
#pragma once
#include "Common/Math/SIMDHeaders.h"
#if PPSSPP_ARCH(SSE2)
struct Vec4S32 {
__m128i v;
Vec4S32 operator +(Vec4S32 other) const {
return Vec4S32{ _mm_add_epi32(v, other.v) };
}
Vec4S32 operator -(Vec4S32 other) const {
return Vec4S32{ _mm_sub_epi32(v, other.v) };
}
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
Vec4S32 operator *(Vec4S32 other) const {
return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0)
}
};
struct Vec4F32 {
__m128 v;
static Vec4F32 FromVec4S32(Vec4S32 other) {
return Vec4F32{ _mm_cvtepi32_ps(other.v) };
}
Vec4F32 operator +(Vec4F32 other) const {
return Vec4F32{ _mm_add_ps(v, other.v) };
}
Vec4F32 operator -(Vec4F32 other) const {
return Vec4F32{ _mm_sub_ps(v, other.v) };
}
Vec4F32 operator *(Vec4F32 other) const {
return Vec4F32{ _mm_mul_ps(v, other.v) };
}
};
struct Vec4U16 {
__m128i v; // we only use the lower 64 bits.
static Vec4U16 Load(void *mem) {
return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) };
}
void Store(void *mem) {
_mm_storel_epi64((__m128i *)mem, v);
}
static Vec4U16 Max(Vec4U16 a, Vec4U16 b) {
return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
}
static Vec4U16 Min(Vec4U16 a, Vec4U16 b) {
return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
}
Vec4U16 CompareLT(Vec4U16 other) {
return Vec4U16{ _mm_cmplt_epu16(v, other.v) };
}
};
#elif PPSSPP_ARCH(ARM_NEON)
struct Vec4S32 {
int32x4_t v;
Vec4S32 operator +(Vec4S32 other) const {
return Vec4S32{ vaddq_s32(v, other.v) };
}
Vec4S32 operator -(Vec4S32 other) const {
return Vec4S32{ vsubq_s32(v, other.v) };
}
Vec4S32 operator *(Vec4S32 other) const {
return Vec4S32{ vmulq_s32(v, other.v) };
}
};
struct Vec4F32 {
float32x4_t v;
static Vec4F32 FromVec4S32(Vec4S32 other) {
return Vec4F32{ vcvtq_f32_s32(other.v) };
}
Vec4F32 operator +(Vec4F32 other) const {
return Vec4F32{ vaddq_f32(v, other.v) };
}
Vec4F32 operator -(Vec4F32 other) const {
return Vec4F32{ vsubq_f32(v, other.v) };
}
Vec4F32 operator *(Vec4F32 other) const {
return Vec4F32{ vmulq_f32(v, other.v) };
}
};
#else
struct Vec4S32 {
s32 v[4];
};
#endif

View file

@ -128,4 +128,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
}
// The below are not real SSE instructions in any generation, but should exist.
// Return 0xFFFF where x <= y, else 0x0000.
inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
}
// Return 0xFFFF where x >= y, else 0x0000.
inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
return _mm_cmple_epu16(y, x);
}
// Return 0xFFFF where x > y, else 0x0000.
inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
}
// Return 0xFFFF where x < y, else 0x0000.
inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
return _mm_cmpgt_epu16(y, x);
}
#endif

View file

@ -6,8 +6,6 @@
#if PPSSPP_ARCH(SSE2)
#include "fast_matrix.h"
void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
int i;
__m128 a_col_1 = _mm_loadu_ps(a);

View file

@ -8,83 +8,6 @@
#include "Common/Math/math_util.h"
#include "GPU/Common/VertexDecoderCommon.h"
#if PPSSPP_ARCH(SSE2)
struct Vec4S32 {
__m128i v;
Vec4S32 operator +(Vec4S32 other) const {
return Vec4S32{ _mm_add_epi32(v, other.v) };
}
Vec4S32 operator -(Vec4S32 other) const {
return Vec4S32{ _mm_sub_epi32(v, other.v) };
}
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
Vec4S32 operator *(Vec4S32 other) const {
return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0)
}
};
struct Vec4F32 {
__m128 v;
static Vec4F32 FromVec4S32(Vec4S32 other) {
return Vec4F32{ _mm_cvtepi32_ps(other.v) };
}
Vec4F32 operator +(Vec4F32 other) const {
return Vec4F32{ _mm_add_ps(v, other.v) };
}
Vec4F32 operator -(Vec4F32 other) const {
return Vec4F32{ _mm_sub_ps(v, other.v) };
}
Vec4F32 operator *(Vec4F32 other) const {
return Vec4F32{ _mm_mul_ps(v, other.v) };
}
};
#elif PPSSPP_ARCH(ARM_NEON)
struct Vec4S32 {
uint32x4_t v;
Vec4S32 operator +(Vec4S32 other) const {
return Vec4S32{ vaddq_s32(v, other.v) };
}
Vec4S32 operator -(Vec4S32 other) const {
return Vec4S32{ vsubq_s32(v, other.v) };
}
Vec4S32 operator *(Vec4S32 other) const {
return Vec4S32{ vmulq_s32(v, other.v) };
}
};
struct Vec4F32 {
float32x4_t v;
static Vec4F32 FromVec4S32(Vec4S32 other) {
return Vec4F32{ _mm_cvtepi32_ps(other.v) };
}
Vec4F32 operator +(Vec4F32 other) const {
return Vec4F32{ vaddq_f32(v, other.v) };
}
Vec4F32 operator -(Vec4F32 other) const {
return Vec4F32{ vsubq_f32(v, other.v) };
}
Vec4F32 operator *(Vec4F32 other) const {
return Vec4F32{ vmulq_f32(v, other.v) };
}
};
#else
struct Vec4S32 {
s32 v[4];
};
#endif
struct ScreenVert {
int x;
int y;