mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Move prototype cross simd wrapper structs to CrossSIMD.h
This commit is contained in:
parent
72c954d8c3
commit
c92b3b6521
5 changed files with 120 additions and 81 deletions
|
@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
|
|||
}
|
||||
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
// fp64's improved version, see #19751
|
||||
// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
|
||||
static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
|
||||
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
|
||||
const __m128i maskGA = _mm_set1_epi32(0x8000F800);
|
||||
|
@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp,
|
|||
__m128i c0 = _mm_load_si128(&srcp[i + 0]);
|
||||
__m128i c1 = _mm_load_si128(&srcp[i + 1]);
|
||||
|
||||
__m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000
|
||||
__m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
|
||||
__m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000
|
||||
__m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000
|
||||
__m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000
|
||||
|
|
|
@ -5,3 +5,99 @@
|
|||
#pragma once
|
||||
|
||||
#include "Common/Math/SIMDHeaders.h"
|
||||
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
|
||||
struct Vec4S32 {
|
||||
__m128i v;
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const {
|
||||
return Vec4S32{ _mm_add_epi32(v, other.v) };
|
||||
}
|
||||
Vec4S32 operator -(Vec4S32 other) const {
|
||||
return Vec4S32{ _mm_sub_epi32(v, other.v) };
|
||||
}
|
||||
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
|
||||
Vec4S32 operator *(Vec4S32 other) const {
|
||||
return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0)
|
||||
}
|
||||
};
|
||||
|
||||
struct Vec4F32 {
|
||||
__m128 v;
|
||||
|
||||
static Vec4F32 FromVec4S32(Vec4S32 other) {
|
||||
return Vec4F32{ _mm_cvtepi32_ps(other.v) };
|
||||
}
|
||||
|
||||
Vec4F32 operator +(Vec4F32 other) const {
|
||||
return Vec4F32{ _mm_add_ps(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator -(Vec4F32 other) const {
|
||||
return Vec4F32{ _mm_sub_ps(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator *(Vec4F32 other) const {
|
||||
return Vec4F32{ _mm_mul_ps(v, other.v) };
|
||||
}
|
||||
};
|
||||
|
||||
struct Vec4U16 {
|
||||
__m128i v; // we only use the lower 64 bits.
|
||||
static Vec4U16 Load(void *mem) {
|
||||
return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) };
|
||||
}
|
||||
void Store(void *mem) {
|
||||
_mm_storel_epi64((__m128i *)mem, v);
|
||||
}
|
||||
static Vec4U16 Max(Vec4U16 a, Vec4U16 b) {
|
||||
return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
|
||||
}
|
||||
static Vec4U16 Min(Vec4U16 a, Vec4U16 b) {
|
||||
return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
|
||||
}
|
||||
Vec4U16 CompareLT(Vec4U16 other) {
|
||||
return Vec4U16{ _mm_cmplt_epu16(v, other.v) };
|
||||
}
|
||||
};
|
||||
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
struct Vec4S32 {
|
||||
int32x4_t v;
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const {
|
||||
return Vec4S32{ vaddq_s32(v, other.v) };
|
||||
}
|
||||
Vec4S32 operator -(Vec4S32 other) const {
|
||||
return Vec4S32{ vsubq_s32(v, other.v) };
|
||||
}
|
||||
Vec4S32 operator *(Vec4S32 other) const {
|
||||
return Vec4S32{ vmulq_s32(v, other.v) };
|
||||
}
|
||||
};
|
||||
|
||||
struct Vec4F32 {
|
||||
float32x4_t v;
|
||||
|
||||
static Vec4F32 FromVec4S32(Vec4S32 other) {
|
||||
return Vec4F32{ vcvtq_f32_s32(other.v) };
|
||||
}
|
||||
|
||||
Vec4F32 operator +(Vec4F32 other) const {
|
||||
return Vec4F32{ vaddq_f32(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator -(Vec4F32 other) const {
|
||||
return Vec4F32{ vsubq_f32(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator *(Vec4F32 other) const {
|
||||
return Vec4F32{ vmulq_f32(v, other.v) };
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
struct Vec4S32 {
|
||||
s32 v[4];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
|
@ -128,4 +128,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
|
|||
return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
|
||||
}
|
||||
|
||||
// The below are not real SSE instructions in any generation, but should exist.
|
||||
|
||||
// Return 0xFFFF where x <= y, else 0x0000.
|
||||
inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
|
||||
return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
|
||||
}
|
||||
|
||||
// Return 0xFFFF where x >= y, else 0x0000.
|
||||
inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
|
||||
return _mm_cmple_epu16(y, x);
|
||||
}
|
||||
|
||||
// Return 0xFFFF where x > y, else 0x0000.
|
||||
inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
|
||||
return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
|
||||
}
|
||||
|
||||
// Return 0xFFFF where x < y, else 0x0000.
|
||||
inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
|
||||
return _mm_cmpgt_epu16(y, x);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -6,8 +6,6 @@
|
|||
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
|
||||
#include "fast_matrix.h"
|
||||
|
||||
void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
|
||||
int i;
|
||||
__m128 a_col_1 = _mm_loadu_ps(a);
|
||||
|
|
|
@ -8,83 +8,6 @@
|
|||
#include "Common/Math/math_util.h"
|
||||
#include "GPU/Common/VertexDecoderCommon.h"
|
||||
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
|
||||
struct Vec4S32 {
|
||||
__m128i v;
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const {
|
||||
return Vec4S32{ _mm_add_epi32(v, other.v) };
|
||||
}
|
||||
Vec4S32 operator -(Vec4S32 other) const {
|
||||
return Vec4S32{ _mm_sub_epi32(v, other.v) };
|
||||
}
|
||||
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
|
||||
Vec4S32 operator *(Vec4S32 other) const {
|
||||
return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0)
|
||||
}
|
||||
};
|
||||
|
||||
struct Vec4F32 {
|
||||
__m128 v;
|
||||
|
||||
static Vec4F32 FromVec4S32(Vec4S32 other) {
|
||||
return Vec4F32{ _mm_cvtepi32_ps(other.v) };
|
||||
}
|
||||
|
||||
Vec4F32 operator +(Vec4F32 other) const {
|
||||
return Vec4F32{ _mm_add_ps(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator -(Vec4F32 other) const {
|
||||
return Vec4F32{ _mm_sub_ps(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator *(Vec4F32 other) const {
|
||||
return Vec4F32{ _mm_mul_ps(v, other.v) };
|
||||
}
|
||||
};
|
||||
|
||||
#elif PPSSPP_ARCH(ARM_NEON)
|
||||
|
||||
struct Vec4S32 {
|
||||
uint32x4_t v;
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const {
|
||||
return Vec4S32{ vaddq_s32(v, other.v) };
|
||||
}
|
||||
Vec4S32 operator -(Vec4S32 other) const {
|
||||
return Vec4S32{ vsubq_s32(v, other.v) };
|
||||
}
|
||||
Vec4S32 operator *(Vec4S32 other) const {
|
||||
return Vec4S32{ vmulq_s32(v, other.v) };
|
||||
}
|
||||
};
|
||||
|
||||
struct Vec4F32 {
|
||||
float32x4_t v;
|
||||
|
||||
static Vec4F32 FromVec4S32(Vec4S32 other) {
|
||||
return Vec4F32{ _mm_cvtepi32_ps(other.v) };
|
||||
}
|
||||
|
||||
Vec4F32 operator +(Vec4F32 other) const {
|
||||
return Vec4F32{ vaddq_f32(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator -(Vec4F32 other) const {
|
||||
return Vec4F32{ vsubq_f32(v, other.v) };
|
||||
}
|
||||
Vec4F32 operator *(Vec4F32 other) const {
|
||||
return Vec4F32{ vmulq_f32(v, other.v) };
|
||||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
struct Vec4S32 {
|
||||
s32 v[4];
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
struct ScreenVert {
|
||||
int x;
|
||||
int y;
|
||||
|
|
Loading…
Add table
Reference in a new issue