mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
More crosssimd
This commit is contained in:
parent
de09dec9d1
commit
c3ac798545
2 changed files with 52 additions and 31 deletions
|
@ -121,12 +121,6 @@ struct Vec4S32 {
|
|||
void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
|
||||
void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
|
||||
|
||||
// Swaps the two lower elements. Useful for reversing triangles..
|
||||
Vec4S32 SwapLowerElements() {
|
||||
return Vec4S32{
|
||||
_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
|
||||
};
|
||||
}
|
||||
Vec4S32 SignBits32ToMask() {
|
||||
return Vec4S32{
|
||||
_mm_srai_epi32(v, 31)
|
||||
|
@ -144,6 +138,12 @@ struct Vec4S32 {
|
|||
return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
|
||||
}
|
||||
|
||||
Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
|
||||
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
|
||||
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
|
||||
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
|
||||
Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
|
||||
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
|
||||
Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
|
||||
|
@ -153,6 +153,11 @@ struct Vec4S32 {
|
|||
void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
|
||||
void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
|
||||
|
||||
Vec4S32 operator <<(int imm) const { return Vec4S32{ _mm_slli_epi32(v, imm) }; }
|
||||
|
||||
// NOTE: May be slow.
|
||||
int operator[](size_t index) const { return ((int *)&v)[index]; }
|
||||
|
||||
// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
|
||||
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0)
|
||||
|
||||
|
@ -217,9 +222,12 @@ struct Vec4F32 {
|
|||
void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
|
||||
void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
|
||||
Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
|
||||
// NOTE: May be slow.
|
||||
float operator[](size_t index) const { return ((float *)&v)[index]; }
|
||||
|
||||
Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
|
||||
Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
|
||||
Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
|
||||
Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }
|
||||
|
||||
Vec4F32 Clamp(float lower, float higher) {
|
||||
return Vec4F32{
|
||||
|
@ -238,13 +246,6 @@ struct Vec4F32 {
|
|||
return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
|
||||
}
|
||||
|
||||
// Swaps the two lower elements. Useful for reversing triangles..
|
||||
Vec4F32 SwapLowerElements() {
|
||||
return Vec4F32{
|
||||
_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 2, 0, 1))
|
||||
};
|
||||
}
|
||||
|
||||
inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
|
||||
return Vec4F32{ _mm_add_ps(
|
||||
_mm_add_ps(
|
||||
|
@ -443,17 +444,18 @@ struct Vec4S32 {
|
|||
void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
|
||||
void StoreAligned(int *dst) { vst1q_s32(dst, v); }
|
||||
|
||||
// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
|
||||
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
|
||||
Vec4S32 SwapLowerElements() {
|
||||
int32x2_t upper = vget_high_s32(v);
|
||||
int32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
|
||||
return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
|
||||
};
|
||||
|
||||
// Warning: Unlike on x86, this is a full 32-bit multiplication.
|
||||
Vec4S32 MulAsS16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
|
||||
|
||||
Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
|
||||
// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
|
||||
Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
|
||||
Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
|
||||
Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
|
||||
|
||||
// NOTE: May be slow.
|
||||
int operator[](size_t index) const { return ((int *)&v)[index]; }
|
||||
|
||||
Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
|
||||
Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
|
||||
Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
|
||||
|
@ -508,6 +510,9 @@ struct Vec4F32 {
|
|||
return Vec4F32{ vcvtq_f32_s32(other.v) };
|
||||
}
|
||||
|
||||
// NOTE: May be slow.
|
||||
float operator[](size_t index) const { return ((float *)&v)[index]; }
|
||||
|
||||
Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
|
||||
Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
|
||||
Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
|
||||
|
@ -521,15 +526,22 @@ struct Vec4F32 {
|
|||
|
||||
Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
|
||||
|
||||
Vec4F32 Recip() {
|
||||
Vec4F32 Recip() const {
|
||||
float32x4_t recip = vrecpeq_f32(v);
|
||||
// Use a couple Newton-Raphson steps to refine the estimate.
|
||||
// May be able to get away with only one refinement, not sure!
|
||||
// To save one iteration at the expense of accuracy, use RecipApprox().
|
||||
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
|
||||
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
|
||||
return Vec4F32{ recip };
|
||||
}
|
||||
|
||||
Vec4F32 RecipApprox() const {
|
||||
float32x4_t recip = vrecpeq_f32(v);
|
||||
// To approximately match the precision of x86-64's rcpps, do a single iteration.
|
||||
recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
|
||||
return Vec4F32{ recip };
|
||||
}
|
||||
|
||||
Vec4F32 Clamp(float lower, float higher) {
|
||||
return Vec4F32{
|
||||
vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
|
||||
|
@ -544,13 +556,6 @@ struct Vec4F32 {
|
|||
return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
|
||||
}
|
||||
|
||||
// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
|
||||
// This is quite awkward on ARM64 :/ Maybe there's a better solution?
|
||||
Vec4F32 SwapLowerElements() {
|
||||
float32x2_t lowerSwapped = vrev64_f32(vget_low_f32(v));
|
||||
return Vec4F32{ vcombine_f32(lowerSwapped, vget_high_f32(v)) };
|
||||
};
|
||||
|
||||
// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
|
||||
static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
|
||||
#if PPSSPP_ARCH(ARM64_NEON)
|
||||
|
|
|
@ -56,6 +56,7 @@
|
|||
#include "Common/Buffer.h"
|
||||
#include "Common/File/Path.h"
|
||||
#include "Common/Math/SIMDHeaders.h"
|
||||
#include "Common/Math/CrossSIMD.h"
|
||||
// Get some more instructions for testing
|
||||
#if PPSSPP_ARCH(SSE2)
|
||||
#include <immintrin.h>
|
||||
|
@ -1124,6 +1125,21 @@ bool TestSIMD() {
|
|||
EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
|
||||
EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
|
||||
#endif
|
||||
|
||||
const int testval[2][4] = {
|
||||
{ 0x1000, 0x2000, 0x3000, 0x7000 },
|
||||
{ -0x1000, -0x2000, -0x3000, -0x7000 }
|
||||
};
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
Vec4S32 s = Vec4S32::Load(testval[i]);
|
||||
Vec4S32 square = s * s;
|
||||
Vec4S32 square16 = s.Mul16(s);
|
||||
EXPECT_EQ_INT(square[0], square16[0]);
|
||||
EXPECT_EQ_INT(square[1], square16[1]);
|
||||
EXPECT_EQ_INT(square[2], square16[2]);
|
||||
EXPECT_EQ_INT(square[3], square16[3]);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue