mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Add SSE2 version of vfpu_dot
See #18249. Speedup for this function ranges 10%..100%, depending on system. Updated verification and speed measurements: https://godbolt.org/z/W1z3sj6hz
This commit is contained in:
parent
b85f7e28a9
commit
23e2d0f797
1 changed files with 173 additions and 1 deletions
|
@ -598,7 +598,8 @@ float Float16ToFloat32(unsigned short l)
|
|||
return f;
|
||||
}
|
||||
|
||||
float vfpu_dot(const float a[4], const float b[4]) {
|
||||
// Reference C++ version.
|
||||
static float vfpu_dot_cpp(const float a[4], const float b[4]) {
|
||||
static const int EXTRA_BITS = 2;
|
||||
float2int result;
|
||||
float2int src[2];
|
||||
|
@ -711,6 +712,177 @@ float vfpu_dot(const float a[4], const float b[4]) {
|
|||
return result.f;
|
||||
}
|
||||
|
||||
#if defined(__SSE2__)
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
static inline __m128i mulhi32x4(__m128i a, __m128i b) {
|
||||
__m128i m02 = _mm_mul_epu32(a, b);
|
||||
__m128i m13 = _mm_mul_epu32(
|
||||
_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
|
||||
_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
|
||||
__m128i m=_mm_unpacklo_epi32(
|
||||
_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 3, 1)),
|
||||
_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 3, 1)));
|
||||
return m;
|
||||
}
|
||||
|
||||
// Values of rounding_mode:
|
||||
// -1 - detect at runtime
|
||||
// 0 - assume round-to-nearest-ties-to-even
|
||||
// 1 - round yourself in integer math
|
||||
template<int rounding_mode=-1>
|
||||
static float vfpu_dot_sse2(const float a[4], const float b[4])
|
||||
{
|
||||
static const int EXTRA_BITS = 2;
|
||||
|
||||
bool is_default_rounding_mode = (rounding_mode == 0);
|
||||
if(rounding_mode == -1)
|
||||
{
|
||||
volatile float test05 = 5.9604644775390625e-08f; // 0.5*2^-23
|
||||
volatile float test15 = 1.78813934326171875e-07f; // 1.5*2^-23
|
||||
const float res15 = 1.0000002384185791015625f; // 1+2^-22
|
||||
test05 += 1.0f;
|
||||
test15 += 1.0f;
|
||||
is_default_rounding_mode = (test05 == 1.0f && test15 == res15);
|
||||
}
|
||||
__m128 A = _mm_loadu_ps(a);
|
||||
__m128 B = _mm_loadu_ps(b);
|
||||
// Extract exponents.
|
||||
__m128 exp_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
|
||||
__m128 eA = _mm_and_ps(A, exp_mask);
|
||||
__m128 eB = _mm_and_ps(B, exp_mask);
|
||||
__m128i exps = _mm_srli_epi32(_mm_add_epi32(
|
||||
_mm_castps_si128(eA),
|
||||
_mm_castps_si128(eB)),23);
|
||||
// Find maximum exponent, stored as float32 in [1;2),
|
||||
// so we can use _mm_max_ps() with normal arguments.
|
||||
__m128 t = _mm_or_ps(_mm_castsi128_ps(exps), _mm_set1_ps(1.0f));
|
||||
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(2, 3, 0, 1))));
|
||||
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(1, 0, 3, 2))));
|
||||
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_set1_epi32(0x3F80007F)));
|
||||
int32_t mexp = _mm_cvtsi128_si32(_mm_castps_si128(t)) & 511;
|
||||
// NOTE: mexp is doubly-biased, same for exps.
|
||||
int32_t max_exp = mexp - 127;
|
||||
// Fall back on anything weird.
|
||||
__m128 finiteA = _mm_sub_ps(A, A);
|
||||
__m128 finiteB = _mm_sub_ps(B, B);
|
||||
finiteA = _mm_cmpeq_ps(finiteA, finiteA);
|
||||
finiteB = _mm_cmpeq_ps(finiteB, finiteB);
|
||||
if(max_exp >= 255 || _mm_movemask_ps(_mm_and_ps(finiteA, finiteB)) != 15) return vfpu_dot_cpp(a, b);
|
||||
// Extract significands.
|
||||
__m128i mA = _mm_or_si128(_mm_and_si128(_mm_castps_si128(A),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
|
||||
__m128i mB = _mm_or_si128(_mm_and_si128(_mm_castps_si128(B),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
|
||||
// Multiply.
|
||||
// NOTE: vfpu_dot does multiplication as
|
||||
// ((x<<EXTRA_BITS)*(y<<EXTRA_BITS))>>(23+EXTRA_BITS),
|
||||
// here we do (x*y)>>(23-EXTRA_BITS-1),
|
||||
// which produces twice the result (neither expression
|
||||
// overflows in our case). We need that because our
|
||||
// variable-shift scheme (below) must shift by at least 1 bit.
|
||||
static const int s = 32-(23 - EXTRA_BITS - 1), s0 = s / 2,s1 = s - s0;
|
||||
// We compute ((x*y)>>shift) as
|
||||
// (((x*y)<<(32-shift))>>32), which we express as
|
||||
// (((x<<s0)*(y<<s1))>>32) (neither shift overflows).
|
||||
__m128i m = mulhi32x4(_mm_slli_epi32(mA, s0), _mm_slli_epi32(mB, s1));
|
||||
// Shift according to max_exp. Since SSE2 doesn't have
|
||||
// variable per-lane shifts, we multiply *again*,
|
||||
// specifically, x>>y turns into (x<<(1<<(32-y)))>>32.
|
||||
// We compute 1<<(32-y) using floating-point casts.
|
||||
// NOTE: the cast for 1<<31 produces the correct value,
|
||||
// since the _mm_cvttps_epi32 error code just happens
|
||||
// to be 0x80000000.
|
||||
// So (since we pre-multiplied m by 2), we need
|
||||
// (m>>1)>>(mexp-exps),
|
||||
// i.e. m>>(mexp+1-exps),
|
||||
// i.e. (m<<(32-(mexp+1-exps)))>>32,
|
||||
// i.e. (m<<(exps-(mexp-31)))>>32.
|
||||
__m128i amounts = _mm_sub_epi32(exps, _mm_set1_epi32(mexp - 31));
|
||||
// Clamp by 0. Both zero and negative amounts produce zero,
|
||||
// since they correspond to right-shifting by 32 or more bits.
|
||||
amounts = _mm_and_si128(amounts, _mm_cmpgt_epi32(amounts, _mm_set1_epi32(0)));
|
||||
// Set up multipliers.
|
||||
__m128i bits = _mm_add_epi32(_mm_set1_epi32(0x3F800000), _mm_slli_epi32(amounts, 23));
|
||||
__m128i muls = _mm_cvttps_epi32(_mm_castsi128_ps(bits));
|
||||
m = mulhi32x4(m, muls);
|
||||
// Extract signs.
|
||||
__m128i signs = _mm_cmpgt_epi32(
|
||||
_mm_set1_epi32(0),
|
||||
_mm_xor_si128(_mm_castps_si128(A), _mm_castps_si128(B)));
|
||||
// Apply signs to m.
|
||||
m = _mm_sub_epi32(_mm_xor_si128(m, signs), signs);
|
||||
// Horizontal sum.
|
||||
// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
|
||||
__m128i h64 = _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i s64 = _mm_add_epi32(h64, m);
|
||||
__m128i h32 = _mm_shufflelo_epi16(s64, _MM_SHUFFLE(1, 0, 3, 2));
|
||||
__m128i s32 = _mm_add_epi32(s64, h32);
|
||||
int32_t mant_sum = _mm_cvtsi128_si32(s32);
|
||||
|
||||
// The rest is scalar.
|
||||
uint32_t sign_sum = 0;
|
||||
if (mant_sum < 0) {
|
||||
sign_sum = 0x80000000;
|
||||
mant_sum = -mant_sum;
|
||||
}
|
||||
|
||||
// Truncate off the extra bits now. We want to zero them for rounding purposes.
|
||||
mant_sum >>= EXTRA_BITS;
|
||||
|
||||
if (mant_sum == 0 || max_exp <= 0) {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
if(is_default_rounding_mode)
|
||||
{
|
||||
float2int r;
|
||||
r.f = (float)mant_sum;
|
||||
mant_sum = (r.i & 0x007FFFFF) | 0x00800000;
|
||||
max_exp += (r.i >> 23) - 0x96;
|
||||
}
|
||||
else
|
||||
{
|
||||
int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
|
||||
if (shift < 0) {
|
||||
// Round to even if we'd shift away a 0.5.
|
||||
const uint32_t round_bit = 1 << (-shift - 1);
|
||||
if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
|
||||
mant_sum += round_bit;
|
||||
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
|
||||
} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
|
||||
mant_sum += round_bit;
|
||||
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
|
||||
}
|
||||
mant_sum >>= -shift;
|
||||
max_exp += -shift;
|
||||
} else {
|
||||
mant_sum <<= shift;
|
||||
max_exp -= shift;
|
||||
}
|
||||
_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);
|
||||
}
|
||||
|
||||
if (max_exp >= 255) {
|
||||
max_exp = 255;
|
||||
mant_sum = 0;
|
||||
} else if (max_exp <= 0) {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
float2int result;
|
||||
result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
|
||||
return result.f;
|
||||
}
|
||||
|
||||
#endif // defined(__SSE2__)
|
||||
|
||||
float vfpu_dot(const float a[4], const float b[4]) {
|
||||
#if defined(__SSE2__)
|
||||
return vfpu_dot_sse2(a, b);
|
||||
#endif
|
||||
return vfpu_dot_cpp(a, b);
|
||||
}
|
||||
|
||||
//==============================================================================
|
||||
// The code below attempts to exactly match behaviour of
|
||||
// PSP's vrnd instructions. See investigation starting around
|
||||
|
|
Loading…
Add table
Reference in a new issue