Don't use aligned loads in non-inlined funcs.

I'm wanting things to stay in registers, but that's not realistic for
arguments.  Force inline the others.  May help #5699.
This commit is contained in:
Unknown W. Brackets 2014-03-23 12:09:17 -07:00
parent a26e6ce4b6
commit 56b83af1f0
2 changed files with 13 additions and 10 deletions

View file

@ -24,10 +24,11 @@ float Vec2<float>::Length() const
{
#if defined(_M_SSE)
float ret;
__m128 sq = _mm_mul_ps(vec, vec);
__m128 xy = _mm_loadu_ps(&x);
__m128 sq = _mm_mul_ps(xy, xy);
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
const __m128 res = _mm_add_ss(sq, r2);
_mm_store_ps(&ret, _mm_sqrt_ss(res));
_mm_store_ss(&ret, _mm_sqrt_ss(res));
return ret;
#else
return sqrtf(Length2());
@ -71,11 +72,12 @@ float Vec3<float>::Length() const
{
#if defined(_M_SSE)
float ret;
__m128 sq = _mm_mul_ps(vec, vec);
__m128 xyz = _mm_loadu_ps(&x);
__m128 sq = _mm_mul_ps(xyz, xyz);
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3));
_mm_store_ps(&ret, _mm_sqrt_ss(res));
_mm_store_ss(&ret, _mm_sqrt_ss(res));
return ret;
#else
return sqrtf(Length2());
@ -185,10 +187,11 @@ float Vec4<float>::Length() const
{
#if defined(_M_SSE)
float ret;
__m128 sq = _mm_mul_ps(vec, vec);
__m128 xyzw = _mm_loadu_ps(&x);
__m128 sq = _mm_mul_ps(xyzw, xyzw);
const __m128 r2 = _mm_add_ps(sq, _mm_movehl_ps(sq, sq));
const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1)));
_mm_store_ps(&ret, _mm_sqrt_ss(res));
_mm_store_ss(&ret, _mm_sqrt_ss(res));
return ret;
#else
return sqrtf(Length2());

View file

@ -919,7 +919,7 @@ inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
}
template<>
inline unsigned int Vec3<float>::ToRGB() const
__forceinline unsigned int Vec3<float>::ToRGB() const
{
#if defined(_M_SSE)
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f)));
@ -933,7 +933,7 @@ inline unsigned int Vec3<float>::ToRGB() const
}
template<>
inline unsigned int Vec3<int>::ToRGB() const
__forceinline unsigned int Vec3<int>::ToRGB() const
{
#if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(ivec, ivec);
@ -973,7 +973,7 @@ inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
}
template<>
inline unsigned int Vec4<float>::ToRGBA() const
__forceinline unsigned int Vec4<float>::ToRGBA() const
{
#if defined(_M_SSE)
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f)));
@ -988,7 +988,7 @@ inline unsigned int Vec4<float>::ToRGBA() const
}
template<>
inline unsigned int Vec4<int>::ToRGBA() const
__forceinline unsigned int Vec4<int>::ToRGBA() const
{
#if defined(_M_SSE)
__m128i c16 = _mm_packs_epi32(ivec, ivec);