From 56b83af1f06f75adb52efe75f19dc14f21b6a186 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 23 Mar 2014 12:09:17 -0700 Subject: [PATCH] Don't use aligned loads in non-inlined funcs. I'm wanting things to stay in registers, but that's not realistic for arguments. Force inline the others. May help #5699. --- GPU/Math3D.cpp | 15 +++++++++------ GPU/Math3D.h | 8 ++++---- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/GPU/Math3D.cpp b/GPU/Math3D.cpp index a6dd8780fc..f43f41ade6 100644 --- a/GPU/Math3D.cpp +++ b/GPU/Math3D.cpp @@ -24,10 +24,11 @@ float Vec2::Length() const { #if defined(_M_SSE) float ret; - __m128 sq = _mm_mul_ps(vec, vec); + __m128 xy = _mm_loadu_ps(&x); + __m128 sq = _mm_mul_ps(xy, xy); const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 res = _mm_add_ss(sq, r2); - _mm_store_ps(&ret, _mm_sqrt_ss(res)); + _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; #else return sqrtf(Length2()); @@ -71,11 +72,12 @@ float Vec3::Length() const { #if defined(_M_SSE) float ret; - __m128 sq = _mm_mul_ps(vec, vec); + __m128 xyz = _mm_loadu_ps(&x); + __m128 sq = _mm_mul_ps(xyz, xyz); const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); const __m128 res = _mm_add_ss(sq, _mm_add_ss(r2, r3)); - _mm_store_ps(&ret, _mm_sqrt_ss(res)); + _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; #else return sqrtf(Length2()); @@ -185,10 +187,11 @@ float Vec4::Length() const { #if defined(_M_SSE) float ret; - __m128 sq = _mm_mul_ps(vec, vec); + __m128 xyzw = _mm_loadu_ps(&x); + __m128 sq = _mm_mul_ps(xyzw, xyzw); const __m128 r2 = _mm_add_ps(sq, _mm_movehl_ps(sq, sq)); const __m128 res = _mm_add_ss(r2, _mm_shuffle_ps(r2, r2, _MM_SHUFFLE(0, 0, 0, 1))); - _mm_store_ps(&ret, _mm_sqrt_ss(res)); + _mm_store_ss(&ret, _mm_sqrt_ss(res)); return ret; #else return sqrtf(Length2()); diff --git a/GPU/Math3D.h b/GPU/Math3D.h index deeaae35ba..705794e5b7 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -919,7 +919,7 @@ inline Vec3 Vec3::FromRGB(unsigned int rgb) } template<> -inline unsigned int Vec3::ToRGB() const +__forceinline unsigned int Vec3::ToRGB() const { #if defined(_M_SSE) __m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f))); @@ -933,7 +933,7 @@ inline unsigned int Vec3::ToRGB() const } template<> -inline unsigned int Vec3::ToRGB() const +__forceinline unsigned int Vec3::ToRGB() const { #if defined(_M_SSE) __m128i c16 = _mm_packs_epi32(ivec, ivec); @@ -973,7 +973,7 @@ inline Vec4 Vec4::FromRGBA(unsigned int rgba) } template<> -inline unsigned int Vec4::ToRGBA() const +__forceinline unsigned int Vec4::ToRGBA() const { #if defined(_M_SSE) __m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f))); @@ -988,7 +988,7 @@ inline unsigned int Vec4::ToRGBA() const } template<> -inline unsigned int Vec4::ToRGBA() const +__forceinline unsigned int Vec4::ToRGBA() const { #if defined(_M_SSE) __m128i c16 = _mm_packs_epi32(ivec, ivec);