diff --git a/GPU/Math3D.h b/GPU/Math3D.h index f5502c304c..50f4f15ee1 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -18,6 +18,11 @@ #pragma once #include +#include "Common/Common.h" + +#if defined(_M_SSE) +#include +#endif namespace Math3D { @@ -36,9 +41,16 @@ template class Vec2 { public: - struct + union { - T x,y; + struct + { + T x,y; + }; +#if defined(_M_SSE) + __m128i ivec; + __m128 vec; +#endif }; T* AsArray() { return &x; } @@ -47,6 +59,10 @@ public: Vec2() {} Vec2(const T a[2]) : x(a[0]), y(a[1]) {} Vec2(const T& _x, const T& _y) : x(_x), y(_y) {} +#if defined(_M_SSE) + Vec2(const __m128 &_vec) : vec(_vec) {} + Vec2(const __m128i &_ivec) : ivec(_ivec) {} +#endif template Vec2 Cast() const @@ -164,9 +180,16 @@ template class Vec3 { public: - struct + union { - T x,y,z; + struct + { + T x,y,z; + }; +#if defined(_M_SSE) + __m128i ivec; + __m128 vec; +#endif }; T* AsArray() { return &x; } @@ -176,6 +199,10 @@ public: Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {} Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {} Vec3(const Vec2& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {} +#if defined(_M_SSE) + Vec3(const __m128 &_vec) : vec(_vec) {} + Vec3(const __m128i &_ivec) : ivec(_ivec) {} +#endif template Vec3 Cast() const @@ -324,9 +351,16 @@ template class Vec4 { public: - struct + union { - T x,y,z,w; + struct + { + T x,y,z,w; + }; +#if defined(_M_SSE) + __m128i ivec; + __m128 vec; +#endif }; T* AsArray() { return &x; } @@ -337,6 +371,10 @@ public: Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {} Vec4(const Vec2& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {} Vec4(const Vec3& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {} +#if defined(_M_SSE) + Vec4(const __m128 &_vec) : vec(_vec) {} + Vec4(const __m128i &_ivec) : ivec(_ivec) {} +#endif template Vec4 Cast() const diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index e517d88aba..7357692372 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -30,6 +30,10 @@ #include +#if defined(_M_SSE) +#include +#endif + extern FormatBuffer fb; extern FormatBuffer depthbuf; @@ -978,6 +982,46 @@ inline void ApplyTexturing(Vec3 &prim_color_rgb, int &prim_color_a, float s prim_color_a = out.a(); } +#if defined(_M_SSE) +static inline __m128 Interpolate(const __m128 &c0, const __m128 &c1, const __m128 &c2, int w0, int w1, int w2, float wsum) { + __m128 v = _mm_mul_ps(c0, _mm_cvtepi32_ps(_mm_set1_epi32(w0))); + v = _mm_add_ps(v, _mm_mul_ps(c1, _mm_cvtepi32_ps(_mm_set1_epi32(w1)))); + v = _mm_add_ps(v, _mm_mul_ps(c2, _mm_cvtepi32_ps(_mm_set1_epi32(w2)))); + return _mm_mul_ps(v, _mm_set_ps1(wsum)); +} + +static inline __m128i Interpolate(const __m128i &c0, const __m128i &c1, const __m128i &c2, int w0, int w1, int w2, float wsum) { + return _mm_cvtps_epi32(Interpolate(_mm_cvtepi32_ps(c0), _mm_cvtepi32_ps(c1), _mm_cvtepi32_ps(c2), w0, w1, w2, wsum)); +} +#endif + +// NOTE: When not casting color0 and color1 to float vectors, this code suffers from severe overflow issues. +// Not sure if that should be regarded as a bug or if casting to float is a valid fix. + +static inline Vec4 Interpolate(const Vec4 &c0, const Vec4 &c1, const Vec4 &c2, int w0, int w1, int w2, float wsum) { +#if defined(_M_SSE) + return Vec4(Interpolate(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum)); +#else + return ((c0.Cast() * w0 + c1.Cast() * w1 + c2.Cast() * w2) * wsum).Cast(); +#endif +} + +static inline Vec3 Interpolate(const Vec3 &c0, const Vec3 &c1, const Vec3 &c2, int w0, int w1, int w2, float wsum) { +#if defined(_M_SSE) + return Vec3(Interpolate(c0.ivec, c1.ivec, c2.ivec, w0, w1, w2, wsum)); +#else + return ((c0.Cast() * w0 + c1.Cast() * w1 + c2.Cast() * w2) * wsum).Cast(); +#endif +} + +static inline Vec2 Interpolate(const Vec2 &c0, const Vec2 &c1, const Vec2 &c2, int w0, int w1, int w2, float wsum) { +#if defined(_M_SSE) + return Vec2(Interpolate(c0.vec, c1.vec, c2.vec, w0, w1, w2, wsum)); +#else + return (c0 * w0 + c1 * w1 + c2 * w2) * wsum; +#endif +} + template void DrawTriangleSlice( const VertexData& v0, const VertexData& v1, const VertexData& v2, @@ -1067,16 +1111,11 @@ void DrawTriangleSlice( int prim_color_a = 0; Vec3 sec_color(0, 0, 0); if (gstate.getShadeMode() == GE_SHADE_GOURAUD && !clearMode) { - // NOTE: When not casting color0 and color1 to float vectors, this code suffers from severe overflow issues. - // Not sure if that should be regarded as a bug or if casting to float is a valid fix. // TODO: Is that the correct way to interpolate? - prim_color_rgb = ((v0.color0.rgb().Cast() * w0 + - v1.color0.rgb().Cast() * w1 + - v2.color0.rgb().Cast() * w2) * wsum).Cast(); - prim_color_a = (int)(((float)v0.color0.a() * w0 + (float)v1.color0.a() * w1 + (float)v2.color0.a() * w2) * wsum); - sec_color = ((v0.color1.Cast() * w0 + - v1.color1.Cast() * w1 + - v2.color1.Cast() * w2) * wsum).Cast(); + const Vec4 prim_color = Interpolate(v0.color0, v1.color0, v2.color0, w0, w1, w2, wsum); + prim_color_rgb = prim_color.rgb(); + prim_color_a = prim_color.a(); + sec_color = Interpolate(v0.color1, v1.color1, v2.color1, w0, w1, w2, wsum); } else { prim_color_rgb = v2.color0.rgb(); prim_color_a = v2.color0.a();