From e6f0f84a45168323a4c878e812aa06b74c9e51c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 1 Dec 2022 16:21:27 +0100 Subject: [PATCH] SSE optimize Float4ToUint8x4, some uses --- Common/Data/Convert/SmallDataConvert.cpp | 1 + Common/Data/Convert/SmallDataConvert.h | 24 ++++++++++++++++++++++++ GPU/Common/DrawEngineCommon.cpp | 4 ++-- GPU/Common/SoftwareTransformCommon.cpp | 4 ++-- GPU/Common/VertexDecoderCommon.h | 14 ++++++-------- GPU/Software/TransformUnit.cpp | 2 +- 6 files changed, 36 insertions(+), 13 deletions(-) diff --git a/Common/Data/Convert/SmallDataConvert.cpp b/Common/Data/Convert/SmallDataConvert.cpp index 4f0d65ea11..ee2673aba1 100644 --- a/Common/Data/Convert/SmallDataConvert.cpp +++ b/Common/Data/Convert/SmallDataConvert.cpp @@ -1,3 +1,4 @@ #include "Common/Data/Convert/SmallDataConvert.h" alignas(16) const float one_over_255_x4[4] = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, }; +alignas(16) const float exactly_255_x4[4] = { 255.0f, 255.0f, 255.0f, 255.0f, }; diff --git a/Common/Data/Convert/SmallDataConvert.h b/Common/Data/Convert/SmallDataConvert.h index 740193e739..1bc6540907 100644 --- a/Common/Data/Convert/SmallDataConvert.h +++ b/Common/Data/Convert/SmallDataConvert.h @@ -18,6 +18,7 @@ #endif extern const float one_over_255_x4[4]; +extern const float exactly_255_x4[4]; // Utilities useful for filling in std140-layout uniform buffers, and similar. // NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html @@ -46,6 +47,12 @@ inline void Uint8x4ToFloat4(float f[4], uint32_t u) { // Could be SSE optimized. inline uint32_t Float4ToUint8x4(const float f[4]) { +#ifdef _M_SSE + __m128i zero = _mm_setzero_si128(); + __m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4)); + __m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero); + return _mm_cvtsi128_si32(ivalue); +#else int i4[4]; for (int i = 0; i < 4; i++) { if (f[i] > 1.0f) { @@ -57,6 +64,23 @@ inline uint32_t Float4ToUint8x4(const float f[4]) { } } return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24); +#endif +} + +inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) { +#ifdef _M_SSE + // Does actually clamp, no way to avoid it with the pack ops! + __m128i zero = _mm_setzero_si128(); + __m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4)); + __m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero); + return _mm_cvtsi128_si32(ivalue); +#else + u32 i4[4]; + for (int i = 0; i < 4; i++) { + i4[i] = (int)(f[i] * 255.0f); + } + return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24); +#endif } inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) { diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 664b90823e..f5579de741 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -524,7 +524,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, } if (vertType & GE_VTYPE_COL_MASK) { - reader.ReadColor0_8888(sv.color); + sv.color_32 = reader.ReadColor0_8888(); } else { memcpy(sv.color, defaultColor, 4); } @@ -573,7 +573,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, sv.uv[1] = 0.0f; } if (vertType & GE_VTYPE_COL_MASK) { - reader.ReadColor0_8888(sv.color); + sv.color_32 = reader.ReadColor0_8888(); } else { memcpy(sv.color, defaultColor, 4); } diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp index 88dd7c9d3c..b85dc342d7 100644 --- a/GPU/Common/SoftwareTransformCommon.cpp +++ b/GPU/Common/SoftwareTransformCommon.cpp @@ -219,10 +219,10 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt if (hasColor) { if (provokeIndOffset != 0 && index + provokeIndOffset < maxIndex) { reader.Goto(index + provokeIndOffset); - reader.ReadColor0_8888(vert.color0); + vert.color0_32 = reader.ReadColor0_8888(); reader.Goto(index); } else { - reader.ReadColor0_8888(vert.color0); + vert.color0_32 = reader.ReadColor0_8888(); } } else { vert.color0_32 = materialAmbientRGBA; diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 355343a879..6ebad221d3 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -192,28 +192,27 @@ public: } } - void ReadColor0_8888(u8 color[4]) const { + u32 ReadColor0_8888() const { switch (decFmt_.c0fmt) { case DEC_U8_4: { const u8 *b = (const u8 *)(data_ + decFmt_.c0off); - memcpy(color, b, 4); + u32 value; + memcpy(&value, b, 4); + return value; } break; case DEC_FLOAT_4: { const float *f = (const float *)(data_ + decFmt_.c0off); - for (int i = 0; i < 4; i++) - color[i] = f[i] * 255.0f; + return Float4ToUint8x4_NoClamp(f); } break; default: - memset(color, 0, sizeof(u8) * 4); - break; + return 0; } } - void ReadColor1(float color[3]) const { switch (decFmt_.c1fmt) { case DEC_U8_4: @@ -281,7 +280,6 @@ public: case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i+4] = s[i] * (1.f / 32768.f); break; case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i+4] = s[i] * (1.f / 32768.f); break; default: - ERROR_LOG_REPORT_ONCE(fmtw1, G3D, "Reader: Unsupported W1 Format %d", decFmt_.w1fmt); memset(weights + 4, 0, sizeof(float) * 4); break; } diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp index ac8ea921a8..0119250316 100644 --- a/GPU/Software/TransformUnit.cpp +++ b/GPU/Software/TransformUnit.cpp @@ -344,7 +344,7 @@ ClipVertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformS normal = -normal; if (vreader.hasColor0()) { - vreader.ReadColor0_8888((u8 *)&vertex.v.color0); + vertex.v.color0 = vreader.ReadColor0_8888(); } else { vertex.v.color0 = gstate.getMaterialAmbientRGBA(); }