From e6f0f84a45168323a4c878e812aa06b74c9e51c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 1 Dec 2022 16:21:27 +0100
Subject: [PATCH] SSE optimize Float4ToUint8x4, some uses

---
 Common/Data/Convert/SmallDataConvert.cpp |  1 +
 Common/Data/Convert/SmallDataConvert.h   | 24 ++++++++++++++++++++++++
 GPU/Common/DrawEngineCommon.cpp          |  4 ++--
 GPU/Common/SoftwareTransformCommon.cpp   |  4 ++--
 GPU/Common/VertexDecoderCommon.h         | 14 ++++++--------
 GPU/Software/TransformUnit.cpp           |  2 +-
 6 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/Common/Data/Convert/SmallDataConvert.cpp b/Common/Data/Convert/SmallDataConvert.cpp
index 4f0d65ea11..ee2673aba1 100644
--- a/Common/Data/Convert/SmallDataConvert.cpp
+++ b/Common/Data/Convert/SmallDataConvert.cpp
@@ -1,3 +1,4 @@
 #include "Common/Data/Convert/SmallDataConvert.h"
 
 alignas(16) const float one_over_255_x4[4] = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, };
+alignas(16) const float exactly_255_x4[4] = { 255.0f, 255.0f, 255.0f, 255.0f, };
diff --git a/Common/Data/Convert/SmallDataConvert.h b/Common/Data/Convert/SmallDataConvert.h
index 740193e739..1bc6540907 100644
--- a/Common/Data/Convert/SmallDataConvert.h
+++ b/Common/Data/Convert/SmallDataConvert.h
@@ -18,6 +18,7 @@
 #endif
 
 extern const float one_over_255_x4[4];
+extern const float exactly_255_x4[4];
 
 // Utilities useful for filling in std140-layout uniform buffers, and similar.
 // NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html
@@ -46,6 +47,12 @@ inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
 
 // Could be SSE optimized.
 inline uint32_t Float4ToUint8x4(const float f[4]) {
+#ifdef _M_SSE
+	__m128i zero = _mm_setzero_si128();
+	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
+	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
+	return _mm_cvtsi128_si32(ivalue);
+#else
 	int i4[4];
 	for (int i = 0; i < 4; i++) {
 		if (f[i] > 1.0f) {
@@ -57,6 +64,23 @@ inline uint32_t Float4ToUint8x4(const float f[4]) {
 		}
 	}
 	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
+#endif
+}
+
+inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
+#ifdef _M_SSE
+	// Does actually clamp, no way to avoid it with the pack ops!
+	__m128i zero = _mm_setzero_si128();
+	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
+	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
+	return _mm_cvtsi128_si32(ivalue);
+#else
+	u32 i4[4];
+	for (int i = 0; i < 4; i++) {
+		i4[i] = (int)(f[i] * 255.0f);
+	}
+	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
+#endif
 }
 
 inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 664b90823e..f5579de741 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -524,7 +524,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
 			}
 
 			if (vertType & GE_VTYPE_COL_MASK) {
-				reader.ReadColor0_8888(sv.color);
+				sv.color_32 = reader.ReadColor0_8888();
 			} else {
 				memcpy(sv.color, defaultColor, 4);
 			}
@@ -573,7 +573,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
 				sv.uv[1] = 0.0f;
 			}
 			if (vertType & GE_VTYPE_COL_MASK) {
-				reader.ReadColor0_8888(sv.color);
+				sv.color_32 = reader.ReadColor0_8888();
 			} else {
 				memcpy(sv.color, defaultColor, 4);
 			}
diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp
index 88dd7c9d3c..b85dc342d7 100644
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@@ -219,10 +219,10 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
 			if (hasColor) {
 				if (provokeIndOffset != 0 && index + provokeIndOffset < maxIndex) {
 					reader.Goto(index + provokeIndOffset);
-					reader.ReadColor0_8888(vert.color0);
+					vert.color0_32 = reader.ReadColor0_8888();
 					reader.Goto(index);
 				} else {
-					reader.ReadColor0_8888(vert.color0);
+					vert.color0_32 = reader.ReadColor0_8888();
 				}
 			} else {
 				vert.color0_32 = materialAmbientRGBA;
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 355343a879..6ebad221d3 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -192,28 +192,27 @@ public:
 		}
 	}
 
-	void ReadColor0_8888(u8 color[4]) const {
+	u32 ReadColor0_8888() const {
 		switch (decFmt_.c0fmt) {
 		case DEC_U8_4:
 			{
 				const u8 *b = (const u8 *)(data_ + decFmt_.c0off);
-				memcpy(color, b, 4);
+				u32 value;
+				memcpy(&value, b, 4);
+				return value;
 			}
 			break;
 		case DEC_FLOAT_4:
 			{
 				const float *f = (const float *)(data_ + decFmt_.c0off);
-				for (int i = 0; i < 4; i++)
-					color[i] = f[i] * 255.0f;
+				return Float4ToUint8x4_NoClamp(f);
 			}
 			break;
 		default:
-			memset(color, 0, sizeof(u8) * 4);
-			break;
+			return 0;
 		}
 	}
 
-
 	void ReadColor1(float color[3]) const {
 		switch (decFmt_.c1fmt) {
 		case DEC_U8_4:
@@ -281,7 +280,6 @@ public:
 		case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
 		case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i+4] = s[i]  * (1.f / 32768.f); break;
 		default:
-			ERROR_LOG_REPORT_ONCE(fmtw1, G3D, "Reader: Unsupported W1 Format %d", decFmt_.w1fmt);
 			memset(weights + 4, 0, sizeof(float) * 4);
 			break;
 		}
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index ac8ea921a8..0119250316 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -344,7 +344,7 @@ ClipVertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformS
 		normal = -normal;
 
 	if (vreader.hasColor0()) {
-		vreader.ReadColor0_8888((u8 *)&vertex.v.color0);
+		vertex.v.color0 = vreader.ReadColor0_8888();
 	} else {
 		vertex.v.color0 = gstate.getMaterialAmbientRGBA();
 	}