SSE optimize Float4ToUint8x4, some uses

2025-04-02 11:01:50 -04:00 · 2022-12-01 16:21:27 +01:00 · 2022-12-01 16:21:27 +01:00 · e6f0f84a45
commit e6f0f84a45
parent d02f46cb27
6 changed files with 36 additions and 13 deletions
--- a/Common/Data/Convert/SmallDataConvert.cpp
+++ b/Common/Data/Convert/SmallDataConvert.cpp
@ -1,3 +1,4 @@
 #include "Common/Data/Convert/SmallDataConvert.h"

 alignas(16) const float one_over_255_x4[4] = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, };
+alignas(16) const float exactly_255_x4[4] = { 255.0f, 255.0f, 255.0f, 255.0f, };
--- a/Common/Data/Convert/SmallDataConvert.h
+++ b/Common/Data/Convert/SmallDataConvert.h
@ -18,6 +18,7 @@
 #endif

 extern const float one_over_255_x4[4];
+extern const float exactly_255_x4[4];

 // Utilities useful for filling in std140-layout uniform buffers, and similar.
 // NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html
@ -46,6 +47,12 @@ inline void Uint8x4ToFloat4(float f[4], uint32_t u) {

 // Could be SSE optimized.
 inline uint32_t Float4ToUint8x4(const float f[4]) {
+#ifdef _M_SSE
+	__m128i zero = _mm_setzero_si128();
+	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
+	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
+	return _mm_cvtsi128_si32(ivalue);
+#else
 	int i4[4];
 	for (int i = 0; i < 4; i++) {
 		if (f[i] > 1.0f) {
@ -57,6 +64,23 @@ inline uint32_t Float4ToUint8x4(const float f[4]) {
 		}
 	}
 	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
+#endif
+}
+
+inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
+#ifdef _M_SSE
+	// Does actually clamp, no way to avoid it with the pack ops!
+	__m128i zero = _mm_setzero_si128();
+	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
+	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
+	return _mm_cvtsi128_si32(ivalue);
+#else
+	u32 i4[4];
+	for (int i = 0; i < 4; i++) {
+		i4[i] = (int)(f[i] * 255.0f);
+	}
+	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
+#endif
 }

 inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -524,7 +524,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
 			}

 			if (vertType & GE_VTYPE_COL_MASK) {
-				reader.ReadColor0_8888(sv.color);
+				sv.color_32 = reader.ReadColor0_8888();
 			} else {
 				memcpy(sv.color, defaultColor, 4);
 			}
@ -573,7 +573,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
 				sv.uv[1] = 0.0f;
 			}
 			if (vertType & GE_VTYPE_COL_MASK) {
-				reader.ReadColor0_8888(sv.color);
+				sv.color_32 = reader.ReadColor0_8888();
 			} else {
 				memcpy(sv.color, defaultColor, 4);
 			}
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@ -219,10 +219,10 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
 			if (hasColor) {
 				if (provokeIndOffset != 0 && index + provokeIndOffset < maxIndex) {
 					reader.Goto(index + provokeIndOffset);
-					reader.ReadColor0_8888(vert.color0);
+					vert.color0_32 = reader.ReadColor0_8888();
 					reader.Goto(index);
 				} else {
-					reader.ReadColor0_8888(vert.color0);
+					vert.color0_32 = reader.ReadColor0_8888();
 				}
 			} else {
 				vert.color0_32 = materialAmbientRGBA;
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@ -192,28 +192,27 @@ public:
 		}
 	}

-	void ReadColor0_8888(u8 color[4]) const {
+	u32 ReadColor0_8888() const {
 		switch (decFmt_.c0fmt) {
 		case DEC_U8_4:
 			{
 				const u8 *b = (const u8 *)(data_ + decFmt_.c0off);
-				memcpy(color, b, 4);
+				u32 value;
+				memcpy(&value, b, 4);
+				return value;
 			}
 			break;
 		case DEC_FLOAT_4:
 			{
 				const float *f = (const float *)(data_ + decFmt_.c0off);
-				for (int i = 0; i < 4; i++)
-					color[i] = f[i] * 255.0f;
+				return Float4ToUint8x4_NoClamp(f);
 			}
 			break;
 		default:
-			memset(color, 0, sizeof(u8) * 4);
-			break;
+			return 0;
 		}
 	}

-
 	void ReadColor1(float color[3]) const {
 		switch (decFmt_.c1fmt) {
 		case DEC_U8_4:
@ -281,7 +280,6 @@ public:
 		case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
 		case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i+4] = s[i]  * (1.f / 32768.f); break;
 		default:
-			ERROR_LOG_REPORT_ONCE(fmtw1, G3D, "Reader: Unsupported W1 Format %d", decFmt_.w1fmt);
 			memset(weights + 4, 0, sizeof(float) * 4);
 			break;
 		}
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@ -344,7 +344,7 @@ ClipVertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformS
 		normal = -normal;

 	if (vreader.hasColor0()) {
-		vreader.ReadColor0_8888((u8 *)&vertex.v.color0);
+		vertex.v.color0 = vreader.ReadColor0_8888();
 	} else {
 		vertex.v.color0 = gstate.getMaterialAmbientRGBA();
 	}