diff --git a/Common/Data/Convert/SmallDataConvert.h b/Common/Data/Convert/SmallDataConvert.h
index 95558a17df..740193e739 100644
--- a/Common/Data/Convert/SmallDataConvert.h
+++ b/Common/Data/Convert/SmallDataConvert.h
@@ -23,7 +23,6 @@ extern const float one_over_255_x4[4];
 // NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html
 
 // LSBs in f[0], etc.
-// Could be SSE optimized.
 inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
 #ifdef _M_SSE
 	__m128i zero = _mm_setzero_si128();
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 7c2dcb2cfc..f23db789cc 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -23,6 +23,7 @@
 
 #include "Common/CommonTypes.h"
 #include "Common/Data/Collections/Hashmaps.h"
+#include "Common/Data/Convert/SmallDataConvert.h"
 #include "Common/Log.h"
 #include "Core/Reporting.h"
 #include "GPU/ge_constants.h"
@@ -181,11 +182,7 @@ public:
 	void ReadColor0(float color[4]) const {
 		switch (decFmt_.c0fmt) {
 		case DEC_U8_4:
-			{
-				const u8 *b = (const u8 *)(data_ + decFmt_.c0off);
-				for (int i = 0; i < 4; i++)
-					color[i] = b[i] * (1.f / 255.f);
-			}
+			Uint8x4ToFloat4(color, *(const u32 *)(data_ + decFmt_.c0off));
 			break;
 		case DEC_FLOAT_4:
 			memcpy(color, data_ + decFmt_.c0off, 16);