SSE optimize Float4ToUint8x4, some uses

This commit is contained in:
Henrik Rydgård 2022-12-01 16:21:27 +01:00
parent d02f46cb27
commit e6f0f84a45
6 changed files with 36 additions and 13 deletions

View file

@ -1,3 +1,4 @@
#include "Common/Data/Convert/SmallDataConvert.h"
alignas(16) const float one_over_255_x4[4] = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, };
alignas(16) const float exactly_255_x4[4] = { 255.0f, 255.0f, 255.0f, 255.0f, };

View file

@ -18,6 +18,7 @@
#endif
extern const float one_over_255_x4[4];
extern const float exactly_255_x4[4];
// Utilities useful for filling in std140-layout uniform buffers, and similar.
// NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html
@ -46,6 +47,12 @@ inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
// Could be SSE optimized.
inline uint32_t Float4ToUint8x4(const float f[4]) {
#ifdef _M_SSE
__m128i zero = _mm_setzero_si128();
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
return _mm_cvtsi128_si32(ivalue);
#else
int i4[4];
for (int i = 0; i < 4; i++) {
if (f[i] > 1.0f) {
@ -57,6 +64,23 @@ inline uint32_t Float4ToUint8x4(const float f[4]) {
}
}
return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
#endif
}
inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
#ifdef _M_SSE
// Does actually clamp, no way to avoid it with the pack ops!
__m128i zero = _mm_setzero_si128();
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
return _mm_cvtsi128_si32(ivalue);
#else
u32 i4[4];
for (int i = 0; i < 4; i++) {
i4[i] = (int)(f[i] * 255.0f);
}
return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
#endif
}
inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {

View file

@ -524,7 +524,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
}
if (vertType & GE_VTYPE_COL_MASK) {
reader.ReadColor0_8888(sv.color);
sv.color_32 = reader.ReadColor0_8888();
} else {
memcpy(sv.color, defaultColor, 4);
}
@ -573,7 +573,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
sv.uv[1] = 0.0f;
}
if (vertType & GE_VTYPE_COL_MASK) {
reader.ReadColor0_8888(sv.color);
sv.color_32 = reader.ReadColor0_8888();
} else {
memcpy(sv.color, defaultColor, 4);
}

View file

@ -219,10 +219,10 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
if (hasColor) {
if (provokeIndOffset != 0 && index + provokeIndOffset < maxIndex) {
reader.Goto(index + provokeIndOffset);
reader.ReadColor0_8888(vert.color0);
vert.color0_32 = reader.ReadColor0_8888();
reader.Goto(index);
} else {
reader.ReadColor0_8888(vert.color0);
vert.color0_32 = reader.ReadColor0_8888();
}
} else {
vert.color0_32 = materialAmbientRGBA;

View file

@ -192,28 +192,27 @@ public:
}
}
void ReadColor0_8888(u8 color[4]) const {
u32 ReadColor0_8888() const {
switch (decFmt_.c0fmt) {
case DEC_U8_4:
{
const u8 *b = (const u8 *)(data_ + decFmt_.c0off);
memcpy(color, b, 4);
u32 value;
memcpy(&value, b, 4);
return value;
}
break;
case DEC_FLOAT_4:
{
const float *f = (const float *)(data_ + decFmt_.c0off);
for (int i = 0; i < 4; i++)
color[i] = f[i] * 255.0f;
return Float4ToUint8x4_NoClamp(f);
}
break;
default:
memset(color, 0, sizeof(u8) * 4);
break;
return 0;
}
}
void ReadColor1(float color[3]) const {
switch (decFmt_.c1fmt) {
case DEC_U8_4:
@ -281,7 +280,6 @@ public:
case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
default:
ERROR_LOG_REPORT_ONCE(fmtw1, G3D, "Reader: Unsupported W1 Format %d", decFmt_.w1fmt);
memset(weights + 4, 0, sizeof(float) * 4);
break;
}

View file

@ -344,7 +344,7 @@ ClipVertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformS
normal = -normal;
if (vreader.hasColor0()) {
vreader.ReadColor0_8888((u8 *)&vertex.v.color0);
vertex.v.color0 = vreader.ReadColor0_8888();
} else {
vertex.v.color0 = gstate.getMaterialAmbientRGBA();
}