mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
SSE optimize Float4ToUint8x4, some uses
This commit is contained in:
parent
d02f46cb27
commit
e6f0f84a45
6 changed files with 36 additions and 13 deletions
|
@ -1,3 +1,4 @@
|
|||
#include "Common/Data/Convert/SmallDataConvert.h"
|
||||
|
||||
alignas(16) const float one_over_255_x4[4] = { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, };
|
||||
alignas(16) const float exactly_255_x4[4] = { 255.0f, 255.0f, 255.0f, 255.0f, };
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#endif
|
||||
|
||||
extern const float one_over_255_x4[4];
|
||||
extern const float exactly_255_x4[4];
|
||||
|
||||
// Utilities useful for filling in std140-layout uniform buffers, and similar.
|
||||
// NEON intrinsics: http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491f/BABDCGGF.html
|
||||
|
@ -46,6 +47,12 @@ inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
|
|||
|
||||
// Could be SSE optimized.
|
||||
inline uint32_t Float4ToUint8x4(const float f[4]) {
|
||||
#ifdef _M_SSE
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
|
||||
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
|
||||
return _mm_cvtsi128_si32(ivalue);
|
||||
#else
|
||||
int i4[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (f[i] > 1.0f) {
|
||||
|
@ -57,6 +64,23 @@ inline uint32_t Float4ToUint8x4(const float f[4]) {
|
|||
}
|
||||
}
|
||||
return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
|
||||
#ifdef _M_SSE
|
||||
// Does actually clamp, no way to avoid it with the pack ops!
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
|
||||
__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
|
||||
return _mm_cvtsi128_si32(ivalue);
|
||||
#else
|
||||
u32 i4[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
i4[i] = (int)(f[i] * 255.0f);
|
||||
}
|
||||
return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
|
||||
|
|
|
@ -524,7 +524,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
|
|||
}
|
||||
|
||||
if (vertType & GE_VTYPE_COL_MASK) {
|
||||
reader.ReadColor0_8888(sv.color);
|
||||
sv.color_32 = reader.ReadColor0_8888();
|
||||
} else {
|
||||
memcpy(sv.color, defaultColor, 4);
|
||||
}
|
||||
|
@ -573,7 +573,7 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr,
|
|||
sv.uv[1] = 0.0f;
|
||||
}
|
||||
if (vertType & GE_VTYPE_COL_MASK) {
|
||||
reader.ReadColor0_8888(sv.color);
|
||||
sv.color_32 = reader.ReadColor0_8888();
|
||||
} else {
|
||||
memcpy(sv.color, defaultColor, 4);
|
||||
}
|
||||
|
|
|
@ -219,10 +219,10 @@ void SoftwareTransform::Decode(int prim, u32 vertType, const DecVtxFormat &decVt
|
|||
if (hasColor) {
|
||||
if (provokeIndOffset != 0 && index + provokeIndOffset < maxIndex) {
|
||||
reader.Goto(index + provokeIndOffset);
|
||||
reader.ReadColor0_8888(vert.color0);
|
||||
vert.color0_32 = reader.ReadColor0_8888();
|
||||
reader.Goto(index);
|
||||
} else {
|
||||
reader.ReadColor0_8888(vert.color0);
|
||||
vert.color0_32 = reader.ReadColor0_8888();
|
||||
}
|
||||
} else {
|
||||
vert.color0_32 = materialAmbientRGBA;
|
||||
|
|
|
@ -192,28 +192,27 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
void ReadColor0_8888(u8 color[4]) const {
|
||||
u32 ReadColor0_8888() const {
|
||||
switch (decFmt_.c0fmt) {
|
||||
case DEC_U8_4:
|
||||
{
|
||||
const u8 *b = (const u8 *)(data_ + decFmt_.c0off);
|
||||
memcpy(color, b, 4);
|
||||
u32 value;
|
||||
memcpy(&value, b, 4);
|
||||
return value;
|
||||
}
|
||||
break;
|
||||
case DEC_FLOAT_4:
|
||||
{
|
||||
const float *f = (const float *)(data_ + decFmt_.c0off);
|
||||
for (int i = 0; i < 4; i++)
|
||||
color[i] = f[i] * 255.0f;
|
||||
return Float4ToUint8x4_NoClamp(f);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
memset(color, 0, sizeof(u8) * 4);
|
||||
break;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ReadColor1(float color[3]) const {
|
||||
switch (decFmt_.c1fmt) {
|
||||
case DEC_U8_4:
|
||||
|
@ -281,7 +280,6 @@ public:
|
|||
case DEC_U16_3: for (int i = 0; i < 3; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
|
||||
case DEC_U16_4: for (int i = 0; i < 4; i++) weights[i+4] = s[i] * (1.f / 32768.f); break;
|
||||
default:
|
||||
ERROR_LOG_REPORT_ONCE(fmtw1, G3D, "Reader: Unsupported W1 Format %d", decFmt_.w1fmt);
|
||||
memset(weights + 4, 0, sizeof(float) * 4);
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -344,7 +344,7 @@ ClipVertexData TransformUnit::ReadVertex(VertexReader &vreader, const TransformS
|
|||
normal = -normal;
|
||||
|
||||
if (vreader.hasColor0()) {
|
||||
vreader.ReadColor0_8888((u8 *)&vertex.v.color0);
|
||||
vertex.v.color0 = vreader.ReadColor0_8888();
|
||||
} else {
|
||||
vertex.v.color0 = gstate.getMaterialAmbientRGBA();
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue