texcache: SSE optimized version of the most common case of Unswizzle

(didn't put this in fast_math because it's pretty specific to PSP)
This commit is contained in:
Henrik Rydgard 2014-03-23 01:50:50 +01:00
parent dac51b9c1b
commit 8bf015fe16

View file

@ -285,6 +285,51 @@ void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffe
}
}
static void Unswizzle16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
#ifdef _M_SSE
const __m128i *src = (const __m128i *)texptr;
for (int by = 0; by < byc; by++) {
__m128i *xdest = (__m128i *)ydestp;
for (int bx = 0; bx < bxc; bx++) {
__m128i *dest = xdest;
for (int n = 0; n < 2; n++) {
// Textures are always 16-byte aligned so this is fine.
__m128i temp1 = _mm_load_si128(src);
__m128i temp2 = _mm_load_si128(src + 1);
__m128i temp3 = _mm_load_si128(src + 2);
__m128i temp4 = _mm_load_si128(src + 3);
_mm_store_si128(dest, temp1);
dest += pitch >> 2;
_mm_store_si128(dest, temp2);
dest += pitch >> 2;
_mm_store_si128(dest, temp3);
dest += pitch >> 2;
_mm_store_si128(dest, temp4);
dest += pitch >> 2;
src += 4;
}
xdest ++;
}
ydestp += (rowWidth * 8) / 4;
}
#else
const u32 *src = (const u32 *)texptr;
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
#endif
}
void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPixel, u32 level) {
const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
const u32 pitch = rowWidth / 4;
@ -295,21 +340,9 @@ void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPix
u32 ydest = 0;
if (rowWidth >= 16) {
const u32 *src = (const u32 *) texptr;
u32 *ydestp = tmpTexBuf32.data();
for (int by = 0; by < byc; by++) {
u32 *xdest = ydestp;
for (int bx = 0; bx < bxc; bx++) {
u32 *dest = xdest;
for (int n = 0; n < 8; n++) {
memcpy(dest, src, 16);
dest += pitch;
src += 4;
}
xdest += 4;
}
ydestp += (rowWidth * 8) / 4;
}
// The most common one, so it gets an optimized implementation.
Unswizzle16(texptr, ydestp, bxc, byc, pitch, rowWidth);
} else if (rowWidth == 8) {
const u32 *src = (const u32 *) texptr;
for (int by = 0; by < byc; by++) {