diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index 633f0dbcda..c780ee363d 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -285,6 +285,51 @@ void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffe } } +static void Unswizzle16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) { +#ifdef _M_SSE + const __m128i *src = (const __m128i *)texptr; + for (int by = 0; by < byc; by++) { + __m128i *xdest = (__m128i *)ydestp; + for (int bx = 0; bx < bxc; bx++) { + __m128i *dest = xdest; + for (int n = 0; n < 2; n++) { + // Textures are always 16-byte aligned so this is fine. + __m128i temp1 = _mm_load_si128(src); + __m128i temp2 = _mm_load_si128(src + 1); + __m128i temp3 = _mm_load_si128(src + 2); + __m128i temp4 = _mm_load_si128(src + 3); + _mm_store_si128(dest, temp1); + dest += pitch >> 2; + _mm_store_si128(dest, temp2); + dest += pitch >> 2; + _mm_store_si128(dest, temp3); + dest += pitch >> 2; + _mm_store_si128(dest, temp4); + dest += pitch >> 2; + src += 4; + } + xdest ++; + } + ydestp += (rowWidth * 8) / 4; + } +#else + const u32 *src = (const u32 *)texptr; + for (int by = 0; by < byc; by++) { + u32 *xdest = ydestp; + for (int bx = 0; bx < bxc; bx++) { + u32 *dest = xdest; + for (int n = 0; n < 8; n++) { + memcpy(dest, src, 16); + dest += pitch; + src += 4; + } + xdest += 4; + } + ydestp += (rowWidth * 8) / 4; + } +#endif +} + void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPixel, u32 level) { const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2); const u32 pitch = rowWidth / 4; @@ -295,21 +340,9 @@ void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPix u32 ydest = 0; if (rowWidth >= 16) { - const u32 *src = (const u32 *) texptr; u32 *ydestp = tmpTexBuf32.data(); - for (int by = 0; by < byc; by++) { - u32 *xdest = ydestp; - for (int bx = 0; bx < bxc; bx++) { - u32 *dest = xdest; - for (int n = 0; n < 8; n++) { - memcpy(dest, src, 16); - dest += pitch; - src += 4; - } - xdest += 4; - } - ydestp += (rowWidth * 8) / 4; - } + // The most common one, so it gets an optimized implementation. + Unswizzle16(texptr, ydestp, bxc, byc, pitch, rowWidth); } else if (rowWidth == 8) { const u32 *src = (const u32 *) texptr; for (int by = 0; by < byc; by++) {