diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index 6a7d92a35a..f10a3a6421 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -251,51 +251,55 @@ void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) { void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) { // ydestp is in 32-bits, so this is convenient. const u32 pitchBy32 = pitch >> 2; + #ifdef _M_SSE - const __m128i *src = (const __m128i *)texptr; - // The pitch parameter is in bytes, so shift down for 128-bit. - // Note: it's always aligned to 16 bytes, so this is safe. - const u32 pitchBy128 = pitch >> 4; - for (int by = 0; by < byc; by++) { - __m128i *xdest = (__m128i *)ydestp; - for (int bx = 0; bx < bxc; bx++) { - __m128i *dest = xdest; - for (int n = 0; n < 2; n++) { - // Textures are always 16-byte aligned so this is fine. - __m128i temp1 = _mm_load_si128(src); - __m128i temp2 = _mm_load_si128(src + 1); - __m128i temp3 = _mm_load_si128(src + 2); - __m128i temp4 = _mm_load_si128(src + 3); - _mm_store_si128(dest, temp1); - dest += pitchBy128; - _mm_store_si128(dest, temp2); - dest += pitchBy128; - _mm_store_si128(dest, temp3); - dest += pitchBy128; - _mm_store_si128(dest, temp4); - dest += pitchBy128; - src += 4; + if (((uintptr_t)ydestp & 0xF) == 0) { + const __m128i *src = (const __m128i *)texptr; + // The pitch parameter is in bytes, so shift down for 128-bit. + // Note: it's always aligned to 16 bytes, so this is safe. + const u32 pitchBy128 = pitch >> 4; + for (int by = 0; by < byc; by++) { + __m128i *xdest = (__m128i *)ydestp; + for (int bx = 0; bx < bxc; bx++) { + __m128i *dest = xdest; + for (int n = 0; n < 2; n++) { + // Textures are always 16-byte aligned so this is fine. + __m128i temp1 = _mm_load_si128(src); + __m128i temp2 = _mm_load_si128(src + 1); + __m128i temp3 = _mm_load_si128(src + 2); + __m128i temp4 = _mm_load_si128(src + 3); + _mm_store_si128(dest, temp1); + dest += pitchBy128; + _mm_store_si128(dest, temp2); + dest += pitchBy128; + _mm_store_si128(dest, temp3); + dest += pitchBy128; + _mm_store_si128(dest, temp4); + dest += pitchBy128; + src += 4; + } + xdest++; } - xdest++; + ydestp += pitchBy32 * 8; } - ydestp += pitchBy32 * 8; - } -#else - const u32 *src = (const u32 *)texptr; - for (int by = 0; by < byc; by++) { - u32 *xdest = ydestp; - for (int bx = 0; bx < bxc; bx++) { - u32 *dest = xdest; - for (int n = 0; n < 8; n++) { - memcpy(dest, src, 16); - dest += pitchBy32; - src += 4; - } - xdest += 4; - } - ydestp += pitchBy32 * 8; - } + } else #endif + { + const u32 *src = (const u32 *)texptr; + for (int by = 0; by < byc; by++) { + u32 *xdest = ydestp; + for (int bx = 0; bx < bxc; bx++) { + u32 *dest = xdest; + for (int n = 0; n < 8; n++) { + memcpy(dest, src, 16); + dest += pitchBy32; + src += 4; + } + xdest += 4; + } + ydestp += pitchBy32 * 8; + } + } } #ifndef _M_SSE