diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp
index 6a7d92a35a..f10a3a6421 100644
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@@ -251,51 +251,55 @@ void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
 void DoUnswizzleTex16Basic(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
 	// ydestp is in 32-bits, so this is convenient.
 	const u32 pitchBy32 = pitch >> 2;
+
 #ifdef _M_SSE
-	const __m128i *src = (const __m128i *)texptr;
-	// The pitch parameter is in bytes, so shift down for 128-bit.
-	// Note: it's always aligned to 16 bytes, so this is safe.
-	const u32 pitchBy128 = pitch >> 4;
-	for (int by = 0; by < byc; by++) {
-		__m128i *xdest = (__m128i *)ydestp;
-		for (int bx = 0; bx < bxc; bx++) {
-			__m128i *dest = xdest;
-			for (int n = 0; n < 2; n++) {
-				// Textures are always 16-byte aligned so this is fine.
-				__m128i temp1 = _mm_load_si128(src);
-				__m128i temp2 = _mm_load_si128(src + 1);
-				__m128i temp3 = _mm_load_si128(src + 2);
-				__m128i temp4 = _mm_load_si128(src + 3);
-				_mm_store_si128(dest, temp1);
-				dest += pitchBy128;
-				_mm_store_si128(dest, temp2);
-				dest += pitchBy128;
-				_mm_store_si128(dest, temp3);
-				dest += pitchBy128;
-				_mm_store_si128(dest, temp4);
-				dest += pitchBy128;
-				src += 4;
+	if (((uintptr_t)ydestp & 0xF) == 0) {
+		const __m128i *src = (const __m128i *)texptr;
+		// The pitch parameter is in bytes, so shift down for 128-bit.
+		// Note: it's always aligned to 16 bytes, so this is safe.
+		const u32 pitchBy128 = pitch >> 4;
+		for (int by = 0; by < byc; by++) {
+			__m128i *xdest = (__m128i *)ydestp;
+			for (int bx = 0; bx < bxc; bx++) {
+				__m128i *dest = xdest;
+				for (int n = 0; n < 2; n++) {
+					// Textures are always 16-byte aligned so this is fine.
+					__m128i temp1 = _mm_load_si128(src);
+					__m128i temp2 = _mm_load_si128(src + 1);
+					__m128i temp3 = _mm_load_si128(src + 2);
+					__m128i temp4 = _mm_load_si128(src + 3);
+					_mm_store_si128(dest, temp1);
+					dest += pitchBy128;
+					_mm_store_si128(dest, temp2);
+					dest += pitchBy128;
+					_mm_store_si128(dest, temp3);
+					dest += pitchBy128;
+					_mm_store_si128(dest, temp4);
+					dest += pitchBy128;
+					src += 4;
+				}
+				xdest++;
 			}
-			xdest++;
+			ydestp += pitchBy32 * 8;
 		}
-		ydestp += pitchBy32 * 8;
-	}
-#else
-	const u32 *src = (const u32 *)texptr;
-	for (int by = 0; by < byc; by++) {
-		u32 *xdest = ydestp;
-		for (int bx = 0; bx < bxc; bx++) {
-			u32 *dest = xdest;
-			for (int n = 0; n < 8; n++) {
-				memcpy(dest, src, 16);
-				dest += pitchBy32;
-				src += 4;
-			}
-			xdest += 4;
-		}
-		ydestp += pitchBy32 * 8;
-	}
+	} else
 #endif
+	{
+		const u32 *src = (const u32 *)texptr;
+		for (int by = 0; by < byc; by++) {
+			u32 *xdest = ydestp;
+			for (int bx = 0; bx < bxc; bx++) {
+				u32 *dest = xdest;
+				for (int n = 0; n < 8; n++) {
+					memcpy(dest, src, 16);
+					dest += pitchBy32;
+					src += 4;
+				}
+				xdest += 4;
+			}
+			ydestp += pitchBy32 * 8;
+		}
+	}
 }
 
 #ifndef _M_SSE