diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp
index 633f0dbcda..c780ee363d 100644
--- a/GPU/GLES/TextureCache.cpp
+++ b/GPU/GLES/TextureCache.cpp
@@ -285,6 +285,51 @@ void TextureCache::NotifyFramebuffer(u32 address, VirtualFramebuffer *framebuffe
 	}
 }
 
+static void Unswizzle16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch, u32 rowWidth) {
+#ifdef _M_SSE
+	const __m128i *src = (const __m128i *)texptr;
+	for (int by = 0; by < byc; by++) {
+		__m128i *xdest = (__m128i *)ydestp;
+		for (int bx = 0; bx < bxc; bx++) {
+			__m128i *dest = xdest;
+			for (int n = 0; n < 2; n++) {
+				// Textures are always 16-byte aligned so this is fine.
+				__m128i temp1 = _mm_load_si128(src);
+				__m128i temp2 = _mm_load_si128(src + 1);
+				__m128i temp3 = _mm_load_si128(src + 2);
+				__m128i temp4 = _mm_load_si128(src + 3);
+				_mm_store_si128(dest, temp1);
+				dest += pitch >> 2;
+				_mm_store_si128(dest, temp2);
+				dest += pitch >> 2;
+				_mm_store_si128(dest, temp3);
+				dest += pitch >> 2;
+				_mm_store_si128(dest, temp4);
+				dest += pitch >> 2;
+				src += 4;
+			}
+			xdest ++;
+		}
+		ydestp += (rowWidth * 8) / 4;
+	}
+#else
+	const u32 *src = (const u32 *)texptr;
+	for (int by = 0; by < byc; by++) {
+		u32 *xdest = ydestp;
+		for (int bx = 0; bx < bxc; bx++) {
+			u32 *dest = xdest;
+			for (int n = 0; n < 8; n++) {
+				memcpy(dest, src, 16);
+				dest += pitch;
+				src += 4;
+			}
+			xdest += 4;
+		}
+		ydestp += (rowWidth * 8) / 4;
+	}
+#endif
+}
+
 void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPixel, u32 level) {
 	const u32 rowWidth = (bytesPerPixel > 0) ? (bufw * bytesPerPixel) : (bufw / 2);
 	const u32 pitch = rowWidth / 4;
@@ -295,21 +340,9 @@ void *TextureCache::UnswizzleFromMem(const u8 *texptr, u32 bufw, u32 bytesPerPix
 
 	u32 ydest = 0;
 	if (rowWidth >= 16) {
-		const u32 *src = (const u32 *) texptr;
 		u32 *ydestp = tmpTexBuf32.data();
-		for (int by = 0; by < byc; by++) {
-			u32 *xdest = ydestp;
-			for (int bx = 0; bx < bxc; bx++) {
-				u32 *dest = xdest;
-				for (int n = 0; n < 8; n++) {
-					memcpy(dest, src, 16);
-					dest += pitch;
-					src += 4;
-				}
-				xdest += 4;
-			}
-			ydestp += (rowWidth * 8) / 4;
-		}
+		// The most common one, so it gets an optimized implementation.
+		Unswizzle16(texptr, ydestp, bxc, byc, pitch, rowWidth);
 	} else if (rowWidth == 8) {
 		const u32 *src = (const u32 *) texptr;
 		for (int by = 0; by < byc; by++) {