diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp
index 0f1e467957..d5cce9c5a1 100644
--- a/GPU/Common/TextureDecoder.cpp
+++ b/GPU/Common/TextureDecoder.cpp
@@ -25,6 +25,9 @@
 
 #ifdef _M_SSE
 #include <xmmintrin.h>
+#if _M_SSE >= 0x401
+#include <smmintrin.h>
+#endif
 
 u32 QuickTexHashSSE2(const void *checkp, u32 size) {
 	u32 check = 0;
@@ -272,3 +275,128 @@ void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch) {
 		dst += pitch;
 	}
 }
+
+void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels) {
+#ifdef _M_SSE
+	const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
+
+	const __m128i *srcp = (const __m128i *)src;
+	__m128i *dstp = (__m128i *)dst;
+	u32 sseChunks = numPixels / 4;
+	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
+		sseChunks = 0;
+	}
+	for (u32 i = 0; i < sseChunks; ++i) {
+		__m128i c = _mm_load_si128(&srcp[i]);
+		__m128i rb = _mm_andnot_si128(maskGA, c);
+		c = _mm_and_si128(c, maskGA);
+
+		__m128i b = _mm_srli_epi32(rb, 16);
+		__m128i r = _mm_slli_epi32(rb, 16);
+		c = _mm_or_si128(_mm_or_si128(c, r), b);
+		_mm_store_si128(&dstp[i], c);
+	}
+	// The remainder starts right after those done via SSE.
+	u32 i = sseChunks * 4;
+#else
+	u32 i = 0;
+#endif
+	for (; i < numPixels; i++) {
+		const u32 c = src[i];
+		dst[i] = ((c >> 16) & 0x000000FF) |
+		         ((c >> 0)  & 0xFF00FF00) |
+		         ((c << 16) & 0x00FF0000);
+	}
+}
+
+inline u16 RGBA8888toRGBA5551(u32 px) {
+	return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000);
+}
+
+void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels) {
+#if _M_SSE >= 0x401
+	const __m128i maskAG = _mm_set1_epi32(0x8000F800);
+	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
+	const __m128i mask = _mm_set1_epi32(0x0000FFFF);
+
+	const __m128i *srcp = (const __m128i *)src;
+	__m128i *dstp = (__m128i *)dst;
+	u32 sseChunks = (numPixels / 4) & ~1;
+	// SSE 4.1 required for _mm_packus_epi32.
+	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
+		sseChunks = 0;
+	}
+	for (u32 i = 0; i < sseChunks; i += 2) {
+		__m128i c1 = _mm_load_si128(&srcp[i + 0]);
+		__m128i c2 = _mm_load_si128(&srcp[i + 1]);
+		__m128i ag, rb;
+
+		ag = _mm_and_si128(c1, maskAG);
+		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
+		rb = _mm_and_si128(c1, maskRB);
+		rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
+		c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
+
+		ag = _mm_and_si128(c2, maskAG);
+		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
+		rb = _mm_and_si128(c2, maskRB);
+		rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
+		c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
+
+		_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
+	}
+	// The remainder starts right after those done via SSE.
+	u32 i = sseChunks * 4;
+#else
+	u32 i = 0;
+#endif
+	for (; i < numPixels; i++) {
+		dst[i] = RGBA8888toRGBA5551(src[i]);
+	}
+}
+
+inline u16 BGRA8888toRGBA5551(u32 px) {
+	return ((px >> 19) & 0x001F) | ((px >> 6) & 0x03E0) | ((px << 7) & 0x7C00) | ((px >> 16) & 0x8000);
+}
+
+void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels) {
+#if _M_SSE >= 0x401
+	const __m128i maskAG = _mm_set1_epi32(0x8000F800);
+	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
+	const __m128i mask = _mm_set1_epi32(0x0000FFFF);
+
+	const __m128i *srcp = (const __m128i *)src;
+	__m128i *dstp = (__m128i *)dst;
+	u32 sseChunks = (numPixels / 4) & ~1;
+	// SSE 4.1 required for _mm_packus_epi32.
+	if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
+		sseChunks = 0;
+	}
+	for (u32 i = 0; i < sseChunks; i += 2) {
+		__m128i c1 = _mm_load_si128(&srcp[i + 0]);
+		__m128i c2 = _mm_load_si128(&srcp[i + 1]);
+		__m128i ag, rb;
+
+		ag = _mm_and_si128(c1, maskAG);
+		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
+		rb = _mm_and_si128(c1, maskRB);
+		rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
+		c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
+
+		ag = _mm_and_si128(c2, maskAG);
+		ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
+		rb = _mm_and_si128(c2, maskRB);
+		rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
+		c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
+
+		_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
+	}
+	// The remainder starts right after those done via SSE.
+	u32 i = sseChunks * 4;
+#else
+	u32 i = 0;
+#endif
+	for (; i < numPixels; i++) {
+		dst[i] = BGRA8888toRGBA5551(src[i]);
+	}
+}
diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h
index e179c67785..9f1ba46675 100644
--- a/GPU/Common/TextureDecoder.h
+++ b/GPU/Common/TextureDecoder.h
@@ -201,3 +201,7 @@ inline void DeIndexTexture4Optimal(ClutT *dest, const u32 texaddr, int length, C
 	const u8 *indexed = (const u8 *) Memory::GetPointer(texaddr);
 	DeIndexTexture4Optimal(dest, indexed, length, color);
 }
+
+void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels);
+void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels);
+void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels);
diff --git a/GPU/GLES/Framebuffer.cpp b/GPU/GLES/Framebuffer.cpp
index 1ed8242e42..fefd6d845e 100644
--- a/GPU/GLES/Framebuffer.cpp
+++ b/GPU/GLES/Framebuffer.cpp
@@ -35,6 +35,7 @@
 #include "GPU/GPUState.h"
 
 #include "GPU/Common/PostShader.h"
+#include "GPU/Common/TextureDecoder.h"
 #include "GPU/GLES/Framebuffer.h"
 #include "GPU/GLES/TextureCache.h"
 #include "GPU/GLES/ShaderManager.h"
@@ -111,11 +112,15 @@ inline u16 RGBA8888toRGBA4444(u32 px) {
 	return ((px >> 4) & 0x000F) | ((px >> 8) & 0x00F0) | ((px >> 12) & 0x0F00) | ((px >> 16) & 0xF000);
 }
 
-inline u16 RGBA8888toRGBA5551(u32 px) {
-	return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000);
+inline u16 BGRA8888toRGB565(u32 px) {
+	return ((px >> 19) & 0x001F) | ((px >> 5) & 0x07E0) | ((px << 8) & 0xF800);
 }
 
-void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format);
+inline u16 BGRA8888toRGBA4444(u32 px) {
+	return ((px >> 20) & 0x000F) | ((px >> 8) & 0x00F0) | ((px << 4) & 0x0F00) | ((px >> 16) & 0xF000);
+}
+
+void ConvertFromRGBA8888(u8 *dst, const u8 *src, u32 stride, u32 height, GEBufferFormat format);
 
 void CenterRect(float *x, float *y, float *w, float *h,
                 float origW, float origH, float frameW, float frameH) {
@@ -1256,12 +1261,23 @@ void FramebufferManager::BlitFramebuffer_(VirtualFramebuffer *src, VirtualFrameb
 	fbo_unbind();
 }
 
+static inline bool UseBGRA8888() {
+	// TODO: Other platforms?  May depend on vendor which is faster?
+#ifdef _WIN32
+	return gl_extensions.EXT_bgra;
+#endif
+	return false;
+}
+
 // TODO: SSE/NEON
 // Could also make C fake-simd for 64-bit, two 8888 pixels fit in a register :)
-void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format) {
+void ConvertFromRGBA8888(u8 *dst, const u8 *src, u32 stride, u32 height, GEBufferFormat format) {
 	if (format == GE_FORMAT_8888) {
 		if (src == dst) {
 			return;
+		} else if (UseBGRA8888()) {
+			u32 numPixels = height * stride;
+			ConvertBGRA8888ToRGBA8888((u32 *)dst, (const u32 *)src, numPixels);
 		} else { // Here lets assume they don't intersect
 			memcpy(dst, src, stride * height * 4);
 		}
@@ -1271,25 +1287,38 @@ void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferForma
 		u16 *dst16 = (u16 *)dst;
 		switch (format) {
 			case GE_FORMAT_565: // BGR 565
-				for (int i = 0; i < size; i++) {
-					dst16[i] = RGBA8888toRGB565(src32[i]);
+				if (UseBGRA8888()) {
+					for (int i = 0; i < size; i++) {
+						dst16[i] = BGRA8888toRGB565(src32[i]);
+					}
+				} else {
+					for (int i = 0; i < size; i++) {
+						dst16[i] = RGBA8888toRGB565(src32[i]);
+					}
 				}
 				break;
 			case GE_FORMAT_5551: // ABGR 1555
-				for (int i = 0; i < size; i++) {
-					dst16[i] = RGBA8888toRGBA5551(src32[i]);
+				if (UseBGRA8888()) {
+					ConvertBGRA8888ToRGBA5551(dst16, src32, size);
+				} else {
+					ConvertRGBA8888ToRGBA5551(dst16, src32, size);
 				}
 				break;
 			case GE_FORMAT_4444: // ABGR 4444
-				for (int i = 0; i < size; i++) {
-					dst16[i] = RGBA8888toRGBA4444(src32[i]);
+				if (UseBGRA8888()) {
+					for (int i = 0; i < size; i++) {
+						dst16[i] = BGRA8888toRGBA4444(src32[i]);
+					}
+				} else {
+					for (int i = 0; i < size; i++) {
+						dst16[i] = RGBA8888toRGBA4444(src32[i]);
+					}
 				}
 				break;
 			case GE_FORMAT_8888:
+			case GE_FORMAT_INVALID:
 				// Not possible.
 				break;
-			default:
-				break;
 		}
 	}
 }
@@ -1317,24 +1346,24 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) {
 	}
 
 	// Receive previously requested data from a PBO
-	if (pixelBufObj_[nextPBO].reading) {
-		glBindBuffer(GL_PIXEL_PACK_BUFFER, pixelBufObj_[nextPBO].handle);
+	AsyncPBO &pbo = pixelBufObj_[nextPBO];
+	if (pbo.reading) {
+		glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo.handle);
 		packed = (GLubyte *)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
 
 		if (packed) {
 			DEBUG_LOG(SCEGE, "Reading PBO to memory , bufSize = %u, packed = %p, fb_address = %08x, stride = %u, pbo = %u",
-			pixelBufObj_[nextPBO].size, packed, pixelBufObj_[nextPBO].fb_address, pixelBufObj_[nextPBO].stride, nextPBO);
+			pbo.size, packed, pbo.fb_address, pbo.stride, nextPBO);
 
-			if (useCPU) {
-				ConvertFromRGBA8888(Memory::GetPointer(pixelBufObj_[nextPBO].fb_address), packed,
-								pixelBufObj_[nextPBO].stride, pixelBufObj_[nextPBO].height,
-								pixelBufObj_[nextPBO].format);
+			if (useCPU || (UseBGRA8888() && pbo.format == GE_FORMAT_8888)) {
+				u8 *dst = Memory::GetPointer(pbo.fb_address);
+				ConvertFromRGBA8888(dst, packed, pbo.stride, pbo.height, pbo.format);
 			} else {
 				// We don't need to convert, GPU already did (or should have)
-				Memory::Memcpy(pixelBufObj_[nextPBO].fb_address, packed, pixelBufObj_[nextPBO].size);
+				Memory::Memcpy(pbo.fb_address, packed, pbo.size);
 			}
 
-			pixelBufObj_[nextPBO].reading = false;
+			pbo.reading = false;
 		}
 
 		glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
@@ -1371,13 +1400,14 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) {
 			case GE_FORMAT_8888: // 32 bit RGBA
 			default:
 				pixelType = GL_UNSIGNED_BYTE;
-				pixelFormat = GL_RGBA;
+				pixelFormat = UseBGRA8888() ? GL_BGRA_EXT : GL_RGBA;
 				pixelSize = 4;
 				align = 4;
 				break;
 		}
 
-		u32 bufSize = vfb->fb_stride * vfb->height * pixelSize;
+		// If using the CPU, we need 4 bytes per pixel always.
+		u32 bufSize = vfb->fb_stride * vfb->height * (useCPU ? 4 : pixelSize);
 		u32 fb_address = (0x04000000) | vfb->fb_address;
 
 		if (vfb->fbo) {
@@ -1404,19 +1434,14 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) {
 
 		if (pixelBufObj_[currentPBO_].maxSize < bufSize) {
 			// We reserve a buffer big enough to fit all those pixels
-			if (useCPU && pixelType != GL_UNSIGNED_BYTE) {
-				// Wnd result may be 16-bit but we are reading 32-bit, so we need double the space on the buffer
-				glBufferData(GL_PIXEL_PACK_BUFFER, bufSize*2, NULL, GL_DYNAMIC_READ);
-			} else {
-				glBufferData(GL_PIXEL_PACK_BUFFER, bufSize, NULL, GL_DYNAMIC_READ);
-			}
+			glBufferData(GL_PIXEL_PACK_BUFFER, bufSize, NULL, GL_DYNAMIC_READ);
 			pixelBufObj_[currentPBO_].maxSize = bufSize;
 		}
 
 		if (useCPU) {
 			// If converting pixel formats on the CPU we'll always request RGBA8888
 			glPixelStorei(GL_PACK_ALIGNMENT, 4);
-			glReadPixels(0, 0, vfb->fb_stride, vfb->height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
+			glReadPixels(0, 0, vfb->fb_stride, vfb->height, UseBGRA8888() ? GL_BGRA_EXT : GL_RGBA, GL_UNSIGNED_BYTE, 0);
 		} else {
 			// Otherwise we'll directly request the format we need and let the GPU sort it out
 			glPixelStorei(GL_PACK_ALIGNMENT, align);
diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp
index c575e4f2ef..be0ceac43c 100644
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@@ -663,8 +663,12 @@ void GPUCommon::ProcessDLQueueInternal() {
 			return;
 		} else {
 			easy_guard guard(listLock);
-			// At the end, we can remove it from the queue and continue.
-			dlQueue.erase(std::remove(dlQueue.begin(), dlQueue.end(), listIndex), dlQueue.end());
+
+			// Some other list could've taken the spot while we dilly-dallied around.
+			if (l.state != PSP_GE_DL_STATE_QUEUED) {
+				// At the end, we can remove it from the queue and continue.
+				dlQueue.erase(std::remove(dlQueue.begin(), dlQueue.end(), listIndex), dlQueue.end());
+			}
 			UpdateTickEstimate(std::max(busyTicks, startingTicks + cyclesExecuted));
 		}
 	}