diff --git a/GPU/Common/TextureDecoder.cpp b/GPU/Common/TextureDecoder.cpp index 0f1e467957..d5cce9c5a1 100644 --- a/GPU/Common/TextureDecoder.cpp +++ b/GPU/Common/TextureDecoder.cpp @@ -25,6 +25,9 @@ #ifdef _M_SSE #include +#if _M_SSE >= 0x401 +#include +#endif u32 QuickTexHashSSE2(const void *checkp, u32 size) { u32 check = 0; @@ -272,3 +275,128 @@ void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch) { dst += pitch; } } + +void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels) { +#ifdef _M_SSE + const __m128i maskGA = _mm_set1_epi32(0xFF00FF00); + + const __m128i *srcp = (const __m128i *)src; + __m128i *dstp = (__m128i *)dst; + u32 sseChunks = numPixels / 4; + if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) { + sseChunks = 0; + } + for (u32 i = 0; i < sseChunks; ++i) { + __m128i c = _mm_load_si128(&srcp[i]); + __m128i rb = _mm_andnot_si128(maskGA, c); + c = _mm_and_si128(c, maskGA); + + __m128i b = _mm_srli_epi32(rb, 16); + __m128i r = _mm_slli_epi32(rb, 16); + c = _mm_or_si128(_mm_or_si128(c, r), b); + _mm_store_si128(&dstp[i], c); + } + // The remainder starts right after those done via SSE. + u32 i = sseChunks * 4; +#else + u32 i = 0; +#endif + for (; i < numPixels; i++) { + const u32 c = src[i]; + dst[i] = ((c >> 16) & 0x000000FF) | + ((c >> 0) & 0xFF00FF00) | + ((c << 16) & 0x00FF0000); + } +} + +inline u16 RGBA8888toRGBA5551(u32 px) { + return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000); +} + +void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels) { +#if _M_SSE >= 0x401 + const __m128i maskAG = _mm_set1_epi32(0x8000F800); + const __m128i maskRB = _mm_set1_epi32(0x00F800F8); + const __m128i mask = _mm_set1_epi32(0x0000FFFF); + + const __m128i *srcp = (const __m128i *)src; + __m128i *dstp = (__m128i *)dst; + u32 sseChunks = (numPixels / 4) & ~1; + // SSE 4.1 required for _mm_packus_epi32. + if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) { + sseChunks = 0; + } + for (u32 i = 0; i < sseChunks; i += 2) { + __m128i c1 = _mm_load_si128(&srcp[i + 0]); + __m128i c2 = _mm_load_si128(&srcp[i + 1]); + __m128i ag, rb; + + ag = _mm_and_si128(c1, maskAG); + ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6)); + rb = _mm_and_si128(c1, maskRB); + rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9)); + c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask); + + ag = _mm_and_si128(c2, maskAG); + ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6)); + rb = _mm_and_si128(c2, maskRB); + rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9)); + c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask); + + _mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2)); + } + // The remainder starts right after those done via SSE. + u32 i = sseChunks * 4; +#else + u32 i = 0; +#endif + for (; i < numPixels; i++) { + dst[i] = RGBA8888toRGBA5551(src[i]); + } +} + +inline u16 BGRA8888toRGBA5551(u32 px) { + return ((px >> 19) & 0x001F) | ((px >> 6) & 0x03E0) | ((px << 7) & 0x7C00) | ((px >> 16) & 0x8000); +} + +void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels) { +#if _M_SSE >= 0x401 + const __m128i maskAG = _mm_set1_epi32(0x8000F800); + const __m128i maskRB = _mm_set1_epi32(0x00F800F8); + const __m128i mask = _mm_set1_epi32(0x0000FFFF); + + const __m128i *srcp = (const __m128i *)src; + __m128i *dstp = (__m128i *)dst; + u32 sseChunks = (numPixels / 4) & ~1; + // SSE 4.1 required for _mm_packus_epi32. + if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) { + sseChunks = 0; + } + for (u32 i = 0; i < sseChunks; i += 2) { + __m128i c1 = _mm_load_si128(&srcp[i + 0]); + __m128i c2 = _mm_load_si128(&srcp[i + 1]); + __m128i ag, rb; + + ag = _mm_and_si128(c1, maskAG); + ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6)); + rb = _mm_and_si128(c1, maskRB); + rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7)); + c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask); + + ag = _mm_and_si128(c2, maskAG); + ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6)); + rb = _mm_and_si128(c2, maskRB); + rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7)); + c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask); + + _mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2)); + } + // The remainder starts right after those done via SSE. + u32 i = sseChunks * 4; +#else + u32 i = 0; +#endif + for (; i < numPixels; i++) { + dst[i] = BGRA8888toRGBA5551(src[i]); + } +} diff --git a/GPU/Common/TextureDecoder.h b/GPU/Common/TextureDecoder.h index e179c67785..9f1ba46675 100644 --- a/GPU/Common/TextureDecoder.h +++ b/GPU/Common/TextureDecoder.h @@ -201,3 +201,7 @@ inline void DeIndexTexture4Optimal(ClutT *dest, const u32 texaddr, int length, C const u8 *indexed = (const u8 *) Memory::GetPointer(texaddr); DeIndexTexture4Optimal(dest, indexed, length, color); } + +void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels); +void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels); +void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels); diff --git a/GPU/GLES/Framebuffer.cpp b/GPU/GLES/Framebuffer.cpp index 1ed8242e42..fefd6d845e 100644 --- a/GPU/GLES/Framebuffer.cpp +++ b/GPU/GLES/Framebuffer.cpp @@ -35,6 +35,7 @@ #include "GPU/GPUState.h" #include "GPU/Common/PostShader.h" +#include "GPU/Common/TextureDecoder.h" #include "GPU/GLES/Framebuffer.h" #include "GPU/GLES/TextureCache.h" #include "GPU/GLES/ShaderManager.h" @@ -111,11 +112,15 @@ inline u16 RGBA8888toRGBA4444(u32 px) { return ((px >> 4) & 0x000F) | ((px >> 8) & 0x00F0) | ((px >> 12) & 0x0F00) | ((px >> 16) & 0xF000); } -inline u16 RGBA8888toRGBA5551(u32 px) { - return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000); +inline u16 BGRA8888toRGB565(u32 px) { + return ((px >> 19) & 0x001F) | ((px >> 5) & 0x07E0) | ((px << 8) & 0xF800); } -void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format); +inline u16 BGRA8888toRGBA4444(u32 px) { + return ((px >> 20) & 0x000F) | ((px >> 8) & 0x00F0) | ((px << 4) & 0x0F00) | ((px >> 16) & 0xF000); +} + +void ConvertFromRGBA8888(u8 *dst, const u8 *src, u32 stride, u32 height, GEBufferFormat format); void CenterRect(float *x, float *y, float *w, float *h, float origW, float origH, float frameW, float frameH) { @@ -1256,12 +1261,23 @@ void FramebufferManager::BlitFramebuffer_(VirtualFramebuffer *src, VirtualFrameb fbo_unbind(); } +static inline bool UseBGRA8888() { + // TODO: Other platforms? May depend on vendor which is faster? +#ifdef _WIN32 + return gl_extensions.EXT_bgra; +#endif + return false; +} + // TODO: SSE/NEON // Could also make C fake-simd for 64-bit, two 8888 pixels fit in a register :) -void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format) { +void ConvertFromRGBA8888(u8 *dst, const u8 *src, u32 stride, u32 height, GEBufferFormat format) { if (format == GE_FORMAT_8888) { if (src == dst) { return; + } else if (UseBGRA8888()) { + u32 numPixels = height * stride; + ConvertBGRA8888ToRGBA8888((u32 *)dst, (const u32 *)src, numPixels); } else { // Here lets assume they don't intersect memcpy(dst, src, stride * height * 4); } @@ -1271,25 +1287,38 @@ void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferForma u16 *dst16 = (u16 *)dst; switch (format) { case GE_FORMAT_565: // BGR 565 - for (int i = 0; i < size; i++) { - dst16[i] = RGBA8888toRGB565(src32[i]); + if (UseBGRA8888()) { + for (int i = 0; i < size; i++) { + dst16[i] = BGRA8888toRGB565(src32[i]); + } + } else { + for (int i = 0; i < size; i++) { + dst16[i] = RGBA8888toRGB565(src32[i]); + } } break; case GE_FORMAT_5551: // ABGR 1555 - for (int i = 0; i < size; i++) { - dst16[i] = RGBA8888toRGBA5551(src32[i]); + if (UseBGRA8888()) { + ConvertBGRA8888ToRGBA5551(dst16, src32, size); + } else { + ConvertRGBA8888ToRGBA5551(dst16, src32, size); } break; case GE_FORMAT_4444: // ABGR 4444 - for (int i = 0; i < size; i++) { - dst16[i] = RGBA8888toRGBA4444(src32[i]); + if (UseBGRA8888()) { + for (int i = 0; i < size; i++) { + dst16[i] = BGRA8888toRGBA4444(src32[i]); + } + } else { + for (int i = 0; i < size; i++) { + dst16[i] = RGBA8888toRGBA4444(src32[i]); + } } break; case GE_FORMAT_8888: + case GE_FORMAT_INVALID: // Not possible. break; - default: - break; } } } @@ -1317,24 +1346,24 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) { } // Receive previously requested data from a PBO - if (pixelBufObj_[nextPBO].reading) { - glBindBuffer(GL_PIXEL_PACK_BUFFER, pixelBufObj_[nextPBO].handle); + AsyncPBO &pbo = pixelBufObj_[nextPBO]; + if (pbo.reading) { + glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo.handle); packed = (GLubyte *)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY); if (packed) { DEBUG_LOG(SCEGE, "Reading PBO to memory , bufSize = %u, packed = %p, fb_address = %08x, stride = %u, pbo = %u", - pixelBufObj_[nextPBO].size, packed, pixelBufObj_[nextPBO].fb_address, pixelBufObj_[nextPBO].stride, nextPBO); + pbo.size, packed, pbo.fb_address, pbo.stride, nextPBO); - if (useCPU) { - ConvertFromRGBA8888(Memory::GetPointer(pixelBufObj_[nextPBO].fb_address), packed, - pixelBufObj_[nextPBO].stride, pixelBufObj_[nextPBO].height, - pixelBufObj_[nextPBO].format); + if (useCPU || (UseBGRA8888() && pbo.format == GE_FORMAT_8888)) { + u8 *dst = Memory::GetPointer(pbo.fb_address); + ConvertFromRGBA8888(dst, packed, pbo.stride, pbo.height, pbo.format); } else { // We don't need to convert, GPU already did (or should have) - Memory::Memcpy(pixelBufObj_[nextPBO].fb_address, packed, pixelBufObj_[nextPBO].size); + Memory::Memcpy(pbo.fb_address, packed, pbo.size); } - pixelBufObj_[nextPBO].reading = false; + pbo.reading = false; } glUnmapBuffer(GL_PIXEL_PACK_BUFFER); @@ -1371,13 +1400,14 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) { case GE_FORMAT_8888: // 32 bit RGBA default: pixelType = GL_UNSIGNED_BYTE; - pixelFormat = GL_RGBA; + pixelFormat = UseBGRA8888() ? GL_BGRA_EXT : GL_RGBA; pixelSize = 4; align = 4; break; } - u32 bufSize = vfb->fb_stride * vfb->height * pixelSize; + // If using the CPU, we need 4 bytes per pixel always. + u32 bufSize = vfb->fb_stride * vfb->height * (useCPU ? 4 : pixelSize); u32 fb_address = (0x04000000) | vfb->fb_address; if (vfb->fbo) { @@ -1404,19 +1434,14 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) { if (pixelBufObj_[currentPBO_].maxSize < bufSize) { // We reserve a buffer big enough to fit all those pixels - if (useCPU && pixelType != GL_UNSIGNED_BYTE) { - // Wnd result may be 16-bit but we are reading 32-bit, so we need double the space on the buffer - glBufferData(GL_PIXEL_PACK_BUFFER, bufSize*2, NULL, GL_DYNAMIC_READ); - } else { - glBufferData(GL_PIXEL_PACK_BUFFER, bufSize, NULL, GL_DYNAMIC_READ); - } + glBufferData(GL_PIXEL_PACK_BUFFER, bufSize, NULL, GL_DYNAMIC_READ); pixelBufObj_[currentPBO_].maxSize = bufSize; } if (useCPU) { // If converting pixel formats on the CPU we'll always request RGBA8888 glPixelStorei(GL_PACK_ALIGNMENT, 4); - glReadPixels(0, 0, vfb->fb_stride, vfb->height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glReadPixels(0, 0, vfb->fb_stride, vfb->height, UseBGRA8888() ? GL_BGRA_EXT : GL_RGBA, GL_UNSIGNED_BYTE, 0); } else { // Otherwise we'll directly request the format we need and let the GPU sort it out glPixelStorei(GL_PACK_ALIGNMENT, align); diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index c575e4f2ef..be0ceac43c 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -663,8 +663,12 @@ void GPUCommon::ProcessDLQueueInternal() { return; } else { easy_guard guard(listLock); - // At the end, we can remove it from the queue and continue. - dlQueue.erase(std::remove(dlQueue.begin(), dlQueue.end(), listIndex), dlQueue.end()); + + // Some other list could've taken the spot while we dilly-dallied around. + if (l.state != PSP_GE_DL_STATE_QUEUED) { + // At the end, we can remove it from the queue and continue. + dlQueue.erase(std::remove(dlQueue.begin(), dlQueue.end(), listIndex), dlQueue.end()); + } UpdateTickEstimate(std::max(busyTicks, startingTicks + cyclesExecuted)); } }