mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Merge pull request #6013 from unknownbrackets/gpu-minor
Fix another race, fix bug in CPU framebuf download, speed up
This commit is contained in:
commit
e5c830e3d7
4 changed files with 193 additions and 32 deletions
|
@ -25,6 +25,9 @@
|
|||
|
||||
#ifdef _M_SSE
|
||||
#include <xmmintrin.h>
|
||||
#if _M_SSE >= 0x401
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
u32 QuickTexHashSSE2(const void *checkp, u32 size) {
|
||||
u32 check = 0;
|
||||
|
@ -272,3 +275,128 @@ void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch) {
|
|||
dst += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels) {
|
||||
#ifdef _M_SSE
|
||||
const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
u32 sseChunks = numPixels / 4;
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; ++i) {
|
||||
__m128i c = _mm_load_si128(&srcp[i]);
|
||||
__m128i rb = _mm_andnot_si128(maskGA, c);
|
||||
c = _mm_and_si128(c, maskGA);
|
||||
|
||||
__m128i b = _mm_srli_epi32(rb, 16);
|
||||
__m128i r = _mm_slli_epi32(rb, 16);
|
||||
c = _mm_or_si128(_mm_or_si128(c, r), b);
|
||||
_mm_store_si128(&dstp[i], c);
|
||||
}
|
||||
// The remainder starts right after those done via SSE.
|
||||
u32 i = sseChunks * 4;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
for (; i < numPixels; i++) {
|
||||
const u32 c = src[i];
|
||||
dst[i] = ((c >> 16) & 0x000000FF) |
|
||||
((c >> 0) & 0xFF00FF00) |
|
||||
((c << 16) & 0x00FF0000);
|
||||
}
|
||||
}
|
||||
|
||||
inline u16 RGBA8888toRGBA5551(u32 px) {
|
||||
return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000);
|
||||
}
|
||||
|
||||
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels) {
|
||||
#if _M_SSE >= 0x401
|
||||
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
|
||||
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
|
||||
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
u32 sseChunks = (numPixels / 4) & ~1;
|
||||
// SSE 4.1 required for _mm_packus_epi32.
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; i += 2) {
|
||||
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
|
||||
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
|
||||
__m128i ag, rb;
|
||||
|
||||
ag = _mm_and_si128(c1, maskAG);
|
||||
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
|
||||
rb = _mm_and_si128(c1, maskRB);
|
||||
rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
|
||||
c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
|
||||
|
||||
ag = _mm_and_si128(c2, maskAG);
|
||||
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
|
||||
rb = _mm_and_si128(c2, maskRB);
|
||||
rb = _mm_or_si128(_mm_srli_epi32(rb, 3), _mm_srli_epi32(rb, 9));
|
||||
c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
|
||||
|
||||
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
|
||||
}
|
||||
// The remainder starts right after those done via SSE.
|
||||
u32 i = sseChunks * 4;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
for (; i < numPixels; i++) {
|
||||
dst[i] = RGBA8888toRGBA5551(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inline u16 BGRA8888toRGBA5551(u32 px) {
|
||||
return ((px >> 19) & 0x001F) | ((px >> 6) & 0x03E0) | ((px << 7) & 0x7C00) | ((px >> 16) & 0x8000);
|
||||
}
|
||||
|
||||
void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels) {
|
||||
#if _M_SSE >= 0x401
|
||||
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
|
||||
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
|
||||
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
|
||||
|
||||
const __m128i *srcp = (const __m128i *)src;
|
||||
__m128i *dstp = (__m128i *)dst;
|
||||
u32 sseChunks = (numPixels / 4) & ~1;
|
||||
// SSE 4.1 required for _mm_packus_epi32.
|
||||
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF) || !cpu_info.bSSE4_1) {
|
||||
sseChunks = 0;
|
||||
}
|
||||
for (u32 i = 0; i < sseChunks; i += 2) {
|
||||
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
|
||||
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
|
||||
__m128i ag, rb;
|
||||
|
||||
ag = _mm_and_si128(c1, maskAG);
|
||||
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
|
||||
rb = _mm_and_si128(c1, maskRB);
|
||||
rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
|
||||
c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
|
||||
|
||||
ag = _mm_and_si128(c2, maskAG);
|
||||
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
|
||||
rb = _mm_and_si128(c2, maskRB);
|
||||
rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
|
||||
c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
|
||||
|
||||
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
|
||||
}
|
||||
// The remainder starts right after those done via SSE.
|
||||
u32 i = sseChunks * 4;
|
||||
#else
|
||||
u32 i = 0;
|
||||
#endif
|
||||
for (; i < numPixels; i++) {
|
||||
dst[i] = BGRA8888toRGBA5551(src[i]);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -201,3 +201,7 @@ inline void DeIndexTexture4Optimal(ClutT *dest, const u32 texaddr, int length, C
|
|||
const u8 *indexed = (const u8 *) Memory::GetPointer(texaddr);
|
||||
DeIndexTexture4Optimal(dest, indexed, length, color);
|
||||
}
|
||||
|
||||
void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, const u32 numPixels);
|
||||
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels);
|
||||
void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, const u32 numPixels);
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include "GPU/GPUState.h"
|
||||
|
||||
#include "GPU/Common/PostShader.h"
|
||||
#include "GPU/Common/TextureDecoder.h"
|
||||
#include "GPU/GLES/Framebuffer.h"
|
||||
#include "GPU/GLES/TextureCache.h"
|
||||
#include "GPU/GLES/ShaderManager.h"
|
||||
|
@ -111,11 +112,15 @@ inline u16 RGBA8888toRGBA4444(u32 px) {
|
|||
return ((px >> 4) & 0x000F) | ((px >> 8) & 0x00F0) | ((px >> 12) & 0x0F00) | ((px >> 16) & 0xF000);
|
||||
}
|
||||
|
||||
inline u16 RGBA8888toRGBA5551(u32 px) {
|
||||
return ((px >> 3) & 0x001F) | ((px >> 6) & 0x03E0) | ((px >> 9) & 0x7C00) | ((px >> 16) & 0x8000);
|
||||
inline u16 BGRA8888toRGB565(u32 px) {
|
||||
return ((px >> 19) & 0x001F) | ((px >> 5) & 0x07E0) | ((px << 8) & 0xF800);
|
||||
}
|
||||
|
||||
void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format);
|
||||
inline u16 BGRA8888toRGBA4444(u32 px) {
|
||||
return ((px >> 20) & 0x000F) | ((px >> 8) & 0x00F0) | ((px << 4) & 0x0F00) | ((px >> 16) & 0xF000);
|
||||
}
|
||||
|
||||
void ConvertFromRGBA8888(u8 *dst, const u8 *src, u32 stride, u32 height, GEBufferFormat format);
|
||||
|
||||
void CenterRect(float *x, float *y, float *w, float *h,
|
||||
float origW, float origH, float frameW, float frameH) {
|
||||
|
@ -1256,12 +1261,23 @@ void FramebufferManager::BlitFramebuffer_(VirtualFramebuffer *src, VirtualFrameb
|
|||
fbo_unbind();
|
||||
}
|
||||
|
||||
static inline bool UseBGRA8888() {
|
||||
// TODO: Other platforms? May depend on vendor which is faster?
|
||||
#ifdef _WIN32
|
||||
return gl_extensions.EXT_bgra;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: SSE/NEON
|
||||
// Could also make C fake-simd for 64-bit, two 8888 pixels fit in a register :)
|
||||
void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferFormat format) {
|
||||
void ConvertFromRGBA8888(u8 *dst, const u8 *src, u32 stride, u32 height, GEBufferFormat format) {
|
||||
if (format == GE_FORMAT_8888) {
|
||||
if (src == dst) {
|
||||
return;
|
||||
} else if (UseBGRA8888()) {
|
||||
u32 numPixels = height * stride;
|
||||
ConvertBGRA8888ToRGBA8888((u32 *)dst, (const u32 *)src, numPixels);
|
||||
} else { // Here lets assume they don't intersect
|
||||
memcpy(dst, src, stride * height * 4);
|
||||
}
|
||||
|
@ -1271,25 +1287,38 @@ void ConvertFromRGBA8888(u8 *dst, u8 *src, u32 stride, u32 height, GEBufferForma
|
|||
u16 *dst16 = (u16 *)dst;
|
||||
switch (format) {
|
||||
case GE_FORMAT_565: // BGR 565
|
||||
for (int i = 0; i < size; i++) {
|
||||
dst16[i] = RGBA8888toRGB565(src32[i]);
|
||||
if (UseBGRA8888()) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
dst16[i] = BGRA8888toRGB565(src32[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < size; i++) {
|
||||
dst16[i] = RGBA8888toRGB565(src32[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case GE_FORMAT_5551: // ABGR 1555
|
||||
for (int i = 0; i < size; i++) {
|
||||
dst16[i] = RGBA8888toRGBA5551(src32[i]);
|
||||
if (UseBGRA8888()) {
|
||||
ConvertBGRA8888ToRGBA5551(dst16, src32, size);
|
||||
} else {
|
||||
ConvertRGBA8888ToRGBA5551(dst16, src32, size);
|
||||
}
|
||||
break;
|
||||
case GE_FORMAT_4444: // ABGR 4444
|
||||
for (int i = 0; i < size; i++) {
|
||||
dst16[i] = RGBA8888toRGBA4444(src32[i]);
|
||||
if (UseBGRA8888()) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
dst16[i] = BGRA8888toRGBA4444(src32[i]);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < size; i++) {
|
||||
dst16[i] = RGBA8888toRGBA4444(src32[i]);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case GE_FORMAT_8888:
|
||||
case GE_FORMAT_INVALID:
|
||||
// Not possible.
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1317,24 +1346,24 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) {
|
|||
}
|
||||
|
||||
// Receive previously requested data from a PBO
|
||||
if (pixelBufObj_[nextPBO].reading) {
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, pixelBufObj_[nextPBO].handle);
|
||||
AsyncPBO &pbo = pixelBufObj_[nextPBO];
|
||||
if (pbo.reading) {
|
||||
glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo.handle);
|
||||
packed = (GLubyte *)glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
|
||||
|
||||
if (packed) {
|
||||
DEBUG_LOG(SCEGE, "Reading PBO to memory , bufSize = %u, packed = %p, fb_address = %08x, stride = %u, pbo = %u",
|
||||
pixelBufObj_[nextPBO].size, packed, pixelBufObj_[nextPBO].fb_address, pixelBufObj_[nextPBO].stride, nextPBO);
|
||||
pbo.size, packed, pbo.fb_address, pbo.stride, nextPBO);
|
||||
|
||||
if (useCPU) {
|
||||
ConvertFromRGBA8888(Memory::GetPointer(pixelBufObj_[nextPBO].fb_address), packed,
|
||||
pixelBufObj_[nextPBO].stride, pixelBufObj_[nextPBO].height,
|
||||
pixelBufObj_[nextPBO].format);
|
||||
if (useCPU || (UseBGRA8888() && pbo.format == GE_FORMAT_8888)) {
|
||||
u8 *dst = Memory::GetPointer(pbo.fb_address);
|
||||
ConvertFromRGBA8888(dst, packed, pbo.stride, pbo.height, pbo.format);
|
||||
} else {
|
||||
// We don't need to convert, GPU already did (or should have)
|
||||
Memory::Memcpy(pixelBufObj_[nextPBO].fb_address, packed, pixelBufObj_[nextPBO].size);
|
||||
Memory::Memcpy(pbo.fb_address, packed, pbo.size);
|
||||
}
|
||||
|
||||
pixelBufObj_[nextPBO].reading = false;
|
||||
pbo.reading = false;
|
||||
}
|
||||
|
||||
glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
|
||||
|
@ -1371,13 +1400,14 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) {
|
|||
case GE_FORMAT_8888: // 32 bit RGBA
|
||||
default:
|
||||
pixelType = GL_UNSIGNED_BYTE;
|
||||
pixelFormat = GL_RGBA;
|
||||
pixelFormat = UseBGRA8888() ? GL_BGRA_EXT : GL_RGBA;
|
||||
pixelSize = 4;
|
||||
align = 4;
|
||||
break;
|
||||
}
|
||||
|
||||
u32 bufSize = vfb->fb_stride * vfb->height * pixelSize;
|
||||
// If using the CPU, we need 4 bytes per pixel always.
|
||||
u32 bufSize = vfb->fb_stride * vfb->height * (useCPU ? 4 : pixelSize);
|
||||
u32 fb_address = (0x04000000) | vfb->fb_address;
|
||||
|
||||
if (vfb->fbo) {
|
||||
|
@ -1404,19 +1434,14 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) {
|
|||
|
||||
if (pixelBufObj_[currentPBO_].maxSize < bufSize) {
|
||||
// We reserve a buffer big enough to fit all those pixels
|
||||
if (useCPU && pixelType != GL_UNSIGNED_BYTE) {
|
||||
// Wnd result may be 16-bit but we are reading 32-bit, so we need double the space on the buffer
|
||||
glBufferData(GL_PIXEL_PACK_BUFFER, bufSize*2, NULL, GL_DYNAMIC_READ);
|
||||
} else {
|
||||
glBufferData(GL_PIXEL_PACK_BUFFER, bufSize, NULL, GL_DYNAMIC_READ);
|
||||
}
|
||||
glBufferData(GL_PIXEL_PACK_BUFFER, bufSize, NULL, GL_DYNAMIC_READ);
|
||||
pixelBufObj_[currentPBO_].maxSize = bufSize;
|
||||
}
|
||||
|
||||
if (useCPU) {
|
||||
// If converting pixel formats on the CPU we'll always request RGBA8888
|
||||
glPixelStorei(GL_PACK_ALIGNMENT, 4);
|
||||
glReadPixels(0, 0, vfb->fb_stride, vfb->height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
|
||||
glReadPixels(0, 0, vfb->fb_stride, vfb->height, UseBGRA8888() ? GL_BGRA_EXT : GL_RGBA, GL_UNSIGNED_BYTE, 0);
|
||||
} else {
|
||||
// Otherwise we'll directly request the format we need and let the GPU sort it out
|
||||
glPixelStorei(GL_PACK_ALIGNMENT, align);
|
||||
|
|
|
@ -663,8 +663,12 @@ void GPUCommon::ProcessDLQueueInternal() {
|
|||
return;
|
||||
} else {
|
||||
easy_guard guard(listLock);
|
||||
// At the end, we can remove it from the queue and continue.
|
||||
dlQueue.erase(std::remove(dlQueue.begin(), dlQueue.end(), listIndex), dlQueue.end());
|
||||
|
||||
// Some other list could've taken the spot while we dilly-dallied around.
|
||||
if (l.state != PSP_GE_DL_STATE_QUEUED) {
|
||||
// At the end, we can remove it from the queue and continue.
|
||||
dlQueue.erase(std::remove(dlQueue.begin(), dlQueue.end(), listIndex), dlQueue.end());
|
||||
}
|
||||
UpdateTickEstimate(std::max(busyTicks, startingTicks + cyclesExecuted));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue