diff --git a/GPU/Common/FramebufferCommon.h b/GPU/Common/FramebufferCommon.h index 6e34906b2c..d2b854a08f 100644 --- a/GPU/Common/FramebufferCommon.h +++ b/GPU/Common/FramebufferCommon.h @@ -168,6 +168,7 @@ public: void NotifyBlockTransferAfter(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int w, int h, int bpp, u32 skipDrawReason); virtual void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h) = 0; + virtual void DownloadFramebufferForClut(void *clut, u32 fb_address, u32 loadBytes) = 0; virtual void MakePixelTexture(const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, int width, int height) = 0; virtual void DrawPixels(VirtualFramebuffer *vfb, int dstX, int dstY, const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, int width, int height) = 0; virtual void DrawFramebufferToOutput(const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, bool applyPostShader) = 0; diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 215730f6c6..e8837d55b9 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -227,32 +227,35 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) { // It's possible for a game to (successfully) access outside valid memory. u32 bytes = Memory::ValidSize(clutAddr, loadBytes); if (clutRenderAddress_ != 0xFFFFFFFF && !g_Config.bDisableSlowFramebufEffects) { - gpu->PerformMemoryDownload(clutAddr, bytes); - } - -#ifdef _M_SSE - int numBlocks = bytes / 16; - if (bytes == loadBytes) { - const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr); - __m128i *dest = (__m128i *)clutBufRaw_; - for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) { - __m128i data1 = _mm_loadu_si128(source); - __m128i data2 = _mm_loadu_si128(source + 1); - _mm_store_si128(dest, data1); - _mm_store_si128(dest + 1, data2); + DownloadFramebufferForClut(clutAddr, bytes); + if (bytes < loadBytes) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); } } else { +#ifdef _M_SSE + int numBlocks = bytes / 16; + if (bytes == loadBytes) { + const __m128i *source = (const __m128i *)Memory::GetPointerUnchecked(clutAddr); + __m128i *dest = (__m128i *)clutBufRaw_; + for (int i = 0; i < numBlocks; i++, source += 2, dest += 2) { + __m128i data1 = _mm_loadu_si128(source); + __m128i data2 = _mm_loadu_si128(source + 1); + _mm_store_si128(dest, data1); + _mm_store_si128(dest + 1, data2); + } + } else { + Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); + if (bytes < loadBytes) { + memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); + } + } +#else Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); if (bytes < loadBytes) { memset((u8 *)clutBufRaw_ + bytes, 0x00, loadBytes - bytes); } - } -#else - Memory::MemcpyUnchecked(clutBufRaw_, clutAddr, bytes); - if (bytes < clutTotalBytes_) { - memset((u8 *)clutBufRaw_ + bytes, 0x00, clutTotalBytes_ - bytes); - } #endif + } } else { memset(clutBufRaw_, 0x00, loadBytes); } diff --git a/GPU/Common/TextureCacheCommon.h b/GPU/Common/TextureCacheCommon.h index 8376093561..d369b49d96 100644 --- a/GPU/Common/TextureCacheCommon.h +++ b/GPU/Common/TextureCacheCommon.h @@ -139,6 +139,8 @@ protected: virtual bool AttachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer, u32 texaddrOffset = 0) = 0; virtual void DetachFramebuffer(TexCacheEntry *entry, u32 address, VirtualFramebuffer *framebuffer) = 0; + virtual void DownloadFramebufferForClut(u32 clutAddr, u32 bytes) = 0; + TexCache cache; std::vector fbCache_; diff --git a/GPU/Directx9/FramebufferDX9.cpp b/GPU/Directx9/FramebufferDX9.cpp index b3741bbf35..4b7c545657 100644 --- a/GPU/Directx9/FramebufferDX9.cpp +++ b/GPU/Directx9/FramebufferDX9.cpp @@ -859,6 +859,38 @@ namespace DX9 { } } + void FramebufferManagerDX9::DownloadFramebufferForClut(void *clut, u32 fb_address, u32 loadBytes) { + VirtualFramebuffer *vfb = GetVFBAt(fb_address); + if (vfb && vfb->fb_stride != 0) { + const u32 bpp = vfb->drawnFormat == GE_FORMAT_8888 ? 4 : 2; + int x = 0; + int y = 0; + int pixels = loadBytes / bpp; + // The height will be 1 for each stride or part thereof. + int w = std::min(pixels % vfb->fb_stride, (int)vfb->width); + int h = std::min((pixels + vfb->fb_stride - 1) / vfb->fb_stride, (int)vfb->height); + + // We intentionally don't call OptimizeDownloadRange() here - we don't want to over download. + // CLUT framebuffers are often incorrectly estimated in size. + if (x == 0 && y == 0 && w == vfb->width && h == vfb->height) { + vfb->memoryUpdated = true; + } + + // We'll pseudo-blit framebuffers here to get a resized version of vfb. + VirtualFramebuffer *nvfb = FindDownloadTempBuffer(vfb); + BlitFramebuffer(nvfb, x, y, vfb, x, y, w, h, 0); + + PackFramebufferDirectx9_(nvfb, x, y, w, h); + + textureCache_->ForgetLastTexture(); + RebindFramebuffer(); + } + + if (Memory::IsValidAddress(fb_address | 0x04000000)) { + Memory::MemcpyUnchecked(clut, fb_address | 0x04000000, loadBytes); + } + } + bool FramebufferManagerDX9::CreateDownloadTempBuffer(VirtualFramebuffer *nvfb) { nvfb->colorDepth = FBO_8888; diff --git a/GPU/Directx9/FramebufferDX9.h b/GPU/Directx9/FramebufferDX9.h index 26612bd15e..89b88c73e9 100644 --- a/GPU/Directx9/FramebufferDX9.h +++ b/GPU/Directx9/FramebufferDX9.h @@ -73,7 +73,8 @@ public: void BindFramebufferColor(int stage, VirtualFramebuffer *framebuffer, int flags); - virtual void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h) override; + void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h) override; + void DownloadFramebufferForClut(void *clut, u32 fb_address, u32 loadBytes) override; std::vector GetFramebufferList(); diff --git a/GPU/Directx9/TextureCacheDX9.cpp b/GPU/Directx9/TextureCacheDX9.cpp index a5f5933ddb..0dff2b6638 100644 --- a/GPU/Directx9/TextureCacheDX9.cpp +++ b/GPU/Directx9/TextureCacheDX9.cpp @@ -804,6 +804,10 @@ void TextureCacheDX9::ApplyTexture() { nextTexture_ = nullptr; } +void TextureCacheDX9::DownloadFramebufferForClut(u32 clutAddr, u32 bytes) { + framebufferManager_->DownloadFramebufferForClut(clutBufRaw_, clutAddr, bytes); +} + class TextureShaderApplierDX9 { public: struct Pos { diff --git a/GPU/Directx9/TextureCacheDX9.h b/GPU/Directx9/TextureCacheDX9.h index c3ce5d7bc0..8a3aefca13 100644 --- a/GPU/Directx9/TextureCacheDX9.h +++ b/GPU/Directx9/TextureCacheDX9.h @@ -72,6 +72,9 @@ public: void ApplyTexture(); +protected: + void DownloadFramebufferForClut(u32 clutAddr, u32 bytes) override; + private: void Decimate(); // Run this once per frame to get rid of old textures. void DeleteTexture(TexCache::iterator it); diff --git a/GPU/GLES/Framebuffer.cpp b/GPU/GLES/Framebuffer.cpp index 93e03802bd..aa424b482b 100644 --- a/GPU/GLES/Framebuffer.cpp +++ b/GPU/GLES/Framebuffer.cpp @@ -1241,6 +1241,42 @@ void FramebufferManager::ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool s } } +void FramebufferManager::DownloadFramebufferForClut(void *clut, u32 fb_address, u32 loadBytes) { + PROFILE_THIS_SCOPE("gpu-readback"); + // Flush async just in case. + PackFramebufferAsync_(nullptr); + + VirtualFramebuffer *vfb = GetVFBAt(fb_address); + if (vfb && vfb->fb_stride != 0) { + const u32 bpp = vfb->drawnFormat == GE_FORMAT_8888 ? 4 : 2; + int x = 0; + int y = 0; + int pixels = loadBytes / bpp; + // The height will be 1 for each stride or part thereof. + int w = std::min(pixels % vfb->fb_stride, (int)vfb->width); + int h = std::min((pixels + vfb->fb_stride - 1) / vfb->fb_stride, (int)vfb->height); + + // We intentionally don't call OptimizeDownloadRange() here - we don't want to over download. + // CLUT framebuffers are often incorrectly estimated in size. + if (x == 0 && y == 0 && w == vfb->width && h == vfb->height) { + vfb->memoryUpdated = true; + } + + // We'll pseudo-blit framebuffers here to get a resized version of vfb. + VirtualFramebuffer *nvfb = FindDownloadTempBuffer(vfb); + BlitFramebuffer(nvfb, x, y, vfb, x, y, w, h, 0); + + PackFramebufferSync_(nvfb, x, y, w, h); + + textureCache_->ForgetLastTexture(); + RebindFramebuffer(); + } + + if (Memory::IsValidAddress(fb_address | 0x04000000)) { + Memory::MemcpyUnchecked(clut, fb_address | 0x04000000, loadBytes); + } +} + bool FramebufferManager::CreateDownloadTempBuffer(VirtualFramebuffer *nvfb) { // When updating VRAM, it need to be exact format. if (!gstate_c.Supports(GPU_PREFER_CPU_DOWNLOAD)) { diff --git a/GPU/GLES/Framebuffer.h b/GPU/GLES/Framebuffer.h index bd494d11be..aa9eb5de15 100644 --- a/GPU/GLES/Framebuffer.h +++ b/GPU/GLES/Framebuffer.h @@ -101,7 +101,8 @@ public: void BindFramebufferColor(int stage, u32 fbRawAddress, VirtualFramebuffer *framebuffer, int flags); // Reads a rectangular subregion of a framebuffer to the right position in its backing memory. - virtual void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h) override; + void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h) override; + void DownloadFramebufferForClut(void *clut, u32 fb_address, u32 loadBytes) override; std::vector GetFramebufferList(); diff --git a/GPU/GLES/TextureCache.cpp b/GPU/GLES/TextureCache.cpp index 4d7e77ab52..bbe70a5d25 100644 --- a/GPU/GLES/TextureCache.cpp +++ b/GPU/GLES/TextureCache.cpp @@ -879,6 +879,10 @@ void TextureCache::ApplyTexture() { nextTexture_ = nullptr; } +void TextureCache::DownloadFramebufferForClut(u32 clutAddr, u32 bytes) { + framebufferManager_->DownloadFramebufferForClut(clutBufRaw_, clutAddr, bytes); +} + class TextureShaderApplier { public: struct Pos { diff --git a/GPU/GLES/TextureCache.h b/GPU/GLES/TextureCache.h index 903edb184a..5d04585afe 100644 --- a/GPU/GLES/TextureCache.h +++ b/GPU/GLES/TextureCache.h @@ -87,6 +87,9 @@ public: void ApplyTexture(); +protected: + void DownloadFramebufferForClut(u32 clutAddr, u32 bytes) override; + private: void Decimate(); // Run this once per frame to get rid of old textures. void DeleteTexture(TexCache::iterator it);