diff --git a/Core/HLE/ReplaceTables.cpp b/Core/HLE/ReplaceTables.cpp index efccd43dee..e4f6d5bb96 100644 --- a/Core/HLE/ReplaceTables.cpp +++ b/Core/HLE/ReplaceTables.cpp @@ -104,7 +104,11 @@ static int Replace_memcpy() { u32 destPtr = PARAM(0); u32 srcPtr = PARAM(1); u32 bytes = PARAM(2); - if (bytes != 0) { + bool skip = false; + if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) { + skip = gpu->UpdateMemory(destPtr, srcPtr, bytes); + } + if (!skip && bytes != 0) { u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 *src = Memory::GetPointerUnchecked(srcPtr); memmove(dst, src, bytes); @@ -114,9 +118,6 @@ static int Replace_memcpy() { CBreakPoints::ExecMemCheck(srcPtr, false, bytes, currentMIPS->pc); CBreakPoints::ExecMemCheck(destPtr, true, bytes, currentMIPS->pc); #endif - if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) { - gpu->UpdateMemory(destPtr, srcPtr, bytes); - } return 10 + bytes / 4; // approximation } @@ -124,7 +125,11 @@ static int Replace_memcpy16() { u32 destPtr = PARAM(0); u32 srcPtr = PARAM(1); u32 bytes = PARAM(2) * 16; - if (bytes != 0) { + bool skip = false; + if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) { + skip = gpu->UpdateMemory(destPtr, srcPtr, bytes); + } + if (!skip && bytes != 0) { u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 *src = Memory::GetPointerUnchecked(srcPtr); memmove(dst, src, bytes); @@ -134,9 +139,6 @@ static int Replace_memcpy16() { CBreakPoints::ExecMemCheck(srcPtr, false, bytes, currentMIPS->pc); CBreakPoints::ExecMemCheck(destPtr, true, bytes, currentMIPS->pc); #endif - if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) { - gpu->UpdateMemory(destPtr, srcPtr, bytes); - } return 10 + bytes / 4; // approximation } @@ -144,7 +146,11 @@ static int Replace_memmove() { u32 destPtr = PARAM(0); u32 srcPtr = PARAM(1); u32 bytes = PARAM(2); - if (bytes != 0) { + bool skip = false; + if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) { + skip = gpu->UpdateMemory(destPtr, srcPtr, bytes); + } + if (!skip && bytes != 0) { u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 *src = Memory::GetPointerUnchecked(srcPtr); memmove(dst, src, bytes); @@ -154,9 +160,6 @@ static int Replace_memmove() { CBreakPoints::ExecMemCheck(srcPtr, false, bytes, currentMIPS->pc); CBreakPoints::ExecMemCheck(destPtr, true, bytes, currentMIPS->pc); #endif - if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(srcPtr)) { - gpu->UpdateMemory(destPtr, srcPtr, bytes); - } return 10 + bytes / 4; // approximation } @@ -165,14 +168,17 @@ static int Replace_memset() { u8 *dst = Memory::GetPointerUnchecked(destPtr); u8 value = PARAM(1); u32 bytes = PARAM(2); - memset(dst, value, bytes); + bool skip = false; + if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(destPtr)) { + skip = gpu->UpdateMemory(destPtr, destPtr, bytes); + } + if (!skip) { + memset(dst, value, bytes); + } RETURN(destPtr); #ifndef MOBILE_DEVICE CBreakPoints::ExecMemCheck(destPtr, true, bytes, currentMIPS->pc); #endif - if (Memory::IsVRAMAddress(destPtr) || Memory::IsVRAMAddress(destPtr)) { - gpu->UpdateMemory(destPtr, destPtr, bytes); - } return 10 + bytes / 4; // approximation } diff --git a/Core/HLE/sceDmac.cpp b/Core/HLE/sceDmac.cpp index 7c6d1dafd8..4fd89dffbb 100644 --- a/Core/HLE/sceDmac.cpp +++ b/Core/HLE/sceDmac.cpp @@ -43,16 +43,17 @@ void __DmacDoState(PointerWrap &p) { } int __DmacMemcpy(u32 dst, u32 src, u32 size) { - Memory::Memcpy(dst, Memory::GetPointer(src), size); #ifndef MOBILE_DEVICE CBreakPoints::ExecMemCheck(src, false, size, currentMIPS->pc); CBreakPoints::ExecMemCheck(dst, true, size, currentMIPS->pc); #endif - src &= ~0x40000000; - dst &= ~0x40000000; + bool skip = false; if (Memory::IsVRAMAddress(src) || Memory::IsVRAMAddress(dst)) { - gpu->UpdateMemory(dst, src, size); + skip = gpu->UpdateMemory(dst, src, size); + } + if (!skip) { + Memory::Memcpy(dst, Memory::GetPointer(src), size); } // This number seems strangely reproducible. diff --git a/Core/HLE/sceKernelInterrupt.cpp b/Core/HLE/sceKernelInterrupt.cpp index 0a23d33097..36e57208e3 100644 --- a/Core/HLE/sceKernelInterrupt.cpp +++ b/Core/HLE/sceKernelInterrupt.cpp @@ -559,11 +559,14 @@ u32 sceKernelMemset(u32 addr, u32 fillc, u32 n) u32 sceKernelMemcpy(u32 dst, u32 src, u32 size) { DEBUG_LOG(SCEKERNEL, "sceKernelMemcpy(dest=%08x, src=%08x, size=%i)", dst, src, size); - // Hm, sceDmacMemcpy seems to be the popular one for this. Ignoring for now. - // gpu->UpdateMemory(dst, src, size); + + bool skip = false; + if (Memory::IsVRAMAddress(src) || Memory::IsVRAMAddress(dst)) { + skip = gpu->UpdateMemory(dst, src, size); + } // Technically should crash if these are invalid and size > 0... - if (Memory::IsValidAddress(dst) && Memory::IsValidAddress(src) && Memory::IsValidAddress(dst + size - 1) && Memory::IsValidAddress(src + size - 1)) + if (!skip && Memory::IsValidAddress(dst) && Memory::IsValidAddress(src) && Memory::IsValidAddress(dst + size - 1) && Memory::IsValidAddress(src + size - 1)) { u8 *dstp = Memory::GetPointer(dst); u8 *srcp = Memory::GetPointer(src); diff --git a/GPU/Directx9/GPU_DX9.cpp b/GPU/Directx9/GPU_DX9.cpp index 1e26d074ce..1749b11def 100644 --- a/GPU/Directx9/GPU_DX9.cpp +++ b/GPU/Directx9/GPU_DX9.cpp @@ -1315,8 +1315,9 @@ void DIRECTX9_GPU::InvalidateCacheInternal(u32 addr, int size, GPUInvalidationTy framebufferManager_.UpdateFromMemory(addr, size); } -void DIRECTX9_GPU::UpdateMemory(u32 dest, u32 src, int size) { +bool DIRECTX9_GPU::UpdateMemory(u32 dest, u32 src, int size) { InvalidateCache(dest, size, GPU_INVALIDATE_HINT); + return false; } void DIRECTX9_GPU::ClearCacheNextFrame() { diff --git a/GPU/Directx9/GPU_DX9.h b/GPU/Directx9/GPU_DX9.h index e0de3b4318..9f6cb86253 100644 --- a/GPU/Directx9/GPU_DX9.h +++ b/GPU/Directx9/GPU_DX9.h @@ -46,7 +46,7 @@ public: virtual void BeginFrame(); virtual void UpdateStats(); virtual void InvalidateCache(u32 addr, int size, GPUInvalidationType type); - virtual void UpdateMemory(u32 dest, u32 src, int size); + virtual bool UpdateMemory(u32 dest, u32 src, int size); virtual void ClearCacheNextFrame(); virtual void DeviceLost(); // Only happens on Android. Drop all textures and shaders. diff --git a/GPU/GLES/Framebuffer.cpp b/GPU/GLES/Framebuffer.cpp index 51b6aa2f86..dd31e3fd84 100644 --- a/GPU/GLES/Framebuffer.cpp +++ b/GPU/GLES/Framebuffer.cpp @@ -318,7 +318,7 @@ FramebufferManager::FramebufferManager() : currentRenderVfb_(0), drawPixelsTex_(0), drawPixelsTexFormat_(GE_FORMAT_INVALID), - convBuf(0), + convBuf_(0), draw2dprogram_(0), postShaderProgram_(0), plainColorLoc_(-1), @@ -327,7 +327,9 @@ FramebufferManager::FramebufferManager() : shaderManager_(0), usePostShader_(false), postShaderAtOutputResolution_(false), - resized_(false) + resized_(false), + gameUsesSequentialCopies_(false), + framebufRangeEnd_(0) #ifndef USING_GLES2 , pixelBufObj_(0), @@ -361,7 +363,7 @@ FramebufferManager::~FramebufferManager() { #ifndef USING_GLES2 delete [] pixelBufObj_; #endif - delete [] convBuf; + delete [] convBuf_; } void FramebufferManager::MakePixelTexture(const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, int width, int height) { @@ -394,15 +396,18 @@ void FramebufferManager::MakePixelTexture(const u8 *srcPixels, GEBufferFormat sr bool useConvBuf = false; if (srcPixelFormat != GE_FORMAT_8888 || srcStride != width) { useConvBuf = true; - if (!convBuf) { - convBuf = new u8[width * height * 4]; + u32 neededSize = width * height * 4; + if (!convBuf_ || convBufSize_ < neededSize) { + delete [] convBuf_; + convBuf_ = new u8[neededSize]; + convBufSize_ = neededSize; } for (int y = 0; y < height; y++) { switch (srcPixelFormat) { case GE_FORMAT_565: { const u16 *src = (const u16 *)srcPixels + srcStride * y; - u8 *dst = convBuf + 4 * width * y; + u8 *dst = convBuf_ + 4 * width * y; for (int x = 0; x < width; x++) { u16 col = src[x]; @@ -417,7 +422,7 @@ void FramebufferManager::MakePixelTexture(const u8 *srcPixels, GEBufferFormat sr case GE_FORMAT_5551: { const u16 *src = (const u16 *)srcPixels + srcStride * y; - u8 *dst = convBuf + 4 * width * y; + u8 *dst = convBuf_ + 4 * width * y; for (int x = 0; x < width; x++) { u16 col = src[x]; @@ -432,7 +437,7 @@ void FramebufferManager::MakePixelTexture(const u8 *srcPixels, GEBufferFormat sr case GE_FORMAT_4444: { const u16 *src = (const u16 *)srcPixels + srcStride * y; - u8 *dst = convBuf + 4 * width * y; + u8 *dst = convBuf_ + 4 * width * y; for (int x = 0; x < width; x++) { u16 col = src[x]; @@ -447,7 +452,7 @@ void FramebufferManager::MakePixelTexture(const u8 *srcPixels, GEBufferFormat sr case GE_FORMAT_8888: { const u8 *src = srcPixels + srcStride * 4 * y; - u8 *dst = convBuf + 4 * width * y; + u8 *dst = convBuf_ + 4 * width * y; memcpy(dst, src, 4 * width); } break; @@ -458,7 +463,7 @@ void FramebufferManager::MakePixelTexture(const u8 *srcPixels, GEBufferFormat sr } } } - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, useConvBuf ? convBuf : srcPixels); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, useConvBuf ? convBuf_ : srcPixels); } void FramebufferManager::DrawPixels(VirtualFramebuffer *vfb, int dstX, int dstY, const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, int width, int height) { @@ -833,6 +838,12 @@ void FramebufferManager::DoSetRenderFrameBuffer() { glEnable(GL_DITHER); // why? currentRenderVfb_ = vfb; + u32 byteSize = FramebufferByteSize(vfb); + u32 fb_address_mem = (fb_address & 0x3FFFFFFF) | 0x04000000; + if (fb_address_mem + byteSize > framebufRangeEnd_) { + framebufRangeEnd_ = fb_address_mem + byteSize; + } + INFO_LOG(SCEGE, "Creating FBO for %08x : %i x %i x %i", vfb->fb_address, vfb->width, vfb->height, vfb->format); // Let's check for depth buffer overlap. Might be interesting. @@ -1230,20 +1241,42 @@ void FramebufferManager::ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool s #endif } - vfb->memoryUpdated = true; - BlitFramebuffer_(nvfb, x, y, vfb, x, y, w, h, 0); + if (gameUsesSequentialCopies_) { + // Ignore the x/y/etc., read the entire thing. + x = 0; + y = 0; + w = vfb->width; + h = vfb->height; + } + if (x == 0 && y == 0 && w == vfb->width && h == vfb->height) { + vfb->memoryUpdated = true; + } else { + const static int FREQUENT_SEQUENTIAL_COPIES = 3; + static int frameLastCopy = 0; + static u32 bufferLastCopy = 0; + static int copiesThisFrame = 0; + if (frameLastCopy != gpuStats.numFlips || bufferLastCopy != vfb->fb_address) { + frameLastCopy = gpuStats.numFlips; + bufferLastCopy = vfb->fb_address; + copiesThisFrame = 0; + } + if (++copiesThisFrame > FREQUENT_SEQUENTIAL_COPIES) { + gameUsesSequentialCopies_ = true; + } + } + BlitFramebuffer_(nvfb, x, y, vfb, x, y, w, h, 0, true); // PackFramebufferSync_() - Synchronous pixel data transfer using glReadPixels // PackFramebufferAsync_() - Asynchronous pixel data transfer using glReadPixels with PBOs #ifdef USING_GLES2 - PackFramebufferSync_(nvfb); + PackFramebufferSync_(nvfb, x, y, w, h); #else if (gl_extensions.PBO_ARB && gl_extensions.OES_texture_npot) { if (!sync) { PackFramebufferAsync_(nvfb); } else { - PackFramebufferSync_(nvfb); + PackFramebufferSync_(nvfb, x, y, w, h); } } #endif @@ -1254,7 +1287,7 @@ void FramebufferManager::ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool s } // TODO: If dimensions are the same, we can use glCopyImageSubData. -void FramebufferManager::BlitFramebuffer_(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp) { +void FramebufferManager::BlitFramebuffer_(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp, bool flip) { if (!dst->fbo) { ERROR_LOG_REPORT_ONCE(dstfbozero, SCEGE, "BlitFramebuffer_: dst->fbo == 0"); fbo_unbind(); @@ -1300,6 +1333,11 @@ void FramebufferManager::BlitFramebuffer_(VirtualFramebuffer *dst, int dstX, int int dstY2 = dst->renderHeight - (h + dstY) * dstYFactor; int dstY1 = dstY2 + h * dstYFactor; + if (flip) { + dstY1 = dst->renderHeight - dstY1; + dstY2 = dst->renderHeight - dstY2; + } + #ifdef MAY_HAVE_GLES3 fbo_bind_for_read(src->fbo); if (!useNV) { @@ -1577,7 +1615,7 @@ void FramebufferManager::PackFramebufferAsync_(VirtualFramebuffer *vfb) { #endif -void FramebufferManager::PackFramebufferSync_(VirtualFramebuffer *vfb) { +void FramebufferManager::PackFramebufferSync_(VirtualFramebuffer *vfb, int x, int y, int w, int h) { if (vfb->fbo) { fbo_bind_for_read(vfb->fbo); } else { @@ -1600,7 +1638,13 @@ void FramebufferManager::PackFramebufferSync_(VirtualFramebuffer *vfb) { if (!convert) { packed = (GLubyte *)Memory::GetPointer(fb_address); } else { // End result may be 16-bit but we are reading 32-bit, so there may not be enough space at fb_address - packed = (GLubyte *)malloc(bufSize * sizeof(GLubyte)); + u32 neededSize = (u32)bufSize * sizeof(GLubyte); + if (!convBuf_ || convBufSize_ < neededSize) { + delete [] convBuf_; + convBuf_ = new u8[neededSize]; + convBufSize_ = neededSize; + } + packed = convBuf_; } if (packed) { @@ -1614,12 +1658,12 @@ void FramebufferManager::PackFramebufferSync_(VirtualFramebuffer *vfb) { glfmt = GL_BGRA_EXT; } #endif - glReadPixels(0, 0, vfb->fb_stride, vfb->height, glfmt, GL_UNSIGNED_BYTE, packed); + int byteOffset = y * vfb->fb_stride * 4; + glReadPixels(0, y, vfb->fb_stride, h, glfmt, GL_UNSIGNED_BYTE, packed + byteOffset); // LogReadPixelsError(glGetError()); if (convert) { - ConvertFromRGBA8888(Memory::GetPointer(fb_address), packed, vfb->fb_stride, vfb->height, vfb->format); - free(packed); + ConvertFromRGBA8888(Memory::GetPointer(fb_address + byteOffset), packed + byteOffset, vfb->fb_stride, h, vfb->format); } } @@ -1790,14 +1834,20 @@ void FramebufferManager::UpdateFromMemory(u32 addr, int size, bool safe) { } } -void FramebufferManager::NotifyFramebufferCopy(u32 src, u32 dst, int size) { +bool FramebufferManager::NotifyFramebufferCopy(u32 src, u32 dst, int size) { + if (!(g_Config.iRenderingMode == FB_BUFFERED_MODE)) { + return false; + } + // MotoGP workaround - for (size_t i = 0; i < vfbs_.size(); i++) { - int bpp = vfbs_[i]->format == GE_FORMAT_8888 ? 4 : 2; - int fsize = vfbs_[i]->fb_stride * vfbs_[i]->height * (vfbs_[i]->format == GE_FORMAT_8888 ? 4 : 2); - if ((vfbs_[i]->fb_address | 0x04000000) == src && size == fsize) { - // A framebuffer matched! - knownFramebufferRAMCopies_.insert(std::pair(src, dst)); + if (Memory::IsVRAMAddress(src) && Memory::IsRAMAddress(dst)) { + for (size_t i = 0; i < vfbs_.size(); i++) { + int bpp = vfbs_[i]->format == GE_FORMAT_8888 ? 4 : 2; + int fsize = FramebufferByteSize(vfbs_[i]); + if (MaskedEqual(vfbs_[i]->fb_address, src) && size == fsize) { + // A framebuffer matched! + knownFramebufferRAMCopies_.insert(std::pair(src, dst)); + } } } @@ -1819,37 +1869,126 @@ void FramebufferManager::NotifyFramebufferCopy(u32 src, u32 dst, int size) { if (srcBuffer == dstBuffer) { WARN_LOG_REPORT_ONCE(dstsrccpy, G3D, "Intra-buffer memcpy (not supported) %08x -> %08x", src, dst); } else { - WARN_LOG_ONCE(dstnotsrccpy, G3D, "Inter-buffer memcpy %08x -> %08x", src, dst); + WARN_LOG_REPORT_ONCE(dstnotsrccpy, G3D, "Inter-buffer memcpy (not supported) %08x -> %08x", src, dst); // Just do the blit! - // TODO: Possibly take bpp into account somehow if games are doing really crazy things? // if (g_Config.bBlockTransferGPU) { // BlitFramebuffer_(dstBuffer, 0, 0, srcBuffer, 0, 0, srcBuffer->width, srcBuffer->height, 0); // } } + Memory::Memcpy(dst, Memory::GetPointer(src), size); + return true; } else if (dstBuffer) { - WARN_LOG_REPORT_ONCE(btucpy, G3D, "Memcpy fbo upload (not supported) %08x -> %08x", src, dst); - // Here we should just draw the pixels into the buffer. - // if (g_Config.bBlockTransferGPU) { - // } - } else if (srcBuffer && g_Config.iRenderingMode == FB_BUFFERED_MODE) { - WARN_LOG_ONCE(btdcpy, G3D, "Memcpy fbo download %08x -> %08x", src, dst); - // if (g_Config.bBlockTransferGPU) { - // ReadFramebufferToMemory(srcBuffer, true, 0, 0, srcBuffer->width, srcBuffer->height); - // } + WARN_LOG_REPORT_ONCE(btucpy, G3D, "Memcpy fbo upload %08x -> %08x", src, dst); + if (g_Config.bBlockTransferGPU) { + const u8 *srcBase = Memory::GetPointerUnchecked(src); + fbo_bind_as_render_target(dstBuffer->fbo); + glViewport(0, 0, dstBuffer->renderWidth, dstBuffer->renderHeight); + // TODO: Validate x/y/w/h based on size and offset? + DrawPixels(dstBuffer, 0, 0, srcBase, dstBuffer->format, dstBuffer->fb_stride, dstBuffer->width, dstBuffer->height); + dstBuffer->dirtyAfterDisplay = true; + if ((gstate_c.skipDrawReason & SKIPDRAW_SKIPFRAME) == 0) + dstBuffer->reallyDirtyAfterDisplay = true; + if (currentRenderVfb_) { + fbo_bind_as_render_target(currentRenderVfb_->fbo); + } else { + fbo_unbind(); + } + glstate.viewport.restore(); + gstate_c.textureChanged = TEXCHANGE_PARAMSONLY; + // This is a memcpy, let's still copy just in case. + return false; + } + return false; + } else if (srcBuffer) { + WARN_LOG_REPORT_ONCE(btdcpy, G3D, "Memcpy fbo download %08x -> %08x", src, dst); + if (g_Config.bBlockTransferGPU) { + // TODO: Validate x/y/w/h based on size and offset? + ReadFramebufferToMemory(srcBuffer, true, 0, 0, srcBuffer->width, srcBuffer->height); + } + return false; + } else { + return false; } } -bool FramebufferManager::NotifyBlockTransfer(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int width, int height, int bpp) { +u32 FramebufferManager::FramebufferByteSize(const VirtualFramebuffer *vfb) const { + return vfb->fb_stride * vfb->height * (vfb->format == GE_FORMAT_8888 ? 4 : 2); +} + +void FramebufferManager::FindTransferFramebuffers(VirtualFramebuffer *&dstBuffer, VirtualFramebuffer *&srcBuffer, u32 dstBasePtr, int dstStride, int &dstX, int &dstY, u32 srcBasePtr, int srcStride, int &srcX, int &srcY, int bpp) const { + u32 dstYOffset = -1; + u32 srcYOffset = -1; + for (size_t i = 0; i < vfbs_.size(); ++i) { + VirtualFramebuffer *vfb = vfbs_[i]; + const u32 vfb_address = 0x04000000 | vfb->fb_address; + const u32 vfb_size = FramebufferByteSize(vfb); + if (vfb_address <= dstBasePtr && dstBasePtr < vfb_address + vfb_size) { + const u32 yOffset = (dstBasePtr - vfb_address) / (dstStride * bpp); + if (yOffset < dstYOffset) { + dstYOffset = yOffset; + dstBuffer = vfb; + } + } + if (vfb_address <= srcBasePtr && srcBasePtr < vfb_address + vfb_size) { + const u32 yOffset = (srcBasePtr - vfb_address) / (srcStride * bpp); + if (yOffset < srcYOffset) { + srcYOffset = yOffset; + srcBuffer = vfb; + } + } + } + + if (dstYOffset != (u32)-1) { + dstY += dstYOffset; + } + if (srcYOffset >= (u32)-1) { + srcY += srcYOffset; + } +} + +bool FramebufferManager::NotifyBlockTransferBefore(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int width, int height, int bpp) { if (!(g_Config.iRenderingMode == FB_BUFFERED_MODE)) { return false; } - - if (Memory::IsRAMAddress(srcBasePtr) && Memory::IsVRAMAddress(dstBasePtr)) { - // TODO: This causes glitches in Tactics Ogre if we don't implement both ways (which will probably be slow...) - // The main thing this helps is videos, which will have a matching stride, and zero x/y. - if (dstStride == srcStride && dstY == 0 && dstX == 0 && srcX == 0 && srcY == 0) { - UpdateFromMemory(dstBasePtr, (dstY + height) * dstStride * bpp, true); + + // Skip checking if there's no framebuffers in that area. + if (!MayIntersectFramebuffer(srcBasePtr) && !MayIntersectFramebuffer(dstBasePtr)) { + return false; + } + + VirtualFramebuffer *dstBuffer = 0; + VirtualFramebuffer *srcBuffer = 0; + FindTransferFramebuffers(dstBuffer, srcBuffer, dstBasePtr, dstStride, dstX, dstY, srcBasePtr, srcStride, srcX, srcY, bpp); + + if (dstBuffer && srcBuffer) { + if (srcBuffer == dstBuffer) { + WARN_LOG_REPORT_ONCE(dstsrc, G3D, "Intra-buffer block transfer (not supported) %08x -> %08x", srcBasePtr, dstBasePtr); + } else { + WARN_LOG_ONCE(dstnotsrc, G3D, "Inter-buffer block transfer %08x -> %08x", srcBasePtr, dstBasePtr); + // Just do the blit! + if (g_Config.bBlockTransferGPU) { + BlitFramebuffer_(dstBuffer, dstX, dstY, srcBuffer, srcX, srcY, width, height, bpp); + return true; // No need to actually do the memory copy behind, probably. + } } + return false; + } else if (dstBuffer) { + // Here we should just draw the pixels into the buffer. Copy first. + return false; + } else if (srcBuffer) { + WARN_LOG_ONCE(btd, G3D, "Block transfer download %08x -> %08x", srcBasePtr, dstBasePtr); + if (g_Config.bBlockTransferGPU && (srcBuffer == currentRenderVfb_ || !srcBuffer->memoryUpdated)) { + ReadFramebufferToMemory(srcBuffer, true, srcX, srcY, width, height); + } + return false; // Let the bit copy happen + } else { + return false; + } +} + +void FramebufferManager::NotifyBlockTransferAfter(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int width, int height, int bpp) { + if (!(g_Config.iRenderingMode == FB_BUFFERED_MODE)) { + return; } // A few games use this INSTEAD of actually drawing the video image to the screen, they just blast it to @@ -1858,6 +1997,7 @@ bool FramebufferManager::NotifyBlockTransfer(u32 dstBasePtr, int dstStride, int u32 backBuffer = PrevDisplayFramebufAddr(); u32 displayBuffer = DisplayFramebufAddr(); + // TODO: Is this not handled by upload? Should we check !dstBuffer to avoid a double copy? if (((backBuffer != 0 && dstBasePtr == backBuffer) || (displayBuffer != 0 && dstBasePtr == displayBuffer)) && dstStride == 512 && height == 272) { @@ -1865,51 +2005,32 @@ bool FramebufferManager::NotifyBlockTransfer(u32 dstBasePtr, int dstStride, int DrawFramebuffer(Memory::GetPointerUnchecked(dstBasePtr), GE_FORMAT_8888, 512, false); } - VirtualFramebuffer *dstBuffer = 0; - VirtualFramebuffer *srcBuffer = 0; - for (size_t i = 0; i < vfbs_.size(); ++i) { - VirtualFramebuffer *vfb = vfbs_[i]; - const u32 vfb_address = 0x04000000 | vfb->fb_address; - const u32 vfb_size = vfb->fb_stride * vfb->height * (vfb->format == GE_FORMAT_8888 ? 4 : 2); - if (vfb_address <= dstBasePtr && dstBasePtr < vfb_address + vfb_size) { - dstY += (dstBasePtr - vfb_address) / (dstStride * bpp); - dstBuffer = vfb; - } - if (vfb_address <= srcBasePtr && srcBasePtr < vfb_address + vfb_size) { - srcY += (srcBasePtr - vfb_address) / (srcStride * bpp); - srcBuffer = vfb; - } - } + if (MayIntersectFramebuffer(srcBasePtr) || MayIntersectFramebuffer(dstBasePtr)) { + VirtualFramebuffer *dstBuffer = 0; + VirtualFramebuffer *srcBuffer = 0; + FindTransferFramebuffers(dstBuffer, srcBuffer, dstBasePtr, dstStride, dstX, dstY, srcBasePtr, srcStride, srcX, srcY, bpp); - if (dstBuffer && srcBuffer) { - if (srcBuffer == dstBuffer) { - WARN_LOG_REPORT_ONCE(dstsrc, G3D, "Intra-buffer block transfer (not supported) %08x -> %08x", srcBasePtr, dstBasePtr); - } else { - WARN_LOG_ONCE(dstnotsrc, G3D, "Inter-buffer block transfer %08x -> %08x", srcBasePtr, dstBasePtr); - // Just do the blit! - // TODO: Possibly take bpp into account somehow if games are doing really crazy things? + if (dstBuffer && !srcBuffer) { + WARN_LOG_REPORT_ONCE(btu, G3D, "Block transfer upload %08x -> %08x", srcBasePtr, dstBasePtr); if (g_Config.bBlockTransferGPU) { - BlitFramebuffer_(dstBuffer, dstX, dstY, srcBuffer, srcX, srcY, width, height, bpp); - return true; // No need to actually do the memory copy behind, probably. + const u8 *srcBase = Memory::GetPointerUnchecked(srcBasePtr) + (srcX + srcY * srcStride) * bpp; + fbo_bind_as_render_target(dstBuffer->fbo); + int dstBpp = dstBuffer->format == GE_FORMAT_8888 ? 4 : 2; + float dstXFactor = (float)bpp / dstBpp; + glViewport(0, 0, dstBuffer->renderWidth, dstBuffer->renderHeight); + DrawPixels(dstBuffer, dstX * dstXFactor, dstY, srcBase, dstBuffer->format, srcStride * dstXFactor, width * dstXFactor, height); + dstBuffer->dirtyAfterDisplay = true; + if ((gstate_c.skipDrawReason & SKIPDRAW_SKIPFRAME) == 0) + dstBuffer->reallyDirtyAfterDisplay = true; + if (currentRenderVfb_) { + fbo_bind_as_render_target(currentRenderVfb_->fbo); + } else { + fbo_unbind(); + } + glstate.viewport.restore(); + gstate_c.textureChanged = TEXCHANGE_PARAMSONLY; } } - return false; - } else if (dstBuffer) { - WARN_LOG_REPORT_ONCE(btu, G3D, "Block transfer upload (not supported) %08x -> %08x", srcBasePtr, dstBasePtr); - if (g_Config.bBlockTransferGPU) { - u8 *srcBase = Memory::GetPointerUnchecked(srcBasePtr) + (srcX + srcY * srcStride) * bpp; - DrawPixels(dstBuffer, dstX, dstY, srcBase, dstBuffer->format, srcStride * bpp, width, height); - } - // Here we should just draw the pixels into the buffer. - return false; - } else if (srcBuffer && g_Config.iRenderingMode == FB_BUFFERED_MODE) { - WARN_LOG_ONCE(btd, G3D, "Block transfer download %08x -> %08x", srcBasePtr, dstBasePtr); - if (g_Config.bBlockTransferGPU) { - ReadFramebufferToMemory(srcBuffer, true, srcX, srcY, width, height); - } - return false; // Let the bit copy happen - } else { - return false; } } diff --git a/GPU/GLES/Framebuffer.h b/GPU/GLES/Framebuffer.h index a3ebf726c8..e23ae84cd1 100644 --- a/GPU/GLES/Framebuffer.h +++ b/GPU/GLES/Framebuffer.h @@ -166,7 +166,8 @@ public: // Returns true if it's sure this is a direct FBO->FBO transfer and it has already handle it. // In that case we hardly need to actually copy the bytes in VRAM, they will be wrong anyway (unless // read framebuffers is on, in which case this should always return false). - bool NotifyBlockTransfer(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int w, int h, int bpp); + bool NotifyBlockTransferBefore(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int w, int h, int bpp); + void NotifyBlockTransferAfter(u32 dstBasePtr, int dstStride, int dstX, int dstY, u32 srcBasePtr, int srcStride, int srcX, int srcY, int w, int h, int bpp); // Reads a rectangular subregion of a framebuffer to the right position in its backing memory. void ReadFramebufferToMemory(VirtualFramebuffer *vfb, bool sync, int x, int y, int w, int h); @@ -199,7 +200,17 @@ public: } } - void NotifyFramebufferCopy(u32 src, u32 dest, int size); + bool MayIntersectFramebuffer(u32 start) { + // Clear the cache/kernel bits. + start = start & 0x3FFFFFFF; + // Most games only have two framebuffers at the start. + if (start >= framebufRangeEnd_ || start < PSP_GetVidMemBase()) { + return false; + } + return true; + } + + bool NotifyFramebufferCopy(u32 src, u32 dest, int size); void DestroyFramebuf(VirtualFramebuffer *vfb); @@ -211,6 +222,9 @@ private: void CompileDraw2DProgram(); void DestroyDraw2DProgram(); + void FindTransferFramebuffers(VirtualFramebuffer *&dstBuffer, VirtualFramebuffer *&srcBuffer, u32 dstBasePtr, int dstStride, int &dstX, int &dstY, u32 srcBasePtr, int srcStride, int &srcX, int &srcY, int bpp) const; + u32 FramebufferByteSize(const VirtualFramebuffer *vfb) const; + void SetNumExtraFBOs(int num); u32 displayFramebufPtr_; @@ -227,11 +241,11 @@ private: VirtualFramebuffer *currentRenderVfb_; // Used by ReadFramebufferToMemory and later framebuffer block copies - void BlitFramebuffer_(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp); + void BlitFramebuffer_(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp, bool flip = false); #ifndef USING_GLES2 void PackFramebufferAsync_(VirtualFramebuffer *vfb); #endif - void PackFramebufferSync_(VirtualFramebuffer *vfb); + void PackFramebufferSync_(VirtualFramebuffer *vfb, int x, int y, int w, int h); // Used by DrawPixels unsigned int drawPixelsTex_; @@ -239,7 +253,8 @@ private: int drawPixelsTexW_; int drawPixelsTexH_; - u8 *convBuf; + u8 *convBuf_; + u32 convBufSize_; GLSLProgram *draw2dprogram_; GLSLProgram *plainColorProgram_; GLSLProgram *postShaderProgram_; @@ -257,7 +272,11 @@ private: bool resized_; bool useBufferedRendering_; bool updateVRAM_; - + bool gameUsesSequentialCopies_; + + // The range of PSP memory that may contain FBOs. So we can skip iterating. + u32 framebufRangeEnd_; + std::vector bvfbs_; // blitting FBOs std::map, FBO *> renderCopies_; diff --git a/GPU/GLES/GLES_GPU.cpp b/GPU/GLES/GLES_GPU.cpp index 679eb5496e..dbda6f590b 100644 --- a/GPU/GLES/GLES_GPU.cpp +++ b/GPU/GLES/GLES_GPU.cpp @@ -661,6 +661,10 @@ void GLES_GPU::ProcessEvent(GPUEvent ev) { InvalidateCacheInternal(ev.invalidate_cache.addr, ev.invalidate_cache.size, ev.invalidate_cache.type); break; + case GPU_EVENT_FB_MEMCPY: + UpdateMemoryInternal(ev.fb_memcpy.dst, ev.fb_memcpy.src, ev.fb_memcpy.size); + break; + default: GPUCommon::ProcessEvent(ev); } @@ -1908,30 +1912,31 @@ void GLES_GPU::DoBlockTransfer() { return; } - // Do the copy! (Hm, if we detect a drawn video frame (see below) then we could maybe skip this?) - // Can use GetPointerUnchecked because we checked the addresses above. We could also avoid them - // entirely by walking a couple of pointers... - if (srcStride == dstStride && width == srcStride) { - // Common case in God of War, let's do it all in one chunk. - u32 srcLineStartAddr = srcBasePtr + (srcY * srcStride + srcX) * bpp; - u32 dstLineStartAddr = dstBasePtr + (dstY * dstStride + dstX) * bpp; - const u8 *src = Memory::GetPointerUnchecked(srcLineStartAddr); - u8 *dst = Memory::GetPointerUnchecked(dstLineStartAddr); - memcpy(dst, src, width * height * bpp); - } else { - for (int y = 0; y < height; y++) { - u32 srcLineStartAddr = srcBasePtr + ((y + srcY) * srcStride + srcX) * bpp; - u32 dstLineStartAddr = dstBasePtr + ((y + dstY) * dstStride + dstX) * bpp; - + // Tell the framebuffer manager to take action if possible. If it does the entire thing, let's just return. + if (!framebufferManager_.NotifyBlockTransferBefore(dstBasePtr, dstStride, dstX, dstY, srcBasePtr, srcStride, srcX, srcY, width, height, bpp)) { + // Do the copy! (Hm, if we detect a drawn video frame (see below) then we could maybe skip this?) + // Can use GetPointerUnchecked because we checked the addresses above. We could also avoid them + // entirely by walking a couple of pointers... + if (srcStride == dstStride && width == srcStride) { + // Common case in God of War, let's do it all in one chunk. + u32 srcLineStartAddr = srcBasePtr + (srcY * srcStride + srcX) * bpp; + u32 dstLineStartAddr = dstBasePtr + (dstY * dstStride + dstX) * bpp; const u8 *src = Memory::GetPointerUnchecked(srcLineStartAddr); u8 *dst = Memory::GetPointerUnchecked(dstLineStartAddr); - memcpy(dst, src, width * bpp); - } - } + memcpy(dst, src, width * height * bpp); + } else { + for (int y = 0; y < height; y++) { + u32 srcLineStartAddr = srcBasePtr + ((y + srcY) * srcStride + srcX) * bpp; + u32 dstLineStartAddr = dstBasePtr + ((y + dstY) * dstStride + dstX) * bpp; + + const u8 *src = Memory::GetPointerUnchecked(srcLineStartAddr); + u8 *dst = Memory::GetPointerUnchecked(dstLineStartAddr); + memcpy(dst, src, width * bpp); + } + } - // Tell the framebuffer manager to take action if possible. If it does the entire thing, let's just return. - if (!framebufferManager_.NotifyBlockTransfer(dstBasePtr, dstStride, dstX, dstY, srcBasePtr, srcStride, srcX, srcY, width, height, bpp)) { textureCache_.Invalidate(dstBasePtr + (dstY * dstStride + dstX) * bpp, height * dstStride * bpp, GPU_INVALIDATE_HINT); + framebufferManager_.NotifyBlockTransferAfter(dstBasePtr, dstStride, dstX, dstY, srcBasePtr, srcStride, srcX, srcY, width, height, bpp); } #ifndef MOBILE_DEVICE @@ -1954,19 +1959,46 @@ void GLES_GPU::InvalidateCacheInternal(u32 addr, int size, GPUInvalidationType t else textureCache_.InvalidateAll(type); - if (type != GPU_INVALIDATE_ALL) - framebufferManager_.UpdateFromMemory(addr, size, type == GPU_INVALIDATE_SAFE); + if (type != GPU_INVALIDATE_ALL && framebufferManager_.MayIntersectFramebuffer(addr)) { + // If we're doing block transfers, we shouldn't need this, and it'll only confuse us. + // Vempire invalidates (with writeback) after drawing, but before blitting. + if (!g_Config.bBlockTransferGPU || type == GPU_INVALIDATE_SAFE) { + framebufferManager_.UpdateFromMemory(addr, size, type == GPU_INVALIDATE_SAFE); + } + } } -void GLES_GPU::UpdateMemory(u32 dest, u32 src, int size) { - InvalidateCache(dest, size, GPU_INVALIDATE_HINT); - - // Track stray copies of a framebuffer in RAM. MotoGP does this. - if (Memory::IsVRAMAddress(src) && Memory::IsRAMAddress(dest)) { - framebufferManager_.NotifyFramebufferCopy(src, dest, size); +void GLES_GPU::UpdateMemoryInternal(u32 dest, u32 src, int size) { + if (!framebufferManager_.NotifyFramebufferCopy(src, dest, size)) { + Memory::Memcpy(dest, Memory::GetPointer(src), size); + InvalidateCache(dest, size, GPU_INVALIDATE_HINT); + } else { + InvalidateCache(dest, size, GPU_INVALIDATE_HINT); } } +bool GLES_GPU::UpdateMemory(u32 dest, u32 src, int size) { + // Track stray copies of a framebuffer in RAM. MotoGP does this. + if (framebufferManager_.MayIntersectFramebuffer(src) || framebufferManager_.MayIntersectFramebuffer(dest)) { + if (IsOnSeparateCPUThread()) { + GPUEvent ev(GPU_EVENT_FB_MEMCPY); + ev.fb_memcpy.dst = dest; + ev.fb_memcpy.src = src; + ev.fb_memcpy.size = size; + ScheduleEvent(ev); + + // This is a memcpy, so we need to wait for it to complete. + SyncThread(); + } else { + UpdateMemoryInternal(dest, src, size); + } + return true; + } + + InvalidateCache(dest, size, GPU_INVALIDATE_HINT); + return false; +} + void GLES_GPU::ClearCacheNextFrame() { textureCache_.ClearNextFrame(); } diff --git a/GPU/GLES/GLES_GPU.h b/GPU/GLES/GLES_GPU.h index 930b5153c6..872811c913 100644 --- a/GPU/GLES/GLES_GPU.h +++ b/GPU/GLES/GLES_GPU.h @@ -44,7 +44,7 @@ public: virtual void BeginFrame(); virtual void UpdateStats(); virtual void InvalidateCache(u32 addr, int size, GPUInvalidationType type); - virtual void UpdateMemory(u32 dest, u32 src, int size); + virtual bool UpdateMemory(u32 dest, u32 src, int size); virtual void ClearCacheNextFrame(); virtual void DeviceLost(); // Only happens on Android. Drop all textures and shaders. @@ -151,6 +151,7 @@ private: void InitClearInternal(); void BeginFrameInternal(); void CopyDisplayToOutputInternal(); + void UpdateMemoryInternal(u32 dest, u32 src, int size); void InvalidateCacheInternal(u32 addr, int size, GPUInvalidationType type); static CommandInfo cmdInfo_[256]; diff --git a/GPU/GPUInterface.h b/GPU/GPUInterface.h index 62d9bae30a..65f9d8a351 100644 --- a/GPU/GPUInterface.h +++ b/GPU/GPUInterface.h @@ -163,6 +163,7 @@ enum GPUEventType { GPU_EVENT_INVALIDATE_CACHE, GPU_EVENT_FINISH_EVENT_LOOP, GPU_EVENT_SYNC_THREAD, + GPU_EVENT_FB_MEMCPY, }; struct GPUEvent { @@ -175,6 +176,12 @@ struct GPUEvent { int size; GPUInvalidationType type; } invalidate_cache; + // GPU_EVENT_FB_MEMCPY + struct { + u32 dst; + u32 src; + int size; + } fb_memcpy; }; operator GPUEventType() const { @@ -227,7 +234,7 @@ public: // If size = -1, invalidate everything. virtual void InvalidateCache(u32 addr, int size, GPUInvalidationType type) = 0; // Update either RAM from VRAM, or VRAM from RAM... or even VRAM from VRAM. - virtual void UpdateMemory(u32 dest, u32 src, int size) = 0; + virtual bool UpdateMemory(u32 dest, u32 src, int size) = 0; // Will cause the texture cache to be cleared at the start of the next frame. virtual void ClearCacheNextFrame() = 0; diff --git a/GPU/Null/NullGpu.cpp b/GPU/Null/NullGpu.cpp index bfb5f2167a..5b881e86e4 100644 --- a/GPU/Null/NullGpu.cpp +++ b/GPU/Null/NullGpu.cpp @@ -657,7 +657,8 @@ void NullGPU::InvalidateCache(u32 addr, int size, GPUInvalidationType type) { // Nothing to invalidate. } -void NullGPU::UpdateMemory(u32 dest, u32 src, int size) { +bool NullGPU::UpdateMemory(u32 dest, u32 src, int size) { // Nothing to update. InvalidateCache(dest, size, GPU_INVALIDATE_HINT); + return false; } diff --git a/GPU/Null/NullGpu.h b/GPU/Null/NullGpu.h index 050588cdd8..db236f002c 100644 --- a/GPU/Null/NullGpu.h +++ b/GPU/Null/NullGpu.h @@ -34,7 +34,7 @@ public: virtual void CopyDisplayToOutput() {} virtual void UpdateStats(); virtual void InvalidateCache(u32 addr, int size, GPUInvalidationType type); - virtual void UpdateMemory(u32 dest, u32 src, int size); + virtual bool UpdateMemory(u32 dest, u32 src, int size); virtual void ClearCacheNextFrame() {}; virtual void DeviceLost() {} diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index 5e13362ff1..994a9dc307 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -851,12 +851,13 @@ void SoftGPU::InvalidateCache(u32 addr, int size, GPUInvalidationType type) // Nothing to invalidate. } -void SoftGPU::UpdateMemory(u32 dest, u32 src, int size) +bool SoftGPU::UpdateMemory(u32 dest, u32 src, int size) { // Nothing to update. InvalidateCache(dest, size, GPU_INVALIDATE_HINT); // Let's just be safe. framebufferDirty_ = true; + return false; } bool SoftGPU::FramebufferDirty() { diff --git a/GPU/Software/SoftGpu.h b/GPU/Software/SoftGpu.h index 940fd1669c..038d79e9bd 100644 --- a/GPU/Software/SoftGpu.h +++ b/GPU/Software/SoftGpu.h @@ -59,7 +59,7 @@ public: virtual void CopyDisplayToOutput(); virtual void UpdateStats(); virtual void InvalidateCache(u32 addr, int size, GPUInvalidationType type); - virtual void UpdateMemory(u32 dest, u32 src, int size); + virtual bool UpdateMemory(u32 dest, u32 src, int size); virtual void ClearCacheNextFrame() {}; virtual void DeviceLost() {}