From e3bdf1a70bd8053f99356d154b3032beba6f2d87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 29 Aug 2023 11:46:24 +0200 Subject: [PATCH 1/2] Add heuristic, fixing video flicker in Naruto UNH 2 caused by copy to wrong target. --- GPU/Common/FramebufferManagerCommon.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp index 0a0b1633b8..8a34b03146 100644 --- a/GPU/Common/FramebufferManagerCommon.cpp +++ b/GPU/Common/FramebufferManagerCommon.cpp @@ -1868,10 +1868,19 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size, if (!ignoreDstBuffer && vfb_address == dst && ((size == 0x44000 && vfb_size == 0x88000) || (size == 0x88000 && vfb_size == 0x44000))) { // Not likely to be a correct color format copy for this buffer. Ignore it, there will either be RAM // that can be displayed from, or another matching buffer with the right format if rendering is going on. + // If we had scoring here, we should strongly penalize this target instead of ignoring it. WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x conspicuously not matching copy size %08x in NotifyFramebufferCopy. Ignoring.", size, vfb_size); continue; } + if ((u32)size > vfb_size + 0x1000 && vfb->fb_format != GE_FORMAT_8888 && vfb->last_frame_render < gpuStats.numFlips) { + // Seems likely we are looking at a potential copy of 32-bit pixels (like video) to an old 16-bit buffer, + // which is very likely simply the wrong target, so skip it. See issue #17740 where this happens in Naruto Ultimate Ninja Heroes 2. + // If we had scoring here, we should strongly penalize this target instead of ignoring it. + WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x too small for %08x bytes of data and also 16-bit (%s), and not rendered to this frame. Ignoring.", vfb_size, size, GeBufferFormatToString(vfb->fb_format)); + continue; + } + if (!ignoreDstBuffer && dst >= vfb_address && (dst + size <= vfb_address + vfb_size || dst == vfb_address)) { const u32 offset = dst - vfb_address; const u32 yOffset = offset / vfb_byteStride; From c563d4e57d9e79b39b9fd0d3ac3e782da00a9f3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 29 Aug 2023 12:53:18 +0200 Subject: [PATCH 2/2] NotifyFramebufferCopy: Pick the target framebuffer by scoring. --- GPU/Common/FramebufferManagerCommon.cpp | 156 ++++++++++++++---------- 1 file changed, 93 insertions(+), 63 deletions(-) diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp index 8a34b03146..5cd77dd8d3 100644 --- a/GPU/Common/FramebufferManagerCommon.cpp +++ b/GPU/Common/FramebufferManagerCommon.cpp @@ -1842,21 +1842,31 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size, // Or at least this should be like the other ones, gathering possible candidates // with the ability to list them out for debugging. - VirtualFramebuffer *dstBuffer = nullptr; - VirtualFramebuffer *srcBuffer = nullptr; bool ignoreDstBuffer = flags & GPUCopyFlag::FORCE_DST_MATCH_MEM; bool ignoreSrcBuffer = flags & (GPUCopyFlag::FORCE_SRC_MATCH_MEM | GPUCopyFlag::MEMSET); RasterChannel channel = flags & GPUCopyFlag::DEPTH_REQUESTED ? RASTER_DEPTH : RASTER_COLOR; - u32 dstY = (u32)-1; - u32 dstH = 0; - u32 srcY = (u32)-1; - u32 srcH = 0; + struct CopyCandidate { + VirtualFramebuffer *vfb; + int score; + + VirtualFramebuffer *dstBuffer = nullptr; + VirtualFramebuffer *srcBuffer = nullptr; + + u32 dstY = (u32)-1; + u32 dstH = 0; + u32 srcY = (u32)-1; + u32 srcH = 0; + }; + + TinySet candidates; for (auto vfb : vfbs_) { if (vfb->fb_stride == 0 || channel != RASTER_COLOR) { continue; } + CopyCandidate candidate{ vfb, 10 }; + // We only remove the kernel and uncached bits when comparing. const u32 vfb_address = vfb->fb_address; const u32 vfb_size = vfb->BufferByteSize(RASTER_COLOR); @@ -1869,129 +1879,149 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size, // Not likely to be a correct color format copy for this buffer. Ignore it, there will either be RAM // that can be displayed from, or another matching buffer with the right format if rendering is going on. // If we had scoring here, we should strongly penalize this target instead of ignoring it. - WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x conspicuously not matching copy size %08x in NotifyFramebufferCopy. Ignoring.", size, vfb_size); - continue; + WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x conspicuously not matching copy size %08x in NotifyFramebufferCopy. Downscoring.", size, vfb_size); + candidate.score = 2; } if ((u32)size > vfb_size + 0x1000 && vfb->fb_format != GE_FORMAT_8888 && vfb->last_frame_render < gpuStats.numFlips) { // Seems likely we are looking at a potential copy of 32-bit pixels (like video) to an old 16-bit buffer, // which is very likely simply the wrong target, so skip it. See issue #17740 where this happens in Naruto Ultimate Ninja Heroes 2. // If we had scoring here, we should strongly penalize this target instead of ignoring it. - WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x too small for %08x bytes of data and also 16-bit (%s), and not rendered to this frame. Ignoring.", vfb_size, size, GeBufferFormatToString(vfb->fb_format)); - continue; + WARN_LOG_N_TIMES(notify_copy_2x, 5, G3D, "Framebuffer size %08x too small for %08x bytes of data and also 16-bit (%s), and not rendered to this frame. Downscoring.", vfb_size, size, GeBufferFormatToString(vfb->fb_format)); + candidate.score = 1; } if (!ignoreDstBuffer && dst >= vfb_address && (dst + size <= vfb_address + vfb_size || dst == vfb_address)) { const u32 offset = dst - vfb_address; const u32 yOffset = offset / vfb_byteStride; - if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0) && yOffset < dstY) { - dstBuffer = vfb; - dstY = yOffset; - dstH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height); + if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0)) { + candidate.dstBuffer = vfb; + candidate.dstY = yOffset; + candidate.dstH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height); } } if (!ignoreSrcBuffer && src >= vfb_address && (src + size <= vfb_address + vfb_size || src == vfb_address)) { const u32 offset = src - vfb_address; const u32 yOffset = offset / vfb_byteStride; - if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0) && yOffset < srcY) { - srcBuffer = vfb; - srcY = yOffset; - srcH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height); - } else if ((offset % vfb_byteStride) == 0 && size == vfb->fb_stride && yOffset < srcY) { + if ((offset % vfb_byteStride) == 0 && (size == vfb_byteWidth || (size % vfb_byteStride) == 0)) { + candidate.srcBuffer = vfb; + candidate.srcY = yOffset; + candidate.srcH = size == vfb_byteWidth ? 1 : std::min((u32)size / vfb_byteStride, (u32)vfb->height); + } else if ((offset % vfb_byteStride) == 0 && size == vfb->fb_stride) { // Valkyrie Profile reads 512 bytes at a time, rather than 2048. So, let's whitelist fb_stride also. - srcBuffer = vfb; - srcY = yOffset; - srcH = 1; - } else if (yOffset == 0 && yOffset < srcY) { + candidate.srcBuffer = vfb; + candidate.srcY = yOffset; + candidate.srcH = 1; + } else if (yOffset == 0) { // Okay, last try - it might be a clut. if (vfb->usageFlags & FB_USAGE_CLUT) { - srcBuffer = vfb; - srcY = yOffset; - srcH = 1; + candidate.srcBuffer = vfb; + candidate.srcY = yOffset; + candidate.srcH = 1; } } } + candidates.push_back(candidate); } if (channel == RASTER_DEPTH) { - srcBuffer = nullptr; - dstBuffer = nullptr; - // Let's assume exact matches only for simplicity. for (auto vfb : vfbs_) { + CopyCandidate candidate{ vfb, 10 }; + candidate.srcBuffer = nullptr; + candidate.dstBuffer = nullptr; if (!ignoreDstBuffer && dst == vfb->z_address && size == vfb->z_stride * 2 * vfb->height) { - if (!dstBuffer || dstBuffer->depthBindSeq < vfb->depthBindSeq) { - dstBuffer = vfb; - dstY = 0; - dstH = vfb->height; + if (!candidate.dstBuffer || candidate.dstBuffer->depthBindSeq < vfb->depthBindSeq) { + candidate.dstBuffer = vfb; + candidate.dstY = 0; + candidate.dstH = vfb->height; } } if (!ignoreSrcBuffer && src == vfb->z_address && size == vfb->z_stride * 2 * vfb->height) { - if (!srcBuffer || srcBuffer->depthBindSeq < vfb->depthBindSeq) { - srcBuffer = vfb; - srcY = 0; - srcH = vfb->height; + if (!candidate.srcBuffer || candidate.srcBuffer->depthBindSeq < vfb->depthBindSeq) { + candidate.srcBuffer = vfb; + candidate.srcY = 0; + candidate.srcH = vfb->height; } } + candidates.push_back(candidate); } } + // Currently we find the first or best one, but maybe we should just copy to all? + CopyCandidate *best = nullptr; + for (size_t i = 0; i < candidates.size(); i++) { + if (!best) { + best = &candidates[i]; + } + if (candidates[i].score > best->score) { + best = &candidates[i]; + } + // In the old code, lower Y won, so let's replicate that. + if (candidates[i].score == best->score && candidates[i].dstY < best->dstY) { + best = &candidates[i]; + } + } + if (!best) { + return false; + } + if (!useBufferedRendering_) { // If we're copying into a recently used display buf, it's probably destined for the screen. - if (channel == RASTER_DEPTH || srcBuffer || (dstBuffer != displayFramebuf_ && dstBuffer != prevDisplayFramebuf_)) { + if (channel == RASTER_DEPTH || best->srcBuffer || (best->dstBuffer != displayFramebuf_ && best->dstBuffer != prevDisplayFramebuf_)) { return false; } } - if (!dstBuffer && srcBuffer && channel != RASTER_DEPTH) { + if (!best->dstBuffer && best->srcBuffer && channel != RASTER_DEPTH) { // Note - if we're here, we're in a memcpy, not a block transfer. Not allowing IntraVRAMBlockTransferAllowCreateFB. // Technically, that makes BlockTransferAllowCreateFB a bit of a misnomer. if (PSP_CoreParameter().compat.flags().BlockTransferAllowCreateFB && !(flags & GPUCopyFlag::DISALLOW_CREATE_VFB)) { - dstBuffer = CreateRAMFramebuffer(dst, srcBuffer->width, srcBuffer->height, srcBuffer->fb_stride, srcBuffer->fb_format); - dstY = 0; + best->dstBuffer = CreateRAMFramebuffer(dst, best->srcBuffer->width, best->srcBuffer->height, best->srcBuffer->fb_stride, best->srcBuffer->fb_format); + best->dstY = 0; } } - if (dstBuffer) { - dstBuffer->last_frame_used = gpuStats.numFlips; - if (channel == RASTER_DEPTH && !srcBuffer) - dstBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH; + if (best->dstBuffer) { + best->dstBuffer->last_frame_used = gpuStats.numFlips; + if (channel == RASTER_DEPTH && !best->srcBuffer) + best->dstBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH; } - if (srcBuffer && channel == RASTER_DEPTH && !dstBuffer) - srcBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH; + if (best->srcBuffer && channel == RASTER_DEPTH && !best->dstBuffer) + best->srcBuffer->usageFlags |= FB_USAGE_COLOR_MIXED_DEPTH; - if (dstBuffer && srcBuffer) { - if (srcBuffer == dstBuffer) { + if (best->dstBuffer && best->srcBuffer) { + if (best->srcBuffer == best->dstBuffer) { WARN_LOG_ONCE(dstsrccpy, G3D, "Intra-buffer memcpy (not supported) %08x -> %08x (size: %x)", src, dst, size); } else { WARN_LOG_ONCE(dstnotsrccpy, G3D, "Inter-buffer memcpy %08x -> %08x (size: %x)", src, dst, size); // Just do the blit! - BlitFramebuffer(dstBuffer, 0, dstY, srcBuffer, 0, srcY, srcBuffer->width, srcH, 0, channel, "Blit_InterBufferMemcpy"); - SetColorUpdated(dstBuffer, skipDrawReason); + BlitFramebuffer(best->dstBuffer, 0, best->dstY, best->srcBuffer, 0, best->srcY, best->srcBuffer->width, best->srcH, 0, channel, "Blit_InterBufferMemcpy"); + SetColorUpdated(best->dstBuffer, skipDrawReason); RebindFramebuffer("RebindFramebuffer - Inter-buffer memcpy"); } return false; - } else if (dstBuffer) { + } else if (best->dstBuffer) { if (flags & GPUCopyFlag::MEMSET) { gpuStats.numClears++; } WARN_LOG_ONCE(btucpy, G3D, "Memcpy fbo upload %08x -> %08x (size: %x)", src, dst, size); FlushBeforeCopy(); const u8 *srcBase = Memory::GetPointerUnchecked(src); - GEBufferFormat srcFormat = channel == RASTER_DEPTH ? GE_FORMAT_DEPTH16 : dstBuffer->fb_format; - int srcStride = channel == RASTER_DEPTH ? dstBuffer->z_stride : dstBuffer->fb_stride; - DrawPixels(dstBuffer, 0, dstY, srcBase, srcFormat, srcStride, dstBuffer->width, dstH, channel, "MemcpyFboUpload_DrawPixels"); - SetColorUpdated(dstBuffer, skipDrawReason); + GEBufferFormat srcFormat = channel == RASTER_DEPTH ? GE_FORMAT_DEPTH16 : best->dstBuffer->fb_format; + int srcStride = channel == RASTER_DEPTH ? best->dstBuffer->z_stride : best->dstBuffer->fb_stride; + DrawPixels(best->dstBuffer, 0, best->dstY, srcBase, srcFormat, srcStride, best->dstBuffer->width, best->dstH, channel, "MemcpyFboUpload_DrawPixels"); + SetColorUpdated(best->dstBuffer, skipDrawReason); RebindFramebuffer("RebindFramebuffer - Memcpy fbo upload"); // This is a memcpy, let's still copy just in case. return false; - } else if (srcBuffer) { + } else if (best->srcBuffer) { WARN_LOG_ONCE(btdcpy, G3D, "Memcpy fbo download %08x -> %08x", src, dst); FlushBeforeCopy(); - if (srcH == 0 || srcY + srcH > srcBuffer->bufferHeight) { - WARN_LOG_ONCE(btdcpyheight, G3D, "Memcpy fbo download %08x -> %08x skipped, %d+%d is taller than %d", src, dst, srcY, srcH, srcBuffer->bufferHeight); - } else if (!g_Config.bSkipGPUReadbacks && (!srcBuffer->memoryUpdated || channel == RASTER_DEPTH)) { - ReadFramebufferToMemory(srcBuffer, 0, srcY, srcBuffer->width, srcH, channel, Draw::ReadbackMode::BLOCK); - srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR; + if (best->srcH == 0 || best->srcY + best->srcH > best->srcBuffer->bufferHeight) { + WARN_LOG_ONCE(btdcpyheight, G3D, "Memcpy fbo download %08x -> %08x skipped, %d+%d is taller than %d", src, dst, best->srcY, best->srcH, best->srcBuffer->bufferHeight); + } else if (!g_Config.bSkipGPUReadbacks && (!best->srcBuffer->memoryUpdated || channel == RASTER_DEPTH)) { + ReadFramebufferToMemory(best->srcBuffer, 0, best->srcY, best->srcBuffer->width, best->srcH, channel, Draw::ReadbackMode::BLOCK); + best->srcBuffer->usageFlags = (best->srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR; } return false; } else { @@ -2115,7 +2145,7 @@ bool FramebufferManagerCommon::FindTransferFramebuffer(u32 basePtr, int stride_p } const BlockTransferRect *best = nullptr; - // Sort candidates by just recency for now, we might add other. + // Use some heuristics to find the best candidate. for (size_t i = 0; i < candidates.size(); i++) { const BlockTransferRect *candidate = &candidates[i];