From d243d45741114e029ec5a2977bf289abefbaba6f Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 24 Nov 2022 16:33:31 -0800 Subject: [PATCH 1/3] GPU: Upload stencil to latest buffer. Just being safe. --- GPU/Common/StencilCommon.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/GPU/Common/StencilCommon.cpp b/GPU/Common/StencilCommon.cpp index 682d8a1f0c..0f454bd076 100644 --- a/GPU/Common/StencilCommon.cpp +++ b/GPU/Common/StencilCommon.cpp @@ -135,10 +135,11 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size, return false; } - VirtualFramebuffer *dstBuffer = 0; + VirtualFramebuffer *dstBuffer = nullptr; for (size_t i = 0; i < vfbs_.size(); ++i) { VirtualFramebuffer *vfb = vfbs_[i]; - if (vfb->fb_address == addr) { + // TODO: Maybe we should broadcast to all? Most of the time, there's only one. + if (vfb->fb_address == addr && (!dstBuffer || dstBuffer->colorBindSeq < vfb->colorBindSeq)) { dstBuffer = vfb; } } From b33662550f0b792edd2e34219de7d18c98cd7bbf Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 24 Nov 2022 18:48:46 -0800 Subject: [PATCH 2/3] Vulkan: Use stencil export when available. This prevents multiple passes to upload stencil buffers. --- Common/GPU/D3D11/thin3d_d3d11.cpp | 1 + Common/GPU/D3D9/thin3d_d3d9.cpp | 1 + Common/GPU/OpenGL/GLFeatures.cpp | 1 + Common/GPU/OpenGL/GLFeatures.h | 1 + Common/GPU/OpenGL/thin3d_gl.cpp | 1 + Common/GPU/Vulkan/thin3d_vulkan.cpp | 1 + Common/GPU/thin3d.h | 1 + GPU/Common/StencilCommon.cpp | 111 +++++++++++++++++----------- GPU/Common/StencilCommon.h | 2 +- unittest/TestShaderGenerators.cpp | 26 ++++--- 10 files changed, 91 insertions(+), 55 deletions(-) diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp index 46713b53f3..8b1d5c00e7 100644 --- a/Common/GPU/D3D11/thin3d_d3d11.cpp +++ b/Common/GPU/D3D11/thin3d_d3d11.cpp @@ -268,6 +268,7 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de caps_.anisoSupported = true; caps_.textureNPOTFullySupported = true; caps_.fragmentShaderDepthWriteSupported = true; + caps_.fragmentShaderStencilWriteSupported = false; caps_.blendMinMaxSupported = true; D3D11_FEATURE_DATA_D3D11_OPTIONS options{}; diff --git a/Common/GPU/D3D9/thin3d_d3d9.cpp b/Common/GPU/D3D9/thin3d_d3d9.cpp index 8de75b09d0..207a09e676 100644 --- a/Common/GPU/D3D9/thin3d_d3d9.cpp +++ b/Common/GPU/D3D9/thin3d_d3d9.cpp @@ -758,6 +758,7 @@ D3D9Context::D3D9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, ID caps_.framebufferSeparateDepthCopySupported = false; caps_.texture3DSupported = true; caps_.fragmentShaderDepthWriteSupported = true; + caps_.fragmentShaderStencilWriteSupported = false; caps_.blendMinMaxSupported = true; if ((caps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && caps.MaxAnisotropy > 1) { diff --git a/Common/GPU/OpenGL/GLFeatures.cpp b/Common/GPU/OpenGL/GLFeatures.cpp index ecec49c82e..06ef1ead3b 100644 --- a/Common/GPU/OpenGL/GLFeatures.cpp +++ b/Common/GPU/OpenGL/GLFeatures.cpp @@ -371,6 +371,7 @@ void CheckGLExtensions() { gl_extensions.ARB_uniform_buffer_object = g_set_gl_extensions.count("GL_ARB_uniform_buffer_object") != 0; gl_extensions.ARB_explicit_attrib_location = g_set_gl_extensions.count("GL_ARB_explicit_attrib_location") != 0; gl_extensions.ARB_texture_non_power_of_two = g_set_gl_extensions.count("GL_ARB_texture_non_power_of_two") != 0; + gl_extensions.ARB_shader_stencil_export = g_set_gl_extensions.count("GL_ARB_shader_stencil_export") != 0; if (gl_extensions.IsGLES) { gl_extensions.EXT_blend_func_extended = g_set_gl_extensions.count("GL_EXT_blend_func_extended") != 0; gl_extensions.OES_texture_npot = g_set_gl_extensions.count("GL_OES_texture_npot") != 0; diff --git a/Common/GPU/OpenGL/GLFeatures.h b/Common/GPU/OpenGL/GLFeatures.h index 0e07fd266f..a2d20b178f 100644 --- a/Common/GPU/OpenGL/GLFeatures.h +++ b/Common/GPU/OpenGL/GLFeatures.h @@ -72,6 +72,7 @@ struct GLExtensions { bool ARB_uniform_buffer_object; bool ARB_texture_non_power_of_two; bool ARB_stencil_texturing; + bool ARB_shader_stencil_export; // EXT bool EXT_swap_control_tear; diff --git a/Common/GPU/OpenGL/thin3d_gl.cpp b/Common/GPU/OpenGL/thin3d_gl.cpp index 9878886fd0..ba55d642a5 100644 --- a/Common/GPU/OpenGL/thin3d_gl.cpp +++ b/Common/GPU/OpenGL/thin3d_gl.cpp @@ -575,6 +575,7 @@ OpenGLContext::OpenGLContext() { } else { caps_.fragmentShaderDepthWriteSupported = true; } + caps_.fragmentShaderStencilWriteSupported = gl_extensions.ARB_shader_stencil_export; // GLES has no support for logic framebuffer operations. There doesn't even seem to exist any such extensions. caps_.logicOpSupported = !gl_extensions.IsGLES; diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp index f055ad9499..819b154905 100644 --- a/Common/GPU/Vulkan/thin3d_vulkan.cpp +++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp @@ -809,6 +809,7 @@ VKContext::VKContext(VulkanContext *vulkan) caps_.fragmentShaderInt32Supported = true; caps_.textureNPOTFullySupported = true; caps_.fragmentShaderDepthWriteSupported = true; + caps_.fragmentShaderStencilWriteSupported = vulkan->Extensions().EXT_shader_stencil_export; caps_.blendMinMaxSupported = true; caps_.logicOpSupported = vulkan->GetDeviceFeatures().enabled.standard.logicOp != 0; caps_.multiViewSupported = vulkan->GetDeviceFeatures().enabled.multiview.multiview != 0; diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h index 53d35250ce..106e6ec5dc 100644 --- a/Common/GPU/thin3d.h +++ b/Common/GPU/thin3d.h @@ -570,6 +570,7 @@ struct DeviceCaps { bool fragmentShaderInt32Supported; bool textureNPOTFullySupported; bool fragmentShaderDepthWriteSupported; + bool fragmentShaderStencilWriteSupported; bool textureDepthSupported; bool blendMinMaxSupported; bool multiViewSupported; diff --git a/GPU/Common/StencilCommon.cpp b/GPU/Common/StencilCommon.cpp index 0f454bd076..c879154e46 100644 --- a/GPU/Common/StencilCommon.cpp +++ b/GPU/Common/StencilCommon.cpp @@ -58,6 +58,34 @@ static u8 StencilBits8888(const u8 *ptr8, u32 numPixels) { return bits >> 24; } +static bool CheckStencilBits(const u8 *src, const VirtualFramebuffer *dstBuffer, int &values, u8 &usedBits) { + switch (dstBuffer->fb_format) { + case GE_FORMAT_565: + // Well, this doesn't make much sense. + return false; + case GE_FORMAT_5551: + usedBits = StencilBits5551(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); + values = 2; + break; + case GE_FORMAT_4444: + usedBits = StencilBits4444(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); + values = 16; + break; + case GE_FORMAT_8888: + usedBits = StencilBits8888(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); + values = 256; + break; + case GE_FORMAT_INVALID: + case GE_FORMAT_DEPTH16: + case GE_FORMAT_CLUT8: + // Inconceivable. + _assert_(false); + return false; + } + + return true; +} + struct StencilUB { float stencilValue; }; @@ -83,8 +111,12 @@ static const SamplerDef samplers[1] = { { 0, "tex" }, }; -void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs) { - ShaderWriter writer(buffer, lang, ShaderStage::Fragment); +void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs, bool useExport) { + std::vector extensions; + if (useExport) + extensions.push_back("#extension GL_ARB_shader_stencil_export : require"); + + ShaderWriter writer(buffer, lang, ShaderStage::Fragment, extensions); writer.HighPrecisionFloat(); writer.DeclareSamplers(samplers); @@ -98,9 +130,13 @@ void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw: writer.C(" vec4 index = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n"); writer.C(" vec4 outColor = index.aaaa;\n"); // Only care about a. - writer.C(" float shifted = roundAndScaleTo255f(index.a) / roundAndScaleTo255f(stencilValue);\n"); - // Bitwise operations on floats, ugh. - writer.C(" if (mod(floor(shifted), 2.0) < 0.99) DISCARD;\n"); + if (useExport) { + writer.C(" gl_FragStencilRefARB = int(roundAndScaleTo255f(index.a));\n"); + } else { + writer.C(" float shifted = roundAndScaleTo255f(index.a) / roundAndScaleTo255f(stencilValue);\n"); + // Bitwise operations on floats, ugh. + writer.C(" if (mod(floor(shifted), 2.0) < 0.99) DISCARD;\n"); + } if (bugs.Has(Draw::Bugs::NO_DEPTH_CANNOT_DISCARD_STENCIL)) { writer.C(" gl_FragDepth = gl_FragCoord.z;\n"); @@ -149,34 +185,15 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size, int values = 0; u8 usedBits = 0; + bool useExportShader = draw_->GetDeviceCaps().fragmentShaderStencilWriteSupported; const u8 *src = Memory::GetPointer(addr); if (!src) return false; - switch (dstBuffer->fb_format) { - case GE_FORMAT_565: - // Well, this doesn't make much sense. + // Could skip this when doing useExportShader, but then we couldn't optimize usedBits == 0. + if (!CheckStencilBits(src, dstBuffer, values, usedBits)) return false; - case GE_FORMAT_5551: - usedBits = StencilBits5551(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); - values = 2; - break; - case GE_FORMAT_4444: - usedBits = StencilBits4444(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); - values = 16; - break; - case GE_FORMAT_8888: - usedBits = StencilBits8888(src, dstBuffer->fb_stride * dstBuffer->bufferHeight); - values = 256; - break; - case GE_FORMAT_INVALID: - case GE_FORMAT_DEPTH16: - case GE_FORMAT_CLUT8: - // Inconceivable. - _assert_(false); - break; - } if (usedBits == 0) { if (flags & WriteStencil::STENCIL_IS_ZERO) { @@ -202,7 +219,7 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size, char *fsCode = new char[8192]; char *vsCode = new char[8192]; - GenerateStencilFs(fsCode, shaderLanguageDesc, draw_->GetBugs()); + GenerateStencilFs(fsCode, shaderLanguageDesc, draw_->GetBugs(), useExportShader); GenerateStencilVs(vsCode, shaderLanguageDesc); _assert_msg_(strlen(fsCode) < 8192, "StenFS length error: %d", (int)strlen(fsCode)); @@ -304,24 +321,32 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size, draw_->SetScissorRect(0, 0, w, h); draw_->BindPipeline(stencilWritePipeline_); - for (int i = 1; i < values; i += i) { - if (!(usedBits & i)) { - // It's already zero, let's skip it. - continue; - } + if (useExportShader) { + // We only need to do one pass if using an export shader. StencilUB ub{}; - if (dstBuffer->fb_format == GE_FORMAT_4444) { - draw_->SetStencilParams(0xFF, (i << 4) | i, 0xFF); - ub.stencilValue = i * (16.0f / 255.0f); - } else if (dstBuffer->fb_format == GE_FORMAT_5551) { - draw_->SetStencilParams(0xFF, 0xFF, 0xFF); - ub.stencilValue = i * (128.0f / 255.0f); - } else { - draw_->SetStencilParams(0xFF, i, 0xFF); - ub.stencilValue = i * (1.0f / 255.0f); - } + draw_->SetStencilParams(0xFF, 0xFF, 0xFF); draw_->UpdateDynamicUniformBuffer(&ub, sizeof(ub)); draw_->DrawUP(positions, 3); + } else { + for (int i = 1; i < values; i += i) { + if (!(usedBits & i)) { + // It's already zero, let's skip it. + continue; + } + StencilUB ub{}; + if (dstBuffer->fb_format == GE_FORMAT_4444) { + draw_->SetStencilParams(0xFF, (i << 4) | i, 0xFF); + ub.stencilValue = i * (16.0f / 255.0f); + } else if (dstBuffer->fb_format == GE_FORMAT_5551) { + draw_->SetStencilParams(0xFF, 0xFF, 0xFF); + ub.stencilValue = i * (128.0f / 255.0f); + } else { + draw_->SetStencilParams(0xFF, i, 0xFF); + ub.stencilValue = i * (1.0f / 255.0f); + } + draw_->UpdateDynamicUniformBuffer(&ub, sizeof(ub)); + draw_->DrawUP(positions, 3); + } } if (useBlit) { diff --git a/GPU/Common/StencilCommon.h b/GPU/Common/StencilCommon.h index e9746012a6..a8c16732e0 100644 --- a/GPU/Common/StencilCommon.h +++ b/GPU/Common/StencilCommon.h @@ -5,5 +5,5 @@ #include "Common/GPU/thin3d.h" // Exposed for automated tests -void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs); +void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs, bool useExport); void GenerateStencilVs(char *buffer, const ShaderLanguageDesc &lang); diff --git a/unittest/TestShaderGenerators.cpp b/unittest/TestShaderGenerators.cpp index 6701e162fd..7722c9f947 100644 --- a/unittest/TestShaderGenerators.cpp +++ b/unittest/TestShaderGenerators.cpp @@ -291,17 +291,21 @@ bool TestStencilShaders() { ShaderLanguageDesc desc(languages[k]); std::string errorMessage; - // Generate all despite failures - it's only 6. - GenerateStencilFs(buffer, desc, bugs); - if (strlen(buffer) >= 8192) { - printf("Stencil fragment shader exceeded buffer:\n\n%s\n", LineNumberString(buffer).c_str()); - failed = true; - } - if (!TestCompileShader(buffer, languages[k], ShaderStage::Fragment, &errorMessage)) { - printf("Error compiling stencil shader:\n\n%s\n\n%s\n", LineNumberString(buffer).c_str(), errorMessage.c_str()); - failed = true; - } else { - printf("===\n%s\n===\n", buffer); + // Generate all despite failures - it's only a few. + // Only use export on Vulkan, because GLSL_3xx is ES which doesn't support stencil export. + bool allowUseExport = languages[k] == ShaderLanguage::GLSL_VULKAN; + for (int useExport = 0; useExport <= (allowUseExport ? 1 : 0); ++useExport) { + GenerateStencilFs(buffer, desc, bugs, useExport == 1); + if (strlen(buffer) >= 8192) { + printf("Stencil fragment shader (useExport=%d) exceeded buffer:\n\n%s\n", useExport, LineNumberString(buffer).c_str()); + failed = true; + } + if (!TestCompileShader(buffer, languages[k], ShaderStage::Fragment, &errorMessage)) { + printf("Error compiling stencil shader (useExport=%d):\n\n%s\n\n%s\n", useExport, LineNumberString(buffer).c_str(), errorMessage.c_str()); + failed = true; + } else { + printf("===\n%s\n===\n", buffer); + } } GenerateStencilVs(buffer, desc); From 9fcccd789a40b3d73c98e13183f9f5df8f1ecaf2 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 24 Nov 2022 19:09:42 -0800 Subject: [PATCH 3/3] Vulkan: Correct displayed depth/stencil format. Also corrects handling of non-24 bit depth when reducing the range. --- Common/GPU/DataFormat.h | 1 + Common/GPU/Vulkan/thin3d_vulkan.cpp | 24 +++++++++++++++++++++++- Common/GPU/thin3d.cpp | 3 +++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/Common/GPU/DataFormat.h b/Common/GPU/DataFormat.h index b116ac9ab1..32463d3a16 100644 --- a/Common/GPU/DataFormat.h +++ b/Common/GPU/DataFormat.h @@ -65,6 +65,7 @@ enum class DataFormat : uint8_t { S8, D16, + D16_S8, D24_S8, D32F, D32F_S8, diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp index 819b154905..1626958e91 100644 --- a/Common/GPU/Vulkan/thin3d_vulkan.cpp +++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp @@ -592,6 +592,8 @@ static int GetBpp(VkFormat format) { static VkFormat DataFormatToVulkan(DataFormat format) { switch (format) { case DataFormat::D16: return VK_FORMAT_D16_UNORM; + case DataFormat::D16_S8: return VK_FORMAT_D16_UNORM_S8_UINT; + case DataFormat::D24_S8: return VK_FORMAT_D24_UNORM_S8_UINT; case DataFormat::D32F: return VK_FORMAT_D32_SFLOAT; case DataFormat::D32F_S8: return VK_FORMAT_D32_SFLOAT_S8_UINT; case DataFormat::S8: return VK_FORMAT_S8_UINT; @@ -784,6 +786,25 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushBuffer *push, const Textur return true; } +static DataFormat DataFormatFromVulkanDepth(VkFormat fmt) { + switch (fmt) { + case VK_FORMAT_D24_UNORM_S8_UINT: + return DataFormat::D24_S8; + case VK_FORMAT_D16_UNORM: + return DataFormat::D16; + case VK_FORMAT_D32_SFLOAT: + return DataFormat::D32F; + case VK_FORMAT_D32_SFLOAT_S8_UINT: + return DataFormat::D32F_S8; + case VK_FORMAT_D16_UNORM_S8_UINT: + return DataFormat::D16_S8; + default: + break; + } + + return DataFormat::UNDEFINED; +} + VKContext::VKContext(VulkanContext *vulkan) : vulkan_(vulkan), renderManager_(vulkan) { shaderLanguageDesc_.Init(GLSL_VULKAN); @@ -803,7 +824,8 @@ VKContext::VKContext(VulkanContext *vulkan) caps_.framebufferStencilBlitSupported = caps_.framebufferDepthBlitSupported; caps_.framebufferDepthCopySupported = true; // Will pretty much always be the case. caps_.framebufferSeparateDepthCopySupported = true; // Will pretty much always be the case. - caps_.preferredDepthBufferFormat = DataFormat::D24_S8; // TODO: Ask vulkan. + // This doesn't affect what depth/stencil format is actually used, see VulkanQueueRunner. + caps_.preferredDepthBufferFormat = DataFormatFromVulkanDepth(vulkan->GetDeviceInfo().preferredDepthStencilFormat); caps_.texture3DSupported = true; caps_.textureDepthSupported = true; caps_.fragmentShaderInt32Supported = true; diff --git a/Common/GPU/thin3d.cpp b/Common/GPU/thin3d.cpp index fab4e665d1..a93a876766 100644 --- a/Common/GPU/thin3d.cpp +++ b/Common/GPU/thin3d.cpp @@ -46,6 +46,7 @@ size_t DataFormatSizeInBytes(DataFormat fmt) { case DataFormat::S8: return 1; case DataFormat::D16: return 2; + case DataFormat::D16_S8: return 3; case DataFormat::D24_S8: return 4; case DataFormat::D32F: return 4; // Or maybe 8... @@ -68,6 +69,7 @@ const char *DataFormatToString(DataFormat fmt) { case DataFormat::S8: return "S8"; case DataFormat::D16: return "D16"; + case DataFormat::D16_S8: return "D16_S8"; case DataFormat::D24_S8: return "D24_S8"; case DataFormat::D32F: return "D32F"; case DataFormat::D32F_S8: return "D32F_S8"; @@ -80,6 +82,7 @@ const char *DataFormatToString(DataFormat fmt) { bool DataFormatIsDepthStencil(DataFormat fmt) { switch (fmt) { case DataFormat::D16: + case DataFormat::D16_S8: case DataFormat::D24_S8: case DataFormat::S8: case DataFormat::D32F: