Merge pull request #16434 from unknownbrackets/stencil-opt

Vulkan: Use stencil export when available
This commit is contained in:
Henrik Rydgård 2022-11-25 10:06:41 +01:00 committed by GitHub
commit d97035fffc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 121 additions and 58 deletions

View file

@ -269,6 +269,7 @@ D3D11DrawContext::D3D11DrawContext(ID3D11Device *device, ID3D11DeviceContext *de
caps_.anisoSupported = true;
caps_.textureNPOTFullySupported = true;
caps_.fragmentShaderDepthWriteSupported = true;
caps_.fragmentShaderStencilWriteSupported = false;
caps_.blendMinMaxSupported = true;
D3D11_FEATURE_DATA_D3D11_OPTIONS options{};

View file

@ -759,6 +759,7 @@ D3D9Context::D3D9Context(IDirect3D9 *d3d, IDirect3D9Ex *d3dEx, int adapterId, ID
caps_.framebufferSeparateDepthCopySupported = false;
caps_.texture3DSupported = true;
caps_.fragmentShaderDepthWriteSupported = true;
caps_.fragmentShaderStencilWriteSupported = false;
caps_.blendMinMaxSupported = true;
if ((caps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && caps.MaxAnisotropy > 1) {

View file

@ -65,6 +65,7 @@ enum class DataFormat : uint8_t {
S8,
D16,
D16_S8,
D24_S8,
D32F,
D32F_S8,

View file

@ -371,6 +371,7 @@ void CheckGLExtensions() {
gl_extensions.ARB_uniform_buffer_object = g_set_gl_extensions.count("GL_ARB_uniform_buffer_object") != 0;
gl_extensions.ARB_explicit_attrib_location = g_set_gl_extensions.count("GL_ARB_explicit_attrib_location") != 0;
gl_extensions.ARB_texture_non_power_of_two = g_set_gl_extensions.count("GL_ARB_texture_non_power_of_two") != 0;
gl_extensions.ARB_shader_stencil_export = g_set_gl_extensions.count("GL_ARB_shader_stencil_export") != 0;
if (gl_extensions.IsGLES) {
gl_extensions.EXT_blend_func_extended = g_set_gl_extensions.count("GL_EXT_blend_func_extended") != 0;
gl_extensions.OES_texture_npot = g_set_gl_extensions.count("GL_OES_texture_npot") != 0;

View file

@ -72,6 +72,7 @@ struct GLExtensions {
bool ARB_uniform_buffer_object;
bool ARB_texture_non_power_of_two;
bool ARB_stencil_texturing;
bool ARB_shader_stencil_export;
// EXT
bool EXT_swap_control_tear;

View file

@ -575,6 +575,7 @@ OpenGLContext::OpenGLContext() {
} else {
caps_.fragmentShaderDepthWriteSupported = true;
}
caps_.fragmentShaderStencilWriteSupported = gl_extensions.ARB_shader_stencil_export;
// GLES has no support for logic framebuffer operations. There doesn't even seem to exist any such extensions.
caps_.logicOpSupported = !gl_extensions.IsGLES;

View file

@ -592,6 +592,8 @@ static int GetBpp(VkFormat format) {
static VkFormat DataFormatToVulkan(DataFormat format) {
switch (format) {
case DataFormat::D16: return VK_FORMAT_D16_UNORM;
case DataFormat::D16_S8: return VK_FORMAT_D16_UNORM_S8_UINT;
case DataFormat::D24_S8: return VK_FORMAT_D24_UNORM_S8_UINT;
case DataFormat::D32F: return VK_FORMAT_D32_SFLOAT;
case DataFormat::D32F_S8: return VK_FORMAT_D32_SFLOAT_S8_UINT;
case DataFormat::S8: return VK_FORMAT_S8_UINT;
@ -784,6 +786,25 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanPushBuffer *push, const Textur
return true;
}
static DataFormat DataFormatFromVulkanDepth(VkFormat fmt) {
switch (fmt) {
case VK_FORMAT_D24_UNORM_S8_UINT:
return DataFormat::D24_S8;
case VK_FORMAT_D16_UNORM:
return DataFormat::D16;
case VK_FORMAT_D32_SFLOAT:
return DataFormat::D32F;
case VK_FORMAT_D32_SFLOAT_S8_UINT:
return DataFormat::D32F_S8;
case VK_FORMAT_D16_UNORM_S8_UINT:
return DataFormat::D16_S8;
default:
break;
}
return DataFormat::UNDEFINED;
}
VKContext::VKContext(VulkanContext *vulkan)
: vulkan_(vulkan), renderManager_(vulkan) {
shaderLanguageDesc_.Init(GLSL_VULKAN);
@ -803,12 +824,14 @@ VKContext::VKContext(VulkanContext *vulkan)
caps_.framebufferStencilBlitSupported = caps_.framebufferDepthBlitSupported;
caps_.framebufferDepthCopySupported = true; // Will pretty much always be the case.
caps_.framebufferSeparateDepthCopySupported = true; // Will pretty much always be the case.
caps_.preferredDepthBufferFormat = DataFormat::D24_S8; // TODO: Ask vulkan.
// This doesn't affect what depth/stencil format is actually used, see VulkanQueueRunner.
caps_.preferredDepthBufferFormat = DataFormatFromVulkanDepth(vulkan->GetDeviceInfo().preferredDepthStencilFormat);
caps_.texture3DSupported = true;
caps_.textureDepthSupported = true;
caps_.fragmentShaderInt32Supported = true;
caps_.textureNPOTFullySupported = true;
caps_.fragmentShaderDepthWriteSupported = true;
caps_.fragmentShaderStencilWriteSupported = vulkan->Extensions().EXT_shader_stencil_export;
caps_.blendMinMaxSupported = true;
caps_.logicOpSupported = vulkan->GetDeviceFeatures().enabled.standard.logicOp != 0;
caps_.multiViewSupported = vulkan->GetDeviceFeatures().enabled.multiview.multiview != 0;

View file

@ -46,6 +46,7 @@ size_t DataFormatSizeInBytes(DataFormat fmt) {
case DataFormat::S8: return 1;
case DataFormat::D16: return 2;
case DataFormat::D16_S8: return 3;
case DataFormat::D24_S8: return 4;
case DataFormat::D32F: return 4;
// Or maybe 8...
@ -68,6 +69,7 @@ const char *DataFormatToString(DataFormat fmt) {
case DataFormat::S8: return "S8";
case DataFormat::D16: return "D16";
case DataFormat::D16_S8: return "D16_S8";
case DataFormat::D24_S8: return "D24_S8";
case DataFormat::D32F: return "D32F";
case DataFormat::D32F_S8: return "D32F_S8";
@ -80,6 +82,7 @@ const char *DataFormatToString(DataFormat fmt) {
bool DataFormatIsDepthStencil(DataFormat fmt) {
switch (fmt) {
case DataFormat::D16:
case DataFormat::D16_S8:
case DataFormat::D24_S8:
case DataFormat::S8:
case DataFormat::D32F:

View file

@ -572,6 +572,7 @@ struct DeviceCaps {
bool fragmentShaderInt32Supported;
bool textureNPOTFullySupported;
bool fragmentShaderDepthWriteSupported;
bool fragmentShaderStencilWriteSupported;
bool textureDepthSupported;
bool blendMinMaxSupported;
bool multiViewSupported;

View file

@ -58,6 +58,34 @@ static u8 StencilBits8888(const u8 *ptr8, u32 numPixels) {
return bits >> 24;
}
static bool CheckStencilBits(const u8 *src, const VirtualFramebuffer *dstBuffer, int &values, u8 &usedBits) {
switch (dstBuffer->fb_format) {
case GE_FORMAT_565:
// Well, this doesn't make much sense.
return false;
case GE_FORMAT_5551:
usedBits = StencilBits5551(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
values = 2;
break;
case GE_FORMAT_4444:
usedBits = StencilBits4444(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
values = 16;
break;
case GE_FORMAT_8888:
usedBits = StencilBits8888(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
values = 256;
break;
case GE_FORMAT_INVALID:
case GE_FORMAT_DEPTH16:
case GE_FORMAT_CLUT8:
// Inconceivable.
_assert_(false);
return false;
}
return true;
}
struct StencilUB {
float stencilValue;
};
@ -83,8 +111,12 @@ static const SamplerDef samplers[1] = {
{ 0, "tex" },
};
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs) {
ShaderWriter writer(buffer, lang, ShaderStage::Fragment);
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs, bool useExport) {
std::vector<const char *> extensions;
if (useExport)
extensions.push_back("#extension GL_ARB_shader_stencil_export : require");
ShaderWriter writer(buffer, lang, ShaderStage::Fragment, extensions);
writer.HighPrecisionFloat();
writer.DeclareSamplers(samplers);
@ -98,9 +130,13 @@ void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw:
writer.C(" vec4 index = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n");
writer.C(" vec4 outColor = index.aaaa;\n"); // Only care about a.
writer.C(" float shifted = roundAndScaleTo255f(index.a) / roundAndScaleTo255f(stencilValue);\n");
// Bitwise operations on floats, ugh.
writer.C(" if (mod(floor(shifted), 2.0) < 0.99) DISCARD;\n");
if (useExport) {
writer.C(" gl_FragStencilRefARB = int(roundAndScaleTo255f(index.a));\n");
} else {
writer.C(" float shifted = roundAndScaleTo255f(index.a) / roundAndScaleTo255f(stencilValue);\n");
// Bitwise operations on floats, ugh.
writer.C(" if (mod(floor(shifted), 2.0) < 0.99) DISCARD;\n");
}
if (bugs.Has(Draw::Bugs::NO_DEPTH_CANNOT_DISCARD_STENCIL)) {
writer.C(" gl_FragDepth = gl_FragCoord.z;\n");
@ -135,10 +171,11 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
return false;
}
VirtualFramebuffer *dstBuffer = 0;
VirtualFramebuffer *dstBuffer = nullptr;
for (size_t i = 0; i < vfbs_.size(); ++i) {
VirtualFramebuffer *vfb = vfbs_[i];
if (vfb->fb_address == addr) {
// TODO: Maybe we should broadcast to all? Most of the time, there's only one.
if (vfb->fb_address == addr && (!dstBuffer || dstBuffer->colorBindSeq < vfb->colorBindSeq)) {
dstBuffer = vfb;
}
}
@ -148,34 +185,15 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
int values = 0;
u8 usedBits = 0;
bool useExportShader = draw_->GetDeviceCaps().fragmentShaderStencilWriteSupported;
const u8 *src = Memory::GetPointer(addr);
if (!src)
return false;
switch (dstBuffer->fb_format) {
case GE_FORMAT_565:
// Well, this doesn't make much sense.
// Could skip this when doing useExportShader, but then we couldn't optimize usedBits == 0.
if (!CheckStencilBits(src, dstBuffer, values, usedBits))
return false;
case GE_FORMAT_5551:
usedBits = StencilBits5551(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
values = 2;
break;
case GE_FORMAT_4444:
usedBits = StencilBits4444(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
values = 16;
break;
case GE_FORMAT_8888:
usedBits = StencilBits8888(src, dstBuffer->fb_stride * dstBuffer->bufferHeight);
values = 256;
break;
case GE_FORMAT_INVALID:
case GE_FORMAT_DEPTH16:
case GE_FORMAT_CLUT8:
// Inconceivable.
_assert_(false);
break;
}
if (usedBits == 0) {
if (flags & WriteStencil::STENCIL_IS_ZERO) {
@ -201,7 +219,7 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
char *fsCode = new char[8192];
char *vsCode = new char[8192];
GenerateStencilFs(fsCode, shaderLanguageDesc, draw_->GetBugs());
GenerateStencilFs(fsCode, shaderLanguageDesc, draw_->GetBugs(), useExportShader);
GenerateStencilVs(vsCode, shaderLanguageDesc);
_assert_msg_(strlen(fsCode) < 8192, "StenFS length error: %d", (int)strlen(fsCode));
@ -303,24 +321,32 @@ bool FramebufferManagerCommon::PerformWriteStencilFromMemory(u32 addr, int size,
draw_->SetScissorRect(0, 0, w, h);
draw_->BindPipeline(stencilWritePipeline_);
for (int i = 1; i < values; i += i) {
if (!(usedBits & i)) {
// It's already zero, let's skip it.
continue;
}
if (useExportShader) {
// We only need to do one pass if using an export shader.
StencilUB ub{};
if (dstBuffer->fb_format == GE_FORMAT_4444) {
draw_->SetStencilParams(0xFF, (i << 4) | i, 0xFF);
ub.stencilValue = i * (16.0f / 255.0f);
} else if (dstBuffer->fb_format == GE_FORMAT_5551) {
draw_->SetStencilParams(0xFF, 0xFF, 0xFF);
ub.stencilValue = i * (128.0f / 255.0f);
} else {
draw_->SetStencilParams(0xFF, i, 0xFF);
ub.stencilValue = i * (1.0f / 255.0f);
}
draw_->SetStencilParams(0xFF, 0xFF, 0xFF);
draw_->UpdateDynamicUniformBuffer(&ub, sizeof(ub));
draw_->DrawUP(positions, 3);
} else {
for (int i = 1; i < values; i += i) {
if (!(usedBits & i)) {
// It's already zero, let's skip it.
continue;
}
StencilUB ub{};
if (dstBuffer->fb_format == GE_FORMAT_4444) {
draw_->SetStencilParams(0xFF, (i << 4) | i, 0xFF);
ub.stencilValue = i * (16.0f / 255.0f);
} else if (dstBuffer->fb_format == GE_FORMAT_5551) {
draw_->SetStencilParams(0xFF, 0xFF, 0xFF);
ub.stencilValue = i * (128.0f / 255.0f);
} else {
draw_->SetStencilParams(0xFF, i, 0xFF);
ub.stencilValue = i * (1.0f / 255.0f);
}
draw_->UpdateDynamicUniformBuffer(&ub, sizeof(ub));
draw_->DrawUP(positions, 3);
}
}
if (useBlit) {

View file

@ -5,5 +5,5 @@
#include "Common/GPU/thin3d.h"
// Exposed for automated tests
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs);
void GenerateStencilFs(char *buffer, const ShaderLanguageDesc &lang, const Draw::Bugs &bugs, bool useExport);
void GenerateStencilVs(char *buffer, const ShaderLanguageDesc &lang);

View file

@ -291,17 +291,21 @@ bool TestStencilShaders() {
ShaderLanguageDesc desc(languages[k]);
std::string errorMessage;
// Generate all despite failures - it's only 6.
GenerateStencilFs(buffer, desc, bugs);
if (strlen(buffer) >= 8192) {
printf("Stencil fragment shader exceeded buffer:\n\n%s\n", LineNumberString(buffer).c_str());
failed = true;
}
if (!TestCompileShader(buffer, languages[k], ShaderStage::Fragment, &errorMessage)) {
printf("Error compiling stencil shader:\n\n%s\n\n%s\n", LineNumberString(buffer).c_str(), errorMessage.c_str());
failed = true;
} else {
printf("===\n%s\n===\n", buffer);
// Generate all despite failures - it's only a few.
// Only use export on Vulkan, because GLSL_3xx is ES which doesn't support stencil export.
bool allowUseExport = languages[k] == ShaderLanguage::GLSL_VULKAN;
for (int useExport = 0; useExport <= (allowUseExport ? 1 : 0); ++useExport) {
GenerateStencilFs(buffer, desc, bugs, useExport == 1);
if (strlen(buffer) >= 8192) {
printf("Stencil fragment shader (useExport=%d) exceeded buffer:\n\n%s\n", useExport, LineNumberString(buffer).c_str());
failed = true;
}
if (!TestCompileShader(buffer, languages[k], ShaderStage::Fragment, &errorMessage)) {
printf("Error compiling stencil shader (useExport=%d):\n\n%s\n\n%s\n", useExport, LineNumberString(buffer).c_str(), errorMessage.c_str());
failed = true;
} else {
printf("===\n%s\n===\n", buffer);
}
}
GenerateStencilVs(buffer, desc);