#include #include "DataFormat.h" #include "Common/Log.h" #include "Common/TimeUtil.h" #include "VulkanQueueRunner.h" #include "VulkanRenderManager.h" // Debug help: adb logcat -s DEBUG PPSSPPNativeActivity PPSSPP NativeGLView NativeRenderer NativeSurfaceView PowerSaveModeReceiver InputDeviceState void VulkanQueueRunner::CreateDeviceObjects() { INFO_LOG(G3D, "VulkanQueueRunner::CreateDeviceObjects"); InitBackbufferRenderPass(); framebufferRenderPass_ = GetRenderPass(VKRRenderPassAction::CLEAR, VKRRenderPassAction::CLEAR, VKRRenderPassAction::CLEAR, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL); #if 0 // Just to check whether it makes sense to split some of these. drawidx is way bigger than the others... // We should probably just move to variable-size data in a raw buffer anyway... VkRenderData rd; INFO_LOG(G3D, "sizeof(pipeline): %d", (int)sizeof(rd.pipeline)); INFO_LOG(G3D, "sizeof(draw): %d", (int)sizeof(rd.draw)); INFO_LOG(G3D, "sizeof(drawidx): %d", (int)sizeof(rd.drawIndexed)); INFO_LOG(G3D, "sizeof(clear): %d", (int)sizeof(rd.clear)); INFO_LOG(G3D, "sizeof(viewport): %d", (int)sizeof(rd.viewport)); INFO_LOG(G3D, "sizeof(scissor): %d", (int)sizeof(rd.scissor)); INFO_LOG(G3D, "sizeof(blendColor): %d", (int)sizeof(rd.blendColor)); INFO_LOG(G3D, "sizeof(push): %d", (int)sizeof(rd.push)); #endif } void VulkanQueueRunner::ResizeReadbackBuffer(VkDeviceSize requiredSize) { if (readbackBuffer_ && requiredSize <= readbackBufferSize_) { return; } if (readbackMemory_) { vulkan_->Delete().QueueDeleteDeviceMemory(readbackMemory_); } if (readbackBuffer_) { vulkan_->Delete().QueueDeleteBuffer(readbackBuffer_); } readbackBufferSize_ = requiredSize; VkDevice device = vulkan_->GetDevice(); VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; buf.size = readbackBufferSize_; buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT; vkCreateBuffer(device, &buf, nullptr, &readbackBuffer_); VkMemoryRequirements reqs{}; vkGetBufferMemoryRequirements(device, readbackBuffer_, &reqs); VkMemoryAllocateInfo allocInfo{ VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; allocInfo.allocationSize = reqs.size; // For speedy readbacks, we want the CPU cache to be enabled. However on most hardware we then have to // sacrifice coherency, which means manual flushing. But try to find such memory first! If no cached // memory type is available we fall back to just coherent. const VkFlags desiredTypes[] = { VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, }; VkFlags successTypeReqs = 0; for (VkFlags typeReqs : desiredTypes) { if (vulkan_->MemoryTypeFromProperties(reqs.memoryTypeBits, typeReqs, &allocInfo.memoryTypeIndex)) { successTypeReqs = typeReqs; break; } } _assert_(successTypeReqs != 0); readbackBufferIsCoherent_ = (successTypeReqs & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0; VkResult res = vkAllocateMemory(device, &allocInfo, nullptr, &readbackMemory_); if (res != VK_SUCCESS) { readbackMemory_ = VK_NULL_HANDLE; vkDestroyBuffer(device, readbackBuffer_, nullptr); readbackBuffer_ = VK_NULL_HANDLE; return; } uint32_t offset = 0; vkBindBufferMemory(device, readbackBuffer_, readbackMemory_, offset); } void VulkanQueueRunner::DestroyDeviceObjects() { INFO_LOG(G3D, "VulkanQueueRunner::DestroyDeviceObjects"); vulkan_->Delete().QueueDeleteDeviceMemory(readbackMemory_); vulkan_->Delete().QueueDeleteBuffer(readbackBuffer_); readbackBufferSize_ = 0; renderPasses_.Iterate([&](const RPKey &rpkey, VkRenderPass rp) { _assert_(rp != VK_NULL_HANDLE); vulkan_->Delete().QueueDeleteRenderPass(rp); }); renderPasses_.Clear(); _assert_(backbufferRenderPass_ != VK_NULL_HANDLE); vulkan_->Delete().QueueDeleteRenderPass(backbufferRenderPass_); backbufferRenderPass_ = VK_NULL_HANDLE; } void VulkanQueueRunner::InitBackbufferRenderPass() { VkAttachmentDescription attachments[2]; attachments[0].format = vulkan_->GetSwapchainFormat(); attachments[0].samples = VK_SAMPLE_COUNT_1_BIT; attachments[0].loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; attachments[0].storeOp = VK_ATTACHMENT_STORE_OP_STORE; attachments[0].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; attachments[0].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; attachments[0].initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; // We don't want to preserve the backbuffer between frames so we really don't care. attachments[0].finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; // We only render once to the backbuffer per frame so we can do this here. attachments[0].flags = 0; attachments[1].format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat; // must use this same format later for the back depth buffer. attachments[1].samples = VK_SAMPLE_COUNT_1_BIT; attachments[1].loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; attachments[1].storeOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; // Don't care about storing backbuffer Z - we clear it anyway. attachments[1].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; attachments[1].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_DEPTH_STENCIL attachments[1].initialLayout = VK_IMAGE_LAYOUT_GENERAL; attachments[1].finalLayout = VK_IMAGE_LAYOUT_GENERAL; #else attachments[1].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; attachments[1].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; #endif attachments[1].flags = 0; VkAttachmentReference color_reference{}; color_reference.attachment = 0; #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_COLOR color_reference.layout = VK_IMAGE_LAYOUT_GENERAL; #else color_reference.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; #endif VkAttachmentReference depth_reference{}; depth_reference.attachment = 1; depth_reference.layout = attachments[1].finalLayout; VkSubpassDescription subpass{}; subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; subpass.flags = 0; subpass.inputAttachmentCount = 0; subpass.pInputAttachments = nullptr; subpass.colorAttachmentCount = 1; subpass.pColorAttachments = &color_reference; subpass.pResolveAttachments = nullptr; subpass.pDepthStencilAttachment = &depth_reference; subpass.preserveAttachmentCount = 0; subpass.pPreserveAttachments = nullptr; // For the built-in layout transitions. VkSubpassDependency dep{}; dep.srcSubpass = VK_SUBPASS_EXTERNAL; dep.dstSubpass = 0; dep.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; dep.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; dep.srcAccessMask = 0; dep.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; VkRenderPassCreateInfo rp_info{ VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO }; rp_info.attachmentCount = 2; rp_info.pAttachments = attachments; rp_info.subpassCount = 1; rp_info.pSubpasses = &subpass; rp_info.dependencyCount = 1; rp_info.pDependencies = &dep; VkResult res = vkCreateRenderPass(vulkan_->GetDevice(), &rp_info, nullptr, &backbufferRenderPass_); _assert_(res == VK_SUCCESS); } VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) { auto pass = renderPasses_.Get(key); if (pass) { return pass; } VkAttachmentDescription attachments[2] = {}; attachments[0].format = VK_FORMAT_R8G8B8A8_UNORM; attachments[0].samples = VK_SAMPLE_COUNT_1_BIT; switch (key.colorLoadAction) { case VKRRenderPassAction::CLEAR: attachments[0].loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; break; case VKRRenderPassAction::KEEP: attachments[0].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; break; case VKRRenderPassAction::DONT_CARE: default: attachments[0].loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; break; } attachments[0].storeOp = VK_ATTACHMENT_STORE_OP_STORE; attachments[0].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; attachments[0].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_COLOR attachments[0].initialLayout = VK_IMAGE_LAYOUT_GENERAL; attachments[0].finalLayout = VK_IMAGE_LAYOUT_GENERAL; #else attachments[0].initialLayout = key.prevColorLayout; attachments[0].finalLayout = key.finalColorLayout; #endif attachments[0].flags = 0; attachments[1].format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat; attachments[1].samples = VK_SAMPLE_COUNT_1_BIT; switch (key.depthLoadAction) { case VKRRenderPassAction::CLEAR: attachments[1].loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; break; case VKRRenderPassAction::KEEP: attachments[1].loadOp = VK_ATTACHMENT_LOAD_OP_LOAD; break; case VKRRenderPassAction::DONT_CARE: attachments[1].loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; break; } switch (key.stencilLoadAction) { case VKRRenderPassAction::CLEAR: attachments[1].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; break; case VKRRenderPassAction::KEEP: attachments[1].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD; break; case VKRRenderPassAction::DONT_CARE: attachments[1].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE; break; } attachments[1].storeOp = VK_ATTACHMENT_STORE_OP_STORE; attachments[1].stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE; #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_DEPTH_STENCIL attachments[1].initialLayout = VK_IMAGE_LAYOUT_GENERAL; attachments[1].finalLayout = VK_IMAGE_LAYOUT_GENERAL; #else attachments[1].initialLayout = key.prevDepthStencilLayout; attachments[1].finalLayout = key.finalDepthStencilLayout; #endif attachments[1].flags = 0; VkAttachmentReference color_reference{}; color_reference.attachment = 0; color_reference.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; VkAttachmentReference depth_reference{}; depth_reference.attachment = 1; depth_reference.layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; VkSubpassDescription subpass{}; subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; subpass.flags = 0; subpass.inputAttachmentCount = 0; subpass.pInputAttachments = nullptr; subpass.colorAttachmentCount = 1; subpass.pColorAttachments = &color_reference; subpass.pResolveAttachments = nullptr; subpass.pDepthStencilAttachment = &depth_reference; subpass.preserveAttachmentCount = 0; subpass.pPreserveAttachments = nullptr; VkSubpassDependency deps[2]{}; int numDeps = 0; switch (key.prevColorLayout) { case VK_IMAGE_LAYOUT_UNDEFINED: // No need to specify stage or access. break; case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: // Already the right color layout. Unclear that we need to do a lot here.. break; case VK_IMAGE_LAYOUT_GENERAL: // We came from the Mali workaround, and are transitioning back to COLOR_ATTACHMENT_OPTIMAL. deps[numDeps].srcAccessMask |= VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; deps[numDeps].srcStageMask |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; break; case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: deps[numDeps].srcAccessMask |= VK_ACCESS_SHADER_READ_BIT; deps[numDeps].srcStageMask |= VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: deps[numDeps].srcAccessMask |= VK_ACCESS_TRANSFER_WRITE_BIT; deps[numDeps].srcStageMask |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: deps[numDeps].srcAccessMask |= VK_ACCESS_TRANSFER_READ_BIT; deps[numDeps].srcStageMask |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; default: _dbg_assert_msg_(false, "GetRenderPass: Unexpected color layout %d", (int)key.prevColorLayout); break; } switch (key.prevDepthStencilLayout) { case VK_IMAGE_LAYOUT_UNDEFINED: // No need to specify stage or access. break; case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: // Already the right depth layout. Unclear that we need to do a lot here.. break; case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: deps[numDeps].srcAccessMask |= VK_ACCESS_SHADER_READ_BIT; deps[numDeps].srcStageMask |= VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: deps[numDeps].srcAccessMask |= VK_ACCESS_TRANSFER_READ_BIT; deps[numDeps].srcStageMask |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: deps[numDeps].srcAccessMask |= VK_ACCESS_TRANSFER_WRITE_BIT; deps[numDeps].srcStageMask |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; default: _dbg_assert_msg_(false, "GetRenderPass: Unexpected depth layout %d", (int)key.prevDepthStencilLayout); break; } if (deps[numDeps].srcAccessMask) { deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL; deps[numDeps].dstSubpass = 0; deps[numDeps].dependencyFlags = 0; deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; deps[numDeps].dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; numDeps++; } // And the final transition. // Don't need to transition it if VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL. switch (key.finalColorLayout) { case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: deps[numDeps].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: deps[numDeps].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: deps[numDeps].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; deps[numDeps].dstStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_UNDEFINED: case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: // Nothing to do. break; default: _dbg_assert_msg_(false, "GetRenderPass: Unexpected final color layout %d", (int)key.finalColorLayout); break; } switch (key.finalDepthStencilLayout) { case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: deps[numDeps].dstAccessMask |= VK_ACCESS_SHADER_READ_BIT; deps[numDeps].dstStageMask |= VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: deps[numDeps].dstAccessMask |= VK_ACCESS_TRANSFER_READ_BIT; deps[numDeps].dstStageMask |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: deps[numDeps].dstAccessMask |= VK_ACCESS_TRANSFER_READ_BIT; deps[numDeps].dstStageMask |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_UNDEFINED: case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: // Nothing to do. break; default: _dbg_assert_msg_(false, "GetRenderPass: Unexpected final depth layout %d", (int)key.finalDepthStencilLayout); break; } if (deps[numDeps].dstAccessMask) { deps[numDeps].srcSubpass = 0; deps[numDeps].dstSubpass = VK_SUBPASS_EXTERNAL; deps[numDeps].dependencyFlags = 0; deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; deps[numDeps].srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; numDeps++; } VkRenderPassCreateInfo rp{ VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO }; rp.attachmentCount = 2; rp.pAttachments = attachments; rp.subpassCount = 1; rp.pSubpasses = &subpass; if (numDeps) { rp.dependencyCount = numDeps; rp.pDependencies = deps; } VkResult res = vkCreateRenderPass(vulkan_->GetDevice(), &rp, nullptr, &pass); _assert_(res == VK_SUCCESS); _assert_(pass != VK_NULL_HANDLE); renderPasses_.Insert(key, pass); return pass; } void VulkanQueueRunner::PreprocessSteps(std::vector &steps) { // Optimizes renderpasses, then sequences them. // Planned optimizations: // * Create copies of render target that are rendered to multiple times and textured from in sequence, and push those render passes // as early as possible in the frame (Wipeout billboards). for (int j = 0; j < (int)steps.size(); j++) { if (steps[j]->stepType == VKRStepType::RENDER && steps[j]->render.framebuffer) { if (steps[j]->render.finalColorLayout == VK_IMAGE_LAYOUT_UNDEFINED) { steps[j]->render.finalColorLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; } if (steps[j]->render.finalDepthStencilLayout == VK_IMAGE_LAYOUT_UNDEFINED) { steps[j]->render.finalDepthStencilLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; } } } for (int j = 0; j < (int)steps.size() - 1; j++) { // Push down empty "Clear/Store" renderpasses, and merge them with the first "Load/Store" to the same framebuffer. if (steps.size() > 1 && steps[j]->stepType == VKRStepType::RENDER && steps[j]->render.numDraws == 0 && steps[j]->render.numReads == 0 && steps[j]->render.color == VKRRenderPassAction::CLEAR && steps[j]->render.stencil == VKRRenderPassAction::CLEAR && steps[j]->render.depth == VKRRenderPassAction::CLEAR) { // Drop the clear step, and merge it into the next step that touches the same framebuffer. for (int i = j + 1; i < (int)steps.size(); i++) { if (steps[i]->stepType == VKRStepType::RENDER && steps[i]->render.framebuffer == steps[j]->render.framebuffer) { if (steps[i]->render.color != VKRRenderPassAction::CLEAR) { steps[i]->render.color = VKRRenderPassAction::CLEAR; steps[i]->render.clearColor = steps[j]->render.clearColor; } if (steps[i]->render.depth != VKRRenderPassAction::CLEAR) { steps[i]->render.depth = VKRRenderPassAction::CLEAR; steps[i]->render.clearDepth = steps[j]->render.clearDepth; } if (steps[i]->render.stencil != VKRRenderPassAction::CLEAR) { steps[i]->render.stencil = VKRRenderPassAction::CLEAR; steps[i]->render.clearStencil = steps[j]->render.clearStencil; } // Cheaply skip the first step. steps[j]->stepType = VKRStepType::RENDER_SKIP; break; } else if (steps[i]->stepType == VKRStepType::COPY && steps[i]->copy.src == steps[j]->render.framebuffer) { // Can't eliminate the clear if a game copies from it before it's // rendered to. However this should be rare. // TODO: This should never happen when we check numReads now. break; } } } } // Queue hacks. if (hacksEnabled_) { if (hacksEnabled_ & QUEUE_HACK_MGS2_ACID) { // Massive speedup. ApplyMGSHack(steps); } if (hacksEnabled_ & QUEUE_HACK_SONIC) { ApplySonicHack(steps); } if (hacksEnabled_ & QUEUE_HACK_RENDERPASS_MERGE) { ApplyRenderPassMerge(steps); } } } void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector &steps, QueueProfileContext *profile) { if (profile) profile->cpuStartTime = real_time_now(); bool emitLabels = vulkan_->Extensions().EXT_debug_utils; for (size_t i = 0; i < steps.size(); i++) { const VKRStep &step = *steps[i]; if (emitLabels) { VkDebugUtilsLabelEXT labelInfo{ VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT }; labelInfo.pLabelName = step.tag; vkCmdBeginDebugUtilsLabelEXT(cmd, &labelInfo); } switch (step.stepType) { case VKRStepType::RENDER: PerformRenderPass(step, cmd); break; case VKRStepType::COPY: PerformCopy(step, cmd); break; case VKRStepType::BLIT: PerformBlit(step, cmd); break; case VKRStepType::READBACK: PerformReadback(step, cmd); break; case VKRStepType::READBACK_IMAGE: PerformReadbackImage(step, cmd); break; case VKRStepType::RENDER_SKIP: break; } if (profile && profile->timestampDescriptions.size() + 1 < MAX_TIMESTAMP_QUERIES) { vkCmdWriteTimestamp(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile->queryPool, (uint32_t)profile->timestampDescriptions.size()); profile->timestampDescriptions.push_back(StepToString(step)); } if (emitLabels) { vkCmdEndDebugUtilsLabelEXT(cmd); } } // Deleting all in one go should be easier on the instruction cache than deleting // them as we go - and easier to debug because we can look backwards in the frame. for (size_t i = 0; i < steps.size(); i++) { delete steps[i]; } if (profile) profile->cpuEndTime = real_time_now(); } void VulkanQueueRunner::ApplyMGSHack(std::vector &steps) { // Really need a sane way to express transforms of steps. // We want to turn a sequence of copy,render(1),copy,render(1),copy,render(1) to copy,copy,copy,render(n). for (int i = 0; i < (int)steps.size() - 3; i++) { int last = -1; if (!(steps[i]->stepType == VKRStepType::COPY && steps[i + 1]->stepType == VKRStepType::RENDER && steps[i + 2]->stepType == VKRStepType::COPY && steps[i + 1]->render.numDraws == 1 && steps[i]->copy.dst == steps[i + 2]->copy.dst)) continue; // Looks promising! Let's start by finding the last one. for (int j = i; j < (int)steps.size(); j++) { switch (steps[j]->stepType) { case VKRStepType::RENDER: if (steps[j]->render.numDraws > 1) last = j - 1; // should really also check descriptor sets... if (steps[j]->commands.size()) { VkRenderData &cmd = steps[j]->commands.back(); if (cmd.cmd == VKRRenderCommand::DRAW_INDEXED && cmd.draw.count != 6) last = j - 1; } break; case VKRStepType::COPY: if (steps[j]->copy.dst != steps[i]->copy.dst) last = j - 1; break; default: break; } if (last != -1) break; } if (last != -1) { // We've got a sequence from i to last that needs reordering. // First, let's sort it, keeping the same length. std::vector copies; std::vector renders; copies.reserve((last - i) / 2); renders.reserve((last - i) / 2); for (int n = i; n <= last; n++) { if (steps[n]->stepType == VKRStepType::COPY) copies.push_back(steps[n]); else if (steps[n]->stepType == VKRStepType::RENDER) renders.push_back(steps[n]); } // Write the copies back. TODO: Combine them too. for (int j = 0; j < (int)copies.size(); j++) { steps[i + j] = copies[j]; } // Write the renders back (so they will be deleted properly). for (int j = 0; j < (int)renders.size(); j++) { steps[i + j + copies.size()] = renders[j]; } _assert_(steps[i + copies.size()]->stepType == VKRStepType::RENDER); // Combine the renders. for (int j = 1; j < (int)renders.size(); j++) { for (int k = 0; k < (int)renders[j]->commands.size(); k++) { steps[i + copies.size()]->commands.push_back(renders[j]->commands[k]); } steps[i + copies.size() + j]->stepType = VKRStepType::RENDER_SKIP; } // We're done. break; } } // There's also a post processing effect using depals that's just brutal in some parts // of the game. for (int i = 0; i < (int)steps.size() - 3; i++) { int last = -1; if (!(steps[i]->stepType == VKRStepType::RENDER && steps[i + 1]->stepType == VKRStepType::RENDER && steps[i + 2]->stepType == VKRStepType::RENDER && steps[i]->render.numDraws == 1 && steps[i + 1]->render.numDraws == 1 && steps[i + 2]->render.numDraws == 1 && steps[i]->render.color == VKRRenderPassAction::DONT_CARE && steps[i + 1]->render.color == VKRRenderPassAction::KEEP && steps[i + 2]->render.color == VKRRenderPassAction::DONT_CARE)) continue; VKRFramebuffer *depalFramebuffer = steps[i]->render.framebuffer; VKRFramebuffer *targetFramebuffer = steps[i + 1]->render.framebuffer; // OK, found the start of a post-process sequence. Let's scan until we find the end. for (int j = i; j < (int)steps.size() - 3; j++) { if (((j - i) & 1) == 0) { // This should be a depal draw. if (steps[j]->render.numDraws != 1) break; if (steps[j]->render.color != VKRRenderPassAction::DONT_CARE) break; if (steps[j]->render.framebuffer != depalFramebuffer) break; last = j; } else { // This should be a target draw. if (steps[j]->render.numDraws != 1) break; if (steps[j]->render.color != VKRRenderPassAction::KEEP) break; if (steps[j]->render.framebuffer != targetFramebuffer) break; last = j; } } if (last == -1) continue; // Combine the depal renders. for (int j = i + 2; j <= last + 1; j += 2) { for (int k = 0; k < (int)steps[j]->commands.size(); k++) { switch (steps[j]->commands[k].cmd) { case VKRRenderCommand::DRAW: case VKRRenderCommand::DRAW_INDEXED: steps[i]->commands.push_back(steps[j]->commands[k]); break; default: break; } } steps[j]->stepType = VKRStepType::RENDER_SKIP; } // Combine the target renders. for (int j = i + 3; j <= last; j += 2) { for (int k = 0; k < (int)steps[j]->commands.size(); k++) { switch (steps[j]->commands[k].cmd) { case VKRRenderCommand::DRAW: case VKRRenderCommand::DRAW_INDEXED: steps[i + 1]->commands.push_back(steps[j]->commands[k]); break; default: break; } } steps[j]->stepType = VKRStepType::RENDER_SKIP; } // We're done - we only expect one of these sequences per frame. break; } } void VulkanQueueRunner::ApplySonicHack(std::vector &steps) { // We want to turn a sequence of render(3),render(1),render(6),render(1),render(6),render(1),render(3) to // render(1), render(1), render(1), render(6), render(6), render(6) for (int i = 0; i < (int)steps.size() - 4; i++) { int last = -1; if (!(steps[i]->stepType == VKRStepType::RENDER && steps[i + 1]->stepType == VKRStepType::RENDER && steps[i + 2]->stepType == VKRStepType::RENDER && steps[i + 3]->stepType == VKRStepType::RENDER && steps[i]->render.numDraws == 3 && steps[i + 1]->render.numDraws == 1 && steps[i + 2]->render.numDraws == 6 && steps[i + 3]->render.numDraws == 1 && steps[i]->render.framebuffer == steps[i + 2]->render.framebuffer && steps[i + 1]->render.framebuffer == steps[i + 3]->render.framebuffer)) continue; // Looks promising! Let's start by finding the last one. for (int j = i; j < (int)steps.size(); j++) { switch (steps[j]->stepType) { case VKRStepType::RENDER: if ((j - i) & 1) { if (steps[j]->render.framebuffer != steps[i + 1]->render.framebuffer) last = j - 1; if (steps[j]->render.numDraws != 1) last = j - 1; } else { if (steps[j]->render.framebuffer != steps[i]->render.framebuffer) last = j - 1; if (steps[j]->render.numDraws != 3 && steps[j]->render.numDraws != 6) last = j - 1; } break; default: break; } if (last != -1) break; } if (last != -1) { // We've got a sequence from i to last that needs reordering. // First, let's sort it, keeping the same length. std::vector type1; std::vector type2; type1.reserve((last - i) / 2); type2.reserve((last - i) / 2); for (int n = i; n <= last; n++) { if (steps[n]->render.framebuffer == steps[i]->render.framebuffer) type1.push_back(steps[n]); else type2.push_back(steps[n]); } // Write the renders back in order. Same amount, so deletion will work fine. for (int j = 0; j < (int)type1.size(); j++) { steps[i + j] = type1[j]; } for (int j = 0; j < (int)type2.size(); j++) { steps[i + j + type1.size()] = type2[j]; } // Combine the renders. for (int j = 1; j < (int)type1.size(); j++) { for (int k = 0; k < (int)type1[j]->commands.size(); k++) { steps[i]->commands.push_back(type1[j]->commands[k]); } steps[i + j]->stepType = VKRStepType::RENDER_SKIP; } for (int j = 1; j < (int)type2.size(); j++) { for (int k = 0; k < (int)type2[j]->commands.size(); k++) { steps[i + type1.size()]->commands.push_back(type2[j]->commands[k]); } steps[i + j + type1.size()]->stepType = VKRStepType::RENDER_SKIP; } // We're done. break; } } } const char *AspectToString(VkImageAspectFlags aspect) { switch (aspect) { case VK_IMAGE_ASPECT_COLOR_BIT: return "COLOR"; case VK_IMAGE_ASPECT_DEPTH_BIT: return "DEPTH"; case VK_IMAGE_ASPECT_STENCIL_BIT: return "STENCIL"; case VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT: return "DEPTHSTENCIL"; default: return "UNUSUAL"; } } std::string VulkanQueueRunner::StepToString(const VKRStep &step) const { char buffer[256]; switch (step.stepType) { case VKRStepType::RENDER: { int w = step.render.framebuffer ? step.render.framebuffer->width : vulkan_->GetBackbufferWidth(); int h = step.render.framebuffer ? step.render.framebuffer->height : vulkan_->GetBackbufferHeight(); snprintf(buffer, sizeof(buffer), "RENDER %s (draws: %d, %dx%d, fb: %p, )", step.tag, step.render.numDraws, w, h, step.render.framebuffer); break; } case VKRStepType::COPY: snprintf(buffer, sizeof(buffer), "COPY '%s' %s -> %s (%dx%d, %s)", step.tag, step.copy.src->tag.c_str(), step.copy.dst->tag.c_str(), step.copy.srcRect.extent.width, step.copy.srcRect.extent.height, AspectToString(step.copy.aspectMask)); break; case VKRStepType::BLIT: snprintf(buffer, sizeof(buffer), "BLIT '%s' %s -> %s (%dx%d->%dx%d, %s)", step.tag, step.copy.src->tag.c_str(), step.copy.dst->tag.c_str(), step.blit.srcRect.extent.width, step.blit.srcRect.extent.height, step.blit.dstRect.extent.width, step.blit.dstRect.extent.height, AspectToString(step.blit.aspectMask)); break; case VKRStepType::READBACK: snprintf(buffer, sizeof(buffer), "READBACK '%s' %s (%dx%d, %s)", step.tag, step.readback.src->tag.c_str(), step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, AspectToString(step.readback.aspectMask)); break; case VKRStepType::READBACK_IMAGE: snprintf(buffer, sizeof(buffer), "READBACK_IMAGE '%s' (%dx%d)", step.tag, step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height); break; case VKRStepType::RENDER_SKIP: snprintf(buffer, sizeof(buffer), "(RENDER_SKIP) %s", step.tag); break; default: buffer[0] = 0; break; } return std::string(buffer); } // Ideally, this should be cheap enough to be applied to all games. At least on mobile, it's pretty // much a guaranteed neutral or win in terms of GPU power. However, dependency calculation really // must be perfect! void VulkanQueueRunner::ApplyRenderPassMerge(std::vector &steps) { // First let's count how many times each framebuffer is rendered to. // If it's more than one, let's do our best to merge them. This can help God of War quite a bit. std::unordered_map counts; for (int i = 0; i < (int)steps.size(); i++) { if (steps[i]->stepType == VKRStepType::RENDER) { counts[steps[i]->render.framebuffer]++; } } auto mergeRenderSteps = [](VKRStep *dst, VKRStep *src) { // OK. Now, if it's a render, slurp up all the commands and kill the step. // Also slurp up any pretransitions. dst->preTransitions.insert(dst->preTransitions.end(), src->preTransitions.begin(), src->preTransitions.end()); dst->commands.insert(dst->commands.end(), src->commands.begin(), src->commands.end()); // So we don't consider it for other things, maybe doesn't matter. src->dependencies.clear(); src->stepType = VKRStepType::RENDER_SKIP; }; auto renderHasClear = [](const VKRStep *step) { const auto &r = step->render; return r.color == VKRRenderPassAction::CLEAR || r.depth == VKRRenderPassAction::CLEAR || r.stencil == VKRRenderPassAction::CLEAR; }; // Now, let's go through the steps. If we find one that is rendered to more than once, // we'll scan forward and slurp up any rendering that can be merged across. for (int i = 0; i < (int)steps.size(); i++) { if (steps[i]->stepType == VKRStepType::RENDER && counts[steps[i]->render.framebuffer] > 1) { auto fb = steps[i]->render.framebuffer; TinySet touchedFramebuffers; // must be the same fast-size as the dependencies TinySet for annoying reasons. for (int j = i + 1; j < (int)steps.size(); j++) { // If any other passes are reading from this framebuffer as-is, we cancel the scan. if (steps[j]->dependencies.contains(fb)) { // Reading from itself means a KEEP, which is okay. if (steps[j]->stepType != VKRStepType::RENDER || steps[j]->render.framebuffer != fb) break; } switch (steps[j]->stepType) { case VKRStepType::RENDER: if (steps[j]->render.framebuffer == fb) { // Prevent Unknown's example case from https://github.com/hrydgard/ppsspp/pull/12242 if (renderHasClear(steps[j]) || steps[j]->dependencies.contains(touchedFramebuffers)) { goto done_fb; } else { // Safe to merge, great. mergeRenderSteps(steps[i], steps[j]); } } else { // Remember the framebuffer this wrote to. We can't merge with later passes that depend on these. touchedFramebuffers.insert(steps[j]->render.framebuffer); } break; case VKRStepType::COPY: if (steps[j]->copy.dst == fb) { // Without framebuffer "renaming", we can't merge past a clobbered fb. goto done_fb; } touchedFramebuffers.insert(steps[j]->copy.dst); break; case VKRStepType::BLIT: if (steps[j]->blit.dst == fb) { // Without framebuffer "renaming", we can't merge past a clobbered fb. goto done_fb; } touchedFramebuffers.insert(steps[j]->blit.dst); break; case VKRStepType::READBACK: // Not sure this has much effect, when executed READBACK is always the last step // since we stall the GPU and wait immediately after. break; case VKRStepType::RENDER_SKIP: case VKRStepType::READBACK_IMAGE: break; default: // We added a new step? Might be unsafe. goto done_fb; } } done_fb: ; } } } void VulkanQueueRunner::LogSteps(const std::vector &steps, bool verbose) { INFO_LOG(G3D, "=================== FRAME ===================="); for (size_t i = 0; i < steps.size(); i++) { const VKRStep &step = *steps[i]; switch (step.stepType) { case VKRStepType::RENDER: LogRenderPass(step, verbose); break; case VKRStepType::COPY: LogCopy(step); break; case VKRStepType::BLIT: LogBlit(step); break; case VKRStepType::READBACK: LogReadback(step); break; case VKRStepType::READBACK_IMAGE: LogReadbackImage(step); break; case VKRStepType::RENDER_SKIP: INFO_LOG(G3D, "(skipped render pass)"); break; } } INFO_LOG(G3D, "------------------- SUBMIT ------------------"); } const char *RenderPassActionName(VKRRenderPassAction a) { switch (a) { case VKRRenderPassAction::CLEAR: return "CLEAR"; case VKRRenderPassAction::DONT_CARE: return "DONT_CARE"; case VKRRenderPassAction::KEEP: return "KEEP"; } return "?"; } const char *ImageLayoutToString(VkImageLayout layout) { switch (layout) { case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: return "COLOR_ATTACHMENT"; case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: return "DEPTH_STENCIL_ATTACHMENT"; case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: return "SHADER_READ_ONLY"; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: return "TRANSFER_SRC"; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: return "TRANSFER_DST"; case VK_IMAGE_LAYOUT_GENERAL: return "GENERAL"; case VK_IMAGE_LAYOUT_PRESENT_SRC_KHR: return "PRESENT_SRC_KHR"; case VK_IMAGE_LAYOUT_UNDEFINED: return "UNDEFINED"; default: return "(unknown)"; } } void VulkanQueueRunner::LogRenderPass(const VKRStep &pass, bool verbose) { const auto &r = pass.render; const char *framebuf = r.framebuffer ? r.framebuffer->tag.c_str() : "backbuffer"; int w = r.framebuffer ? r.framebuffer->width : vulkan_->GetBackbufferWidth(); int h = r.framebuffer ? r.framebuffer->height : vulkan_->GetBackbufferHeight(); INFO_LOG(G3D, "RENDER %s Begin(%s, draws: %d, %dx%d, %s, %s, %s)", pass.tag, framebuf, r.numDraws, w, h, RenderPassActionName(r.color), RenderPassActionName(r.depth), RenderPassActionName(r.stencil)); // TODO: Log these in detail. for (int i = 0; i < pass.preTransitions.size(); i++) { INFO_LOG(G3D, " PRETRANSITION: %s %s -> %s", pass.preTransitions[i].fb->tag.c_str(), AspectToString(pass.preTransitions[i].aspect), ImageLayoutToString(pass.preTransitions[i].targetLayout)); } if (verbose) { for (auto &cmd : pass.commands) { switch (cmd.cmd) { case VKRRenderCommand::REMOVED: INFO_LOG(G3D, " (Removed)"); break; case VKRRenderCommand::BIND_PIPELINE: INFO_LOG(G3D, " BindPipeline(%x)", (int)(intptr_t)cmd.pipeline.pipeline); break; case VKRRenderCommand::BLEND: INFO_LOG(G3D, " BlendColor(%08x)", cmd.blendColor.color); break; case VKRRenderCommand::CLEAR: INFO_LOG(G3D, " Clear"); break; case VKRRenderCommand::DRAW: INFO_LOG(G3D, " Draw(%d)", cmd.draw.count); break; case VKRRenderCommand::DRAW_INDEXED: INFO_LOG(G3D, " DrawIndexed(%d)", cmd.drawIndexed.count); break; case VKRRenderCommand::SCISSOR: INFO_LOG(G3D, " Scissor(%d, %d, %d, %d)", (int)cmd.scissor.scissor.offset.x, (int)cmd.scissor.scissor.offset.y, (int)cmd.scissor.scissor.extent.width, (int)cmd.scissor.scissor.extent.height); break; case VKRRenderCommand::STENCIL: INFO_LOG(G3D, " Stencil(ref=%d, compare=%d, write=%d)", cmd.stencil.stencilRef, cmd.stencil.stencilCompareMask, cmd.stencil.stencilWriteMask); break; case VKRRenderCommand::VIEWPORT: INFO_LOG(G3D, " Viewport(%f, %f, %f, %f, %f, %f)", cmd.viewport.vp.x, cmd.viewport.vp.y, cmd.viewport.vp.width, cmd.viewport.vp.height, cmd.viewport.vp.minDepth, cmd.viewport.vp.maxDepth); break; case VKRRenderCommand::PUSH_CONSTANTS: INFO_LOG(G3D, " PushConstants(%d)", cmd.push.size); break; case VKRRenderCommand::NUM_RENDER_COMMANDS: break; } } } INFO_LOG(G3D, " Final: %s %s", ImageLayoutToString(pass.render.finalColorLayout), ImageLayoutToString(pass.render.finalDepthStencilLayout)); INFO_LOG(G3D, "RENDER End(%s) - %d commands executed", framebuf, (int)pass.commands.size()); } void VulkanQueueRunner::LogCopy(const VKRStep &step) { INFO_LOG(G3D, "%s", StepToString(step).c_str()); } void VulkanQueueRunner::LogBlit(const VKRStep &step) { INFO_LOG(G3D, "%s", StepToString(step).c_str()); } void VulkanQueueRunner::LogReadback(const VKRStep &step) { INFO_LOG(G3D, "%s", StepToString(step).c_str()); } void VulkanQueueRunner::LogReadbackImage(const VKRStep &step) { INFO_LOG(G3D, "%s", StepToString(step).c_str()); } void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer cmd) { // TODO: If there are multiple, we can transition them together. for (const auto &iter : step.preTransitions) { if (iter.aspect == VK_IMAGE_ASPECT_COLOR_BIT && iter.fb->color.layout != iter.targetLayout) { VkImageMemoryBarrier barrier{ VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER }; barrier.oldLayout = iter.fb->color.layout; barrier.subresourceRange.layerCount = 1; barrier.subresourceRange.levelCount = 1; barrier.image = iter.fb->color.image; VkPipelineStageFlags srcStage{}; VkPipelineStageFlags dstStage{}; switch (barrier.oldLayout) { case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT; srcStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; break; case VK_IMAGE_LAYOUT_UNDEFINED: barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT; srcStage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; srcStage = VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; srcStage = VK_PIPELINE_STAGE_TRANSFER_BIT; break; default: _assert_msg_(false, "PerformRenderPass: Unexpected oldLayout: %d", (int)barrier.oldLayout); break; } barrier.newLayout = iter.targetLayout; switch (barrier.newLayout) { case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; default: _assert_msg_(false, "PerformRenderPass: Unexpected newLayout: %d", (int)barrier.newLayout); break; } barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; vkCmdPipelineBarrier(cmd, srcStage, dstStage, 0, 0, nullptr, 0, nullptr, 1, &barrier); iter.fb->color.layout = barrier.newLayout; } else if ((iter.aspect & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) && iter.fb->depth.layout != iter.targetLayout) { VkImageMemoryBarrier barrier{ VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER }; barrier.oldLayout = iter.fb->depth.layout; barrier.subresourceRange.layerCount = 1; barrier.subresourceRange.levelCount = 1; barrier.image = iter.fb->depth.image; barrier.srcAccessMask = 0; VkPipelineStageFlags srcStage; VkPipelineStageFlags dstStage; switch (barrier.oldLayout) { case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; srcStage = VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; srcStage = VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; srcStage = VK_PIPELINE_STAGE_TRANSFER_BIT; break; default: Crash(); break; } barrier.newLayout = iter.targetLayout; switch (barrier.newLayout) { case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; dstStage = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; default: Crash(); break; } barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; vkCmdPipelineBarrier(cmd, srcStage, dstStage, 0, 0, nullptr, 0, nullptr, 1, &barrier); iter.fb->depth.layout = barrier.newLayout; } } // Don't execute empty renderpasses that keep the contents. if (step.commands.empty() && step.render.color == VKRRenderPassAction::KEEP && step.render.depth == VKRRenderPassAction::KEEP && step.render.stencil == VKRRenderPassAction::KEEP) { // Nothing to do. // TODO: Though - a later step might have used this step's finalColorLayout etc to get things in a layout it expects. // Should we just do a barrier? Or just let the later step deal with not having things in its preferred layout, like now? return; } // Write-after-write hazards. Fixed flicker in God of War on ARM (before we added another fix that removed these). if (step.render.framebuffer) { int n = 0; int stage = 0; VkImageMemoryBarrier barriers[2]{}; if (step.render.framebuffer->color.layout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL) { barriers[n].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[n].oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; barriers[n].newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; barriers[n].subresourceRange.layerCount = 1; barriers[n].subresourceRange.levelCount = 1; barriers[n].image = step.render.framebuffer->color.image; barriers[n].srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; barriers[n].dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT; barriers[n].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; barriers[n].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[n].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; n++; } if (step.render.framebuffer->depth.layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL) { barriers[n].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barriers[n].oldLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; barriers[n].newLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; barriers[n].subresourceRange.layerCount = 1; barriers[n].subresourceRange.levelCount = 1; barriers[n].image = step.render.framebuffer->depth.image; barriers[n].srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; barriers[n].dstAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; barriers[n].subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; barriers[n].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[n].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; stage |= VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; n++; } if (stage) { vkCmdPipelineBarrier(cmd, stage, stage, 0, 0, nullptr, 0, nullptr, n, barriers); // No need to modify the image layouts here - it's just an execution barrier. } } // This reads the layout of the color and depth images, and chooses a render pass using them that // will transition to the desired final layout. PerformBindFramebufferAsRenderTarget(step, cmd); int curWidth = step.render.framebuffer ? step.render.framebuffer->width : vulkan_->GetBackbufferWidth(); int curHeight = step.render.framebuffer ? step.render.framebuffer->height : vulkan_->GetBackbufferHeight(); VKRFramebuffer *fb = step.render.framebuffer; VkPipeline lastPipeline = VK_NULL_HANDLE; auto &commands = step.commands; // We can do a little bit of state tracking here to eliminate some calls into the driver. // The stencil ones are very commonly mostly redundant so let's eliminate them where possible. int lastStencilWriteMask = -1; int lastStencilCompareMask = -1; int lastStencilReference = -1; for (const auto &c : commands) { switch (c.cmd) { case VKRRenderCommand::REMOVED: break; case VKRRenderCommand::BIND_PIPELINE: if (c.pipeline.pipeline != lastPipeline) { vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, c.pipeline.pipeline); lastPipeline = c.pipeline.pipeline; // Reset dynamic state so it gets refreshed with the new pipeline. lastStencilWriteMask = -1; lastStencilCompareMask = -1; lastStencilReference = -1; } break; case VKRRenderCommand::VIEWPORT: if (fb != nullptr) { vkCmdSetViewport(cmd, 0, 1, &c.viewport.vp); } else { const VkViewport &vp = c.viewport.vp; DisplayRect rc{ vp.x, vp.y, vp.width, vp.height }; RotateRectToDisplay(rc, (float)vulkan_->GetBackbufferWidth(), (float)vulkan_->GetBackbufferHeight()); VkViewport final_vp; final_vp.x = rc.x; final_vp.y = rc.y; final_vp.width = rc.w; final_vp.height = rc.h; final_vp.maxDepth = vp.maxDepth; final_vp.minDepth = vp.minDepth; vkCmdSetViewport(cmd, 0, 1, &final_vp); } break; case VKRRenderCommand::SCISSOR: { if (fb != nullptr) { vkCmdSetScissor(cmd, 0, 1, &c.scissor.scissor); } else { // Rendering to backbuffer. Might need to rotate. const VkRect2D &rc = c.scissor.scissor; DisplayRect rotated_rc{ rc.offset.x, rc.offset.y, (int)rc.extent.width, (int)rc.extent.height }; RotateRectToDisplay(rotated_rc, vulkan_->GetBackbufferWidth(), vulkan_->GetBackbufferHeight()); _dbg_assert_(rotated_rc.x >= 0); _dbg_assert_(rotated_rc.y >= 0); VkRect2D finalRect = VkRect2D{ { rotated_rc.x, rotated_rc.y }, { (uint32_t)rotated_rc.w, (uint32_t)rotated_rc.h} }; vkCmdSetScissor(cmd, 0, 1, &finalRect); } break; } case VKRRenderCommand::BLEND: { float bc[4]; Uint8x4ToFloat4(bc, c.blendColor.color); vkCmdSetBlendConstants(cmd, bc); break; } case VKRRenderCommand::PUSH_CONSTANTS: vkCmdPushConstants(cmd, c.push.pipelineLayout, c.push.stages, c.push.offset, c.push.size, c.push.data); break; case VKRRenderCommand::STENCIL: if (lastStencilWriteMask != c.stencil.stencilWriteMask) { lastStencilWriteMask = (int)c.stencil.stencilWriteMask; vkCmdSetStencilWriteMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilWriteMask); } if (lastStencilCompareMask != c.stencil.stencilCompareMask) { lastStencilCompareMask = c.stencil.stencilCompareMask; vkCmdSetStencilCompareMask(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilCompareMask); } if (lastStencilReference != c.stencil.stencilRef) { lastStencilReference = c.stencil.stencilRef; vkCmdSetStencilReference(cmd, VK_STENCIL_FRONT_AND_BACK, c.stencil.stencilRef); } break; case VKRRenderCommand::DRAW_INDEXED: vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, c.drawIndexed.pipelineLayout, 0, 1, &c.drawIndexed.ds, c.drawIndexed.numUboOffsets, c.drawIndexed.uboOffsets); vkCmdBindIndexBuffer(cmd, c.drawIndexed.ibuffer, c.drawIndexed.ioffset, c.drawIndexed.indexType); vkCmdBindVertexBuffers(cmd, 0, 1, &c.drawIndexed.vbuffer, &c.drawIndexed.voffset); vkCmdDrawIndexed(cmd, c.drawIndexed.count, c.drawIndexed.instances, 0, 0, 0); break; case VKRRenderCommand::DRAW: vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, c.draw.pipelineLayout, 0, 1, &c.draw.ds, c.draw.numUboOffsets, c.draw.uboOffsets); if (c.draw.vbuffer) { vkCmdBindVertexBuffers(cmd, 0, 1, &c.draw.vbuffer, &c.draw.voffset); } vkCmdDraw(cmd, c.draw.count, 1, c.draw.offset, 0); break; case VKRRenderCommand::CLEAR: { // If we get here, we failed to merge a clear into a render pass load op. This is bad for perf. int numAttachments = 0; VkClearRect rc{}; rc.baseArrayLayer = 0; rc.layerCount = 1; rc.rect.extent.width = (uint32_t)curWidth; rc.rect.extent.height = (uint32_t)curHeight; VkClearAttachment attachments[2]{}; if (c.clear.clearMask & VK_IMAGE_ASPECT_COLOR_BIT) { VkClearAttachment &attachment = attachments[numAttachments++]; attachment.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; attachment.colorAttachment = 0; Uint8x4ToFloat4(attachment.clearValue.color.float32, c.clear.clearColor); } if (c.clear.clearMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { VkClearAttachment &attachment = attachments[numAttachments++]; attachment.aspectMask = 0; if (c.clear.clearMask & VK_IMAGE_ASPECT_DEPTH_BIT) { attachment.clearValue.depthStencil.depth = c.clear.clearZ; attachment.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT; } if (c.clear.clearMask & VK_IMAGE_ASPECT_STENCIL_BIT) { attachment.clearValue.depthStencil.stencil = (uint32_t)c.clear.clearStencil; attachment.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; } } if (numAttachments) { vkCmdClearAttachments(cmd, numAttachments, attachments, 1, &rc); } break; } default: ERROR_LOG(G3D, "Unimpl queue command"); ; } } vkCmdEndRenderPass(cmd); // The renderpass handles the layout transition. if (fb) { fb->color.layout = step.render.finalColorLayout; fb->depth.layout = step.render.finalDepthStencilLayout; } } void VulkanQueueRunner::PerformBindFramebufferAsRenderTarget(const VKRStep &step, VkCommandBuffer cmd) { VkRenderPass renderPass; int numClearVals = 0; VkClearValue clearVal[2]{}; VkFramebuffer framebuf; int w; int h; if (step.render.framebuffer) { _dbg_assert_(step.render.finalColorLayout != VK_IMAGE_LAYOUT_UNDEFINED); _dbg_assert_(step.render.finalDepthStencilLayout != VK_IMAGE_LAYOUT_UNDEFINED); VKRFramebuffer *fb = step.render.framebuffer; framebuf = fb->framebuf; w = fb->width; h = fb->height; // Mali driver on S8 (Android O) and S9 mishandles renderpasses that do just a clear // and then no draw calls. Memory transaction elimination gets mis-flagged or something. // To avoid this, we transition to GENERAL and back in this case (ARM-approved workaround). // See pull request #10723. bool maliBugWorkaround = step.render.numDraws == 0 && step.render.color == VKRRenderPassAction::CLEAR && vulkan_->GetPhysicalDeviceProperties().properties.driverVersion == 0xaa9c4b29; if (maliBugWorkaround) { TransitionImageLayout2(cmd, step.render.framebuffer->color.image, 0, 1, VK_IMAGE_ASPECT_COLOR_BIT, fb->color.layout, VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT); fb->color.layout = VK_IMAGE_LAYOUT_GENERAL; } renderPass = GetRenderPass( step.render.color, step.render.depth, step.render.stencil, fb->color.layout, fb->depth.layout, step.render.finalColorLayout, step.render.finalDepthStencilLayout); // We now do any layout pretransitions as part of the render pass. fb->color.layout = step.render.finalColorLayout; fb->depth.layout = step.render.finalDepthStencilLayout; if (step.render.color == VKRRenderPassAction::CLEAR) { Uint8x4ToFloat4(clearVal[0].color.float32, step.render.clearColor); numClearVals = 1; } if (step.render.depth == VKRRenderPassAction::CLEAR || step.render.stencil == VKRRenderPassAction::CLEAR) { clearVal[1].depthStencil.depth = step.render.clearDepth; clearVal[1].depthStencil.stencil = step.render.clearStencil; numClearVals = 2; } } else { framebuf = backbuffer_; w = vulkan_->GetBackbufferWidth(); h = vulkan_->GetBackbufferHeight(); renderPass = GetBackbufferRenderPass(); Uint8x4ToFloat4(clearVal[0].color.float32, step.render.clearColor); numClearVals = 2; // We don't bother with a depth buffer here. clearVal[1].depthStencil.depth = 0.0f; clearVal[1].depthStencil.stencil = 0; } VkRenderPassBeginInfo rp_begin = { VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO }; rp_begin.renderPass = renderPass; rp_begin.framebuffer = framebuf; rp_begin.renderArea.offset.x = 0; rp_begin.renderArea.offset.y = 0; rp_begin.renderArea.extent.width = w; rp_begin.renderArea.extent.height = h; rp_begin.clearValueCount = numClearVals; rp_begin.pClearValues = numClearVals ? clearVal : nullptr; vkCmdBeginRenderPass(cmd, &rp_begin, VK_SUBPASS_CONTENTS_INLINE); } void VulkanQueueRunner::PerformCopy(const VKRStep &step, VkCommandBuffer cmd) { VKRFramebuffer *src = step.copy.src; VKRFramebuffer *dst = step.copy.dst; VkImageCopy copy{}; copy.srcOffset.x = step.copy.srcRect.offset.x; copy.srcOffset.y = step.copy.srcRect.offset.y; copy.srcOffset.z = 0; copy.srcSubresource.mipLevel = 0; copy.srcSubresource.layerCount = 1; copy.dstOffset.x = step.copy.dstPos.x; copy.dstOffset.y = step.copy.dstPos.y; copy.dstOffset.z = 0; copy.dstSubresource.mipLevel = 0; copy.dstSubresource.layerCount = 1; copy.extent.width = step.copy.srcRect.extent.width; copy.extent.height = step.copy.srcRect.extent.height; copy.extent.depth = 1; VkImageMemoryBarrier srcBarriers[2]{}; VkImageMemoryBarrier dstBarriers[2]{}; int srcCount = 0; int dstCount = 0; VkPipelineStageFlags srcStage = 0; VkPipelineStageFlags dstStage = 0; // First source barriers. if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { if (src->color.layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) { SetupTransitionToTransferSrc(src->color, srcBarriers[srcCount++], srcStage, VK_IMAGE_ASPECT_COLOR_BIT); } if (dst->color.layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { SetupTransitionToTransferDst(dst->color, dstBarriers[dstCount++], dstStage, VK_IMAGE_ASPECT_COLOR_BIT); } } // We can't copy only depth or only stencil unfortunately - or can we?. if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (src->depth.layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) { SetupTransitionToTransferSrc(src->depth, srcBarriers[srcCount++], srcStage, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); } if (dst->depth.layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { SetupTransitionToTransferDst(dst->depth, dstBarriers[dstCount++], dstStage, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); _dbg_assert_(dst->depth.layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); } } if (srcCount) { vkCmdPipelineBarrier(cmd, srcStage, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, srcCount, srcBarriers); } if (dstCount) { vkCmdPipelineBarrier(cmd, dstStage, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, dstCount, dstBarriers); } if (step.copy.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { copy.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; copy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; vkCmdCopyImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, ©); } if (step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { copy.srcSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); copy.dstSubresource.aspectMask = step.copy.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); vkCmdCopyImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, ©); } } void VulkanQueueRunner::PerformBlit(const VKRStep &step, VkCommandBuffer cmd) { VkImageMemoryBarrier srcBarriers[2]{}; VkImageMemoryBarrier dstBarriers[2]{}; VKRFramebuffer *src = step.blit.src; VKRFramebuffer *dst = step.blit.dst; // If any validation needs to be performed here, it should probably have been done // already when the blit was queued. So don't validate here. VkImageBlit blit{}; blit.srcOffsets[0].x = step.blit.srcRect.offset.x; blit.srcOffsets[0].y = step.blit.srcRect.offset.y; blit.srcOffsets[0].z = 0; blit.srcOffsets[1].x = step.blit.srcRect.offset.x + step.blit.srcRect.extent.width; blit.srcOffsets[1].y = step.blit.srcRect.offset.y + step.blit.srcRect.extent.height; blit.srcOffsets[1].z = 1; blit.srcSubresource.mipLevel = 0; blit.srcSubresource.layerCount = 1; blit.dstOffsets[0].x = step.blit.dstRect.offset.x; blit.dstOffsets[0].y = step.blit.dstRect.offset.y; blit.dstOffsets[0].z = 0; blit.dstOffsets[1].x = step.blit.dstRect.offset.x + step.blit.dstRect.extent.width; blit.dstOffsets[1].y = step.blit.dstRect.offset.y + step.blit.dstRect.extent.height; blit.dstOffsets[1].z = 1; blit.dstSubresource.mipLevel = 0; blit.dstSubresource.layerCount = 1; VkPipelineStageFlags srcStage = 0; VkPipelineStageFlags dstStage = 0; int srcCount = 0; int dstCount = 0; // First source barriers. if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { if (src->color.layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) { SetupTransitionToTransferSrc(src->color, srcBarriers[srcCount++], srcStage, VK_IMAGE_ASPECT_COLOR_BIT); } if (dst->color.layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { SetupTransitionToTransferDst(dst->color, dstBarriers[dstCount++], dstStage, VK_IMAGE_ASPECT_COLOR_BIT); } } // We can't copy only depth or only stencil unfortunately. if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { if (src->depth.layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) { SetupTransitionToTransferSrc(src->depth, srcBarriers[srcCount++], srcStage, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); } if (dst->depth.layout != VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL) { SetupTransitionToTransferDst(dst->depth, dstBarriers[dstCount++], dstStage, VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT); } } if (srcCount) { vkCmdPipelineBarrier(cmd, srcStage, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, srcCount, srcBarriers); } if (dstCount) { vkCmdPipelineBarrier(cmd, dstStage, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, dstCount, dstBarriers); } if (step.blit.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { blit.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; blit.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; vkCmdBlitImage(cmd, src->color.image, src->color.layout, dst->color.image, dst->color.layout, 1, &blit, step.blit.filter); } // TODO: Need to check if the depth format is blittable. // Actually, we should probably almost always use copies rather than blits for depth buffers. if (step.blit.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { blit.srcSubresource.aspectMask = 0; blit.dstSubresource.aspectMask = 0; if (step.blit.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) { blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT; blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT; } if (step.blit.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) { blit.srcSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; blit.dstSubresource.aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT; } vkCmdBlitImage(cmd, src->depth.image, src->depth.layout, dst->depth.image, dst->depth.layout, 1, &blit, step.blit.filter); } } void VulkanQueueRunner::SetupTransitionToTransferSrc(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect) { barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.oldLayout = img.layout; barrier.subresourceRange.layerCount = 1; barrier.subresourceRange.levelCount = 1; barrier.image = img.image; barrier.srcAccessMask = 0; switch (img.layout) { case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT; stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; break; case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; stage |= VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT; stage |= VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; default: _dbg_assert_msg_(false, "Transition from this layout to transfer src not supported (%d)", (int)img.layout); break; } barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; if (img.format == VK_FORMAT_D16_UNORM_S8_UINT || img.format == VK_FORMAT_D24_UNORM_S8_UINT || img.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { // Barrier must specify both for combined depth/stencil buffers. barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; } else { barrier.subresourceRange.aspectMask = aspect; } barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; img.layout = barrier.newLayout; // NOTE: Must do this AFTER updating img.layout to avoid behaviour differences. #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_COLOR if (aspect == VK_IMAGE_ASPECT_COLOR_BIT) { if (barrier.oldLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL || barrier.oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; if (barrier.newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL || barrier.newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; } #endif #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_DEPTH_STENCIL if (aspect != VK_IMAGE_ASPECT_COLOR_BIT) { if (barrier.oldLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || barrier.oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; if (barrier.newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || barrier.newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; } #endif } void VulkanQueueRunner::SetupTransitionToTransferDst(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect) { barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; barrier.oldLayout = img.layout; barrier.subresourceRange.layerCount = 1; barrier.subresourceRange.levelCount = 1; barrier.image = img.image; barrier.srcAccessMask = 0; switch (img.layout) { case VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; break; case VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; stage |= VK_PIPELINE_STAGE_TRANSFER_BIT; break; case VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; stage |= VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; break; case VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL: barrier.srcAccessMask = VK_ACCESS_SHADER_READ_BIT; stage |= VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; break; default: _dbg_assert_msg_(false, "Transition from this layout to transfer dst not supported (%d)", (int)img.layout); break; } barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; barrier.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL; if (img.format == VK_FORMAT_D16_UNORM_S8_UINT || img.format == VK_FORMAT_D24_UNORM_S8_UINT || img.format == VK_FORMAT_D32_SFLOAT_S8_UINT) { // Barrier must specify both for combined depth/stencil buffers. barrier.subresourceRange.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; } else { barrier.subresourceRange.aspectMask = aspect; } barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; img.layout = barrier.newLayout; // NOTE: Must do this AFTER updating img.layout to avoid behaviour differences. #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_COLOR if (aspect == VK_IMAGE_ASPECT_COLOR_BIT) { if (barrier.oldLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL || barrier.oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; if (barrier.newLayout == VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL || barrier.newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; } #endif #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_DEPTH_STENCIL if (aspect != VK_IMAGE_ASPECT_COLOR_BIT) { if (barrier.oldLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || barrier.oldLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.oldLayout = VK_IMAGE_LAYOUT_GENERAL; if (barrier.newLayout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL || barrier.newLayout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) barrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; } #endif } void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd) { ResizeReadbackBuffer(sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height); VkBufferImageCopy region{}; region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 }; region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 }; region.imageSubresource.aspectMask = step.readback.aspectMask; region.imageSubresource.layerCount = 1; region.bufferOffset = 0; region.bufferRowLength = step.readback.srcRect.extent.width; region.bufferImageHeight = step.readback.srcRect.extent.height; VkImage image; VkImageLayout copyLayout; // Special case for backbuffer readbacks. if (step.readback.src == nullptr) { // We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT, // and then back into it. TransitionImageLayout2(cmd, backbufferImage_, 0, 1, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, VK_ACCESS_TRANSFER_READ_BIT); copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; image = backbufferImage_; } else { VKRImage *srcImage; if (step.readback.aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) { srcImage = &step.readback.src->color; } else if (step.readback.aspectMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { srcImage = &step.readback.src->depth; } else { _dbg_assert_msg_(false, "No image aspect to readback?"); return; } VkImageMemoryBarrier barrier{ VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER }; VkPipelineStageFlags stage = 0; if (srcImage->layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL) { SetupTransitionToTransferSrc(*srcImage, barrier, stage, step.readback.aspectMask); vkCmdPipelineBarrier(cmd, stage, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, 1, &barrier); } image = srcImage->image; copyLayout = srcImage->layout; } vkCmdCopyImageToBuffer(cmd, image, copyLayout, readbackBuffer_, 1, ®ion); // NOTE: Can't read the buffer using the CPU here - need to sync first. // If we copied from the backbuffer, transition it back. if (step.readback.src == nullptr) { // We only take screenshots after the main render pass (anything else would be stupid) so we need to transition out of PRESENT, // and then back into it. TransitionImageLayout2(cmd, backbufferImage_, 0, 1, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_ACCESS_TRANSFER_READ_BIT, 0); copyLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; } } void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffer cmd) { // TODO: Clean this up - just reusing `SetupTransitionToTransferSrc`. VKRImage srcImage{}; srcImage.image = step.readback_image.image; srcImage.layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; VkImageMemoryBarrier barrier{ VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER }; VkPipelineStageFlags stage = 0; SetupTransitionToTransferSrc(srcImage, barrier, stage, VK_IMAGE_ASPECT_COLOR_BIT); vkCmdPipelineBarrier(cmd, stage, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, 0, nullptr, 0, nullptr, 1, &barrier); ResizeReadbackBuffer(sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height); VkBufferImageCopy region{}; region.imageOffset = { step.readback_image.srcRect.offset.x, step.readback_image.srcRect.offset.y, 0 }; region.imageExtent = { step.readback_image.srcRect.extent.width, step.readback_image.srcRect.extent.height, 1 }; region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; region.imageSubresource.layerCount = 1; region.imageSubresource.mipLevel = step.readback_image.mipLevel; region.bufferOffset = 0; region.bufferRowLength = step.readback_image.srcRect.extent.width; region.bufferImageHeight = step.readback_image.srcRect.extent.height; vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, readbackBuffer_, 1, ®ion); // Now transfer it back to a texture. TransitionImageLayout2(cmd, step.readback_image.image, 0, 1, VK_IMAGE_ASPECT_COLOR_BIT, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_SHADER_READ_BIT); // NOTE: Can't read the buffer using the CPU here - need to sync first. // Doing that will also act like a heavyweight barrier ensuring that device writes are visible on the host. } void VulkanQueueRunner::CopyReadbackBuffer(int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) { if (!readbackMemory_) return; // Something has gone really wrong. // Read back to the requested address in ram from buffer. void *mappedData; const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat); VkResult res = vkMapMemory(vulkan_->GetDevice(), readbackMemory_, 0, width * height * srcPixelSize, 0, &mappedData); if (!readbackBufferIsCoherent_) { VkMappedMemoryRange range{}; range.memory = readbackMemory_; range.offset = 0; range.size = width * height * srcPixelSize; vkInvalidateMappedMemoryRanges(vulkan_->GetDevice(), 1, &range); } if (res != VK_SUCCESS) { ERROR_LOG(G3D, "CopyReadbackBuffer: vkMapMemory failed! result=%d", (int)res); return; } // TODO: Perform these conversions in a compute shader on the GPU. if (srcFormat == Draw::DataFormat::R8G8B8A8_UNORM) { ConvertFromRGBA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat); } else if (srcFormat == Draw::DataFormat::B8G8R8A8_UNORM) { ConvertFromBGRA8888(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, destFormat); } else if (srcFormat == destFormat) { // Can just memcpy when it matches no matter the format! uint8_t *dst = pixels; const uint8_t *src = (const uint8_t *)mappedData; for (int y = 0; y < height; ++y) { memcpy(dst, src, width * srcPixelSize); src += width * srcPixelSize; dst += pixelStride * srcPixelSize; } } else if (destFormat == Draw::DataFormat::D32F) { ConvertToD32F(pixels, (const uint8_t *)mappedData, pixelStride, width, width, height, srcFormat); } else { // TODO: Maybe a depth conversion or something? ERROR_LOG(G3D, "CopyReadbackBuffer: Unknown format"); _assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat); } vkUnmapMemory(vulkan_->GetDevice(), readbackMemory_); }