From 2467fddc01adcd0748d23d8a9b2ae6e249f09f43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 18 Sep 2019 00:16:08 +0200
Subject: [PATCH] Vulkan GPU profiler: Measure the CPU time spent on the render
 thread recording command buffers.

---
 ext/native/thin3d/VulkanQueueRunner.cpp   | 15 ++++++---
 ext/native/thin3d/VulkanQueueRunner.h     | 10 +++++-
 ext/native/thin3d/VulkanRenderManager.cpp | 39 ++++++++++++-----------
 ext/native/thin3d/VulkanRenderManager.h   |  6 ++--
 4 files changed, 42 insertions(+), 28 deletions(-)
diff --git a/ext/native/thin3d/VulkanQueueRunner.cpp b/ext/native/thin3d/VulkanQueueRunner.cpp
index 8be1ca7120..19b9e1c6a3 100644
--- a/ext/native/thin3d/VulkanQueueRunner.cpp
+++ b/ext/native/thin3d/VulkanQueueRunner.cpp
@@ -1,4 +1,6 @@
 #include <map>
+
+#include "base/timeutil.h"
 #include "DataFormat.h"
 #include "VulkanQueueRunner.h"
 #include "VulkanRenderManager.h"
@@ -375,7 +377,9 @@ VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) {
 	return pass;
 }
 
-void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &steps, VkQueryPool queryPool, std::vector<std::string> *timestampDescriptions) {
+void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &steps, QueueProfileContext *profile) {
+	if (profile)
+		profile->cpuStartTime = real_time_now();
 	// Optimizes renderpasses, then sequences them.
 	// Planned optimizations: 
 	//  * Create copies of render target that are rendered to multiple times and textured from in sequence, and push those render passes
@@ -465,9 +469,9 @@ void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &st
 			break;
 		}
 
-		if (queryPool) {
-			vkCmdWriteTimestamp(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, queryPool, (uint32_t)timestampDescriptions->size());
-			timestampDescriptions->push_back(StepToString(step));
+		if (profile) {
+			vkCmdWriteTimestamp(cmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, profile->queryPool, (uint32_t)profile->timestampDescriptions.size());
+			profile->timestampDescriptions.push_back(StepToString(step));
 		}
 	}
 
@@ -476,6 +480,9 @@ void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &st
 	for (size_t i = 0; i < steps.size(); i++) {
 		delete steps[i];
 	}
+
+	if (profile)
+		profile->cpuEndTime = real_time_now();
 }
 
 void VulkanQueueRunner::ApplyMGSHack(std::vector<VKRStep *> &steps) {
diff --git a/ext/native/thin3d/VulkanQueueRunner.h b/ext/native/thin3d/VulkanQueueRunner.h
index 3c840856c6..8bb1adfbdf 100644
--- a/ext/native/thin3d/VulkanQueueRunner.h
+++ b/ext/native/thin3d/VulkanQueueRunner.h
@@ -173,6 +173,14 @@ struct TransitionRequest {
 	VkImageLayout targetLayout;
 };
 
+struct QueueProfileContext {
+	VkQueryPool queryPool;
+	std::vector<std::string> timestampDescriptions;
+	std::string profileSummary;
+	double cpuStartTime;
+	double cpuEndTime;
+};
+
 struct VKRStep {
 	VKRStep(VKRStepType _type) : stepType(_type) {}
 	~VKRStep() {}
@@ -232,7 +240,7 @@ public:
 	}
 
 	// RunSteps can modify steps but will leave it in a valid state.
-	void RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &steps, VkQueryPool queryPool, std::vector<std::string> *timestampDescriptions);
+	void RunSteps(VkCommandBuffer cmd, std::vector<VKRStep *> &steps, QueueProfileContext *profile);
 	void LogSteps(const std::vector<VKRStep *> &steps);
 
 	std::string StepToString(const VKRStep &step) const;
diff --git a/ext/native/thin3d/VulkanRenderManager.cpp b/ext/native/thin3d/VulkanRenderManager.cpp
index f9428cc462..f6c4778c62 100644
--- a/ext/native/thin3d/VulkanRenderManager.cpp
+++ b/ext/native/thin3d/VulkanRenderManager.cpp
@@ -137,7 +137,7 @@ VulkanRenderManager::VulkanRenderManager(VulkanContext *vulkan) : vulkan_(vulkan
 		VkQueryPoolCreateInfo query_ci{ VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
 		query_ci.queryCount = MAX_TIMESTAMP_QUERIES;
 		query_ci.queryType = VK_QUERY_TYPE_TIMESTAMP;
-		res = vkCreateQueryPool(vulkan_->GetDevice(), &query_ci, nullptr, &frameData_[i].timestampQueryPool_);
+		res = vkCreateQueryPool(vulkan_->GetDevice(), &query_ci, nullptr, &frameData_[i].profile.queryPool);
 	}
 
 	queueRunner_.CreateDeviceObjects();
@@ -227,7 +227,7 @@ void VulkanRenderManager::StopThread() {
 				frameData.pull_condVar.notify_all();
 			}
 			// Zero the queries so we don't try to pull them later.
-			frameData.timestampDescriptions.clear();
+			frameData.profile.timestampDescriptions.clear();
 		}
 		thread_.join();
 		ILOG("Vulkan submission thread joined. Frame=%d", vulkan_->GetCurFrame());
@@ -298,7 +298,7 @@ VulkanRenderManager::~VulkanRenderManager() {
 		vkDestroyCommandPool(device, frameData_[i].cmdPoolInit, nullptr);
 		vkDestroyCommandPool(device, frameData_[i].cmdPoolMain, nullptr);
 		vkDestroyFence(device, frameData_[i].fence, nullptr);
-		vkDestroyQueryPool(device, frameData_[i].timestampQueryPool_, nullptr);
+		vkDestroyQueryPool(device, frameData_[i].profile.queryPool, nullptr);
 	}
 	queueRunner_.DestroyDeviceObjects();
 }
@@ -375,11 +375,11 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling) {
 
 	if (frameData.profilingEnabled_) {
 		// Pull the profiling results from last time and produce a summary!
-		if (!frameData.timestampDescriptions.empty()) {
-			int numQueries = (int)frameData.timestampDescriptions.size();
+		if (!frameData.profile.timestampDescriptions.empty()) {
+			int numQueries = (int)frameData.profile.timestampDescriptions.size();
 			VkResult res = vkGetQueryPoolResults(
 				vulkan_->GetDevice(),
-				frameData.timestampQueryPool_, 0, numQueries, sizeof(uint64_t) * numQueries, &queryResults[0], sizeof(uint64_t),
+				frameData.profile.queryPool, 0, numQueries, sizeof(uint64_t) * numQueries, &queryResults[0], sizeof(uint64_t),
 				VK_QUERY_RESULT_64_BIT);
 			if (res == VK_SUCCESS) {
 				double timestampConversionFactor = (double)vulkan_->GetPhysicalDeviceProperties().properties.limits.timestampPeriod * (1.0 / 1000000.0);
@@ -390,19 +390,20 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling) {
 				char line[256];
 				snprintf(line, sizeof(line), "Total GPU time: %0.3f ms\n", ((double)((queryResults[numQueries - 1] - queryResults[0]) & timestampDiffMask) * timestampConversionFactor));
 				str << line;
+				snprintf(line, sizeof(line), "Render CPU time: %0.3f ms\n", (frameData.profile.cpuEndTime - frameData.profile.cpuStartTime) * 1000.0);
+				str << line;
 				for (int i = 0; i < numQueries - 1; i++) {
 					uint64_t diff = (queryResults[i + 1] - queryResults[i]) & timestampDiffMask;
 					double milliseconds = (double)diff * timestampConversionFactor;
-					snprintf(line, sizeof(line), "%s: %0.3f ms\n", frameData.timestampDescriptions[i + 1].c_str(), milliseconds);
+					snprintf(line, sizeof(line), "%s: %0.3f ms\n", frameData.profile.timestampDescriptions[i + 1].c_str(), milliseconds);
 					str << line;
 				}
-				frameData.profileSummary = str.str();
+				frameData.profile.profileSummary = str.str();
 			} else {
-				frameData.profileSummary = "(error getting GPU profile - not ready?)";
+				frameData.profile.profileSummary = "(error getting GPU profile - not ready?)";
 			}
-		}
-		else {
-			frameData.profileSummary = "(no GPU profile data collected)";
+		} else {
+			frameData.profile.profileSummary = "(no GPU profile data collected)";
 		}
 	}
 
@@ -415,16 +416,16 @@ void VulkanRenderManager::BeginFrame(bool enableProfiling) {
 
 	insideFrame_ = true;
 
-	frameData.timestampDescriptions.clear();
+	frameData.profile.timestampDescriptions.clear();
 	if (frameData_->profilingEnabled_) {
 		// For various reasons, we need to always use an init cmd buffer in this case to perform the vkCmdResetQueryPool,
 		// unless we want to limit ourselves to only measure the main cmd buffer.
 		// Reserve the first two queries for initCmd.
-		frameData.timestampDescriptions.push_back("initCmd Begin");
-		frameData.timestampDescriptions.push_back("initCmd");
+		frameData.profile.timestampDescriptions.push_back("initCmd Begin");
+		frameData.profile.timestampDescriptions.push_back("initCmd");
 		VkCommandBuffer initCmd = GetInitCmd();
-		vkCmdResetQueryPool(initCmd, frameData.timestampQueryPool_, 0, MAX_TIMESTAMP_QUERIES);
-		vkCmdWriteTimestamp(initCmd, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, frameData.timestampQueryPool_, 0);
+		vkCmdResetQueryPool(initCmd, frameData.profile.queryPool, 0, MAX_TIMESTAMP_QUERIES);
+		vkCmdWriteTimestamp(initCmd, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, frameData.profile.queryPool, 0);
 	}
 }
 
@@ -960,7 +961,7 @@ void VulkanRenderManager::Submit(int frame, bool triggerFence) {
 	if (frameData.hasInitCommands) {
 		if (frameData.profilingEnabled_ && triggerFence) {
 			// Pre-allocated query ID 1.
-			vkCmdWriteTimestamp(frameData.initCmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, frameData.timestampQueryPool_, 1);
+			vkCmdWriteTimestamp(frameData.initCmd, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, frameData.profile.queryPool, 1);
 		}
 		VkResult res = vkEndCommandBuffer(frameData.initCmd);
 		_assert_msg_(G3D, res == VK_SUCCESS, "vkEndCommandBuffer failed (init)! result=%s", VulkanResultToString(res));
@@ -1061,7 +1062,7 @@ void VulkanRenderManager::Run(int frame) {
 	auto &stepsOnThread = frameData_[frame].steps;
 	VkCommandBuffer cmd = frameData.mainCmd;
 	// queueRunner_.LogSteps(stepsOnThread);
-	queueRunner_.RunSteps(cmd, stepsOnThread, frameData.profilingEnabled_ ? frameData.timestampQueryPool_ : VK_NULL_HANDLE, &frameData.timestampDescriptions);
+	queueRunner_.RunSteps(cmd, stepsOnThread, frameData.profilingEnabled_ ? &frameData.profile : nullptr);
 	stepsOnThread.clear();
 
 	switch (frameData.type) {
diff --git a/ext/native/thin3d/VulkanRenderManager.h b/ext/native/thin3d/VulkanRenderManager.h
index 56a3fa1cf3..d0b0d74793 100644
--- a/ext/native/thin3d/VulkanRenderManager.h
+++ b/ext/native/thin3d/VulkanRenderManager.h
@@ -258,7 +258,7 @@ public:
 	}
 
 	std::string GetGpuProfileString() const {
-		return frameData_[vulkan_->GetCurFrame()].profileSummary;
+		return frameData_[vulkan_->GetCurFrame()].profile.profileSummary;
 	}
 
 private:
@@ -305,10 +305,8 @@ private:
 		uint32_t curSwapchainImage = -1;
 
 		// Profiling.
-		VkQueryPool timestampQueryPool_ = VK_NULL_HANDLE;
+		QueueProfileContext profile;
 		bool profilingEnabled_;
-		std::vector<std::string> timestampDescriptions;
-		std::string profileSummary;
 	};
 
 	FrameData frameData_[VulkanContext::MAX_INFLIGHT_FRAMES];