diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 2cf904cbea..ddbdd02aee 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -88,7 +88,7 @@ protected:
 	// Cached vertex decoders
 	u32 lastVType_ = -1;
 	std::unordered_map<u32, VertexDecoder *> decoderMap_;
-	VertexDecoder *dec_;
+	VertexDecoder *dec_ = nullptr;
 	VertexDecoderJitCache *decJitCache_;
 	VertexDecoderOptions decOptions_;
 
diff --git a/GPU/D3D11/DrawEngineD3D11.cpp b/GPU/D3D11/DrawEngineD3D11.cpp
index 46fbe4c473..e6c3cd419a 100644
--- a/GPU/D3D11/DrawEngineD3D11.cpp
+++ b/GPU/D3D11/DrawEngineD3D11.cpp
@@ -83,7 +83,7 @@ DrawEngineD3D11::DrawEngineD3D11(Draw::DrawContext *draw, ID3D11Device *device,
 		textureCache_(0),
 		framebufferManager_(0),
 		numDrawCalls(0),
-		vertexCountInDrawCalls(0),
+		vertexCountInDrawCalls_(0),
 		decodeCounter_(0),
 		dcid_(0) {
 	device1_ = (ID3D11Device1 *)draw->GetNativeObject(Draw::NativeObject::DEVICE_EX);
@@ -284,7 +284,7 @@ inline void DrawEngineD3D11::SetupVertexDecoderInternal(u32 vertType) {
 }
 
 void DrawEngineD3D11::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
-	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls + vertexCount > VERTEX_BUFFER_MAX)
+	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX)
 		Flush();
 
 	// TODO: Is this the right thing to do?
@@ -309,17 +309,19 @@ void DrawEngineD3D11::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
 	dc.prim = prim;
 	dc.vertexCount = vertexCount;
 
-	u32 dhash = dcid_;
-	dhash ^= (u32)(uintptr_t)verts;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)(uintptr_t)inds;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)vertType;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)vertexCount;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)prim;
-	dcid_ = dhash;
+	if (g_Config.bVertexCache) {
+		u32 dhash = dcid_;
+		dhash ^= (u32)(uintptr_t)verts;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)(uintptr_t)inds;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)vertType;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)vertexCount;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)prim;
+		dcid_ = dhash;
+	}
 
 	if (inds) {
 		GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
@@ -331,7 +333,7 @@ void DrawEngineD3D11::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
 	uvScale[numDrawCalls] = gstate_c.uv;
 
 	numDrawCalls++;
-	vertexCountInDrawCalls += vertexCount;
+	vertexCountInDrawCalls_ += vertexCount;
 
 	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
 		DecodeVertsStep();
@@ -941,12 +943,12 @@ rotateVBO:
 	}
 
 	gpuStats.numDrawCalls += numDrawCalls;
-	gpuStats.numVertsSubmitted += vertexCountInDrawCalls;
+	gpuStats.numVertsSubmitted += vertexCountInDrawCalls_;
 
 	indexGen.Reset();
 	decodedVerts_ = 0;
 	numDrawCalls = 0;
-	vertexCountInDrawCalls = 0;
+	vertexCountInDrawCalls_ = 0;
 	decodeCounter_ = 0;
 	dcid_ = 0;
 	prevPrim_ = GE_PRIM_INVALID;
@@ -959,7 +961,10 @@ rotateVBO:
 	gstate_c.vertBounds.maxU = 0;
 	gstate_c.vertBounds.maxV = 0;
 
+#if PPSSPP_PLATFORM(WINDOWS) && !PPSSPP_PLATFORM(UWP)
+	// We only support GPU debugging on Windows, and that's the only use case for this.
 	host->GPUNotifyDraw();
+#endif
 }
 
 bool DrawEngineD3D11::IsCodePtrVertexDecoder(const u8 *ptr) const {
diff --git a/GPU/D3D11/DrawEngineD3D11.h b/GPU/D3D11/DrawEngineD3D11.h
index d8077a711c..64867b69a2 100644
--- a/GPU/D3D11/DrawEngineD3D11.h
+++ b/GPU/D3D11/DrawEngineD3D11.h
@@ -229,7 +229,7 @@ private:
 
 	DeferredDrawCall drawCalls[MAX_DEFERRED_DRAW_CALLS];
 	int numDrawCalls;
-	int vertexCountInDrawCalls;
+	int vertexCountInDrawCalls_;
 
 	int decimationCounter_;
 	int decodeCounter_;
diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp
index 4368f72e8c..84d0bd5fc7 100644
--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@@ -69,16 +69,12 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra
 		draw_(draw),
 		prevPrim_(GE_PRIM_INVALID),
 		numDrawCalls(0),
-		vertexCountInDrawCalls(0),
 		curFrame_(0),
-		nullTexture_(nullptr),
 		stats_{} {
-
 	decOptions_.expandAllWeightsToFloat = false;
 	decOptions_.expand8BitNormalsToFloat = false;
 
-	// Allocate nicely aligned memory. Maybe graphics drivers will
-	// appreciate it.
+	// Allocate nicely aligned memory. Maybe graphics drivers will appreciate it.
 	// All this is a LOT of memory, need to see if we can cut down somehow.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
@@ -336,7 +332,7 @@ inline void DrawEngineVulkan::SetupVertexDecoderInternal(u32 vertType) {
 }
 
 void DrawEngineVulkan::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
-	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls + vertexCount > VERTEX_BUFFER_MAX)
+	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX)
 		Flush();
 
 	// TODO: Is this the right thing to do?
@@ -360,6 +356,20 @@ void DrawEngineVulkan::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
 	dc.prim = prim;
 	dc.vertexCount = vertexCount;
 
+	if (g_Config.bVertexCache) {
+		u32 dhash = dcid_;
+		dhash ^= (u32)(uintptr_t)verts;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)(uintptr_t)inds;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)vertType;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)vertexCount;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)prim;
+		dcid_ = dhash;
+	}
+
 	if (inds) {
 		GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
 	} else {
@@ -370,7 +380,12 @@ void DrawEngineVulkan::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim,
 	uvScale[numDrawCalls] = gstate_c.uv;
 
 	numDrawCalls++;
-	vertexCountInDrawCalls += vertexCount;
+	vertexCountInDrawCalls_ += vertexCount;
+
+	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
+		DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
+		decodeCounter_++;
+	}
 
 	if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) {
 		// Rendertarget == texture?
@@ -452,8 +467,6 @@ void DrawEngineVulkan::DecodeVertsStep(u8 *dest, int &i, int &decodedVerts) {
 }
 
 void DrawEngineVulkan::DecodeVerts(VulkanPushBuffer *push, uint32_t *bindOffset, VkBuffer *vkbuf) {
-	int decodedVerts = 0;
-
 	u8 *dest = decoded;
 
 	// Figure out how much pushbuffer space we need to allocate.
@@ -488,9 +501,9 @@ void DrawEngineVulkan::DecodeVerts(VulkanPushBuffer *push, uint32_t *bindOffset,
 	}
 
 	const UVScale origUV = gstate_c.uv;
-	for (int i = 0; i < numDrawCalls; i++) {
-		gstate_c.uv = uvScale[i];
-		DecodeVertsStep(dest, i, decodedVerts);  // Note that this can modify i
+	for (; decodeCounter_ < numDrawCalls; decodeCounter_++) {
+		gstate_c.uv = uvScale[decodeCounter_];
+		DecodeVertsStep(dest, decodeCounter_, decodedVerts_);  // NOTE! DecodeVertsStep can modify i!
 	}
 	gstate_c.uv = origUV;
 
@@ -685,9 +698,26 @@ void DrawEngineVulkan::DoFlush() {
 		int vertexCount = 0;
 		bool useElements = true;
 
-		// Decode directly into the pushbuffer
+		// Cannot cache vertex data with morph enabled.
+		bool useCache = g_Config.bVertexCache && !(lastVType_ & GE_VTYPE_MORPHCOUNT_MASK);
+		// Also avoid caching when software skinning.
 		VkBuffer vbuf;
-		DecodeVerts(frame->pushVertex, &vbOffset, &vbuf);
+		if (g_Config.bSoftwareSkinning && (lastVType_ & GE_VTYPE_WEIGHT_MASK)) {
+			// If software skinning, we've already predecoded into "decoded". So push that content.
+			VkDeviceSize size = decodedVerts_ * dec_->GetDecVtxFmt().stride;
+			u8 *dest = (u8 *)frame->pushVertex->Push(size, &vbOffset, &vbuf);
+			memcpy(dest, decoded, size);
+		} else {
+			// Decode directly into the pushbuffer
+			DecodeVerts(frame->pushVertex, &vbOffset, &vbuf);
+		}
+
+		useCache = false;
+		if (useCache) {
+			u32 id = dcid_ ^ gstate.getUVGenMode();  // This can have an effect on which UV decoder we need to use! And hence what the decoded data will look like. See #9263
+			// TODO: Actually support vertex caching
+		}
+
 		gpuStats.numUncachedVertsDrawn += indexGen.VertexCount();
 		useElements = !indexGen.SeenOnlyPurePrims();
 		vertexCount = indexGen.VertexCount();
@@ -752,7 +782,8 @@ void DrawEngineVulkan::DoFlush() {
 		if (useElements) {
 			VkBuffer ibuf;
 			ibOffset = (uint32_t)frame->pushIndex->Push(decIndex, 2 * indexGen.VertexCount(), &ibuf);
-			// TODO: Avoid rebinding vertex/index buffers if the vertex size stays the same by using the offset arguments
+			// TODO (maybe): Avoid rebinding vertex/index buffers if the vertex size stays the same by using the offset arguments.
+			// Not sure if actually worth it, binding buffers should be fast.
 			vkCmdBindVertexBuffers(cmd, 0, 1, &vbuf, offsets);
 			vkCmdBindIndexBuffer(cmd, ibuf, ibOffset, VK_INDEX_TYPE_UINT16);
 			int numInstances = (gstate_c.bezier || gstate_c.spline) ? numPatches : 1;
@@ -888,11 +919,14 @@ void DrawEngineVulkan::DoFlush() {
 	}
 
 	gpuStats.numDrawCalls += numDrawCalls;
-	gpuStats.numVertsSubmitted += vertexCountInDrawCalls;
+	gpuStats.numVertsSubmitted += vertexCountInDrawCalls_;
 
 	indexGen.Reset();
+	decodedVerts_ = 0;
 	numDrawCalls = 0;
-	vertexCountInDrawCalls = 0;
+	vertexCountInDrawCalls_ = 0;
+	decodeCounter_ = 0;
+	dcid_ = 0;
 	prevPrim_ = GE_PRIM_INVALID;
 	gstate_c.vertexFullAlpha = true;
 	framebufferManager_->SetColorUpdated(gstate_c.skipDrawReason);
diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h
index a2d809d47a..8081ae3387 100644
--- a/GPU/Vulkan/DrawEngineVulkan.h
+++ b/GPU/Vulkan/DrawEngineVulkan.h
@@ -195,6 +195,7 @@ private:
 
 	// Vertex collector state
 	IndexGenerator indexGen;
+	int decodedVerts_ = 0;
 	GEPrimitiveType prevPrim_;
 
 	TransformedVertex *transformed = nullptr;
@@ -220,14 +221,18 @@ private:
 	VkSampler sampler;
 
 	// Null texture
-	VulkanTexture *nullTexture_;
-	VkSampler nullSampler_;
+	VulkanTexture *nullTexture_ = nullptr;
+	VkSampler nullSampler_ = VK_NULL_HANDLE;
 
 	DeferredDrawCall drawCalls[MAX_DEFERRED_DRAW_CALLS];
-	int numDrawCalls;
-	int vertexCountInDrawCalls;
+	int numDrawCalls = 0;
+	int vertexCountInDrawCalls_ = 0;
 	UVScale uvScale[MAX_DEFERRED_DRAW_CALLS];
 
+	int decimationCounter_ = 0;
+	int decodeCounter_ = 0;
+	u32 dcid_;
+
 	DrawEngineVulkanStats stats_;
 
 	VulkanPipelineRasterStateKey pipelineKey_{};
diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp
index 7fdbcbc694..bada57a1e8 100644
--- a/GPU/Vulkan/GPU_Vulkan.cpp
+++ b/GPU/Vulkan/GPU_Vulkan.cpp
@@ -198,12 +198,12 @@ void GPU_Vulkan::CheckGPUFeatures() {
 
 void GPU_Vulkan::BeginHostFrame() {
 	drawEngine_.BeginFrame();
+	UpdateCmdInfo();
 
 	if (resized_) {
 		CheckGPUFeatures();
 		// In case the GPU changed.
 		BuildReportingInfo();
-		UpdateCmdInfo();
 		framebufferManager_->Resized();
 		drawEngine_.Resized();
 		textureCacheVulkan_->NotifyConfigChanged();
@@ -342,14 +342,13 @@ void GPU_Vulkan::UpdateVsyncInterval(bool force) {
 }
 
 void GPU_Vulkan::UpdateCmdInfo() {
-	/*
 	if (g_Config.bSoftwareSkinning) {
 		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
 		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPU_Vulkan::Execute_VertexTypeSkinning;
-	} else {*/
+	} else {
 		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
 		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPU_Vulkan::Execute_VertexType;
-	// }
+	}
 }
 
 void GPU_Vulkan::BeginFrameInternal() {
@@ -539,6 +538,24 @@ void GPU_Vulkan::Execute_VertexType(u32 op, u32 diff) {
 	}
 }
 
+void GPU_Vulkan::Execute_VertexTypeSkinning(u32 op, u32 diff) {
+	// Don't flush when weight count changes, unless morph is enabled.
+	if ((diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) || (op & GE_VTYPE_MORPHCOUNT_MASK) != 0) {
+		// Restore and flush
+		gstate.vertType ^= diff;
+		Flush();
+		gstate.vertType ^= diff;
+		if (diff & (GE_VTYPE_TC_MASK | GE_VTYPE_THROUGH_MASK))
+			gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
+		// In this case, we may be doing weights and morphs.
+		// Update any bone matrix uniforms so it uses them correctly.
+		if ((op & GE_VTYPE_MORPHCOUNT_MASK) != 0) {
+			gstate_c.Dirty(gstate_c.deferredVertTypeDirty);
+			gstate_c.deferredVertTypeDirty = 0;
+		}
+	}
+}
+
 void GPU_Vulkan::Execute_Bezier(u32 op, u32 diff) {
 	// We don't dirty on normal changes anymore as we prescale, but it's needed for splines/bezier.
 	gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
diff --git a/GPU/Vulkan/GPU_Vulkan.h b/GPU/Vulkan/GPU_Vulkan.h
index 7cdb665054..b056cb02e2 100644
--- a/GPU/Vulkan/GPU_Vulkan.h
+++ b/GPU/Vulkan/GPU_Vulkan.h
@@ -77,6 +77,7 @@ public:
 	void Execute_Bezier(u32 op, u32 diff);
 	void Execute_Spline(u32 op, u32 diff);
 	void Execute_VertexType(u32 op, u32 diff);
+	void Execute_VertexTypeSkinning(u32 op, u32 diff);
 	void Execute_TexSize0(u32 op, u32 diff);
 	void Execute_LoadClut(u32 op, u32 diff);
 	void Execute_BoneMtxNum(u32 op, u32 diff);