Merge pull request #10653 from hrydgard/submitprim-unify

Unify the SubmitPrim function, and some others, between the HW backends
2025-04-02 11:01:50 -04:00 · 2018-02-26 16:17:16 +01:00 · 2018-02-26 16:17:16 +01:00 · 4b784e6035
commit 4b784e6035
parent 10be0cad96 a78365e73d
23 changed files with 296 additions and 1198 deletions
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@ -668,3 +668,69 @@ ReliableHashType DrawEngineCommon::ComputeHash() {
 	fullhash += DoReliableHash(&uvScale[0], sizeof(uvScale[0]) * numDrawCalls, 0x0123e658);
 	return fullhash;
 }
+
+void DrawEngineCommon::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
+	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
+		DispatchFlush();
+	}
+
+	// TODO: Is this the right thing to do?
+	if (prim == GE_PRIM_KEEP_PREVIOUS) {
+		prim = prevPrim_ != GE_PRIM_INVALID ? prevPrim_ : GE_PRIM_POINTS;
+	} else {
+		prevPrim_ = prim;
+	}
+
+	SetupVertexDecoder(vertType);
+
+	*bytesRead = vertexCount * dec_->VertexSize();
+	if ((vertexCount < 2 && prim > 0) || (vertexCount < 3 && prim > 2 && prim != GE_PRIM_RECTANGLES))
+		return;
+
+	DeferredDrawCall &dc = drawCalls[numDrawCalls];
+	dc.verts = verts;
+	dc.inds = inds;
+	dc.vertType = vertType;
+	dc.indexType = (vertType & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
+	dc.prim = prim;
+	dc.vertexCount = vertexCount;
+
+	if (g_Config.bVertexCache) {
+		u32 dhash = dcid_;
+		dhash ^= (u32)(uintptr_t)verts;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)(uintptr_t)inds;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)vertType;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)vertexCount;
+		dhash = __rotl(dhash, 13);
+		dhash ^= (u32)prim;
+		dcid_ = dhash;
+	}
+
+	if (inds) {
+		GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
+	} else {
+		dc.indexLowerBound = 0;
+		dc.indexUpperBound = vertexCount - 1;
+	}
+
+	uvScale[numDrawCalls] = gstate_c.uv;
+
+	numDrawCalls++;
+	vertexCountInDrawCalls_ += vertexCount;
+
+	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
+		DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
+		decodeCounter_++;
+	}
+
+	if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) {
+		// Rendertarget == texture?
+		if (!g_Config.bDisableSlowFramebufEffects) {
+			gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
+			DispatchFlush();
+		}
+	}
+}
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@ -56,10 +56,16 @@ public:
 	// Flush is normally non-virtual but here's a virtual way to call it, used by the shared spline code, which is expensive anyway.
 	// Not really sure if these wrappers are worth it...
 	virtual void DispatchFlush() = 0;
-	// Same for SubmitPrim
-	virtual void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) = 0;
+
+	// This would seem to be unnecessary now, but is still required for splines/beziers to work in the software backend since SubmitPrim
+	// is different. Should probably refactor that.
+	virtual void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
+		SubmitPrim(verts, inds, prim, vertexCount, vertType, bytesRead);
+	}

 	bool TestBoundingBox(void* control_points, int vertexCount, u32 vertType, int *bytesRead);
+
+	void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead);
 	void SubmitSpline(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, int type_u, int type_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead);
 	void SubmitBezier(const void *control_points, const void *indices, int tess_u, int tess_v, int count_u, int count_v, GEPatchPrimType prim_type, bool computeNormals, bool patchFacing, u32 vertType, int *bytesRead);

@ -73,6 +79,9 @@ public:
 	bool IsCodePtrVertexDecoder(const u8 *ptr) const {
 		return decJitCache_->IsInSpace(ptr);
 	}
+	int GetNumDrawCalls() const {
+		return numDrawCalls;
+	}

 protected:
 	virtual void ClearTrackedVertexArrays() {}
--- a/GPU/D3D11/DrawEngineD3D11.cpp
+++ b/GPU/D3D11/DrawEngineD3D11.cpp
@ -257,72 +257,6 @@ ID3D11InputLayout *DrawEngineD3D11::SetupDecFmtForDraw(D3D11VertexShader *vshade
 	}
 }

-void DrawEngineD3D11::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
-	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX)
-		Flush();
-
-	// TODO: Is this the right thing to do?
-	if (prim == GE_PRIM_KEEP_PREVIOUS) {
-		prim = prevPrim_ != GE_PRIM_INVALID ? prevPrim_ : GE_PRIM_POINTS;
-	} else {
-		prevPrim_ = prim;
-	}
-
-	SetupVertexDecoder(vertType);
-
-	*bytesRead = vertexCount * dec_->VertexSize();
-
-	if ((vertexCount < 2 && prim > 0) || (vertexCount < 3 && prim > 2 && prim != GE_PRIM_RECTANGLES))
-		return;
-
-	DeferredDrawCall &dc = drawCalls[numDrawCalls];
-	dc.verts = verts;
-	dc.inds = inds;
-	dc.vertType = vertType;
-	dc.indexType = (vertType & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
-	dc.prim = prim;
-	dc.vertexCount = vertexCount;
-
-	if (g_Config.bVertexCache) {
-		u32 dhash = dcid_;
-		dhash ^= (u32)(uintptr_t)verts;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)(uintptr_t)inds;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)vertType;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)vertexCount;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)prim;
-		dcid_ = dhash;
-	}
-
-	if (inds) {
-		GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
-	} else {
-		dc.indexLowerBound = 0;
-		dc.indexUpperBound = vertexCount - 1;
-	}
-
-	uvScale[numDrawCalls] = gstate_c.uv;
-
-	numDrawCalls++;
-	vertexCountInDrawCalls_ += vertexCount;
-
-	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
-		DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
-		decodeCounter_++;
-	}
-
-	if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) {
-		// Rendertarget == texture?
-		if (!g_Config.bDisableSlowFramebufEffects) {
-			gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-			Flush();
-		}
-	}
-}
-
 void DrawEngineD3D11::MarkUnreliable(VertexArrayInfoD3D11 *vai) {
 	vai->status = VertexArrayInfoD3D11::VAI_UNRELIABLE;
 	if (vai->vbo) {
--- a/GPU/D3D11/DrawEngineD3D11.h
+++ b/GPU/D3D11/DrawEngineD3D11.h
@ -105,8 +105,6 @@ public:
 	DrawEngineD3D11(Draw::DrawContext *draw, ID3D11Device *device, ID3D11DeviceContext *context);
 	virtual ~DrawEngineD3D11();

-	void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead);
-
 	void SetShaderManager(ShaderManagerD3D11 *shaderManager) {
 		shaderManager_ = shaderManager;
 	}
@ -135,9 +133,6 @@ public:
 	}

 	void DispatchFlush() override { Flush(); }
-	void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) override {
-		SubmitPrim(verts, inds, prim, vertexCount, vertType, bytesRead);
-	}

 	void ClearTrackedVertexArrays() override;

--- a/GPU/D3D11/GPU_D3D11.cpp
+++ b/GPU/D3D11/GPU_D3D11.cpp
@ -66,31 +66,6 @@
 #include "Core/HLE/sceKernelInterrupt.h"
 #include "Core/HLE/sceGe.h"

-struct D3D11CommandTableEntry {
-	uint8_t cmd;
-	uint8_t flags;
-	uint64_t dirty;
-	GPU_D3D11::CmdFunc func;
-};
-
-// This table gets crunched into a faster form by init.
-static const D3D11CommandTableEntry commandTable[] = {
-	// Changes that dirty the current texture.
-	{ GE_CMD_TEXSIZE0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_TexSize0 },
-
-	// Changing the vertex type requires us to flush.
-	{ GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType },
-
-	{ GE_CMD_PRIM, FLAG_EXECUTE, 0, &GPU_D3D11::Execute_Prim },
-	{ GE_CMD_BEZIER, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Bezier },
-	{ GE_CMD_SPLINE, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Spline },
-
-	// Changes that trigger data copies. Only flushing on change for LOADCLUT must be a bit of a hack...
-	{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_D3D11::Execute_LoadClut },
-};
-
-GPU_D3D11::CommandInfo GPU_D3D11::cmdInfo_[256]{};
-
 GPU_D3D11::GPU_D3D11(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	: GPUCommon(gfxCtx, draw), drawEngine_(draw,
 	(ID3D11Device *)draw->GetNativeObject(Draw::NativeObject::DEVICE),
@ -126,45 +101,6 @@ GPU_D3D11::GPU_D3D11(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 		ERROR_LOG(G3D, "gstate has drifted out of sync!");
 	}

-	memset(cmdInfo_, 0, sizeof(cmdInfo_));
-
-	// Import both the global and local command tables, and check for dupes
-	std::set<u8> dupeCheck;
-	for (size_t i = 0; i < commonCommandTableSize; i++) {
-		const u8 cmd = commonCommandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commonCommandTable[i].flags | (commonCommandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commonCommandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-
-	for (size_t i = 0; i < ARRAY_SIZE(commandTable); i++) {
-		const u8 cmd = commandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commandTable[i].flags | (commandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-
-	// Find commands missing from the table.
-	for (int i = 0; i < 0xEF; i++) {
-		if (dupeCheck.find((u8)i) == dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command missing from table: %02x (%i)", i, i);
-		}
-	}
-
 	// No need to flush before the tex scale/offset commands if we are baking
 	// the tex scale/offset into the vertices anyway.
 	UpdateCmdInfo();
@ -187,18 +123,6 @@ GPU_D3D11::~GPU_D3D11() {
 	stockD3D11.Destroy();
 }

-void GPU_D3D11::UpdateCmdInfo() {
-	if (g_Config.bSoftwareSkinning) {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexTypeSkinning;
-	} else {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexType;
-	}
-
-	CheckGPUFeatures();
-}
-
 void GPU_D3D11::CheckGPUFeatures() {
 	u32 features = 0;

@ -330,26 +254,6 @@ void GPU_D3D11::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat f
 	framebufferManagerD3D11_->SetDisplayFramebuffer(framebuf, stride, format);
 }

-bool GPU_D3D11::FramebufferDirty() {
-	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->dirtyAfterDisplay;
-		vfb->dirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
-bool GPU_D3D11::FramebufferReallyDirty() {
-	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->reallyDirtyAfterDisplay;
-		vfb->reallyDirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
 void GPU_D3D11::CopyDisplayToOutput() {
 	float blendColor[4]{};
 	context_->OMSetBlendState(stockD3D11.blendStateDisabledWithColorMask[0xF], blendColor, 0xFFFFFFFF);
@ -365,44 +269,6 @@ void GPU_D3D11::CopyDisplayToOutput() {
 	gstate_c.Dirty(DIRTY_TEXTURE_IMAGE);
 }

-// Maybe should write this in ASM...
-void GPU_D3D11::FastRunLoop(DisplayList &list) {
-	PROFILE_THIS_SCOPE("gpuloop");
-	const CommandInfo *cmdInfo = cmdInfo_;
-	int dc = downcount;
-	for (; dc > 0; --dc) {
-		// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
-		const u32 op = *(const u32 *)(Memory::base + list.pc);
-		const u32 cmd = op >> 24;
-		const CommandInfo &info = cmdInfo[cmd];
-		const u32 diff = op ^ gstate.cmdmem[cmd];
-		if (diff == 0) {
-			if (info.flags & FLAG_EXECUTE) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			}
-		} else {
-			uint64_t flags = info.flags;
-			if (flags & FLAG_FLUSHBEFOREONCHANGE) {
-				drawEngine_.Flush();
-			}
-			gstate.cmdmem[cmd] = op;
-			if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			} else {
-				uint64_t dirty = flags >> 8;
-				if (dirty)
-					gstate_c.Dirty(dirty);
-			}
-		}
-		list.pc += 4;
-	}
-	downcount = 0;
-}
-
 void GPU_D3D11::FinishDeferred() {
 	// This finishes reading any vertex data that is pending.
 	drawEngine_.FinishDeferred();
@ -435,85 +301,6 @@ void GPU_D3D11::ExecuteOp(u32 op, u32 diff) {
 	}
 }

-void GPU_D3D11::Execute_Prim(u32 op, u32 diff) {
-	// This drives all drawing. All other state we just buffer up, then we apply it only
-	// when it's time to draw. As most PSP games set state redundantly ALL THE TIME, this is a huge optimization.
-
-	u32 data = op & 0xFFFFFF;
-	u32 count = data & 0xFFFF;
-	if (count == 0)
-		return;
-
-	// Upper bits are ignored.
-	GEPrimitiveType prim = static_cast<GEPrimitiveType>((data >> 16) & 7);
-	SetDrawType(DRAW_PRIM, prim);
-
-	// Discard AA lines as we can't do anything that makes sense with these anyway. The SW plugin might, though.
-
-	if (gstate.isAntiAliasEnabled()) {
-		// Discard AA lines in DOA
-		if (prim == GE_PRIM_LINE_STRIP)
-			return;
-		// Discard AA lines in Summon Night 5
-		if ((prim == GE_PRIM_LINES) && gstate.isSkinningEnabled())
-			return;
-	}
-
-	// This also make skipping drawing very effective.
-	framebufferManagerD3D11_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
-	if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) {
-		drawEngine_.SetupVertexDecoder(gstate.vertType);
-		// Rough estimate, not sure what's correct.
-		cyclesExecuted += EstimatePerVertexCost() * count;
-		return;
-	}
-
-	u32 vertexAddr = gstate_c.vertexAddr;
-	if (!Memory::IsValidAddress(vertexAddr)) {
-		ERROR_LOG_REPORT(G3D, "Bad vertex address %08x!", vertexAddr);
-		return;
-	}
-
-	void *verts = Memory::GetPointerUnchecked(vertexAddr);
-	void *inds = 0;
-	u32 vertexType = gstate.vertType;
-	if ((vertexType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
-		u32 indexAddr = gstate_c.indexAddr;
-		if (!Memory::IsValidAddress(indexAddr)) {
-			ERROR_LOG_REPORT(G3D, "Bad index address %08x!", indexAddr);
-			return;
-		}
-		inds = Memory::GetPointerUnchecked(indexAddr);
-	}
-
-#ifndef MOBILE_DEVICE
-	if (prim > GE_PRIM_RECTANGLES) {
-		ERROR_LOG_REPORT_ONCE(reportPrim, G3D, "Unexpected prim type: %d", prim);
-	}
-#endif
-
-	if (gstate_c.dirty & DIRTY_VERTEXSHADER_STATE) {
-		vertexCost_ = EstimatePerVertexCost();
-	}
-	gpuStats.vertexGPUCycles += vertexCost_ * count;
-	cyclesExecuted += vertexCost_* count;
-
-	int bytesRead = 0;
-	UpdateUVScaleOffset();
-	drawEngine_.SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
-
-	// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
-	// Some games rely on this, they don't bother reloading VADDR and IADDR.
-	// The VADDR/IADDR registers are NOT updated.
-	AdvanceVerts(vertexType, count, bytesRead);
-}
-
-void GPU_D3D11::Execute_LoadClut(u32 op, u32 diff) {
-	gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-	textureCacheD3D11_->LoadClut(gstate.getClutAddress(), gstate.getClutLoadBytes());
-	// This could be used to "dirty" textures with clut.
-}
-
 void GPU_D3D11::GetStats(char *buffer, size_t bufsize) {
 	float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
 	snprintf(buffer, bufsize - 1,
--- a/GPU/D3D11/GPU_D3D11.h
+++ b/GPU/D3D11/GPU_D3D11.h
@ -36,7 +36,7 @@ public:
 	GPU_D3D11(GraphicsContext *gfxCtx, Draw::DrawContext *draw);
 	~GPU_D3D11();

-	void CheckGPUFeatures();
+	void CheckGPUFeatures() override;
 	void PreExecuteOp(u32 op, u32 diff) override;
 	void ExecuteOp(u32 op, u32 diff) override;

@ -50,23 +50,12 @@ public:
 	void DoState(PointerWrap &p) override;

 	void ClearShaderCache() override;
-	bool FramebufferDirty() override;
-	bool FramebufferReallyDirty() override;

 	void GetReportingInfo(std::string &primaryInfo, std::string &fullInfo) override {
 		primaryInfo = reportingPrimaryInfo_;
 		fullInfo = reportingFullInfo_;
 	}

-	typedef void (GPU_D3D11::*CmdFunc)(u32 op, u32 diff);
-	struct CommandInfo {
-		uint64_t flags;
-		GPU_D3D11::CmdFunc func;
-	};
-
-	void Execute_Prim(u32 op, u32 diff);
-	void Execute_LoadClut(u32 op, u32 diff);
-
 	// Using string because it's generic - makes no assumptions on the size of the shader IDs of this backend.
 	std::vector<std::string> DebugGetShaderIDs(DebugShaderType shader) override;
 	std::string DebugGetShaderString(std::string id, DebugShaderType shader, DebugShaderStringType stringType) override;
@ -75,12 +64,9 @@ public:
 	void EndHostFrame() override;

 protected:
-	void FastRunLoop(DisplayList &list) override;
 	void FinishDeferred() override;

 private:
-	void UpdateCmdInfo();
-
 	void Flush() {
 		drawEngine_.Flush();
 	}
@ -101,8 +87,6 @@ private:
 	DrawEngineD3D11 drawEngine_;
 	ShaderManagerD3D11 *shaderManagerD3D11_;

-	static CommandInfo cmdInfo_[256];
-
 	int lastVsync_;
 	int vertexCost_ = 0;

--- a/GPU/Directx9/DrawEngineDX9.cpp
+++ b/GPU/Directx9/DrawEngineDX9.cpp
@ -232,70 +232,6 @@ IDirect3DVertexDeclaration9 *DrawEngineDX9::SetupDecFmtForDraw(VSShader *vshader
 	}
 }

-void DrawEngineDX9::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
-	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX)
-		Flush();
-
-	// TODO: Is this the right thing to do?
-	if (prim == GE_PRIM_KEEP_PREVIOUS) {
-		prim = prevPrim_ != GE_PRIM_INVALID ? prevPrim_ : GE_PRIM_POINTS;
-	} else {
-		prevPrim_ = prim;
-	}
-
-	SetupVertexDecoder(vertType);
-
-	*bytesRead = vertexCount * dec_->VertexSize();
-
-	if ((vertexCount < 2 && prim > 0) || (vertexCount < 3 && prim > 2 && prim != GE_PRIM_RECTANGLES))
-		return;
-
-	DeferredDrawCall &dc = drawCalls[numDrawCalls];
-	dc.verts = verts;
-	dc.inds = inds;
-	dc.vertType = vertType;
-	dc.indexType = (vertType & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
-	dc.prim = prim;
-	dc.vertexCount = vertexCount;
-
-	u32 dhash = dcid_;
-	dhash ^= (u32)(uintptr_t)verts;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)(uintptr_t)inds;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)vertType;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)vertexCount;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)prim;
-	dcid_ = dhash;
-
-	if (inds) {
-		GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
-	} else {
-		dc.indexLowerBound = 0;
-		dc.indexUpperBound = vertexCount - 1;
-	}
-
-	uvScale[numDrawCalls] = gstate_c.uv;
-
-	numDrawCalls++;
-	vertexCountInDrawCalls_ += vertexCount;
-
-	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
-		DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
-		decodeCounter_++;
-	}
-
-	if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) {
-		// Rendertarget == texture?
-		if (!g_Config.bDisableSlowFramebufEffects) {
-			gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-			Flush();
-		}
-	}
-}
-
 void DrawEngineDX9::MarkUnreliable(VertexArrayInfoDX9 *vai) {
 	vai->status = VertexArrayInfoDX9::VAI_UNRELIABLE;
 	if (vai->vbo) {
--- a/GPU/Directx9/DrawEngineDX9.h
+++ b/GPU/Directx9/DrawEngineDX9.h
@ -103,8 +103,6 @@ public:
 	DrawEngineDX9(Draw::DrawContext *draw);
 	virtual ~DrawEngineDX9();

-	void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead);
-
 	void SetShaderManager(ShaderManagerDX9 *shaderManager) {
 		shaderManager_ = shaderManager;
 	}
@ -134,9 +132,6 @@ public:
 	}

 	void DispatchFlush() override { Flush(); }
-	void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) override {
-		SubmitPrim(verts, inds, prim, vertexCount, vertType, bytesRead);
-	}

 private:
 	void DoFlush();
--- a/GPU/Directx9/GPU_DX9.cpp
+++ b/GPU/Directx9/GPU_DX9.cpp
@ -50,31 +50,6 @@

 namespace DX9 {

-struct D3D9CommandTableEntry {
-	uint8_t cmd;
-	uint8_t flags;
-	uint64_t dirty;
-	GPU_DX9::CmdFunc func;
-};
-
-// This table gets crunched into a faster form by init.
-static const D3D9CommandTableEntry commandTable[] = {
-	// Changes that dirty the current texture.
-	{ GE_CMD_TEXSIZE0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_TexSize0 },
-
-	// Changing the vertex type requires us to flush.
-	{ GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType },
-
-	{ GE_CMD_PRIM, FLAG_EXECUTE, 0, &GPU_DX9::Execute_Prim },
-	{ GE_CMD_BEZIER, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Bezier },
-	{ GE_CMD_SPLINE, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Spline },
-
-	// Changes that trigger data copies. Only flushing on change for LOADCLUT must be a bit of a hack...
-	{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_DX9::Execute_LoadClut },
-};
-
-GPU_DX9::CommandInfo GPU_DX9::cmdInfo_[256];
-
 GPU_DX9::GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	: GPUCommon(gfxCtx, draw),
 		depalShaderCache_(draw),
@ -108,44 +83,6 @@ GPU_DX9::GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 		ERROR_LOG(G3D, "gstate has drifted out of sync!");
 	}

-	memset(cmdInfo_, 0, sizeof(cmdInfo_));
-
-	// Import both the global and local command tables, and check for dupes
-	std::set<u8> dupeCheck;
-	for (size_t i = 0; i < commonCommandTableSize; i++) {
-		const u8 cmd = commonCommandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commonCommandTable[i].flags | (commonCommandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commonCommandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-
-	for (size_t i = 0; i < ARRAY_SIZE(commandTable); i++) {
-		const u8 cmd = commandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commandTable[i].flags | (commandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-	// Find commands missing from the table.
-	for (int i = 0; i < 0xEF; i++) {
-		if (dupeCheck.find((u8)i) == dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command missing from table: %02x (%i)", i, i);
-		}
-	}
-
 	// No need to flush before the tex scale/offset commands if we are baking
 	// the tex scale/offset into the vertices anyway.
 	UpdateCmdInfo();
@ -166,18 +103,6 @@ GPU_DX9::GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	}
 }

-void GPU_DX9::UpdateCmdInfo() {
-	if (g_Config.bSoftwareSkinning) {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexTypeSkinning;
-	} else {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexType;
-	}
-
-	CheckGPUFeatures();
-}
-
 void GPU_DX9::CheckGPUFeatures() {
 	u32 features = 0;

@ -305,26 +230,6 @@ void GPU_DX9::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat for
 	framebufferManagerDX9_->SetDisplayFramebuffer(framebuf, stride, format);
 }

-bool GPU_DX9::FramebufferDirty() {
-	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->dirtyAfterDisplay;
-		vfb->dirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
-bool GPU_DX9::FramebufferReallyDirty() {
-	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->reallyDirtyAfterDisplay;
-		vfb->reallyDirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
 void GPU_DX9::CopyDisplayToOutput() {
 	dxstate.depthWrite.set(true);
 	dxstate.colorMask.set(true, true, true, true);
@ -340,44 +245,6 @@ void GPU_DX9::CopyDisplayToOutput() {
 	gstate_c.Dirty(DIRTY_TEXTURE_IMAGE);
 }

-// Maybe should write this in ASM...
-void GPU_DX9::FastRunLoop(DisplayList &list) {
-	PROFILE_THIS_SCOPE("gpuloop");
-	const CommandInfo *cmdInfo = cmdInfo_;
-	int dc = downcount;
-	for (; dc > 0; --dc) {
-		// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
-		const u32 op = *(const u32 *)(Memory::base + list.pc);
-		const u32 cmd = op >> 24;
-		const CommandInfo &info = cmdInfo[cmd];
-		const u32 diff = op ^ gstate.cmdmem[cmd];
-		if (diff == 0) {
-			if (info.flags & FLAG_EXECUTE) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			}
-		} else {
-			uint64_t flags = info.flags;
-			if (flags & FLAG_FLUSHBEFOREONCHANGE) {
-				drawEngine_.Flush();
-			}
-			gstate.cmdmem[cmd] = op;
-			if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			} else {
-				uint64_t dirty = flags >> 8;
-				if (dirty)
-					gstate_c.Dirty(dirty);
-			}
-		}
-		list.pc += 4;
-	}
-	downcount = 0;
-}
-
 void GPU_DX9::FinishDeferred() {
 	// This finishes reading any vertex data that is pending.
 	drawEngine_.FinishDeferred();
@ -410,84 +277,6 @@ void GPU_DX9::ExecuteOp(u32 op, u32 diff) {
 	}
 }

-void GPU_DX9::Execute_Prim(u32 op, u32 diff) {
-	// This drives all drawing. All other state we just buffer up, then we apply it only
-	// when it's time to draw. As most PSP games set state redundantly ALL THE TIME, this is a huge optimization.
-
-	u32 data = op & 0xFFFFFF;
-	u32 count = data & 0xFFFF;
-	if (count == 0)
-		return;
-	// Upper bits are ignored.
-	GEPrimitiveType prim = static_cast<GEPrimitiveType>((data >> 16) & 7);
-	SetDrawType(DRAW_PRIM, prim);
-
-	// Discard AA lines as we can't do anything that makes sense with these anyway. The SW plugin might, though.
-
-	if (gstate.isAntiAliasEnabled()) {
-		// Discard AA lines in DOA
-		if (prim == GE_PRIM_LINE_STRIP)
-			return;
-		// Discard AA lines in Summon Night 5
-		if ((prim == GE_PRIM_LINES) && gstate.isSkinningEnabled())
-			return;
-	}
-
-	// This also make skipping drawing very effective.
-	framebufferManagerDX9_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
-	if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) {
-		drawEngine_.SetupVertexDecoder(gstate.vertType);
-		// Rough estimate, not sure what's correct.
-		cyclesExecuted += EstimatePerVertexCost() * count;
-		return;
-	}
-
-	u32 vertexAddr = gstate_c.vertexAddr;
-	if (!Memory::IsValidAddress(vertexAddr)) {
-		ERROR_LOG_REPORT(G3D, "Bad vertex address %08x!", vertexAddr);
-		return;
-	}
-
-	void *verts = Memory::GetPointerUnchecked(vertexAddr);
-	void *inds = 0;
-	u32 vertexType = gstate.vertType;
-	if ((vertexType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
-		u32 indexAddr = gstate_c.indexAddr;
-		if (!Memory::IsValidAddress(indexAddr)) {
-			ERROR_LOG_REPORT(G3D, "Bad index address %08x!", indexAddr);
-			return;
-		}
-		inds = Memory::GetPointerUnchecked(indexAddr);
-	}
-
-#ifndef MOBILE_DEVICE
-	if (prim > GE_PRIM_RECTANGLES) {
-		ERROR_LOG_REPORT_ONCE(reportPrim, G3D, "Unexpected prim type: %d", prim);
-	}
-#endif
-
-	if (gstate_c.dirty & DIRTY_VERTEXSHADER_STATE) {
-		vertexCost_ = EstimatePerVertexCost();
-	}
-	gpuStats.vertexGPUCycles += vertexCost_ * count;
-	cyclesExecuted += vertexCost_* count;
-
-	int bytesRead = 0;
-	UpdateUVScaleOffset();
-	drawEngine_.SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
-
-	// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
-	// Some games rely on this, they don't bother reloading VADDR and IADDR.
-	// The VADDR/IADDR registers are NOT updated.
-	AdvanceVerts(vertexType, count, bytesRead);
-}
-
-void GPU_DX9::Execute_LoadClut(u32 op, u32 diff) {
-	gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-	textureCacheDX9_->LoadClut(gstate.getClutAddress(), gstate.getClutLoadBytes());
-	// This could be used to "dirty" textures with clut.
-}
-
 void GPU_DX9::GetStats(char *buffer, size_t bufsize) {
 	float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
 	snprintf(buffer, bufsize - 1,
--- a/GPU/Directx9/GPU_DX9.h
+++ b/GPU/Directx9/GPU_DX9.h
@ -37,7 +37,7 @@ public:
 	GPU_DX9(GraphicsContext *gfxCtx, Draw::DrawContext *draw);
 	~GPU_DX9();

-	void CheckGPUFeatures();
+	void CheckGPUFeatures() override;
 	void PreExecuteOp(u32 op, u32 diff) override;
 	void ExecuteOp(u32 op, u32 diff) override;

@ -51,24 +51,12 @@ public:
 	void DoState(PointerWrap &p) override;

 	void ClearShaderCache() override;
-	bool FramebufferDirty() override;
-	bool FramebufferReallyDirty() override;

 	void GetReportingInfo(std::string &primaryInfo, std::string &fullInfo) override {
 		primaryInfo = reportingPrimaryInfo_;
 		fullInfo = reportingFullInfo_;
 	}

-	typedef void (GPU_DX9::*CmdFunc)(u32 op, u32 diff);
-	struct CommandInfo {
-		uint64_t flags;
-		GPU_DX9::CmdFunc func;
-	};
-
-	void Execute_Prim(u32 op, u32 diff);
-	void Execute_TexSize0(u32 op, u32 diff);
-	void Execute_LoadClut(u32 op, u32 diff);
-
 	// Using string because it's generic - makes no assumptions on the size of the shader IDs of this backend.
 	std::vector<std::string> DebugGetShaderIDs(DebugShaderType shader) override;
 	std::string DebugGetShaderString(std::string id, DebugShaderType shader, DebugShaderStringType stringType) override;
@ -76,12 +64,9 @@ public:
 	void BeginHostFrame() override;

 protected:
-	void FastRunLoop(DisplayList &list) override;
 	void FinishDeferred() override;

 private:
-	void UpdateCmdInfo();
-
 	void Flush() {
 		drawEngine_.Flush();
 	}
@ -102,8 +87,6 @@ private:
 	DrawEngineDX9 drawEngine_;
 	ShaderManagerDX9 *shaderManagerDX9_;

-	static CommandInfo cmdInfo_[256];
-
 	int lastVsync_;
 	int vertexCost_ = 0;

--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@ -276,70 +276,6 @@ GLRInputLayout *DrawEngineGLES::SetupDecFmtForDraw(LinkedShader *program, const
 	return inputLayout;
 }

-void DrawEngineGLES::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
-	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX)
-		Flush();
-
-	// TODO: Is this the right thing to do?
-	if (prim == GE_PRIM_KEEP_PREVIOUS) {
-		prim = prevPrim_ != GE_PRIM_INVALID ? prevPrim_ : GE_PRIM_POINTS;
-	} else {
-		prevPrim_ = prim;
-	}
-
-	SetupVertexDecoder(vertType);
-
-	*bytesRead = vertexCount * dec_->VertexSize();
-
-	if ((vertexCount < 2 && prim > 0) || (vertexCount < 3 && prim > 2 && prim != GE_PRIM_RECTANGLES))
-		return;
-
-	DeferredDrawCall &dc = drawCalls[numDrawCalls];
-	dc.verts = verts;
-	dc.inds = inds;
-	dc.vertType = vertType;
-	dc.indexType = (vertType & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
-	dc.prim = prim;
-	dc.vertexCount = vertexCount;
-
-	u32 dhash = dcid_;
-	dhash ^= (u32)(uintptr_t)verts;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)(uintptr_t)inds;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)vertType;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)vertexCount;
-	dhash = __rotl(dhash, 13);
-	dhash ^= (u32)prim;
-	dcid_ = dhash;
-
-	if (inds) {
-		GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
-	} else {
-		dc.indexLowerBound = 0;
-		dc.indexUpperBound = vertexCount - 1;
-	}
-
-	uvScale[numDrawCalls] = gstate_c.uv;
-
-	numDrawCalls++;
-	vertexCountInDrawCalls_ += vertexCount;
-
-	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
-		DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
-		decodeCounter_++;
-	}
-
-	if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) {
-		// Rendertarget == texture?
-		if (!g_Config.bDisableSlowFramebufEffects) {
-			gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-			Flush();
-		}
-	}
-}
-
 void DrawEngineGLES::DecodeVertsToPushBuffer(GLPushBuffer *push, uint32_t *bindOffset, GLRBuffer **buf) {
 	u8 *dest = decoded;

--- a/GPU/GLES/DrawEngineGLES.h
+++ b/GPU/GLES/DrawEngineGLES.h
@ -105,8 +105,6 @@ public:
 	DrawEngineGLES(Draw::DrawContext *draw);
 	virtual ~DrawEngineGLES();

-	void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead);
-
 	void SetShaderManager(ShaderManagerGLES *shaderManager) {
 		shaderManager_ = shaderManager;
 	}
@ -146,9 +144,6 @@ public:
 	bool IsCodePtrVertexDecoder(const u8 *ptr) const;

 	void DispatchFlush() override { Flush(); }
-	void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) override {
-		SubmitPrim(verts, inds, prim, vertexCount, vertType, bytesRead);
-	}

 	GLPushBuffer *GetPushVertexBuffer() {
 		return frameData_[render_->GetCurFrame()].pushVertex;
--- a/GPU/GLES/GPU_GLES.cpp
+++ b/GPU/GLES/GPU_GLES.cpp
@ -51,32 +51,6 @@
 #include "Windows/GPU/WindowsGLContext.h"
 #endif

-struct GLESCommandTableEntry {
-	uint8_t cmd;
-	uint8_t flags;
-	uint64_t dirty;
-	GPU_GLES::CmdFunc func;
-};
-
-// This table gets crunched into a faster form by init.
-// TODO: Share this table between the backends. Will have to make another indirection for the function pointers though..
-static const GLESCommandTableEntry commandTable[] = {
-	// Changes that dirty the current texture.
-	{ GE_CMD_TEXSIZE0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_TexSize0 },
-
-	// Changing the vertex type requires us to flush.
-	{ GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType },
-
-	{ GE_CMD_PRIM, FLAG_EXECUTE, 0, &GPU_GLES::Execute_Prim },
-	{ GE_CMD_BEZIER, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Bezier },
-	{ GE_CMD_SPLINE, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Spline },
-
-	// Changes that trigger data copies. Only flushing on change for LOADCLUT must be a bit of a hack...
-	{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_GLES::Execute_LoadClut },
-};
-
-GPU_GLES::CommandInfo GPU_GLES::cmdInfo_[256];
-
 GPU_GLES::GPU_GLES(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 : GPUCommon(gfxCtx, draw), drawEngine_(draw), fragmentTestCache_(draw), depalShaderCache_(draw) {
 	UpdateVsyncInterval(true);
@ -112,44 +86,6 @@ GPU_GLES::GPU_GLES(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 		ERROR_LOG(G3D, "gstate has drifted out of sync!");
 	}

-	memset(cmdInfo_, 0, sizeof(cmdInfo_));
-
-	// Import both the global and local command tables, and check for dupes
-	std::set<u8> dupeCheck;
-	for (size_t i = 0; i < commonCommandTableSize; i++) {
-		const u8 cmd = commonCommandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commonCommandTable[i].flags | (commonCommandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commonCommandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-
-	for (size_t i = 0; i < ARRAY_SIZE(commandTable); i++) {
-		const u8 cmd = commandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commandTable[i].flags | (commandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-	// Find commands missing from the table.
-	for (int i = 0; i < 0xEF; i++) {
-		if (dupeCheck.find((u8)i) == dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command missing from table: %02x (%i)", i, i);
-		}
-	}
-
 	// No need to flush before the tex scale/offset commands if we are baking
 	// the tex scale/offset into the vertices anyway.

@ -467,16 +403,6 @@ inline void GPU_GLES::UpdateVsyncInterval(bool force) {
 #endif
 }

-void GPU_GLES::UpdateCmdInfo() {
-	if (g_Config.bSoftwareSkinning) {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexTypeSkinning;
-	} else {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexType;
-	}
-}
-
 void GPU_GLES::ReapplyGfxState() {
 	GPUCommon::ReapplyGfxState();
 }
@ -510,26 +436,6 @@ void GPU_GLES::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat fo
 	framebufferManagerGL_->SetDisplayFramebuffer(framebuf, stride, format);
 }

-bool GPU_GLES::FramebufferDirty() {
-	VirtualFramebuffer *vfb = framebufferManagerGL_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->dirtyAfterDisplay;
-		vfb->dirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
-bool GPU_GLES::FramebufferReallyDirty() {
-	VirtualFramebuffer *vfb = framebufferManagerGL_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->reallyDirtyAfterDisplay;
-		vfb->reallyDirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
 void GPU_GLES::CopyDisplayToOutput() {
 	// Flush anything left over.
 	framebufferManagerGL_->RebindFramebuffer();
@ -551,44 +457,6 @@ void GPU_GLES::CopyDisplayToOutput() {
 #endif
 }

-// Maybe should write this in ASM...
-void GPU_GLES::FastRunLoop(DisplayList &list) {
-	PROFILE_THIS_SCOPE("gpuloop");
-	const CommandInfo *cmdInfo = cmdInfo_;
-	int dc = downcount;
-	for (; dc > 0; --dc) {
-		// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
-		const u32 op = *(const u32 *)(Memory::base + list.pc);
-		const u32 cmd = op >> 24;
-		const CommandInfo &info = cmdInfo[cmd];
-		const u32 diff = op ^ gstate.cmdmem[cmd];
-		if (diff == 0) {
-			if (info.flags & FLAG_EXECUTE) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			}
-		} else {
-			uint64_t flags = info.flags;
-			if (flags & FLAG_FLUSHBEFOREONCHANGE) {
-				drawEngine_.Flush();
-			}
-			gstate.cmdmem[cmd] = op;
-			if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			} else {
-				uint64_t dirty = flags >> 8;
-				if (dirty)
-					gstate_c.Dirty(dirty);
-			}
-		}
-		list.pc += 4;
-	}
-	downcount = 0;
-}
-
 void GPU_GLES::FinishDeferred() {
 	// This finishes reading any vertex data that is pending.
 	drawEngine_.FinishDeferred();
@ -621,81 +489,6 @@ void GPU_GLES::ExecuteOp(u32 op, u32 diff) {
 	}
 }

-void GPU_GLES::Execute_Prim(u32 op, u32 diff) {
-	// This drives all drawing. All other state we just buffer up, then we apply it only
-	// when it's time to draw. As most PSP games set state redundantly ALL THE TIME, this is a huge optimization.
-
-	u32 data = op & 0xFFFFFF;
-	u32 count = data & 0xFFFF;
-	if (count == 0)
-		return;
-
-	// Upper bits are ignored.
-	GEPrimitiveType prim = static_cast<GEPrimitiveType>((data >> 16) & 7);
-	SetDrawType(DRAW_PRIM, prim);
-
-	// Discard AA lines as we can't do anything that makes sense with these anyway. The SW plugin might, though.
-
-	if (gstate.isAntiAliasEnabled()) {
-		// Discard AA lines in DOA
-		if (prim == GE_PRIM_LINE_STRIP)
-			return;
-		// Discard AA lines in Summon Night 5
-		if ((prim == GE_PRIM_LINES) && gstate.isSkinningEnabled())
-			return;
-	}
-
-	// This also makes skipping drawing very effective. This function can change the framebuffer.
-	framebufferManagerGL_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
-	if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB))	{
-		drawEngine_.SetupVertexDecoder(gstate.vertType);
-		// Rough estimate, not sure what's correct.
-		cyclesExecuted += EstimatePerVertexCost() * count;
-		return;
-	}
-
-	if (!Memory::IsValidAddress(gstate_c.vertexAddr)) {
-		ERROR_LOG_REPORT(G3D, "Bad vertex address %08x!", gstate_c.vertexAddr);
-		return;
-	}
-
-	void *verts = Memory::GetPointerUnchecked(gstate_c.vertexAddr);
-	void *inds = 0;
-	if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
-		if (!Memory::IsValidAddress(gstate_c.indexAddr)) {
-			ERROR_LOG_REPORT(G3D, "Bad index address %08x!", gstate_c.indexAddr);
-			return;
-		}
-		inds = Memory::GetPointerUnchecked(gstate_c.indexAddr);
-	}
-
-#ifndef MOBILE_DEVICE
-	if (prim > GE_PRIM_RECTANGLES) {
-		ERROR_LOG_REPORT_ONCE(reportPrim, G3D, "Unexpected prim type: %d", prim);
-	}
-#endif
-
-	if (gstate_c.dirty & DIRTY_VERTEXSHADER_STATE) {
-		vertexCost_ = EstimatePerVertexCost();
-	}
-	gpuStats.vertexGPUCycles += vertexCost_ * count;
-	cyclesExecuted += vertexCost_* count;
-
-	int bytesRead = 0;
-	UpdateUVScaleOffset();
-	drawEngine_.SubmitPrim(verts, inds, prim, count, gstate.vertType, &bytesRead);
-
-	// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
-	// Some games rely on this, they don't bother reloading VADDR and IADDR.
-	// The VADDR/IADDR registers are NOT updated.
-	AdvanceVerts(gstate.vertType, count, bytesRead);
-}
-
-void GPU_GLES::Execute_LoadClut(u32 op, u32 diff) {
-	gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-	textureCacheGL_->LoadClut(gstate.getClutAddress(), gstate.getClutLoadBytes());
-}
-
 void GPU_GLES::GetStats(char *buffer, size_t bufsize) {
 	float vertexAverageCycles = gpuStats.numVertsSubmitted > 0 ? (float)gpuStats.vertexGPUCycles / (float)gpuStats.numVertsSubmitted : 0.0f;
 	snprintf(buffer, bufsize - 1,
--- a/GPU/GLES/GPU_GLES.h
+++ b/GPU/GLES/GPU_GLES.h
@ -36,7 +36,7 @@ public:
 	~GPU_GLES();

 	// This gets called on startup and when we get back from settings.
-	void CheckGPUFeatures();
+	void CheckGPUFeatures() override;

 	bool IsReady() override;

@ -55,23 +55,12 @@ public:

 	void ClearShaderCache() override;
 	void CleanupBeforeUI() override;
-	bool FramebufferDirty() override;
-	bool FramebufferReallyDirty() override;

 	void GetReportingInfo(std::string &primaryInfo, std::string &fullInfo) override {
 		primaryInfo = reportingPrimaryInfo_;
 		fullInfo = reportingFullInfo_;
 	}

-	typedef void (GPU_GLES::*CmdFunc)(u32 op, u32 diff);
-	struct CommandInfo {
-		uint64_t flags;
-		GPU_GLES::CmdFunc func;
-	};
-
-	void Execute_Prim(u32 op, u32 diff);
-	void Execute_LoadClut(u32 op, u32 diff);
-
 	// Using string because it's generic - makes no assumptions on the size of the shader IDs of this backend.
 	std::vector<std::string> DebugGetShaderIDs(DebugShaderType shader) override;
 	std::string DebugGetShaderString(std::string id, DebugShaderType shader, DebugShaderStringType stringType) override;
@ -80,7 +69,6 @@ public:
 	void EndHostFrame() override;

 protected:
-	void FastRunLoop(DisplayList &list) override;
 	void FinishDeferred() override;

 private:
@ -96,9 +84,6 @@ private:
 	void Reinitialize() override;

 	inline void UpdateVsyncInterval(bool force);
-	void UpdateCmdInfo();
-
-	static CommandInfo cmdInfo_[256];

 	FramebufferManagerGLES *framebufferManagerGL_;
 	TextureCacheGLES *textureCacheGL_;
--- a/GPU/GPU.cpp
+++ b/GPU/GPU.cpp
@ -110,6 +110,6 @@ bool GPU_Init(GraphicsContext *ctx, Draw::DrawContext *draw) {

 void GPU_Shutdown() {
 	delete gpu;
-	gpu = 0;
+	gpu = nullptr;
 	gpuDebug = 0;
 }
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@ -42,6 +42,15 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	{ GE_CMD_BJUMP, FLAG_EXECUTE | FLAG_READS_PC | FLAG_WRITES_PC, 0, &GPUCommon::Execute_BJump },  // EXECUTE
 	{ GE_CMD_BOUNDINGBOX, FLAG_EXECUTE, 0, &GPUCommon::Execute_BoundingBox }, // + FLUSHBEFORE when we implement... or not, do we need to?

+	{ GE_CMD_PRIM, FLAG_EXECUTE, 0, &GPUCommon::Execute_Prim },
+	{ GE_CMD_BEZIER, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Bezier },
+	{ GE_CMD_SPLINE, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Spline },
+
+	// Changing the vertex type requires us to flush.
+	{ GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType },
+
+	{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_LoadClut },
+
 	// These two are actually processed in CMD_END. Not sure if FLAG_FLUSHBEFORE matters.
 	{ GE_CMD_SIGNAL, FLAG_FLUSHBEFORE },
 	{ GE_CMD_FINISH, FLAG_FLUSHBEFORE },
@ -121,7 +130,7 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	{ GE_CMD_TEXOFFSETU },
 	{ GE_CMD_TEXOFFSETV },

-	// TEXSIZE0 is handled by each backend.
+	{ GE_CMD_TEXSIZE0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_TexSize0 },
 	{ GE_CMD_TEXSIZE1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_TEXTURE_PARAMS },
 	{ GE_CMD_TEXSIZE2, FLAG_FLUSHBEFOREONCHANGE, DIRTY_TEXTURE_PARAMS },
 	{ GE_CMD_TEXSIZE3, FLAG_FLUSHBEFOREONCHANGE, DIRTY_TEXTURE_PARAMS },
@ -343,6 +352,9 @@ const CommonCommandTableEntry commonCommandTable[] = {
 };
 size_t commonCommandTableSize = ARRAY_SIZE(commonCommandTable);

+// TODO: Make class member?
+GPUCommon::CommandInfo GPUCommon::cmdInfo_[256];
+
 void GPUCommon::Flush() {
 	drawEngineCommon_->DispatchFlush();
 }
@ -366,11 +378,47 @@ GPUCommon::GPUCommon(GraphicsContext *gfxCtx, Draw::DrawContext *draw) :
 	gstate.Reset();
 	gstate_c.Reset();
 	gpuStats.Reset();
+
+	memset(cmdInfo_, 0, sizeof(cmdInfo_));
+
+	// Import both the global and local command tables, and check for dupes
+	std::set<u8> dupeCheck;
+	for (size_t i = 0; i < commonCommandTableSize; i++) {
+		const u8 cmd = commonCommandTable[i].cmd;
+		if (dupeCheck.find(cmd) != dupeCheck.end()) {
+			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
+		} else {
+			dupeCheck.insert(cmd);
+		}
+		cmdInfo_[cmd].flags |= (uint64_t)commonCommandTable[i].flags | (commonCommandTable[i].dirty << 8);
+		cmdInfo_[cmd].func = commonCommandTable[i].func;
+		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
+			Crash();
+		}
+	}
+	// Find commands missing from the table.
+	for (int i = 0; i < 0xEF; i++) {
+		if (dupeCheck.find((u8)i) == dupeCheck.end()) {
+			ERROR_LOG(G3D, "Command missing from table: %02x (%i)", i, i);
+		}
+	}
+
+	UpdateCmdInfo();
 }

 GPUCommon::~GPUCommon() {
 }

+void GPUCommon::UpdateCmdInfo() {
+	if (g_Config.bSoftwareSkinning) {
+		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
+		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexTypeSkinning;
+	} else {
+		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
+		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexType;
+	}
+}
+
 void GPUCommon::BeginHostFrame() {
 	ReapplyGfxState();

@ -917,6 +965,46 @@ bool GPUCommon::InterpretList(DisplayList &list) {
 	return gpuState == GPUSTATE_DONE || gpuState == GPUSTATE_ERROR;
 }

+// Maybe should write this in ASM...
+void GPUCommon::FastRunLoop(DisplayList &list) {
+	PROFILE_THIS_SCOPE("gpuloop");
+	const CommandInfo *cmdInfo = cmdInfo_;
+	int dc = downcount;
+	for (; dc > 0; --dc) {
+		// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
+		const u32 op = *(const u32 *)(Memory::base + list.pc);
+		const u32 cmd = op >> 24;
+		const CommandInfo &info = cmdInfo[cmd];
+		const u32 diff = op ^ gstate.cmdmem[cmd];
+		if (diff == 0) {
+			if (info.flags & FLAG_EXECUTE) {
+				downcount = dc;
+				(this->*info.func)(op, diff);
+				dc = downcount;
+			}
+		} else {
+			uint64_t flags = info.flags;
+			if (flags & FLAG_FLUSHBEFOREONCHANGE) {
+				if (drawEngineCommon_->GetNumDrawCalls()) {
+					drawEngineCommon_->DispatchFlush();
+				}
+			}
+			gstate.cmdmem[cmd] = op;
+			if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
+				downcount = dc;
+				(this->*info.func)(op, diff);
+				dc = downcount;
+			} else {
+				uint64_t dirty = flags >> 8;
+				if (dirty)
+					gstate_c.Dirty(dirty);
+			}
+		}
+		list.pc += 4;
+	}
+	downcount = 0;
+}
+
 void GPUCommon::BeginFrame() {
 	immCount_ = 0;
 	if (dumpNextFrame_) {
@ -1347,6 +1435,11 @@ void GPUCommon::Execute_VertexType(u32 op, u32 diff) {
 	}
 }

+void GPUCommon::Execute_LoadClut(u32 op, u32 diff) {
+	gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
+	textureCache_->LoadClut(gstate.getClutAddress(), gstate.getClutLoadBytes());
+}
+
 void GPUCommon::Execute_VertexTypeSkinning(u32 op, u32 diff) {
 	// Don't flush when weight count changes, unless morph is enabled.
 	if ((diff & ~GE_VTYPE_WEIGHTCOUNT_MASK) || (op & GE_VTYPE_MORPHCOUNT_MASK) != 0) {
@ -1368,6 +1461,81 @@ void GPUCommon::Execute_VertexTypeSkinning(u32 op, u32 diff) {
 		gstate_c.Dirty(DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_FRAGMENTSHADER_STATE);
 }

+
+void GPUCommon::Execute_Prim(u32 op, u32 diff) {
+	// This drives all drawing. All other state we just buffer up, then we apply it only
+	// when it's time to draw. As most PSP games set state redundantly ALL THE TIME, this is a huge optimization.
+
+	PROFILE_THIS_SCOPE("execprim");
+
+	u32 data = op & 0xFFFFFF;
+	u32 count = data & 0xFFFF;
+	if (count == 0)
+		return;
+
+	// Upper bits are ignored.
+	GEPrimitiveType prim = static_cast<GEPrimitiveType>((data >> 16) & 7);
+	SetDrawType(DRAW_PRIM, prim);
+
+	// Discard AA lines as we can't do anything that makes sense with these anyway. The SW plugin might, though.
+	if (gstate.isAntiAliasEnabled()) {
+		// Discard AA lines in DOA
+		if (prim == GE_PRIM_LINE_STRIP)
+			return;
+		// Discard AA lines in Summon Night 5
+		if ((prim == GE_PRIM_LINES) && gstate.isSkinningEnabled())
+			return;
+	}
+
+	// This also makes skipping drawing very effective.
+	framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
+
+	if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) {
+		drawEngineCommon_->SetupVertexDecoder(gstate.vertType);  // Do we still need to do this?
+																											// Rough estimate, not sure what's correct.
+		cyclesExecuted += EstimatePerVertexCost() * count;
+		return;
+	}
+
+	if (!Memory::IsValidAddress(gstate_c.vertexAddr)) {
+		ERROR_LOG_REPORT(G3D, "Bad vertex address %08x!", gstate_c.vertexAddr);
+		return;
+	}
+
+	void *verts = Memory::GetPointerUnchecked(gstate_c.vertexAddr);
+	void *inds = 0;
+	u32 vertexType = gstate.vertType;
+	if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
+		u32 indexAddr = gstate_c.indexAddr;
+		if (!Memory::IsValidAddress(indexAddr)) {
+			ERROR_LOG_REPORT(G3D, "Bad index address %08x!", indexAddr);
+			return;
+		}
+		inds = Memory::GetPointerUnchecked(indexAddr);
+	}
+
+#ifndef MOBILE_DEVICE
+	if (prim > GE_PRIM_RECTANGLES) {
+		ERROR_LOG_REPORT_ONCE(reportPrim, G3D, "Unexpected prim type: %d", prim);
+	}
+#endif
+
+	if (gstate_c.dirty & DIRTY_VERTEXSHADER_STATE) {
+		vertexCost_ = EstimatePerVertexCost();
+	}
+	gpuStats.vertexGPUCycles += vertexCost_ * count;
+	cyclesExecuted += vertexCost_* count;
+
+	int bytesRead = 0;
+	UpdateUVScaleOffset();
+	drawEngineCommon_->SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
+
+	// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
+	// Some games rely on this, they don't bother reloading VADDR and IADDR.
+	// The VADDR/IADDR registers are NOT updated.
+	AdvanceVerts(vertexType, count, bytesRead);
+}
+
 void GPUCommon::Execute_Bezier(u32 op, u32 diff) {
 	drawEngineCommon_->DispatchFlush();

@ -2432,3 +2600,23 @@ bool GPUCommon::DescribeCodePtr(const u8 *ptr, std::string &name) {
 	}
 	return false;
 }
+
+bool GPUCommon::FramebufferDirty() {
+	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
+	if (vfb) {
+		bool dirty = vfb->dirtyAfterDisplay;
+		vfb->dirtyAfterDisplay = false;
+		return dirty;
+	}
+	return true;
+}
+
+bool GPUCommon::FramebufferReallyDirty() {
+	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
+	if (vfb) {
+		bool dirty = vfb->reallyDirtyAfterDisplay;
+		vfb->reallyDirtyAfterDisplay = false;
+		return dirty;
+	}
+	return true;
+}
--- a/GPU/GPUCommon.h
+++ b/GPU/GPUCommon.h
@ -70,6 +70,10 @@ public:
 	Draw::DrawContext *GetDrawContext() override {
 		return draw_;
 	}
+	virtual void CheckGPUFeatures() = 0;
+
+	void UpdateCmdInfo();
+
 	bool IsReady() override {
 		return true;
 	}
@ -129,11 +133,14 @@ public:
 	void Execute_VertexType(u32 op, u32 diff);
 	void Execute_VertexTypeSkinning(u32 op, u32 diff);

+	void Execute_Prim(u32 op, u32 diff);
 	void Execute_Bezier(u32 op, u32 diff);
 	void Execute_Spline(u32 op, u32 diff);
 	void Execute_BoundingBox(u32 op, u32 diff);
 	void Execute_BlockTransferStart(u32 op, u32 diff);

+	void Execute_LoadClut(u32 op, u32 diff);
+
 	void Execute_TexSize0(u32 op, u32 diff);
 	void Execute_TexLevel(u32 op, u32 diff);

@ -238,6 +245,9 @@ public:
 		return -1;
 	}

+	bool FramebufferDirty() override;
+	bool FramebufferReallyDirty() override;
+
 	typedef void (GPUCommon::*CmdFunc)(u32 op, u32 diff);

 protected:
@ -258,8 +268,8 @@ protected:

 	void BeginFrame() override;

-	// To avoid virtual calls to PreExecuteOp().
-	virtual void FastRunLoop(DisplayList &list) = 0;
+	virtual void FastRunLoop(DisplayList &list);
+
 	void SlowRunLoop(DisplayList &list);
 	void UpdatePC(u32 currentPC, u32 newPC);
 	void UpdateState(GPURunState state);
@ -290,6 +300,13 @@ protected:
 	GraphicsContext *gfxCtx_;
 	Draw::DrawContext *draw_;

+	struct CommandInfo {
+		uint64_t flags;
+		GPUCommon::CmdFunc func;
+	};
+
+	static CommandInfo cmdInfo_[256];
+
 	typedef std::list<int> DisplayListQueue;

 	int nextListID;
@ -316,6 +333,8 @@ protected:
 	DrawType lastDraw_;
 	GEPrimitiveType lastPrim_;

+	int vertexCost_ = 0;
+
 	// No idea how big this buffer needs to be.
 	enum {
 		MAX_IMMBUFFER_SIZE = 32,
--- a/GPU/Null/NullGpu.h
+++ b/GPU/Null/NullGpu.h
@ -26,6 +26,8 @@ class NullGPU : public GPUCommon {
 public:
 	NullGPU();
 	~NullGPU();
+
+	void CheckGPUFeatures() override {}
 	void InitClear() override {}
 	void ExecuteOp(u32 op, u32 diff) override;

--- a/GPU/Software/SoftGpu.h
+++ b/GPU/Software/SoftGpu.h
@ -52,6 +52,8 @@ class SoftGPU : public GPUCommon {
 public:
 	SoftGPU(GraphicsContext *gfxCtx, Draw::DrawContext *_thin3D);
 	~SoftGPU();
+
+	void CheckGPUFeatures() override {}
 	void InitClear() override {}
 	void ExecuteOp(u32 op, u32 diff) override;

--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@ -337,72 +337,6 @@ void DrawEngineVulkan::EndFrame() {
 	vertexCache_->End();
 }

-void DrawEngineVulkan::SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) {
-	if (!indexGen.PrimCompatible(prevPrim_, prim) || numDrawCalls >= MAX_DEFERRED_DRAW_CALLS || vertexCountInDrawCalls_ + vertexCount > VERTEX_BUFFER_MAX) {
-		Flush();
-	}
-
-	// TODO: Is this the right thing to do?
-	if (prim == GE_PRIM_KEEP_PREVIOUS) {
-		prim = prevPrim_ != GE_PRIM_INVALID ? prevPrim_ : GE_PRIM_POINTS;
-	} else {
-		prevPrim_ = prim;
-	}
-
-	SetupVertexDecoder(vertType);
-
-	*bytesRead = vertexCount * dec_->VertexSize();
-	if ((vertexCount < 2 && prim > 0) || (vertexCount < 3 && prim > 2 && prim != GE_PRIM_RECTANGLES))
-		return;
-
-	DeferredDrawCall &dc = drawCalls[numDrawCalls];
-	dc.verts = verts;
-	dc.inds = inds;
-	dc.vertType = vertType;
-	dc.indexType = (vertType & GE_VTYPE_IDX_MASK) >> GE_VTYPE_IDX_SHIFT;
-	dc.prim = prim;
-	dc.vertexCount = vertexCount;
-
-	if (g_Config.bVertexCache) {
-		u32 dhash = dcid_;
-		dhash ^= (u32)(uintptr_t)verts;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)(uintptr_t)inds;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)vertType;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)vertexCount;
-		dhash = __rotl(dhash, 13);
-		dhash ^= (u32)prim;
-		dcid_ = dhash;
-	}
-
-	if (inds) {
-		GetIndexBounds(inds, vertexCount, vertType, &dc.indexLowerBound, &dc.indexUpperBound);
-	} else {
-		dc.indexLowerBound = 0;
-		dc.indexUpperBound = vertexCount - 1;
-	}
-
-	uvScale[numDrawCalls] = gstate_c.uv;
-
-	numDrawCalls++;
-	vertexCountInDrawCalls_ += vertexCount;
-
-	if (g_Config.bSoftwareSkinning && (vertType & GE_VTYPE_WEIGHT_MASK)) {
-		DecodeVertsStep(decoded, decodeCounter_, decodedVerts_);
-		decodeCounter_++;
-	}
-
-	if (prim == GE_PRIM_RECTANGLES && (gstate.getTextureAddress(0) & 0x3FFFFFFF) == (gstate.getFrameBufAddress() & 0x3FFFFFFF)) {
-		// Rendertarget == texture?
-		if (!g_Config.bDisableSlowFramebufEffects) {
-			gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-			Flush();
-		}
-	}
-}
-
 void DrawEngineVulkan::DecodeVertsToPushBuffer(VulkanPushBuffer *push, uint32_t *bindOffset, VkBuffer *vkbuf) {
 	u8 *dest = decoded;

--- a/GPU/Vulkan/DrawEngineVulkan.h
+++ b/GPU/Vulkan/DrawEngineVulkan.h
@ -122,8 +122,6 @@ public:
 	DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *draw);
 	virtual ~DrawEngineVulkan();

-	void SubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead);
-
 	void SetShaderManager(ShaderManagerVulkan *shaderManager) {
 		shaderManager_ = shaderManager;
 	}
@ -157,9 +155,6 @@ public:
 	}

 	void DispatchFlush() override { Flush(); }
-	void DispatchSubmitPrim(void *verts, void *inds, GEPrimitiveType prim, int vertexCount, u32 vertType, int *bytesRead) override {
-		SubmitPrim(verts, inds, prim, vertexCount, vertType, bytesRead);
-	}

 	VkPipelineLayout GetPipelineLayout() const {
 		return pipelineLayout_;
--- a/GPU/Vulkan/GPU_Vulkan.cpp
+++ b/GPU/Vulkan/GPU_Vulkan.cpp
@ -46,31 +46,6 @@
 #include "Core/HLE/sceKernelInterrupt.h"
 #include "Core/HLE/sceGe.h"

-struct VulkanCommandTableEntry {
-	uint8_t cmd;
-	uint8_t flags;
-	uint64_t dirty;
-	GPU_Vulkan::CmdFunc func;
-};
-
-GPU_Vulkan::CommandInfo GPU_Vulkan::cmdInfo_[256];
-
-// This table gets crunched into a faster form by init.
-static const VulkanCommandTableEntry commandTable[] = {
-	// Changes that dirty the current texture.
-	{ GE_CMD_TEXSIZE0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPUCommon::Execute_TexSize0 },
-
-	// Changing the vertex type requires us to flush.
-	{ GE_CMD_VERTEXTYPE, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_VertexType },
-
-	{ GE_CMD_PRIM, FLAG_EXECUTE, 0, &GPU_Vulkan::Execute_Prim },
-	{ GE_CMD_BEZIER, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Bezier },
-	{ GE_CMD_SPLINE, FLAG_FLUSHBEFORE | FLAG_EXECUTE, 0, &GPUCommon::Execute_Spline },
-
-	// Changes that trigger data copies. Only flushing on change for LOADCLUT must be a bit of a hack...
-	{ GE_CMD_LOADCLUT, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTE, 0, &GPU_Vulkan::Execute_LoadClut },
-};
-
 GPU_Vulkan::GPU_Vulkan(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 	: GPUCommon(gfxCtx, draw),
 		vulkan_((VulkanContext *)gfxCtx->GetAPIContext()),
@ -111,46 +86,6 @@ GPU_Vulkan::GPU_Vulkan(GraphicsContext *gfxCtx, Draw::DrawContext *draw)
 		ERROR_LOG(G3D, "gstate has drifted out of sync!");
 	}

-	memset(cmdInfo_, 0, sizeof(cmdInfo_));
-
-	// Import both the global and local command tables, and check for dupes
-	std::set<u8> dupeCheck;
-	for (size_t i = 0; i < commonCommandTableSize; i++) {
-		const u8 cmd = commonCommandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commonCommandTable[i].flags | (commonCommandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commonCommandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-
-	for (size_t i = 0; i < ARRAY_SIZE(commandTable); i++) {
-		const u8 cmd = commandTable[i].cmd;
-		if (dupeCheck.find(cmd) != dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command table Dupe: %02x (%i)", (int)cmd, (int)cmd);
-		} else {
-			dupeCheck.insert(cmd);
-		}
-		cmdInfo_[cmd].flags |= (uint64_t)commandTable[i].flags | (commandTable[i].dirty << 8);
-		cmdInfo_[cmd].func = commandTable[i].func;
-		if ((cmdInfo_[cmd].flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) && !cmdInfo_[cmd].func) {
-			Crash();
-		}
-	}
-	// Find commands missing from the table.
-	for (int i = 0; i < 0xEF; i++) {
-		if (dupeCheck.find((u8)i) == dupeCheck.end()) {
-			ERROR_LOG(G3D, "Command missing from table: %02x (%i)", i, i);
-		}
-	}
-
-	UpdateCmdInfo();
-
 	BuildReportingInfo();
 	// Update again after init to be sure of any silly driver problems.
 	UpdateVsyncInterval(true);
@ -405,41 +340,11 @@ void GPU_Vulkan::UpdateVsyncInterval(bool force) {
 	// TODO
 }

-void GPU_Vulkan::UpdateCmdInfo() {
-	if (g_Config.bSoftwareSkinning) {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags &= ~FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexTypeSkinning;
-	} else {
-		cmdInfo_[GE_CMD_VERTEXTYPE].flags |= FLAG_FLUSHBEFOREONCHANGE;
-		cmdInfo_[GE_CMD_VERTEXTYPE].func = &GPUCommon::Execute_VertexType;
-	}
-}
-
 void GPU_Vulkan::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat format) {
 	host->GPUNotifyDisplay(framebuf, stride, format);
 	framebufferManager_->SetDisplayFramebuffer(framebuf, stride, format);
 }

-bool GPU_Vulkan::FramebufferDirty() {
-	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->dirtyAfterDisplay;
-		vfb->dirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
-bool GPU_Vulkan::FramebufferReallyDirty() {
-	VirtualFramebuffer *vfb = framebufferManager_->GetDisplayVFB();
-	if (vfb) {
-		bool dirty = vfb->reallyDirtyAfterDisplay;
-		vfb->reallyDirtyAfterDisplay = false;
-		return dirty;
-	}
-	return true;
-}
-
 void GPU_Vulkan::CopyDisplayToOutput() {
 	// Flush anything left over.
 	drawEngine_.Flush();
@ -451,44 +356,6 @@ void GPU_Vulkan::CopyDisplayToOutput() {
 	gstate_c.Dirty(DIRTY_TEXTURE_IMAGE);
 }

-// Maybe should write this in ASM...
-void GPU_Vulkan::FastRunLoop(DisplayList &list) {
-	PROFILE_THIS_SCOPE("gpuloop");
-	const CommandInfo *cmdInfo = cmdInfo_;
-	int dc = downcount;
-	for (; dc > 0; --dc) {
-		// We know that display list PCs have the upper nibble == 0 - no need to mask the pointer
-		const u32 op = *(const u32 *)(Memory::base + list.pc);
-		const u32 cmd = op >> 24;
-		const CommandInfo &info = cmdInfo[cmd];
-		const u32 diff = op ^ gstate.cmdmem[cmd];
-		if (diff == 0) {
-			if (info.flags & FLAG_EXECUTE) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			}
-		} else {
-			uint64_t flags = info.flags;
-			if (flags & FLAG_FLUSHBEFOREONCHANGE) {
-				drawEngine_.Flush();
-			}
-			gstate.cmdmem[cmd] = op;
-			if (flags & (FLAG_EXECUTE | FLAG_EXECUTEONCHANGE)) {
-				downcount = dc;
-				(this->*info.func)(op, diff);
-				dc = downcount;
-			} else {
-				uint64_t dirty = flags >> 8;
-				if (dirty)
-					gstate_c.Dirty(dirty);
-			}
-		}
-		list.pc += 4;
-	}
-	downcount = 0;
-}
-
 void GPU_Vulkan::FinishDeferred() {
 	drawEngine_.FinishDeferred();
 }
@ -520,85 +387,6 @@ void GPU_Vulkan::ExecuteOp(u32 op, u32 diff) {
 	}
 }

-void GPU_Vulkan::Execute_Prim(u32 op, u32 diff) {
-	// This drives all drawing. All other state we just buffer up, then we apply it only
-	// when it's time to draw. As most PSP games set state redundantly ALL THE TIME, this is a huge optimization.
-
-	PROFILE_THIS_SCOPE("execprim");
-
-	u32 data = op & 0xFFFFFF;
-	u32 count = data & 0xFFFF;
-	if (count == 0)
-		return;
-
-	// Upper bits are ignored.
-	GEPrimitiveType prim = static_cast<GEPrimitiveType>((data >> 16) & 7);
-	SetDrawType(DRAW_PRIM, prim);
-
-	// Discard AA lines as we can't do anything that makes sense with these anyway. The SW plugin might, though.
-	if (gstate.isAntiAliasEnabled()) {
-		// Discard AA lines in DOA
-		if (prim == GE_PRIM_LINE_STRIP)
-			return;
-		// Discard AA lines in Summon Night 5
-		if ((prim == GE_PRIM_LINES) && gstate.isSkinningEnabled())
-			return;
-	}
-
-	// This also makes skipping drawing very effective.
-	framebufferManager_->SetRenderFrameBuffer(gstate_c.IsDirty(DIRTY_FRAMEBUF), gstate_c.skipDrawReason);
-
-	if (gstate_c.skipDrawReason & (SKIPDRAW_SKIPFRAME | SKIPDRAW_NON_DISPLAYED_FB)) {
-		drawEngine_.SetupVertexDecoder(gstate.vertType);  // Do we still need to do this?
-		// Rough estimate, not sure what's correct.
-		cyclesExecuted += EstimatePerVertexCost() * count;
-		return;
-	}
-
-	if (!Memory::IsValidAddress(gstate_c.vertexAddr)) {
-		ERROR_LOG_REPORT(G3D, "Bad vertex address %08x!", gstate_c.vertexAddr);
-		return;
-	}
-
-	void *verts = Memory::GetPointerUnchecked(gstate_c.vertexAddr);
-	void *inds = 0;
-	u32 vertexType = gstate.vertType;
-	if ((gstate.vertType & GE_VTYPE_IDX_MASK) != GE_VTYPE_IDX_NONE) {
-		u32 indexAddr = gstate_c.indexAddr;
-		if (!Memory::IsValidAddress(indexAddr)) {
-			ERROR_LOG_REPORT(G3D, "Bad index address %08x!", indexAddr);
-			return;
-		}
-		inds = Memory::GetPointerUnchecked(indexAddr);
-	}
-
-#ifndef MOBILE_DEVICE
-	if (prim > GE_PRIM_RECTANGLES) {
-		ERROR_LOG_REPORT_ONCE(reportPrim, G3D, "Unexpected prim type: %d", prim);
-	}
-#endif
-
-	if (gstate_c.dirty & DIRTY_VERTEXSHADER_STATE) {
-		vertexCost_ = EstimatePerVertexCost();
-	}
-	gpuStats.vertexGPUCycles += vertexCost_ * count;
-	cyclesExecuted += vertexCost_* count;
-
-	int bytesRead = 0;
-	UpdateUVScaleOffset();
-	drawEngine_.SubmitPrim(verts, inds, prim, count, vertexType, &bytesRead);
-
-	// After drawing, we advance the vertexAddr (when non indexed) or indexAddr (when indexed).
-	// Some games rely on this, they don't bother reloading VADDR and IADDR.
-	// The VADDR/IADDR registers are NOT updated.
-	AdvanceVerts(vertexType, count, bytesRead);
-}
-
-void GPU_Vulkan::Execute_LoadClut(u32 op, u32 diff) {
-	gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-	textureCacheVulkan_->LoadClut(gstate.getClutAddress(), gstate.getClutLoadBytes());
-}
-
 void GPU_Vulkan::InitDeviceObjects() {
 	ILOG("GPU_Vulkan::InitDeviceObjects");
 	// Initialize framedata
--- a/GPU/Vulkan/GPU_Vulkan.h
+++ b/GPU/Vulkan/GPU_Vulkan.h
@ -36,7 +36,7 @@ public:
 	~GPU_Vulkan();

 	// This gets called on startup and when we get back from settings.
-	void CheckGPUFeatures();
+	void CheckGPUFeatures() override;

 	// These are where we can reset command buffers etc.
 	void BeginHostFrame() override;
@ -54,23 +54,12 @@ public:
 	void DoState(PointerWrap &p) override;

 	void ClearShaderCache() override;
-	bool FramebufferDirty() override;
-	bool FramebufferReallyDirty() override;

 	void GetReportingInfo(std::string &primaryInfo, std::string &fullInfo) override {
 		primaryInfo = reportingPrimaryInfo_;
 		fullInfo = reportingFullInfo_;
 	}

-	typedef void (GPU_Vulkan::*CmdFunc)(u32 op, u32 diff);
-	struct CommandInfo {
-		uint64_t flags;
-		GPU_Vulkan::CmdFunc func;
-	};
-	
-	void Execute_Prim(u32 op, u32 diff);
-	void Execute_LoadClut(u32 op, u32 diff);
-
 	// Using string because it's generic - makes no assumptions on the size of the shader IDs of this backend.
 	std::vector<std::string> DebugGetShaderIDs(DebugShaderType shader) override;
 	std::string DebugGetShaderString(std::string id, DebugShaderType shader, DebugShaderStringType stringType) override;
@ -80,7 +69,6 @@ public:
 	}

 protected:
-	void FastRunLoop(DisplayList &list) override;
 	void FinishDeferred() override;

 private:
@ -93,13 +81,10 @@ private:
 	void CopyDisplayToOutput() override;
 	void Reinitialize() override;
 	inline void UpdateVsyncInterval(bool force);
-	void UpdateCmdInfo();

 	void InitDeviceObjects();
 	void DestroyDeviceObjects();

-	static CommandInfo cmdInfo_[256];
-
 	VulkanContext *vulkan_;
 	FramebufferManagerVulkan *framebufferManagerVulkan_;
 	TextureCacheVulkan *textureCacheVulkan_;
@ -112,8 +97,6 @@ private:
 	// Manages state and pipeline objects
 	PipelineManagerVulkan *pipelineManager_;

-	int vertexCost_ = 0;
-
 	std::string reportingPrimaryInfo_;
 	std::string reportingFullInfo_;