From 7483923d073cb32207353d122b3226f340b7c011 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 12:52:06 -0700
Subject: [PATCH 01/35] softgpu: Correct clear rect off by one issues.

---
 GPU/Software/Rasterizer.cpp | 15 +++++++++++----
 test.py                     |  6 +++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
index 2336ff6bef..b4d13fa6f6 100644
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@@ -1136,13 +1136,20 @@ void DrawPoint(const VertexData &v0, const BinCoords &range, const RasterizerSta
 }
 
 void ClearRectangle(const VertexData &v0, const VertexData &v1, const BinCoords &range, const RasterizerState &state) {
-	DrawingCoords pprime = TransformUnit::ScreenToDrawing(range.x1, range.y1);
-	DrawingCoords pend = TransformUnit::ScreenToDrawing(range.x2, range.y2);
+	int entireX1 = std::min(v0.screenpos.x, v1.screenpos.x);
+	int entireY1 = std::min(v0.screenpos.y, v1.screenpos.y);
+	int entireX2 = std::max(v0.screenpos.x, v1.screenpos.x) - 1;
+	int entireY2 = std::max(v0.screenpos.y, v1.screenpos.y) - 1;
+	int minX = std::max(entireX1, range.x1) | (SCREEN_SCALE_FACTOR / 2 - 1);
+	int minY = std::max(entireY1, range.y1) | (SCREEN_SCALE_FACTOR / 2 - 1);
+	int maxX = std::min(entireX2, range.x2);
+	int maxY = std::min(entireY2, range.y2);
+	const DrawingCoords pprime = TransformUnit::ScreenToDrawing(minX, minY);
+	const DrawingCoords pend = TransformUnit::ScreenToDrawing(maxX, maxY);
 	auto &pixelID = state.pixelID;
 	auto &samplerID = state.samplerID;
 
-	// Min and max are in PSP fixed point screen coordinates, 16 here is for the 4 subpixel bits.
-	const int w = (range.x2 - range.x1 + 1) / SCREEN_SCALE_FACTOR;
+	const int w = pend.x - pprime.x + 1;
 	if (w <= 0)
 		return;
 
diff --git a/test.py b/test.py
index 47f5c36034..d8585b50dd 100755
--- a/test.py
+++ b/test.py
@@ -147,6 +147,7 @@ tests_good = [
   "gpu/commands/blend",
   "gpu/commands/blend565",
   "gpu/commands/blocktransfer",
+  "gpu/commands/cull",
   "gpu/commands/fog",
   "gpu/commands/material",
   "gpu/displaylist/alignment",
@@ -159,6 +160,7 @@ tests_good = [
   "gpu/ge/queue",
   "gpu/primitives/indices",
   "gpu/primitives/invalidprim",
+  "gpu/primitives/points",
   "gpu/primitives/trianglefan",
   "gpu/primitives/trianglestrip",
   "gpu/primitives/triangles",
@@ -181,6 +183,7 @@ tests_good = [
   "gpu/texfunc/replace",
   "gpu/textures/mipmap",
   "gpu/textures/rotate",
+  "gpu/vertices/colors",
   "hash/hash",
   "hle/check_not_used_uids",
   "intr/intr",
@@ -387,7 +390,6 @@ tests_next = [
   "font/shadowglyphimageclip",
   "font/shadowinfo",
   "gpu/clipping/guardband",
-  "gpu/commands/cull",
   "gpu/commands/light",
   "gpu/complex/complex",
   "gpu/depth/precision",
@@ -404,7 +406,6 @@ tests_next = [
   "gpu/primitives/immediate",
   "gpu/primitives/lines",
   "gpu/primitives/linestrip",
-  "gpu/primitives/points",
   "gpu/primitives/rectangles",
   "gpu/primitives/spline",
   "gpu/reflection/reflection",
@@ -415,7 +416,6 @@ tests_next = [
   "gpu/simple/simple",
   "gpu/textures/size",
   "gpu/triangle/triangle",
-  "gpu/vertices/colors",
   "gpu/vertices/texcoords",
   "intr/registersub",
   "intr/releasesub",

From 355c18512fa9354f12b73549c8e5fc4ccbb4a14e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 20 Sep 2022 22:31:54 +0200
Subject: [PATCH 02/35] Fix BGRA issue on D3D with GPU CLUT textures

---
 GPU/Common/TextureCacheCommon.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index 127d491010..97efd42cbe 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -418,10 +418,13 @@ TexCacheEntry *TextureCacheCommon::SetTexture() {
 	// Should probably revisit how this works..
 	gstate_c.SetNeedShaderTexclamp(false);
 	gstate_c.skipDrawReason &= ~SKIPDRAW_BAD_FB_TEXTURE;
-	if (gstate_c.bgraTexture != isBgraBackend_) {
+
+	bool isBgraTexture = isBgraBackend_ && !hasClutGPU;
+
+	if (gstate_c.bgraTexture != isBgraTexture) {
 		gstate_c.Dirty(DIRTY_FRAGMENTSHADER_STATE);
 	}
-	gstate_c.bgraTexture = isBgraBackend_;
+	gstate_c.bgraTexture = isBgraTexture;
 
 	if (entryIter != cache_.end()) {
 		entry = entryIter->second.get();

From 78a3925198bcefd9c6577cd33671240381fd03f3 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 13:43:19 -0700
Subject: [PATCH 03/35] softgpu: Fix display framebuffer read.

---
 GPU/Software/SoftGpu.cpp | 21 ++++++++++++---------
 GPU/Software/SoftGpu.h   |  2 +-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp
index 0f2d3bcda1..ecef1ab647 100644
--- a/GPU/Software/SoftGpu.cpp
+++ b/GPU/Software/SoftGpu.cpp
@@ -490,14 +490,16 @@ void SoftGPU::SetDisplayFramebuffer(u32 framebuf, u32 stride, GEBufferFormat for
 
 DSStretch g_DarkStalkerStretch;
 
-void SoftGPU::ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, u8 *overrideData) {
+void SoftGPU::ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, const uint16_t *overrideData) {
 	// TODO: This should probably be converted in a shader instead..
 	fbTexBuffer_.resize(srcwidth * srcheight);
-	FormatBuffer displayBuffer;
-	displayBuffer.data = overrideData ? overrideData : Memory::GetPointerWrite(displayFramebuf_);
+	const uint16_t *displayBuffer = overrideData;
+	if (!displayBuffer)
+		displayBuffer = (const uint16_t *)Memory::GetPointer(displayFramebuf_);
+
 	for (int y = 0; y < srcheight; ++y) {
 		u32 *buf_line = &fbTexBuffer_[y * srcwidth];
-		const u16 *fb_line = &displayBuffer.as16[y * displayStride_];
+		const u16 *fb_line = &displayBuffer[y * displayStride_];
 
 		switch (displayFormat_) {
 		case GE_FORMAT_565:
@@ -557,7 +559,7 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) {
 	bool hasPostShader = presentation_ && presentation_->HasPostShader();
 
 	if (PSP_CoreParameter().compat.flags().DarkStalkersPresentHack && displayFormat_ == GE_FORMAT_5551 && g_DarkStalkerStretch != DSStretch::Off) {
-		u8 *data = Memory::GetPointerWrite(0x04088000);
+		const u8 *data = Memory::GetPointerWrite(0x04088000);
 		bool fillDesc = true;
 		if (draw_->GetDataFormatSupport(Draw::DataFormat::A1B5G5R5_UNORM_PACK16) & Draw::FMT_TEXTURE) {
 			// The perfect one.
@@ -567,7 +569,7 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) {
 			desc.format = Draw::DataFormat::A1R5G5B5_UNORM_PACK16;
 			outputFlags |= OutputFlags::RB_SWIZZLE;
 		} else {
-			ConvertTextureDescFrom16(desc, srcwidth, srcheight, data);
+			ConvertTextureDescFrom16(desc, srcwidth, srcheight, (const uint16_t *)data);
 			fillDesc = false;
 		}
 		if (fillDesc) {
@@ -586,13 +588,13 @@ void SoftGPU::CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight) {
 		hasImage = false;
 		u1 = 1.0f;
 	} else if (displayFormat_ == GE_FORMAT_8888) {
-		u8 *data = Memory::GetPointerWrite(displayFramebuf_);
+		const u8 *data = Memory::GetPointer(displayFramebuf_);
 		desc.width = displayStride_ == 0 ? srcwidth : displayStride_;
 		desc.height = srcheight;
 		desc.initData.push_back(data);
 		desc.format = Draw::DataFormat::R8G8B8A8_UNORM;
 	} else if (displayFormat_ == GE_FORMAT_5551) {
-		const u8 *data = Memory::GetPointerWrite(displayFramebuf_);
+		const u8 *data = Memory::GetPointer(displayFramebuf_);
 		bool fillDesc = true;
 		if (draw_->GetDataFormatSupport(Draw::DataFormat::A1B5G5R5_UNORM_PACK16) & Draw::FMT_TEXTURE) {
 			// The perfect one.
@@ -1247,18 +1249,19 @@ bool SoftGPU::GetCurrentFramebuffer(GPUDebugBuffer &buffer, GPUDebugFramebufferT
 	int stride = gstate.FrameBufStride();
 	DrawingCoords size = GetTargetSize(stride);
 	GEBufferFormat fmt = gstate.FrameBufFormat();
+	const u8 *src = fb.data;
 
 	if (type == GPU_DBG_FRAMEBUF_DISPLAY) {
 		size.x = 480;
 		size.y = 272;
 		stride = displayStride_;
 		fmt = displayFormat_;
+		src = Memory::GetPointer(displayFramebuf_);
 	}
 
 	buffer.Allocate(size.x, size.y, fmt);
 
 	const int depth = fmt == GE_FORMAT_8888 ? 4 : 2;
-	const u8 *src = fb.data;
 	u8 *dst = buffer.GetData();
 	const int byteWidth = size.x * depth;
 	for (int16_t y = 0; y < size.y; ++y) {
diff --git a/GPU/Software/SoftGpu.h b/GPU/Software/SoftGpu.h
index e90f7c0fb8..18fa23118d 100644
--- a/GPU/Software/SoftGpu.h
+++ b/GPU/Software/SoftGpu.h
@@ -194,7 +194,7 @@ public:
 protected:
 	void FastRunLoop(DisplayList &list) override;
 	void CopyToCurrentFboFromDisplayRam(int srcwidth, int srcheight);
-	void ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, u8 *overrideData = nullptr);
+	void ConvertTextureDescFrom16(Draw::TextureDesc &desc, int srcwidth, int srcheight, const uint16_t *overrideData = nullptr);
 
 private:
 	void MarkDirty(uint32_t addr, uint32_t stride, uint32_t height, GEBufferFormat fmt, SoftGPUVRAMDirty value);

From bf86f00df87c16aea79317fa036a4da399bb252f Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 14:01:36 -0700
Subject: [PATCH 04/35] GPU: Correct display framebuffer reading.

The displayFramebuf_ might not be set yet, but that doesn't mean we want
an INVALID format and zero stride.  We might also be rendering to a
different target, but still want the display.
---
 GPU/Common/FramebufferManagerCommon.cpp | 8 ++++----
 GPU/Common/FramebufferManagerCommon.h   | 9 ++++++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index ed43996676..dd01c3c499 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -1021,7 +1021,7 @@ void FramebufferManagerCommon::UpdateFromMemory(u32 addr, int size) {
 	// TODO: Could go through all FBOs, but probably not important?
 	// TODO: Could also check for inner changes, but video is most important.
 	// TODO: This shouldn't care if it's a display framebuf or not, should work exactly the same.
-	bool isDisplayBuf = addr == DisplayFramebufAddr() || addr == PrevDisplayFramebufAddr();
+	bool isDisplayBuf = addr == CurrentDisplayFramebufAddr() || addr == PrevDisplayFramebufAddr();
 	// TODO: Deleting the FBO is a heavy hammer solution, so let's only do it if it'd help.
 	if (!Memory::IsValidAddress(displayFramebufPtr_))
 		return;
@@ -2182,7 +2182,7 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS
 	// We may still do a partial block draw below if this doesn't pass.
 	if (!useBufferedRendering_ && dstStride >= 480 && width >= 480 && height == 272) {
 		bool isPrevDisplayBuffer = PrevDisplayFramebufAddr() == dstBasePtr;
-		bool isDisplayBuffer = DisplayFramebufAddr() == dstBasePtr;
+		bool isDisplayBuffer = CurrentDisplayFramebufAddr() == dstBasePtr;
 		if (isPrevDisplayBuffer || isDisplayBuffer) {
 			FlushBeforeCopy();
 			DrawFramebufferToOutput(Memory::GetPointerUnchecked(dstBasePtr), dstStride, displayFormat_);
@@ -2357,8 +2357,8 @@ void FramebufferManagerCommon::ShowScreenResolution() {
 // * Save state screenshots(could probably be async but need to manage the stall.)
 bool FramebufferManagerCommon::GetFramebuffer(u32 fb_address, int fb_stride, GEBufferFormat format, GPUDebugBuffer &buffer, int maxScaleFactor) {
 	VirtualFramebuffer *vfb = currentRenderVfb_;
-	if (!vfb) {
-		vfb = GetVFBAt(fb_address);
+	if (!vfb || vfb->fb_address != fb_address) {
+		vfb = ResolveVFB(fb_address, fb_stride, format);
 	}
 
 	if (!vfb) {
diff --git a/GPU/Common/FramebufferManagerCommon.h b/GPU/Common/FramebufferManagerCommon.h
index 5ef9159868..a38cc328d5 100644
--- a/GPU/Common/FramebufferManagerCommon.h
+++ b/GPU/Common/FramebufferManagerCommon.h
@@ -335,15 +335,18 @@ public:
 	u32 PrevDisplayFramebufAddr() const {
 		return prevDisplayFramebuf_ ? prevDisplayFramebuf_->fb_address : 0;
 	}
-	u32 DisplayFramebufAddr() const {
+	u32 CurrentDisplayFramebufAddr() const {
 		return displayFramebuf_ ? displayFramebuf_->fb_address : 0;
 	}
 
+	u32 DisplayFramebufAddr() const {
+		return displayFramebufPtr_;
+	}
 	u32 DisplayFramebufStride() const {
-		return displayFramebuf_ ? displayStride_ : 0;
+		return displayStride_;
 	}
 	GEBufferFormat DisplayFramebufFormat() const {
-		return displayFramebuf_ ? displayFormat_ : GE_FORMAT_INVALID;
+		return displayFormat_;
 	}
 
 	bool UseBufferedRendering() const {

From 1dc1b2c35b2ad3dd5bee02013ad3b772ea13b794 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 14:05:50 -0700
Subject: [PATCH 05/35] headless: Use display buf for compare screenshot.

This is what the test actually uses too.
---
 headless/StubHost.cpp | 2 +-
 test.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/headless/StubHost.cpp b/headless/StubHost.cpp
index b3896df478..03a27ab1bf 100644
--- a/headless/StubHost.cpp
+++ b/headless/StubHost.cpp
@@ -46,7 +46,7 @@ void HeadlessHost::SendDebugScreenshot(const u8 *pixbuf, u32 w, u32 h) {
 	const static u32 FRAME_HEIGHT = 272;
 
 	GPUDebugBuffer buffer;
-	gpuDebug->GetCurrentFramebuffer(buffer, GPU_DBG_FRAMEBUF_RENDER);
+	gpuDebug->GetCurrentFramebuffer(buffer, GPU_DBG_FRAMEBUF_DISPLAY);
 	const std::vector<u32> pixels = TranslateDebugBufferToCompare(&buffer, 512, 272);
 
 	ScreenshotComparer comparer(pixels, FRAME_STRIDE, FRAME_WIDTH, FRAME_HEIGHT);
diff --git a/test.py b/test.py
index d8585b50dd..e2967f06a2 100755
--- a/test.py
+++ b/test.py
@@ -150,6 +150,7 @@ tests_good = [
   "gpu/commands/cull",
   "gpu/commands/fog",
   "gpu/commands/material",
+  "gpu/complex/complex",
   "gpu/displaylist/alignment",
   "gpu/dither/dither",
   "gpu/filtering/mipmaplinear",
@@ -391,7 +392,6 @@ tests_next = [
   "font/shadowinfo",
   "gpu/clipping/guardband",
   "gpu/commands/light",
-  "gpu/complex/complex",
   "gpu/depth/precision",
   "gpu/displaylist/state",
   "gpu/filtering/linear",

From cefef3b4f91b195a227449d12a1f04a0d5cdbd72 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 14:29:33 -0700
Subject: [PATCH 06/35] softgpu: Narrow blend check for fast path further.

See #15756, frame was largely black because of a full screen blend
rectangle intended to brighten the screen slighty (I assume.)
---
 GPU/Software/RasterizerRectangle.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/GPU/Software/RasterizerRectangle.cpp b/GPU/Software/RasterizerRectangle.cpp
index ff5b9c85d9..8c7b4b171f 100644
--- a/GPU/Software/RasterizerRectangle.cpp
+++ b/GPU/Software/RasterizerRectangle.cpp
@@ -93,7 +93,10 @@ static inline bool AlphaTestIsNeedless(const PixelFuncID &pixelID) {
 	case GE_COMP_NOTEQUAL:
 	case GE_COMP_GREATER:
 	case GE_COMP_GEQUAL:
-		return pixelID.alphaBlend && pixelID.alphaTestRef == 0 && !pixelID.hasAlphaTestMask;
+		if (pixelID.alphaTestRef != 0 || pixelID.hasAlphaTestMask)
+			return false;
+		// DrawSinglePixel5551 assumes it can take the src color directly if full alpha.
+		return pixelID.alphaBlend && pixelID.AlphaBlendSrc() == PixelBlendFactor::SRCALPHA && pixelID.AlphaBlendDst() == PixelBlendFactor::INVSRCALPHA;
 	}
 
 	return false;

From 0a24004eac95272bf7d2bf647a20518d6a14b80d Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 15:12:03 -0700
Subject: [PATCH 07/35] GPU: Account for w properly in lines, fixing width.

See #15756.
---
 GPU/Common/SoftwareTransformCommon.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/GPU/Common/SoftwareTransformCommon.cpp b/GPU/Common/SoftwareTransformCommon.cpp
index 98b5adc079..562769d660 100644
--- a/GPU/Common/SoftwareTransformCommon.cpp
+++ b/GPU/Common/SoftwareTransformCommon.cpp
@@ -790,13 +790,13 @@ void SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *&inds,
 			float yoff = addWidth.y * dy;
 
 			// bottom right
-			trans[0].CopyFromWithOffset(transVtx2, xoff, yoff);
+			trans[0].CopyFromWithOffset(transVtx2, xoff * transVtx2.pos_w, yoff * transVtx2.pos_w);
 			// top right
-			trans[1].CopyFromWithOffset(transVtx1, xoff, yoff);
+			trans[1].CopyFromWithOffset(transVtx1, xoff * transVtx1.pos_w, yoff * transVtx1.pos_w);
 			// top left
-			trans[2].CopyFromWithOffset(transVtx1, -xoff, -yoff);
+			trans[2].CopyFromWithOffset(transVtx1, -xoff * transVtx1.pos_w, -yoff * transVtx1.pos_w);
 			// bottom left
-			trans[3].CopyFromWithOffset(transVtx2, -xoff, -yoff);
+			trans[3].CopyFromWithOffset(transVtx2, -xoff * transVtx2.pos_w, -yoff * transVtx2.pos_w);
 
 			// Triangle: BR-TR-TL
 			indsOut[0] = i * 2 + 0;
@@ -835,17 +835,17 @@ void SoftwareTransform::ExpandLines(int vertexCount, int &maxIndex, u16 *&inds,
 
 			// bottom right
 			trans[0] = transVtxBL;
-			trans[0].x += addWidth.x * dx;
-			trans[0].y += addWidth.y * dy;
-			trans[0].u += addWidth.x * du;
-			trans[0].v += addWidth.y * dv;
+			trans[0].x += addWidth.x * dx * trans[0].pos_w;
+			trans[0].y += addWidth.y * dy * trans[0].pos_w;
+			trans[0].u += addWidth.x * du * trans[0].uv_w;
+			trans[0].v += addWidth.y * dv * trans[0].uv_w;
 
 			// top right
 			trans[1] = transVtxTL;
-			trans[1].x += addWidth.x * dx;
-			trans[1].y += addWidth.y * dy;
-			trans[1].u += addWidth.x * du;
-			trans[1].v += addWidth.y * dv;
+			trans[1].x += addWidth.x * dx * trans[1].pos_w;
+			trans[1].y += addWidth.y * dy * trans[1].pos_w;
+			trans[1].u += addWidth.x * du * trans[1].uv_w;
+			trans[1].v += addWidth.y * dv * trans[1].uv_w;
 
 			// top left
 			trans[2] = transVtxTL;

From 2fc7f72d72c783e1737c18d9707ba0463d7340bc Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 15:55:39 -0700
Subject: [PATCH 08/35] GPU: Clip clamped depth accounting for perspective.

---
 GPU/Common/VertexShaderGenerator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index 16cce87d0f..adce008676 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -1206,14 +1206,14 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			// Everywhere else, it's 0 -> 1, simpler.
 			WRITE(p, "  if (u_depthRange.y >= 1.0) {\n");
 		}
-		WRITE(p, "    %sgl_ClipDistance%s = integerZ;\n", compat.vsOutPrefix, clip0);
+		WRITE(p, "    %sgl_ClipDistance%s = integerZ * outPos.w;\n", compat.vsOutPrefix, clip0);
 		WRITE(p, "  } else {\n");
 		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip0);
 		WRITE(p, "  }\n");
 
 		// This is similar, but for maxz when it's below 65535.0.  -1/0 don't matter here.
 		WRITE(p, "  if (u_depthRange.x + u_depthRange.y <= 65534.0) {\n");
-		WRITE(p, "    %sgl_ClipDistance%s = 65535.0 - integerZ;\n", compat.vsOutPrefix, clip1);
+		WRITE(p, "    %sgl_ClipDistance%s = (65535.0 - integerZ) * outPos.w;\n", compat.vsOutPrefix, clip1);
 		WRITE(p, "  } else {\n");
 		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip1);
 		WRITE(p, "  }\n");

From f8d29fdc1a8b4b703ee9466e1127b9b571e4c64a Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 16:21:17 -0700
Subject: [PATCH 09/35] GPU: Simplify depth clamped clip planes.

There's no need to think about the scaled Z if we're using w anyway, just
use the existing Z clipping.
---
 GPU/Common/VertexShaderGenerator.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index adce008676..6e3f3185b3 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -1189,37 +1189,34 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 			WRITE(p, "  %sv_fogdepth = (viewPos.z + u_fogcoef.x) * u_fogcoef.y;\n", compat.vsOutPrefix);
 	}
 
-	if (clipClampedDepth || (vertexRangeCulling && !IsVRBuild())) {
-		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
-	}
-
 	if (clipClampedDepth) {
 		const char *clip0 = compat.shaderLanguage == HLSL_D3D11 ? ".x" : "[0]";
 		const char *clip1 = compat.shaderLanguage == HLSL_D3D11 ? ".y" : "[1]";
-		WRITE(p, "  mediump float integerZ = projPos.z * u_depthRange.x + u_depthRange.y;\n");
 
 		// This should clip against minz, but only when it's above zero.
 		if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
 			// On OpenGL/GLES, these values account for the -1 -> 1 range.
 			WRITE(p, "  if (u_depthRange.y - u_depthRange.x >= 1.0) {\n");
+			WRITE(p, "    %sgl_ClipDistance%s = outPos.w + outPos.z;\n", compat.vsOutPrefix, clip0);
 		} else {
 			// Everywhere else, it's 0 -> 1, simpler.
 			WRITE(p, "  if (u_depthRange.y >= 1.0) {\n");
+			WRITE(p, "    %sgl_ClipDistance%s = outPos.z;\n", compat.vsOutPrefix, clip0);
 		}
-		WRITE(p, "    %sgl_ClipDistance%s = integerZ * outPos.w;\n", compat.vsOutPrefix, clip0);
 		WRITE(p, "  } else {\n");
 		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip0);
 		WRITE(p, "  }\n");
 
 		// This is similar, but for maxz when it's below 65535.0.  -1/0 don't matter here.
 		WRITE(p, "  if (u_depthRange.x + u_depthRange.y <= 65534.0) {\n");
-		WRITE(p, "    %sgl_ClipDistance%s = (65535.0 - integerZ) * outPos.w;\n", compat.vsOutPrefix, clip1);
+		WRITE(p, "    %sgl_ClipDistance%s = outPos.w - outPos.z;\n", compat.vsOutPrefix, clip1);
 		WRITE(p, "  } else {\n");
 		WRITE(p, "    %sgl_ClipDistance%s = 0.0;\n", compat.vsOutPrefix, clip1);
 		WRITE(p, "  }\n");
 	}
 
 	if (vertexRangeCulling && !IsVRBuild()) {
+		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
 		WRITE(p, "  float projZ = (projPos.z - u_depthRange.z) * u_depthRange.w;\n");
 		// Vertex range culling doesn't happen when Z clips, note sign of w is important.
 		WRITE(p, "  if (u_cullRangeMin.w <= 0.0 || projZ * outPos.w > -outPos.w) {\n");

From 79c5c93d351dd2eed359147f0e546e7ad0b1bb4d Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 20 Sep 2022 19:25:32 -0700
Subject: [PATCH 10/35] Kernel: Match index lookup behavior for tls.

It might be the uids it returns always follow this format.
This makes the test almost pass, outside psplink using more memory (test
should be adjusted to compensate.)
---
 Core/HLE/sceKernelMemory.cpp | 119 ++++++++++++++++++++---------------
 1 file changed, 70 insertions(+), 49 deletions(-)

diff --git a/Core/HLE/sceKernelMemory.cpp b/Core/HLE/sceKernelMemory.cpp
index 935cbc8482..318cae7d0c 100644
--- a/Core/HLE/sceKernelMemory.cpp
+++ b/Core/HLE/sceKernelMemory.cpp
@@ -2174,68 +2174,89 @@ int sceKernelDeleteTlspl(SceUID uid)
 	return error;
 }
 
-int sceKernelGetTlsAddr(SceUID uid)
-{
-	// TODO: Allocate downward if PSP_TLSPL_ATTR_HIGHMEM?
-	DEBUG_LOG(SCEKERNEL, "sceKernelGetTlsAddr(%08x)", uid);
+struct FindTLSByIndexArg {
+	int index;
+	TLSPL *result = nullptr;
+};
 
+static bool FindTLSByIndex(TLSPL *possible, FindTLSByIndexArg *state) {
+	if (possible->ntls.index == state->index) {
+		state->result = possible;
+		return false;
+	}
+	return true;
+}
+
+int sceKernelGetTlsAddr(SceUID uid) {
 	if (!__KernelIsDispatchEnabled() || __IsInInterrupt())
-		return 0;
+		return hleLogWarning(SCEKERNEL, 0, "dispatch disabled");
 
 	u32 error;
 	TLSPL *tls = kernelObjects.Get<TLSPL>(uid, error);
-	if (tls)
-	{
-		SceUID threadID = __KernelGetCurThread();
-		int allocBlock = -1;
-		bool needsClear = false;
+	if (!tls) {
+		if (uid < 0)
+			return hleLogError(SCEKERNEL, 0, "tlspl not found");
 
-		// If the thread already has one, return it.
+		// There's this weird behavior where it looks up by index.  Maybe we shouldn't use uids...
+		if (!tlsplUsedIndexes[(uid >> 3) & 15])
+			return hleLogError(SCEKERNEL, 0, "tlspl not found");
+
+		FindTLSByIndexArg state;
+		state.index = (uid >> 3) & 15;
+		kernelObjects.Iterate<TLSPL>(&FindTLSByIndex, &state);
+		if (!state.result)
+			return hleLogError(SCEKERNEL, 0, "tlspl not found");
+
+		tls = state.result;
+	}
+
+	SceUID threadID = __KernelGetCurThread();
+	int allocBlock = -1;
+	bool needsClear = false;
+
+	// If the thread already has one, return it.
+	for (size_t i = 0; i < tls->ntls.totalBlocks && allocBlock == -1; ++i)
+	{
+		if (tls->usage[i] == threadID)
+			allocBlock = (int) i;
+	}
+
+	if (allocBlock == -1)
+	{
 		for (size_t i = 0; i < tls->ntls.totalBlocks && allocBlock == -1; ++i)
 		{
-			if (tls->usage[i] == threadID)
-				allocBlock = (int) i;
+			// The PSP doesn't give the same block out twice in a row, even if freed.
+			if (tls->usage[tls->next] == 0)
+				allocBlock = tls->next;
+			tls->next = (tls->next + 1) % tls->ntls.totalBlocks;
 		}
 
-		if (allocBlock == -1)
+		if (allocBlock != -1)
 		{
-			for (size_t i = 0; i < tls->ntls.totalBlocks && allocBlock == -1; ++i)
-			{
-				// The PSP doesn't give the same block out twice in a row, even if freed.
-				if (tls->usage[tls->next] == 0)
-					allocBlock = tls->next;
-				tls->next = (tls->next + 1) % tls->ntls.totalBlocks;
-			}
-
-			if (allocBlock != -1)
-			{
-				tls->usage[allocBlock] = threadID;
-				tlsplThreadEndChecks.insert(std::make_pair(threadID, uid));
-				--tls->ntls.freeBlocks;
-				needsClear = true;
-			}
+			tls->usage[allocBlock] = threadID;
+			tlsplThreadEndChecks.insert(std::make_pair(threadID, uid));
+			--tls->ntls.freeBlocks;
+			needsClear = true;
 		}
-
-		if (allocBlock == -1)
-		{
-			tls->waitingThreads.push_back(threadID);
-			__KernelWaitCurThread(WAITTYPE_TLSPL, uid, 1, 0, false, "allocate tls");
-			return 0;
-		}
-
-		u32 alignedSize = (tls->ntls.blockSize + tls->alignment - 1) & ~(tls->alignment - 1);
-		u32 allocAddress = tls->address + allocBlock * alignedSize;
-		NotifyMemInfo(MemBlockFlags::SUB_ALLOC, allocAddress, tls->ntls.blockSize, "TlsAddr");
-
-		// We clear the blocks upon first allocation (and also when they are freed, both are necessary.)
-		if (needsClear) {
-			Memory::Memset(allocAddress, 0, tls->ntls.blockSize, "TlsAddr");
-		}
-
-		return allocAddress;
 	}
-	else
-		return 0;
+
+	if (allocBlock == -1)
+	{
+		tls->waitingThreads.push_back(threadID);
+		__KernelWaitCurThread(WAITTYPE_TLSPL, uid, 1, 0, false, "allocate tls");
+		return hleLogDebug(SCEKERNEL, 0, "waiting for tls alloc");
+	}
+
+	u32 alignedSize = (tls->ntls.blockSize + tls->alignment - 1) & ~(tls->alignment - 1);
+	u32 allocAddress = tls->address + allocBlock * alignedSize;
+	NotifyMemInfo(MemBlockFlags::SUB_ALLOC, allocAddress, tls->ntls.blockSize, "TlsAddr");
+
+	// We clear the blocks upon first allocation (and also when they are freed, both are necessary.)
+	if (needsClear) {
+		Memory::Memset(allocAddress, 0, tls->ntls.blockSize, "TlsAddr");
+	}
+
+	return hleLogDebug(SCEKERNEL, allocAddress);
 }
 
 // Parameters are an educated guess.

From 30454f8dc747ea1fcf70deac6774cc18778cfadd Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 21 Sep 2022 00:09:34 -0700
Subject: [PATCH 11/35] Vulkan: Avoid crash in headless on finish.

---
 Common/GPU/Vulkan/VulkanRenderManager.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Common/GPU/Vulkan/VulkanRenderManager.cpp b/Common/GPU/Vulkan/VulkanRenderManager.cpp
index 3a0dde0312..4d1363e49c 100644
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@@ -1269,7 +1269,10 @@ void VulkanRenderManager::Run(int frame) {
 	BeginSubmitFrame(frame);
 
 	FrameData &frameData = frameData_[frame];
-	queueRunner_.PreprocessSteps(frameData_[frame].steps);
+	queueRunner_.PreprocessSteps(frameData.steps);
+	// Likely during shutdown, happens in headless.
+	if (frameData.steps.empty() && !frameData.hasAcquired)
+		frameData.skipSwap = true;
 	//queueRunner_.LogSteps(stepsOnThread, false);
 	queueRunner_.RunSteps(frameData, frameDataShared_);
 

From bbc5a7cf4b5af932e9a75e5a3e1f8ff2aca456ce Mon Sep 17 00:00:00 2001
From: Lubos <tridosm@gmail.com>
Date: Wed, 21 Sep 2022 16:44:31 +0200
Subject: [PATCH 12/35] OpenXR - Stereo mirroring fixed, disable stereo when no
 world scale defined

---
 Common/VR/PPSSPPVR.cpp   |  3 ++-
 Common/VR/VRRenderer.cpp | 23 +++++++++++++++++------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/Common/VR/PPSSPPVR.cpp b/Common/VR/PPSSPPVR.cpp
index 1e0931ebfe..13b424832c 100644
--- a/Common/VR/PPSSPPVR.cpp
+++ b/Common/VR/PPSSPPVR.cpp
@@ -230,7 +230,8 @@ bool StartVRRender() {
 
 		// Decide if the scene is 3D or not
 		if (g_Config.bEnableVR && !VR_GetConfig(VR_CONFIG_FORCE_2D) && (VR_GetConfig(VR_CONFIG_3D_GEOMETRY_COUNT) > 15)) {
-			VR_SetConfig(VR_CONFIG_MODE, g_Config.bEnableStereo ? VR_MODE_STEREO_6DOF : VR_MODE_MONO_6DOF);
+			bool stereo = VR_GetConfig(VR_CONFIG_6DOF_PRECISE) && g_Config.bEnableStereo;
+			VR_SetConfig(VR_CONFIG_MODE, stereo ? VR_MODE_STEREO_6DOF : VR_MODE_MONO_6DOF);
 		} else {
 			VR_SetConfig(VR_CONFIG_MODE, VR_MODE_FLAT_SCREEN);
 		}
diff --git a/Common/VR/VRRenderer.cpp b/Common/VR/VRRenderer.cpp
index 248b2b1487..2ad0014c10 100644
--- a/Common/VR/VRRenderer.cpp
+++ b/Common/VR/VRRenderer.cpp
@@ -359,13 +359,17 @@ void VR_FinishFrame( engine_t* engine ) {
 		for (int eye = 0; eye < ovrMaxNumEyes; eye++) {
 			int imageLayer = engine->appState.Renderer.Multiview ? eye : 0;
 			ovrFramebuffer* frameBuffer = &engine->appState.Renderer.FrameBuffer[0];
-			if ((vrMode != VR_MODE_MONO_6DOF) && !engine->appState.Renderer.Multiview) {
-				frameBuffer = &engine->appState.Renderer.FrameBuffer[eye];
+			XrPosef pose = invViewTransform[0];
+			if (vrMode != VR_MODE_MONO_6DOF) {
+				if (!engine->appState.Renderer.Multiview) {
+					frameBuffer = &engine->appState.Renderer.FrameBuffer[eye];
+				}
+				pose = invViewTransform[eye];
 			}
 
 			memset(&projection_layer_elements[eye], 0, sizeof(XrCompositionLayerProjectionView));
 			projection_layer_elements[eye].type = XR_TYPE_COMPOSITION_LAYER_PROJECTION_VIEW;
-			projection_layer_elements[eye].pose = invViewTransform[eye];
+			projection_layer_elements[eye].pose = pose;
 			projection_layer_elements[eye].fov = fov;
 
 			memset(&projection_layer_elements[eye].subImage, 0, sizeof(XrSwapchainSubImage));
@@ -502,9 +506,16 @@ ovrMatrix4f VR_GetMatrix( VRMatrix matrix ) {
 			output.M[2][3] -= hmdposition.z * (vrConfig[VR_CONFIG_MIRROR_AXIS_Z] ? -1.0f : 1.0f) * scale;
 		}
 		if (vrConfig[VR_CONFIG_6DOF_PRECISE] && (matrix == VR_VIEW_MATRIX_RIGHT_EYE)) {
-			output.M[0][3] += (invViewTransform[1].position.x - invViewTransform[0].position.x) * scale;
-			output.M[1][3] += (invViewTransform[1].position.y - invViewTransform[0].position.y) * scale;
-			output.M[2][3] += (invViewTransform[1].position.z - invViewTransform[0].position.z) * scale;
+			float dx = fabs(invViewTransform[1].position.x - invViewTransform[0].position.x);
+			float dy = fabs(invViewTransform[1].position.y - invViewTransform[0].position.y);
+			float dz = fabs(invViewTransform[1].position.z - invViewTransform[0].position.z);
+			float ipd = sqrt(dx * dx + dy * dy + dz * dz);
+			XrVector3f separation = {ipd * scale, 0.0f, 0.0f};
+			separation = XrQuaternionf_Rotate(invView.orientation, separation);
+			separation = XrVector3f_ScalarMultiply(separation, vrConfig[VR_CONFIG_MIRROR_AXIS_Z] ? -1.0f : 1.0f);
+			output.M[0][3] -= separation.x;
+			output.M[1][3] -= separation.y;
+			output.M[2][3] -= separation.z;
 		}
 	} else {
 		assert(false);

From 94ae0fabfafbf28a6fd116a4f8d9a72deb380950 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 21 Sep 2022 18:33:15 +0200
Subject: [PATCH 13/35] CLUTs can be loaded from small rectangular textures.
 Need to linearize.

Fixes #8406, although technically, we should wrap by bufw, not the
texture width.
---
 GPU/Common/Draw2D.cpp                   | 24 ++++++++++++++++++++++++
 GPU/Common/Draw2D.h                     |  1 +
 GPU/Common/FramebufferManagerCommon.cpp |  1 +
 GPU/Common/FramebufferManagerCommon.h   |  1 +
 GPU/Common/TextureCacheCommon.cpp       |  5 +++--
 5 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/Draw2D.cpp b/GPU/Common/Draw2D.cpp
index ff7aab3169..49aaa0bd5b 100644
--- a/GPU/Common/Draw2D.cpp
+++ b/GPU/Common/Draw2D.cpp
@@ -70,6 +70,23 @@ Draw2DPipelineInfo GenerateDraw2DCopyColorFs(ShaderWriter &writer) {
 	};
 }
 
+Draw2DPipelineInfo GenerateDraw2DCopyColorRect2LinFs(ShaderWriter &writer) {
+	writer.DeclareSamplers(samplers);
+	writer.BeginFSMain(g_draw2Duniforms, varyings, FSFLAG_NONE);
+	writer.C("  vec2 tSize = texSize / scaleFactor;\n");
+	writer.C("  vec2 pixels = v_texcoord * tSize;\n");
+	writer.C("  float u = mod(floor(pixels.x), tSize.x);\n");
+	writer.C("  float v = floor(pixels.x / tSize.x);\n");
+	writer.C("  vec4 outColor = ").SampleTexture2D("tex", "vec2(u, v) / tSize").C(";\n");
+	writer.EndFSMain("outColor", FSFLAG_NONE);
+
+	return Draw2DPipelineInfo{
+		"draw2d_copy_color_rect2lin",
+		RASTER_COLOR,
+		RASTER_COLOR,
+	};
+}
+
 Draw2DPipelineInfo GenerateDraw2DCopyDepthFs(ShaderWriter &writer) {
 	writer.DeclareSamplers(samplers);
 	writer.BeginFSMain(Slice<UniformDef>::empty(), varyings, FSFLAG_WRITEDEPTH);
@@ -318,6 +335,13 @@ Draw2DPipeline *FramebufferManagerCommon::Get2DPipeline(Draw2DShader shader) {
 		pipeline = draw2DPipelineColor_;
 		break;
 
+	case DRAW2D_COPY_COLOR_RECT2LIN:
+		if (!draw2DPipelineColorRect2Lin_) {
+			draw2DPipelineColorRect2Lin_ = draw2D_.Create2DPipeline(&GenerateDraw2DCopyColorRect2LinFs);
+		}
+		pipeline = draw2DPipelineColorRect2Lin_;
+		break;
+
 	case DRAW2D_COPY_DEPTH:
 		if (!draw_->GetDeviceCaps().fragmentShaderDepthWriteSupported) {
 			// Can't do it
diff --git a/GPU/Common/Draw2D.h b/GPU/Common/Draw2D.h
index d44f7ff613..bf156e6d39 100644
--- a/GPU/Common/Draw2D.h
+++ b/GPU/Common/Draw2D.h
@@ -16,6 +16,7 @@ enum Draw2DShader {
 	DRAW2D_COPY_DEPTH,
 	DRAW2D_565_TO_DEPTH,
 	DRAW2D_565_TO_DEPTH_DESWIZZLE,
+	DRAW2D_COPY_COLOR_RECT2LIN,
 };
 
 inline RasterChannel Draw2DSourceChannel(Draw2DShader shader) {
diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index dd01c3c499..8d17f2d9ff 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -2701,6 +2701,7 @@ void FramebufferManagerCommon::DeviceLost() {
 	DoRelease(stencilUploadSampler_);
 	DoRelease(stencilUploadPipeline_);
 	DoRelease(draw2DPipelineColor_);
+	DoRelease(draw2DPipelineColorRect2Lin_);
 	DoRelease(draw2DPipelineDepth_);
 	DoRelease(draw2DPipeline565ToDepth_);
 	DoRelease(draw2DPipeline565ToDepthDeswizzle_);
diff --git a/GPU/Common/FramebufferManagerCommon.h b/GPU/Common/FramebufferManagerCommon.h
index a38cc328d5..20fd15f87a 100644
--- a/GPU/Common/FramebufferManagerCommon.h
+++ b/GPU/Common/FramebufferManagerCommon.h
@@ -569,6 +569,7 @@ protected:
 
 	// Draw2D pipelines
 	Draw2DPipeline *draw2DPipelineColor_ = nullptr;
+	Draw2DPipeline *draw2DPipelineColorRect2Lin_ = nullptr;
 	Draw2DPipeline *draw2DPipelineDepth_ = nullptr;
 	Draw2DPipeline *draw2DPipeline565ToDepth_ = nullptr;
 	Draw2DPipeline *draw2DPipeline565ToDepthDeswizzle_ = nullptr;
diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index 97efd42cbe..8bc072114b 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -1259,11 +1259,12 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 					dynamicClutTemp_ = draw_->CreateFramebuffer(desc);
 				}
 
-				// Download the pixels to our temp clut, scaling down if needed.
+				// Copy the pixels to our temp clut, scaling down if needed and wrapping.
+				// TODO: Take the clutRenderOffset_ into account here.
 				framebufferManager_->BlitUsingRaster(
 					chosenFramebuffer->fbo, 0.0f, 0.0f, 512.0f * chosenFramebuffer->renderScaleFactor, 1.0f, 
 					dynamicClutTemp_, 0.0f, 0.0f, 512.0f, 1.0f, 
-					false, 1.0f, framebufferManager_->Get2DPipeline(DRAW2D_COPY_COLOR), "copy_clut_to_temp");
+					false, chosenFramebuffer->renderScaleFactor, framebufferManager_->Get2DPipeline(DRAW2D_COPY_COLOR_RECT2LIN), "copy_clut_to_temp");
 				clutRenderFormat_ = chosenFramebuffer->fb_format;
 			}
 			NotifyMemInfo(MemBlockFlags::ALLOC, clutAddr, loadBytes, "CLUT");

From 78ab0139144df5fb3bfc5f29460650b1159945bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 21 Sep 2022 18:37:40 +0200
Subject: [PATCH 14/35] Shouldn't 'floor' there

---
 GPU/Common/Draw2D.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/Common/Draw2D.cpp b/GPU/Common/Draw2D.cpp
index 49aaa0bd5b..83df3ee34d 100644
--- a/GPU/Common/Draw2D.cpp
+++ b/GPU/Common/Draw2D.cpp
@@ -75,7 +75,7 @@ Draw2DPipelineInfo GenerateDraw2DCopyColorRect2LinFs(ShaderWriter &writer) {
 	writer.BeginFSMain(g_draw2Duniforms, varyings, FSFLAG_NONE);
 	writer.C("  vec2 tSize = texSize / scaleFactor;\n");
 	writer.C("  vec2 pixels = v_texcoord * tSize;\n");
-	writer.C("  float u = mod(floor(pixels.x), tSize.x);\n");
+	writer.C("  float u = mod(pixels.x, tSize.x);\n");
 	writer.C("  float v = floor(pixels.x / tSize.x);\n");
 	writer.C("  vec4 outColor = ").SampleTexture2D("tex", "vec2(u, v) / tSize").C(";\n");
 	writer.EndFSMain("outColor", FSFLAG_NONE);

From 8ed1694a2ff5bc80c075b42883e021f483a86df8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 21 Sep 2022 23:49:50 +0200
Subject: [PATCH 15/35] Don't try to replace or scale CLUT8-on-GPU textures.

See #8509
---
 GPU/Common/TextureCacheCommon.cpp | 41 +++++++++++++++++--------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index 8bc072114b..17624f1e02 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -2634,14 +2634,32 @@ bool TextureCacheCommon::PrepareBuildTexture(BuildTexturePlan &plan, TexCacheEnt
 		}
 	}
 
-	if (isPPGETexture) {
-		plan.replaced = &replacer_.FindNone();
-		plan.replaceValid = false;
+	bool canReplace = !isPPGETexture;
+	if (entry->status & TexCacheEntry::TexStatus::STATUS_CLUT_GPU) {
+		_dbg_assert_(entry->format == GE_TFMT_CLUT4 || entry->format == GE_TFMT_CLUT8);
+		plan.decodeToClut8 = true;
+		// We only support 1 mip level when doing CLUT on GPU for now.
+		// Supporting more would be possible, just not very interesting until we need it.
+		plan.levelsToCreate = 1;
+		plan.levelsToLoad = 1;
+		plan.maxPossibleLevels = 1;
+		plan.scaleFactor = 1;
+		plan.saveTexture = false;  // Can't yet save these properly.
+		canReplace = false;
 	} else {
+		plan.decodeToClut8 = false;
+	}
+
+	if (canReplace) {
 		plan.replaced = &FindReplacement(entry, plan.w, plan.h, plan.depth);
 		plan.replaceValid = plan.replaced->Valid();
+	} else {
+		plan.replaced = &replacer_.FindNone();
+		plan.replaceValid = false;
 	}
 
+	// NOTE! Last chance to change scale factor here!
+
 	plan.saveTexture = false;
 	if (plan.replaceValid) {
 		// We're replacing, so we won't scale.
@@ -2652,7 +2670,7 @@ bool TextureCacheCommon::PrepareBuildTexture(BuildTexturePlan &plan, TexCacheEnt
 		// But, we still need to create the texture at a larger size.
 		plan.replaced->GetSize(0, plan.createW, plan.createH);
 	} else {
-		if (replacer_.Enabled() && !plan.replaceValid && plan.depth == 1) {
+		if (replacer_.Enabled() && !plan.replaceValid && plan.depth == 1 && canReplace) {
 			ReplacedTextureDecodeInfo replacedInfo;
 			// TODO: Do we handle the race where a replacement becomes valid AFTER this but before we save?
 			replacedInfo.cachekey = entry->CacheKey();
@@ -2683,21 +2701,6 @@ bool TextureCacheCommon::PrepareBuildTexture(BuildTexturePlan &plan, TexCacheEnt
 		plan.maxPossibleLevels = log2i(std::min(plan.createW, plan.createH)) + 1;
 	}
 
-	if (entry->status & TexCacheEntry::TexStatus::STATUS_CLUT_GPU) {
-		_dbg_assert_(entry->format == GE_TFMT_CLUT4 || entry->format == GE_TFMT_CLUT8);
-		plan.decodeToClut8 = true;
-		// We only support 1 mip level when doing CLUT on GPU for now.
-		// Supporting more would be possible, just not very interesting until we need it.
-		plan.levelsToCreate = 1;
-		plan.levelsToLoad = 1;
-		plan.maxPossibleLevels = 1;
-		plan.scaleFactor = 1;
-		plan.saveTexture = false;  // Can't yet save these properly.
-		// TODO: Also forcibly disable replacement, or check that the replacement is a 8-bit paletted texture.
-	} else {
-		plan.decodeToClut8 = false;
-	}
-
 	if (plan.levelsToCreate == 1) {
 		entry->status |= TexCacheEntry::STATUS_NO_MIPS;
 	} else {

From a92aaf9311994bee1a79ed436226a69b2084035c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 21 Sep 2022 23:57:19 +0200
Subject: [PATCH 16/35] Dirty more state after depal

---
 GPU/Common/TextureCacheCommon.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index 17624f1e02..cfb6d1640d 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -2194,7 +2194,7 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 	ApplySamplingParams(samplerKey);
 
 	// Since we started/ended render passes, might need these.
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
+	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_FRAGMENTSHADER_STATE | DIRTY_VERTEXSHADER_STATE);
 }
 
 // Applies depal to a normal (non-framebuffer) texture, pre-decoded to CLUT8 format.
@@ -2286,7 +2286,7 @@ void TextureCacheCommon::ApplyTextureDepal(TexCacheEntry *entry) {
 	ApplySamplingParams(samplerKey);
 
 	// Since we started/ended render passes, might need these.
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE);
+	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_FRAGMENTSHADER_STATE | DIRTY_VERTEXSHADER_STATE);
 }
 
 void TextureCacheCommon::Clear(bool delete_them) {

From a681d149dba06c438ce3f06eb66de131edcb608a Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 21 Sep 2022 21:15:58 -0700
Subject: [PATCH 17/35] GPU: Use hardware tess params for texgen.

Fixes field in Test Drive Unlimited with hardware tessellation.
---
 GPU/Common/VertexShaderGenerator.cpp | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index 6e3f3185b3..d6837422d4 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -1128,7 +1128,10 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 					}
 				} else {
 					if (hasTexcoord) {
-						WRITE(p, "  %sv_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
+						if (doBezier || doSpline)
+							WRITE(p, "  %sv_texcoord = vec3(tess.tex.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
+						else
+							WRITE(p, "  %sv_texcoord = vec3(texcoord.xy * u_uvscaleoffset.xy + u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
 					} else {
 						WRITE(p, "  %sv_texcoord = vec3(u_uvscaleoffset.zw, 0.0);\n", compat.vsOutPrefix);
 					}
@@ -1140,26 +1143,36 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 					std::string temp_tc;
 					switch (uvProjMode) {
 					case GE_PROJMAP_POSITION:  // Use model space XYZ as source
-						temp_tc = "vec4(position, 1.0)";
+						if (doBezier || doSpline)
+							temp_tc = "vec4(tess.pos, 1.0)";
+						else
+							temp_tc = "vec4(position, 1.0)";
 						break;
 					case GE_PROJMAP_UV:  // Use unscaled UV as source
 						{
 							// prescale is false here.
 							if (hasTexcoord) {
-								temp_tc = "vec4(texcoord.xy, 0.0, 1.0)";
+								if (doBezier || doSpline)
+									temp_tc = "vec4(tess.tex.xy, 0.0, 1.0)";
+								else
+									temp_tc = "vec4(texcoord.xy, 0.0, 1.0)";
 							} else {
 								temp_tc = "vec4(0.0, 0.0, 0.0, 1.0)";
 							}
 						}
 						break;
 					case GE_PROJMAP_NORMALIZED_NORMAL:  // Use normalized transformed normal as source
-						if (hasNormal)
+						if (hasNormalTess)
+							temp_tc = StringFromFormat("length(tess.nrm) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%stess.nrm), 1.0)", flipNormal ? "-" : "");
+						else if (hasNormal)
 							temp_tc = StringFromFormat("length(normal) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%snormal), 1.0)", flipNormal ? "-" : "");
 						else
 							temp_tc = "vec4(0.0, 0.0, 1.0, 1.0)";
 						break;
 					case GE_PROJMAP_NORMAL:  // Use non-normalized transformed normal as source
-						if (hasNormal)
+						if (hasNormalTess)
+							temp_tc = flipNormal ? "vec4(-tess.nrm, 1.0)" : "vec4(tess.nrm, 1.0)";
+						else if (hasNormal)
 							temp_tc = flipNormal ? "vec4(-normal, 1.0)" : "vec4(normal, 1.0)";
 						else
 							temp_tc = "vec4(0.0, 0.0, 1.0, 1.0)";

From 0e6c90bc28c30ef5dd5d25af431b5300dbfa21a6 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 21 Sep 2022 21:16:56 -0700
Subject: [PATCH 18/35] GE Debugger: Default auto flush enabled.

It seems like it's usually what one would want, you can disable for
debugging state issues.
---
 Windows/GEDebugger/GEDebugger.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Windows/GEDebugger/GEDebugger.h b/Windows/GEDebugger/GEDebugger.h
index c7d70dc3ee..beae98a04a 100644
--- a/Windows/GEDebugger/GEDebugger.h
+++ b/Windows/GEDebugger/GEDebugger.h
@@ -134,7 +134,7 @@ private:
 	int textureLevel_ = 0;
 	bool showClut_ = false;
 	bool forceOpaque_ = false;
-	bool autoFlush_ = false;
+	bool autoFlush_ = true;
 	// The most recent primary/framebuffer and texture buffers.
 	const GPUDebugBuffer *primaryBuffer_ = nullptr;
 	const GPUDebugBuffer *secondBuffer_ = nullptr;

From 1e78f3aa573bc7d2781bee8ad1500b74f322ea55 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 21 Sep 2022 22:06:32 -0700
Subject: [PATCH 19/35] GPU: Correct neg normal for hwtess texgen.

---
 GPU/Common/VertexShaderGenerator.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/GPU/Common/VertexShaderGenerator.cpp b/GPU/Common/VertexShaderGenerator.cpp
index d6837422d4..126d188500 100644
--- a/GPU/Common/VertexShaderGenerator.cpp
+++ b/GPU/Common/VertexShaderGenerator.cpp
@@ -1162,16 +1162,16 @@ bool GenerateVertexShader(const VShaderID &id, char *buffer, const ShaderLanguag
 						}
 						break;
 					case GE_PROJMAP_NORMALIZED_NORMAL:  // Use normalized transformed normal as source
-						if (hasNormalTess)
-							temp_tc = StringFromFormat("length(tess.nrm) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%stess.nrm), 1.0)", flipNormal ? "-" : "");
+						if ((doBezier || doSpline) && hasNormalTess)
+							temp_tc = StringFromFormat("length(tess.nrm) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%stess.nrm), 1.0)", flipNormalTess ? "-" : "");
 						else if (hasNormal)
 							temp_tc = StringFromFormat("length(normal) == 0.0 ? vec4(0.0, 0.0, 1.0, 1.0) : vec4(normalize(%snormal), 1.0)", flipNormal ? "-" : "");
 						else
 							temp_tc = "vec4(0.0, 0.0, 1.0, 1.0)";
 						break;
 					case GE_PROJMAP_NORMAL:  // Use non-normalized transformed normal as source
-						if (hasNormalTess)
-							temp_tc = flipNormal ? "vec4(-tess.nrm, 1.0)" : "vec4(tess.nrm, 1.0)";
+						if ((doBezier || doSpline) && hasNormalTess)
+							temp_tc = flipNormalTess ? "vec4(-tess.nrm, 1.0)" : "vec4(tess.nrm, 1.0)";
 						else if (hasNormal)
 							temp_tc = flipNormal ? "vec4(-normal, 1.0)" : "vec4(normal, 1.0)";
 						else

From a8eced47736d427d229f06859ccac51a2023ef8c Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Wed, 21 Sep 2022 23:30:00 -0700
Subject: [PATCH 20/35] GLES: Avoid resizing tessellation data textures.

Just recreate when it needs to be larger.  Fixes Test Drive Unlimited
issues noted in #16069.
---
 GPU/GLES/DrawEngineGLES.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp
index 4eb23cd4d4..47b10722ef 100644
--- a/GPU/GLES/DrawEngineGLES.cpp
+++ b/GPU/GLES/DrawEngineGLES.cpp
@@ -500,7 +500,8 @@ void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *p
 		prevSizeU = size_u;
 		prevSizeV = size_v;
 		if (!data_tex[0])
-			data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D, size_u * 3, size_v, 1, 1);
+			renderManager_->DeleteTexture(data_tex[0]);
+		data_tex[0] = renderManager_->CreateTexture(GL_TEXTURE_2D, size_u * 3, size_v, 1, 1);
 		renderManager_->TextureImage(data_tex[0], 0, size_u * 3, size_v, 1, Draw::DataFormat::R32G32B32A32_FLOAT, nullptr, GLRAllocType::NONE, false);
 		renderManager_->FinalizeTexture(data_tex[0], 0, false);
 	}
@@ -518,7 +519,8 @@ void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *p
 	if (prevSizeWU < weights.size_u) {
 		prevSizeWU = weights.size_u;
 		if (!data_tex[1])
-			data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_u * 2, 1, 1, 1);
+			renderManager_->DeleteTexture(data_tex[1]);
+		data_tex[1] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_u * 2, 1, 1, 1);
 		renderManager_->TextureImage(data_tex[1], 0, weights.size_u * 2, 1, 1, Draw::DataFormat::R32G32B32A32_FLOAT, nullptr, GLRAllocType::NONE, false);
 		renderManager_->FinalizeTexture(data_tex[1], 0, false);
 	}
@@ -529,7 +531,8 @@ void TessellationDataTransferGLES::SendDataToShader(const SimpleVertex *const *p
 	if (prevSizeWV < weights.size_v) {
 		prevSizeWV = weights.size_v;
 		if (!data_tex[2])
-			data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_v * 2, 1, 1, 1);
+			renderManager_->DeleteTexture(data_tex[2]);
+		data_tex[2] = renderManager_->CreateTexture(GL_TEXTURE_2D, weights.size_v * 2, 1, 1, 1);
 		renderManager_->TextureImage(data_tex[2], 0, weights.size_v * 2, 1, 1, Draw::DataFormat::R32G32B32A32_FLOAT, nullptr, GLRAllocType::NONE, false);
 		renderManager_->FinalizeTexture(data_tex[2], 0, false);
 	}

From fc39f042ae2bc7704c8fa59853e82d3407d0c6fb Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 22 Sep 2022 00:08:38 -0700
Subject: [PATCH 21/35] softgpu: Avoid unnecessary flushing for curves.

We don't need to flush all drawing between curves in softgpu, let them
queue up.
---
 GPU/Common/DrawEngineCommon.h  | 2 ++
 GPU/Common/SplineCommon.cpp    | 3 ++-
 GPU/GPUCommon.cpp              | 6 ++++--
 GPU/Software/TransformUnit.cpp | 1 +
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 40c397649b..8fc0382a29 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -147,6 +147,8 @@ protected:
 
 	bool useHWTransform_ = false;
 	bool useHWTessellation_ = false;
+	// Used to prevent unnecessary flushing in softgpu.
+	bool flushOnParams_ = true;
 
 	// Vertex collector buffers
 	u8 *decoded = nullptr;
diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp
index d3f864a4d4..b2358f401b 100644
--- a/GPU/Common/SplineCommon.cpp
+++ b/GPU/Common/SplineCommon.cpp
@@ -577,7 +577,8 @@ void DrawEngineCommon::SubmitCurve(const void *control_points, const void *indic
 	if (output.count)
 		DispatchSubmitPrim(output.vertices, output.indices, PatchPrimToPrim(surface.primType), output.count, vertTypeID, gstate.getCullMode(), &generatedBytesRead);
 
-	DispatchFlush();
+	if (flushOnParams_)
+		DispatchFlush();
 
 	if (origVertType & GE_VTYPE_TC_MASK) {
 		gstate_c.uv = prevUVScale;
diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp
index 3d64bdb1ab..a5fd84e696 100644
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@@ -1942,7 +1942,8 @@ void GPUCommon::Execute_Bezier(u32 op, u32 diff) {
 	}
 
 	// Can't flush after setting gstate_c.submitType below since it'll be a mess - it must be done already.
-	drawEngineCommon_->DispatchFlush();
+	if (flushOnParams_)
+		drawEngineCommon_->DispatchFlush();
 
 	Spline::BezierSurface surface;
 	surface.tess_u = gstate.getPatchDivisionU();
@@ -2014,7 +2015,8 @@ void GPUCommon::Execute_Spline(u32 op, u32 diff) {
 	}
 
 	// Can't flush after setting gstate_c.submitType below since it'll be a mess - it must be done already.
-	drawEngineCommon_->DispatchFlush();
+	if (flushOnParams_)
+		drawEngineCommon_->DispatchFlush();
 
 	Spline::SplineSurface surface;
 	surface.tess_u = gstate.getPatchDivisionU();
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index c1471f7172..660e7a82e1 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -54,6 +54,7 @@ SoftwareDrawEngine::SoftwareDrawEngine() {
 	// All this is a LOT of memory, need to see if we can cut down somehow.  Used for splines.
 	decoded = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
+	flushOnParams_ = false;
 }
 
 SoftwareDrawEngine::~SoftwareDrawEngine() {

From 287e025978607666f17c9ffe9aac1159942c3b89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 09:12:20 +0200
Subject: [PATCH 22/35] Minor cleanups around dirtying of render state

---
 GPU/Common/FramebufferManagerCommon.cpp | 12 ++++++------
 GPU/Common/ShaderCommon.h               |  4 ++++
 GPU/Common/StencilCommon.cpp            |  6 +-----
 GPU/Common/TextureCacheCommon.cpp       |  9 ++++-----
 GPU/GPUState.h                          |  7 +++----
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index 8d17f2d9ff..c4655cf02e 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -647,7 +647,7 @@ void FramebufferManagerCommon::CopyToDepthFromOverlappingFramebuffers(VirtualFra
 		}
 	}
 
-	gstate_c.Dirty(DIRTY_TEXTURE_IMAGE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_BLEND_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }
 
 // Can't easily dynamically create these strings, we just pass along the pointer.
@@ -915,7 +915,7 @@ void FramebufferManagerCommon::BlitFramebufferDepth(VirtualFramebuffer *src, Vir
 
 	// Some GPUs can copy depth but only if stencil gets to come along for the ride. We only want to use this if there is no blit functionality.
 	if (useCopy) {
-		draw_->CopyFramebufferImage(src->fbo, 0, 0, 0, 0, dst->fbo, 0, 0, 0, 0, w, h, 1, Draw::FB_DEPTH_BIT, "BlitFramebufferDepth");
+		draw_->CopyFramebufferImage(src->fbo, 0, 0, 0, 0, dst->fbo, 0, 0, 0, 0, w, h, 1, Draw::FB_DEPTH_BIT, "CopyFramebufferDepth");
 		RebindFramebuffer("After BlitFramebufferDepth");
 	} else if (useBlit) {
 		// We'll accept whether we get a separate depth blit or not...
@@ -1097,7 +1097,7 @@ void FramebufferManagerCommon::DrawPixels(VirtualFramebuffer *vfb, int dstX, int
 		pixelsTex->Release();
 		draw_->InvalidateCachedState();
 
-		gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
+		gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 	}
 }
 
@@ -2767,7 +2767,7 @@ void FramebufferManagerCommon::DrawActiveTexture(float x, float y, float w, floa
 
 	draw2D_.DrawStrip2D(nullptr, coord, 4, (flags & DRAWTEX_LINEAR) != 0, Get2DPipeline((flags & DRAWTEX_DEPTH) ? DRAW2D_COPY_DEPTH : DRAW2D_COPY_COLOR));
 
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }
 
 void FramebufferManagerCommon::BlitFramebuffer(VirtualFramebuffer *dst, int dstX, int dstY, VirtualFramebuffer *src, int srcX, int srcY, int w, int h, int bpp, RasterChannel channel, const char *tag) {
@@ -2871,7 +2871,7 @@ void FramebufferManagerCommon::BlitFramebuffer(VirtualFramebuffer *dst, int dstX
 
 	draw_->InvalidateCachedState();
 
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }
 
 // The input is raw pixel coordinates, scale not taken into account.
@@ -2907,7 +2907,7 @@ void FramebufferManagerCommon::BlitUsingRaster(
 
 	draw2D_.Blit(pipeline, srcX1, srcY1, srcX2, srcY2, destX1, destY1, destX2, destY2, (float)srcW, (float)srcH, (float)destW, (float)destH, linearFilter, scaleFactor);
 
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }
 
 VirtualFramebuffer *FramebufferManagerCommon::ResolveFramebufferColorToFormat(VirtualFramebuffer *src, GEBufferFormat newFormat) {
diff --git a/GPU/Common/ShaderCommon.h b/GPU/Common/ShaderCommon.h
index 1a69e5c387..a4adfb70ea 100644
--- a/GPU/Common/ShaderCommon.h
+++ b/GPU/Common/ShaderCommon.h
@@ -109,6 +109,10 @@ enum : uint64_t {
 	DIRTY_VERTEXSHADER_STATE = 1ULL << 47,
 	DIRTY_FRAGMENTSHADER_STATE = 1ULL << 48,
 
+	// Everything that's not uniforms. Use this after using thin3d.
+	// TODO: Should we also add DIRTY_FRAMEBUF here? It kinda generally takes care of itself.
+	DIRTY_ALL_RENDER_STATE = DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS,
+
 	DIRTY_ALL = 0xFFFFFFFFFFFFFFFF
 };
 
diff --git a/GPU/Common/StencilCommon.cpp b/GPU/Common/StencilCommon.cpp
index 561c6a2857..86a5508b80 100644
--- a/GPU/Common/StencilCommon.cpp
+++ b/GPU/Common/StencilCommon.cpp
@@ -186,13 +186,9 @@ bool FramebufferManagerCommon::PerformStencilUpload(u32 addr, int size, StencilU
 
 		// Otherwise, we can skip alpha in many cases, in which case we don't even use a shader.
 		if (flags & StencilUpload::IGNORE_ALPHA) {
-			shaderManager_->DirtyLastShader();
-
 			if (dstBuffer->fbo) {
 				draw_->BindFramebufferAsRenderTarget(dstBuffer->fbo, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, Draw::RPAction::CLEAR }, "PerformStencilUpload_Clear");
 			}
-
-			gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE);
 			return true;
 		}
 	}
@@ -333,6 +329,6 @@ bool FramebufferManagerCommon::PerformStencilUpload(u32 addr, int size, StencilU
 	tex->Release();
 
 	draw_->InvalidateCachedState();
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 	return true;
 }
diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index cfb6d1640d..3b0f662002 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -2095,7 +2095,6 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 				mode = ShaderDepalMode::SMOOTHED;
 			}
 
-			// Since we started/ended render passes, might need these.
 			gstate_c.Dirty(DIRTY_DEPAL);
 			gstate_c.SetUseShaderDepal(mode);
 			gstate_c.depalFramebufferFormat = framebuffer->fb_format;
@@ -2193,8 +2192,8 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 	SamplerCacheKey samplerKey = GetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight);
 	ApplySamplingParams(samplerKey);
 
-	// Since we started/ended render passes, might need these.
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_FRAGMENTSHADER_STATE | DIRTY_VERTEXSHADER_STATE);
+	// Since we've drawn using thin3d, might need these.
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }
 
 // Applies depal to a normal (non-framebuffer) texture, pre-decoded to CLUT8 format.
@@ -2285,8 +2284,8 @@ void TextureCacheCommon::ApplyTextureDepal(TexCacheEntry *entry) {
 	SamplerCacheKey samplerKey = GetFramebufferSamplingParams(texWidth, texHeight);
 	ApplySamplingParams(samplerKey);
 
-	// Since we started/ended render passes, might need these.
-	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_FRAGMENTSHADER_STATE | DIRTY_VERTEXSHADER_STATE);
+	// Since we've drawn using thin3d, might need these.
+	gstate_c.Dirty(DIRTY_ALL_RENDER_STATE);
 }
 
 void TextureCacheCommon::Clear(bool delete_them) {
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index 191a670376..db4a04d3cd 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -469,12 +469,12 @@ struct UVScale {
 // Might want to move this mechanism into the backend later.
 enum {
 	GPU_SUPPORTS_DUALSOURCE_BLEND = FLAG_BIT(0),
-	// Free bit: 1
-	GPU_SUPPORTS_GLSL_330 = FLAG_BIT(2),
+	// Free bits: 1-2
 	GPU_SUPPORTS_VS_RANGE_CULLING = FLAG_BIT(3),
 	GPU_SUPPORTS_BLEND_MINMAX = FLAG_BIT(4),
 	GPU_SUPPORTS_LOGIC_OP = FLAG_BIT(5),
 	GPU_USE_DEPTH_RANGE_HACK = FLAG_BIT(6),
+	// Free bit: 7
 	GPU_SUPPORTS_ANISOTROPY = FLAG_BIT(8),
 	GPU_USE_CLEAR_RAM_HACK = FLAG_BIT(9),
 	GPU_SUPPORTS_INSTANCE_RENDERING = FLAG_BIT(10),
@@ -485,8 +485,7 @@ enum {
 	// Free bit: 15
 	GPU_SUPPORTS_DEPTH_TEXTURE = FLAG_BIT(16),
 	GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17),
-	GPU_SUPPORTS_FRAGMENT_SHADER_INTERLOCK = FLAG_BIT(18),
-	// Free bits: 19
+	// Free bits: 18-19
 	GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH = FLAG_BIT(20),
 	GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT = FLAG_BIT(21),
 	GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT = FLAG_BIT(22),

From 188ab67d6ab993184e9b0291b0451f11d156faaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 09:28:32 +0200
Subject: [PATCH 23/35] More lenient heuristic for uploading depth buffers.
 Still behind compat flag. See #11100

---
 GPU/Common/FramebufferManagerCommon.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index 8d17f2d9ff..f421b3458a 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -553,10 +553,10 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 		CopyToDepthFromOverlappingFramebuffers(currentRenderVfb_);
 
 		// Special compatibility trick for Burnout Dominator lens flares. Not sure how to best generalize this. See issue #11100
-		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && (currentRenderVfb_->usageFlags & FB_USAGE_CLUT) != 0) {
+		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && currentRenderVfb_->z_address > 0x04110000) {
 			// Set the flag, then upload memory contents to depth channel.
 			// Sanity check the depth buffer pointer.
-			if (currentRenderVfb_->z_address != 0 && currentRenderVfb_->z_address != currentRenderVfb_->fb_address) {
+			if (currentRenderVfb_->z_address > 0x04110000 && currentRenderVfb_->z_address != 0 && currentRenderVfb_->z_address != currentRenderVfb_->fb_address) {
 				if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
 					const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
 					DrawPixels(currentRenderVfb_, 0, 0, (const u8 *)src, GE_FORMAT_DEPTH16, currentRenderVfb_->z_stride, currentRenderVfb_->width, currentRenderVfb_->height, RASTER_DEPTH, "Depth Upload");

From bd196f7a5053dea37bbeed70ebcba79ed4b50123 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 09:57:53 +0200
Subject: [PATCH 24/35] Preserve depth buffer on framebuffer resize, if has
 been used.

---
 GPU/Common/FramebufferManagerCommon.cpp | 26 ++++++++++++++++---------
 GPU/Common/TextureCacheCommon.cpp       |  8 ++++++--
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index f421b3458a..cdfe1fd677 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -547,16 +547,19 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 		return;
 	}
 
+	// First time use of this framebuffer's depth buffer.
+	currentRenderVfb_->usageFlags |= FB_USAGE_RENDER_DEPTH;
+
 	// If this first draw call is anything other than a clear, "resolve" the depth buffer,
 	// by copying from any overlapping buffers with fresher content.
-	if (!isClearingDepth) {
+	if (!isClearingDepth && useBufferedRendering_) {
 		CopyToDepthFromOverlappingFramebuffers(currentRenderVfb_);
 
 		// Special compatibility trick for Burnout Dominator lens flares. Not sure how to best generalize this. See issue #11100
 		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && currentRenderVfb_->z_address > 0x04110000) {
 			// Set the flag, then upload memory contents to depth channel.
 			// Sanity check the depth buffer pointer.
-			if (currentRenderVfb_->z_address > 0x04110000 && currentRenderVfb_->z_address != 0 && currentRenderVfb_->z_address != currentRenderVfb_->fb_address) {
+			if (currentRenderVfb_->z_address > 0x04110000) {
 				if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
 					const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
 					DrawPixels(currentRenderVfb_, 0, 0, (const u8 *)src, GE_FORMAT_DEPTH16, currentRenderVfb_->z_stride, currentRenderVfb_->width, currentRenderVfb_->height, RASTER_DEPTH, "Depth Upload");
@@ -565,9 +568,6 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 		}
 	}
 
-	// First time use of this framebuffer's depth buffer.
-	currentRenderVfb_->usageFlags |= FB_USAGE_RENDER_DEPTH;
-
 	currentRenderVfb_->depthBindSeq = GetBindSeqCount();
 }
 
@@ -1540,7 +1540,7 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w,
 	if (creating) {
 		WARN_LOG(FRAMEBUF, "Creating %s FBO at %08x/%d %dx%d (force=%d)", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, vfb->bufferWidth, vfb->bufferHeight, (int)force);
 	} else {
-		WARN_LOG(FRAMEBUF, "Resizing %s FBO at %08x/%d from %dx%d to %dx%d (force=%d)", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, old.bufferWidth, old.bufferHeight, vfb->bufferWidth, vfb->bufferHeight, (int)force);
+		WARN_LOG(FRAMEBUF, "Resizing %s FBO at %08x/%d from %dx%d to %dx%d (force=%d, skipCopy=%d)", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, old.bufferWidth, old.bufferHeight, vfb->bufferWidth, vfb->bufferHeight, (int)force, (int)skipCopy);
 	}
 
 	// During hardware rendering, we always render at full color depth even if the game wouldn't on real hardware.
@@ -1578,8 +1578,10 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w,
 		if (vfb->fbo) {
 			draw_->BindFramebufferAsRenderTarget(vfb->fbo, { Draw::RPAction::CLEAR, Draw::RPAction::CLEAR, Draw::RPAction::CLEAR }, "ResizeFramebufFBO");
 			if (!skipCopy) {
-				BlitFramebuffer(vfb, 0, 0, &old, 0, 0, std::min((u16)oldWidth, std::min(vfb->bufferWidth, vfb->width)), std::min((u16)oldHeight, std::min(vfb->height, vfb->bufferHeight)), 0, RASTER_COLOR, "Blit_ResizeFramebufFBO");
-				// Depth copying is handled by deferred copies later.
+				BlitFramebuffer(vfb, 0, 0, &old, 0, 0, std::min((u16)oldWidth, std::min(vfb->bufferWidth, vfb->width)), std::min((u16)oldHeight, std::min(vfb->height, vfb->bufferHeight)), 0, RASTER_COLOR, "BlitColor_ResizeFramebufFBO");
+			}
+			if (vfb->usageFlags & FB_USAGE_RENDER_DEPTH) {
+				BlitFramebuffer(vfb, 0, 0, &old, 0, 0, std::min((u16)oldWidth, std::min(vfb->bufferWidth, vfb->width)), std::min((u16)oldHeight, std::min(vfb->height, vfb->bufferHeight)), 0, RASTER_DEPTH, "BlitDepth_ResizeFramebufFBO");
 			}
 		}
 		fbosToDelete_.push_back(old.fbo);
@@ -2214,8 +2216,9 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS
 			int dstBpp = BufferFormatBytesPerPixel(dstRect.vfb->fb_format);
 			float dstXFactor = (float)bpp / dstBpp;
 			if (dstRect.w_bytes / bpp > dstRect.vfb->width || dstRect.h > dstRect.vfb->height) {
-				// The buffer isn't big enough, and we have a clear hint of size.  Resize.
+				// The buffer isn't big enough, and we have a clear hint of size. Resize.
 				// This happens in Valkyrie Profile when uploading video at the ending.
+				// Also happens to the CLUT framebuffer in the Burnout Dominator lens flare effect. See #16075
 				ResizeFramebufFBO(dstRect.vfb, dstRect.w_bytes / bpp, dstRect.h, false, true);
 				// Make sure we don't flop back and forth.
 				dstRect.vfb->newWidth = std::max(dstRect.w_bytes / bpp, (int)dstRect.vfb->width);
@@ -2780,6 +2783,11 @@ void FramebufferManagerCommon::BlitFramebuffer(VirtualFramebuffer *dst, int dstX
 		return;
 	}
 
+	if (channel == RASTER_DEPTH && !draw_->GetDeviceCaps().fragmentShaderDepthWriteSupported) {
+		// Can't do anything :(
+		return;
+	}
+
 	// Perform a little bit of clipping first.
 	// Block transfer coords are unsigned so I don't think we need to clip on the left side.. Although there are
 	// other uses for BlitFramebuffer.
diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index cfb6d1640d..686bb226ab 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -1208,6 +1208,8 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 			clutRenderOffset_ = MAX_CLUT_OFFSET;
 			const std::vector<VirtualFramebuffer *> &framebuffers = framebufferManager_->Framebuffers();
 
+			u32 bestClutAddress = 0xFFFFFFFF;
+
 			VirtualFramebuffer *chosenFramebuffer = nullptr;
 			for (VirtualFramebuffer *framebuffer : framebuffers) {
 				const u32 fb_address = framebuffer->fb_address & 0x3FFFFFFF;
@@ -1234,7 +1236,7 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 						WARN_LOG_N_TIMES(clutfb, 5, G3D, "Detected LoadCLUT(%d bytes) from framebuffer %08x (%s), byte offset %d", loadBytes, fb_address, GeBufferFormatToString(framebuffer->fb_format), offset);
 						framebuffer->last_frame_clut = gpuStats.numFlips;
 						framebuffer->usageFlags |= FB_USAGE_CLUT;
-						clutRenderAddress_ = framebuffer->fb_address;
+						bestClutAddress = framebuffer->fb_address;
 						clutRenderOffset_ = (u32)offset;
 						chosenFramebuffer = framebuffer;
 						if (offset == 0) {
@@ -1245,7 +1247,9 @@ void TextureCacheCommon::LoadClut(u32 clutAddr, u32 loadBytes) {
 				}
 			}
 
-			if (chosenFramebuffer) {
+			if (chosenFramebuffer && chosenFramebuffer->fbo) {
+				clutRenderAddress_ = bestClutAddress;
+
 				if (!dynamicClutTemp_) {
 					Draw::FramebufferDesc desc{};
 					desc.width = 512;

From 8e30a7ccfcce8028a97b8e84615e1a5a30be3466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 21 Sep 2022 13:12:58 +0200
Subject: [PATCH 25/35] Vulkan: Don't have renderpasses store/load depth
 buffers when we don't use them

---
 Common/GPU/Vulkan/VulkanQueueRunner.cpp   | 56 +++++++++++++----------
 Common/GPU/Vulkan/VulkanQueueRunner.h     |  7 +--
 Common/GPU/Vulkan/VulkanRenderManager.cpp | 29 +++++++-----
 GPU/Vulkan/PipelineManagerVulkan.cpp      | 15 +++---
 GPU/Vulkan/PipelineManagerVulkan.h        |  1 -
 5 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/Common/GPU/Vulkan/VulkanQueueRunner.cpp b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
index 8127114148..0db9618e76 100644
--- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
@@ -326,29 +326,33 @@ static VkAttachmentStoreOp ConvertStoreAction(VKRRenderPassStoreAction action) {
 // Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827
 // Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies
 
-VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rpType) {
-	bool selfDependency = rpType == RP_TYPE_COLOR_DEPTH_INPUT;
+VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPassType rpType) {
+	bool selfDependency = rpType == RP_TYPE_COLOR_INPUT || rpType == RP_TYPE_COLOR_DEPTH_INPUT;
+	bool isBackbuffer = rpType == RP_TYPE_BACKBUFFER;
+	bool hasDepth = rpType == RP_TYPE_BACKBUFFER || rpType == RP_TYPE_COLOR_DEPTH || rpType == RP_TYPE_COLOR_DEPTH_INPUT;
 
 	VkAttachmentDescription attachments[2] = {};
-	attachments[0].format = rpType == RP_TYPE_BACKBUFFER ? vulkan->GetSwapchainFormat() : VK_FORMAT_R8G8B8A8_UNORM;
+	attachments[0].format = isBackbuffer ? vulkan->GetSwapchainFormat() : VK_FORMAT_R8G8B8A8_UNORM;
 	attachments[0].samples = VK_SAMPLE_COUNT_1_BIT;
 	attachments[0].loadOp = ConvertLoadAction(key.colorLoadAction);
 	attachments[0].storeOp = ConvertStoreAction(key.colorStoreAction);
 	attachments[0].stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
 	attachments[0].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
-	attachments[0].initialLayout = rpType == RP_TYPE_BACKBUFFER ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
-	attachments[0].finalLayout = rpType == RP_TYPE_BACKBUFFER ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+	attachments[0].initialLayout = isBackbuffer ? VK_IMAGE_LAYOUT_UNDEFINED : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+	attachments[0].finalLayout = isBackbuffer ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
 	attachments[0].flags = 0;
 
-	attachments[1].format = vulkan->GetDeviceInfo().preferredDepthStencilFormat;
-	attachments[1].samples = VK_SAMPLE_COUNT_1_BIT;
-	attachments[1].loadOp = ConvertLoadAction(key.depthLoadAction);
-	attachments[1].storeOp = ConvertStoreAction(key.depthStoreAction);
-	attachments[1].stencilLoadOp = ConvertLoadAction(key.stencilLoadAction);
-	attachments[1].stencilStoreOp = ConvertStoreAction(key.stencilStoreAction);
-	attachments[1].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
-	attachments[1].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
-	attachments[1].flags = 0;
+	if (hasDepth) {
+		attachments[1].format = vulkan->GetDeviceInfo().preferredDepthStencilFormat;
+		attachments[1].samples = VK_SAMPLE_COUNT_1_BIT;
+		attachments[1].loadOp = ConvertLoadAction(key.depthLoadAction);
+		attachments[1].storeOp = ConvertStoreAction(key.depthStoreAction);
+		attachments[1].stencilLoadOp = ConvertLoadAction(key.stencilLoadAction);
+		attachments[1].stencilStoreOp = ConvertStoreAction(key.stencilStoreAction);
+		attachments[1].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+		attachments[1].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
+		attachments[1].flags = 0;
+	}
 
 	VkAttachmentReference color_reference{};
 	color_reference.attachment = 0;
@@ -371,7 +375,9 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp
 	subpass.colorAttachmentCount = 1;
 	subpass.pColorAttachments = &color_reference;
 	subpass.pResolveAttachments = nullptr;
-	subpass.pDepthStencilAttachment = &depth_reference;
+	if (hasDepth) {
+		subpass.pDepthStencilAttachment = &depth_reference;
+	}
 	subpass.preserveAttachmentCount = 0;
 	subpass.pPreserveAttachments = nullptr;
 
@@ -380,12 +386,12 @@ VkRenderPass CreateRP(VulkanContext *vulkan, const RPKey &key, RenderPassType rp
 	size_t numDeps = 0;
 
 	VkRenderPassCreateInfo rp{ VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO };
-	rp.attachmentCount = 2;
+	rp.attachmentCount = hasDepth ? 2 : 1;
 	rp.pAttachments = attachments;
 	rp.subpassCount = 1;
 	rp.pSubpasses = &subpass;
 
-	if (rpType == RP_TYPE_BACKBUFFER) {
+	if (isBackbuffer) {
 		deps[numDeps].srcSubpass = VK_SUBPASS_EXTERNAL;
 		deps[numDeps].dstSubpass = 0;
 		deps[numDeps].srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
@@ -424,7 +430,7 @@ VkRenderPass VKRRenderPass::Get(VulkanContext *vulkan, RenderPassType rpType) {
 	// practical later when referring to it. Could change to on-demand if it feels motivated
 	// but I think the render pass objects are cheap.
 	if (!pass[(int)rpType]) {
-		pass[(int)rpType] = CreateRP(vulkan, key_, (RenderPassType)rpType);
+		pass[(int)rpType] = CreateRenderPass(vulkan, key_, (RenderPassType)rpType);
 	}
 	return pass[(int)rpType];
 }
@@ -873,8 +879,10 @@ std::string VulkanQueueRunner::StepToString(const VKRStep &step) const {
 		const char *renderCmd;
 		switch (step.render.renderPassType) {
 		case RP_TYPE_BACKBUFFER: renderCmd = "BACKBUF"; break;
-		case RP_TYPE_COLOR_DEPTH: renderCmd = "RENDER"; break;
-		case RP_TYPE_COLOR_DEPTH_INPUT: renderCmd = "RENDER_INPUT"; break;
+		case RP_TYPE_COLOR: renderCmd = "RENDER"; break;
+		case RP_TYPE_COLOR_DEPTH: renderCmd = "RENDER_DEPTH"; break;
+		case RP_TYPE_COLOR_INPUT: renderCmd = "RENDER_INPUT"; break;
+		case RP_TYPE_COLOR_DEPTH_INPUT: renderCmd = "RENDER_DEPTH_INPUT"; break;
 		default: renderCmd = "N/A";
 		}
 		snprintf(buffer, sizeof(buffer), "%s %s (draws: %d, %dx%d/%dx%d, fb: %p, )", renderCmd, step.tag, step.render.numDraws, actual_w, actual_h, w, h, step.render.framebuffer);
@@ -1153,7 +1161,7 @@ void TransitionToOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayout
 			srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected color layout %d", (int)colorLayout);
+			_dbg_assert_msg_(false, "TransitionToOptimal: Unexpected color layout %d", (int)colorLayout);
 			break;
 		}
 		recordBarrier->TransitionImage(
@@ -1189,7 +1197,7 @@ void TransitionToOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayout
 			srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected depth layout %d", (int)depthStencilLayout);
+			_dbg_assert_msg_(false, "TransitionToOptimal: Unexpected depth layout %d", (int)depthStencilLayout);
 			break;
 		}
 		recordBarrier->TransitionImage(
@@ -1236,7 +1244,7 @@ void TransitionFromOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayou
 			// Nothing to do.
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected final color layout %d", (int)colorLayout);
+			_dbg_assert_msg_(false, "TransitionFromOptimal: Unexpected final color layout %d", (int)colorLayout);
 			break;
 		}
 		barrier[0].oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
@@ -1275,7 +1283,7 @@ void TransitionFromOptimal(VkCommandBuffer cmd, VkImage colorImage, VkImageLayou
 			// Nothing to do.
 			break;
 		default:
-			_dbg_assert_msg_(false, "GetRenderPass: Unexpected final depth layout %d", (int)depthStencilLayout);
+			_dbg_assert_msg_(false, "TransitionFromOptimal: Unexpected final depth layout %d", (int)depthStencilLayout);
 			break;
 		}
 		barrier[barrierCount].oldLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
diff --git a/Common/GPU/Vulkan/VulkanQueueRunner.h b/Common/GPU/Vulkan/VulkanQueueRunner.h
index 2c76262a6a..d6d07ec6ce 100644
--- a/Common/GPU/Vulkan/VulkanQueueRunner.h
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.h
@@ -43,18 +43,19 @@ enum class VKRRenderCommand : uint8_t {
 
 enum class PipelineFlags {
 	NONE = 0,
-	USES_LINES = (1 << 2),
 	USES_BLEND_CONSTANT = (1 << 3),
-	USES_DEPTH_STENCIL = (1 << 4),  // Reads or writes the depth buffer.
+	USES_DEPTH_STENCIL = (1 << 4),  // Reads or writes the depth or stencil buffers.
 	USES_INPUT_ATTACHMENT = (1 << 5),
 };
 ENUM_CLASS_BITOPS(PipelineFlags);
 
 // Pipelines need to be created for the right type of render pass.
 enum RenderPassType {
-	RP_TYPE_BACKBUFFER,
+	RP_TYPE_BACKBUFFER,  // For the backbuffer we can always use CLEAR/DONT_CARE, so bandwidth cost for a depth channel is negligible.
 	RP_TYPE_COLOR_DEPTH,
 	RP_TYPE_COLOR_DEPTH_INPUT,
+	RP_TYPE_COLOR,
+	RP_TYPE_COLOR_INPUT,
 	// Later will add pure-color render passes.
 	RP_TYPE_COUNT,
 };
diff --git a/Common/GPU/Vulkan/VulkanRenderManager.cpp b/Common/GPU/Vulkan/VulkanRenderManager.cpp
index 4d1363e49c..045ebf7a8a 100644
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@@ -158,33 +158,37 @@ VKRFramebuffer::VKRFramebuffer(VulkanContext *vk, VkCommandBuffer initCmd, VKRRe
 	// We create the actual framebuffer objects on demand, because some combinations might not make sense.
 }
 
-VkFramebuffer VKRFramebuffer::Get(VKRRenderPass *compatibleRenderPass, RenderPassType renderPassType) {
-	if (framebuf[(int)renderPassType]) {
-		return framebuf[(int)renderPassType];
+VkFramebuffer VKRFramebuffer::Get(VKRRenderPass *compatibleRenderPass, RenderPassType rpType) {
+	if (framebuf[(int)rpType]) {
+		return framebuf[(int)rpType];
 	}
 
 	VkFramebufferCreateInfo fbci{ VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO };
 	VkImageView views[2]{};
 
-	fbci.renderPass = compatibleRenderPass->Get(vulkan_, renderPassType);
-	fbci.attachmentCount = 2;
-	fbci.pAttachments = views;
+	bool hasDepth = rpType == RP_TYPE_BACKBUFFER || rpType == RP_TYPE_COLOR_DEPTH || rpType == RP_TYPE_COLOR_DEPTH_INPUT;
+
 	views[0] = color.imageView;
-	views[1] = depth.imageView;
+	if (hasDepth) {
+		views[1] = depth.imageView;
+	}
+	fbci.renderPass = compatibleRenderPass->Get(vulkan_, rpType);
+	fbci.attachmentCount = hasDepth ? 2 : 1;
+	fbci.pAttachments = views;
 	fbci.width = width;
 	fbci.height = height;
 	fbci.layers = 1;
 
-	VkResult res = vkCreateFramebuffer(vulkan_->GetDevice(), &fbci, nullptr, &framebuf[(int)renderPassType]);
+	VkResult res = vkCreateFramebuffer(vulkan_->GetDevice(), &fbci, nullptr, &framebuf[(int)rpType]);
 	_assert_(res == VK_SUCCESS);
 
 	if (!tag_.empty() && vulkan_->Extensions().EXT_debug_utils) {
 		vulkan_->SetDebugName(color.image, VK_OBJECT_TYPE_IMAGE, StringFromFormat("fb_color_%s", tag_.c_str()).c_str());
 		vulkan_->SetDebugName(depth.image, VK_OBJECT_TYPE_IMAGE, StringFromFormat("fb_depth_%s", tag_.c_str()).c_str());
-		vulkan_->SetDebugName(framebuf[(int)renderPassType], VK_OBJECT_TYPE_FRAMEBUFFER, StringFromFormat("fb_%s", tag_.c_str()).c_str());
+		vulkan_->SetDebugName(framebuf[(int)rpType], VK_OBJECT_TYPE_FRAMEBUFFER, StringFromFormat("fb_%s", tag_.c_str()).c_str());
 	}
 
-	return framebuf[(int)renderPassType];
+	return framebuf[(int)rpType];
 }
 
 VKRFramebuffer::~VKRFramebuffer() {
@@ -656,15 +660,16 @@ void VulkanRenderManager::EndCurRenderStep() {
 		curRenderStep_->render.colorLoad, curRenderStep_->render.depthLoad, curRenderStep_->render.stencilLoad,
 		curRenderStep_->render.colorStore, curRenderStep_->render.depthStore, curRenderStep_->render.stencilStore,
 	};
-	RenderPassType rpType = RP_TYPE_COLOR_DEPTH;
 	// Save the accumulated pipeline flags so we can use that to configure the render pass.
 	// We'll often be able to avoid loading/saving the depth/stencil buffer.
 	curRenderStep_->render.pipelineFlags = curPipelineFlags_;
+	bool depthStencil = (curPipelineFlags_ & PipelineFlags::USES_DEPTH_STENCIL) != 0;
+	RenderPassType rpType = depthStencil ? RP_TYPE_COLOR_DEPTH : RP_TYPE_COLOR;
 	if (!curRenderStep_->render.framebuffer) {
 		rpType = RP_TYPE_BACKBUFFER;
 	} else if (curPipelineFlags_ & PipelineFlags::USES_INPUT_ATTACHMENT) {
 		// Not allowed on backbuffers.
-		rpType = RP_TYPE_COLOR_DEPTH_INPUT;
+		rpType = depthStencil ? RP_TYPE_COLOR_INPUT : RP_TYPE_COLOR_DEPTH_INPUT;
 	}
 	// TODO: Also add render pass types for depth/stencil-less.
 
diff --git a/GPU/Vulkan/PipelineManagerVulkan.cpp b/GPU/Vulkan/PipelineManagerVulkan.cpp
index e950dfc457..21ce7df61d 100644
--- a/GPU/Vulkan/PipelineManagerVulkan.cpp
+++ b/GPU/Vulkan/PipelineManagerVulkan.cpp
@@ -170,8 +170,8 @@ static std::string CutFromMain(std::string str) {
 }
 
 static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager, VkPipelineCache pipelineCache,
-		VkPipelineLayout layout, PipelineFlags pipelineFlags, const VulkanPipelineRasterStateKey &key,
-		const DecVtxFormat *decFmt, VulkanVertexShader *vs, VulkanFragmentShader *fs, bool useHwTransform, u32 variantBitmask) {
+	VkPipelineLayout layout, PipelineFlags pipelineFlags, const VulkanPipelineRasterStateKey &key,
+	const DecVtxFormat *decFmt, VulkanVertexShader *vs, VulkanFragmentShader *fs, bool useHwTransform, u32 variantBitmask) {
 	VulkanPipeline *vulkanPipeline = new VulkanPipeline();
 	VKRGraphicsPipelineDesc *desc = &vulkanPipeline->desc;
 	desc->pipelineCache = pipelineCache;
@@ -221,7 +221,7 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager,
 	VkDynamicState *dynamicStates = &desc->dynamicStates[0];
 	int numDyn = 0;
 	if (key.blendEnable &&
-		  (UsesBlendConstant(key.srcAlpha) || UsesBlendConstant(key.srcColor) || UsesBlendConstant(key.destAlpha) || UsesBlendConstant(key.destColor))) {
+		(UsesBlendConstant(key.srcAlpha) || UsesBlendConstant(key.srcColor) || UsesBlendConstant(key.destAlpha) || UsesBlendConstant(key.destColor))) {
 		dynamicStates[numDyn++] = VK_DYNAMIC_STATE_BLEND_CONSTANTS;
 		useBlendConstant = true;
 	}
@@ -232,12 +232,12 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager,
 		dynamicStates[numDyn++] = VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK;
 		dynamicStates[numDyn++] = VK_DYNAMIC_STATE_STENCIL_REFERENCE;
 	}
-	
+
 	VkPipelineDynamicStateCreateInfo &ds = desc->ds;
 	ds.flags = 0;
 	ds.pDynamicStates = dynamicStates;
 	ds.dynamicStateCount = numDyn;
-	
+
 	VkPipelineRasterizationStateCreateInfo &rs = desc->rs;
 	rs.flags = 0;
 	rs.depthBiasEnable = false;
@@ -299,10 +299,9 @@ static VulkanPipeline *CreateVulkanPipeline(VulkanRenderManager *renderManager,
 	VKRGraphicsPipeline *pipeline = renderManager->CreateGraphicsPipeline(desc, variantBitmask, "game");
 
 	vulkanPipeline->pipeline = pipeline;
-	if (useBlendConstant)
+	if (useBlendConstant) {
 		pipelineFlags |= PipelineFlags::USES_BLEND_CONSTANT;
-	if (key.topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST || key.topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP)
-		pipelineFlags |= PipelineFlags::USES_LINES;
+	}
 	if (dss.depthTestEnable || dss.stencilTestEnable) {
 		pipelineFlags |= PipelineFlags::USES_DEPTH_STENCIL;
 	}
diff --git a/GPU/Vulkan/PipelineManagerVulkan.h b/GPU/Vulkan/PipelineManagerVulkan.h
index 08907e3b3e..a7dac0c30f 100644
--- a/GPU/Vulkan/PipelineManagerVulkan.h
+++ b/GPU/Vulkan/PipelineManagerVulkan.h
@@ -58,7 +58,6 @@ struct VulkanPipeline {
 	PipelineFlags pipelineFlags;  // PipelineFlags enum above.
 
 	bool UsesBlendConstant() const { return (pipelineFlags & PipelineFlags::USES_BLEND_CONSTANT) != 0; }
-	bool UsesLines() const { return (pipelineFlags & PipelineFlags::USES_LINES) != 0; }
 	bool UsesDepthStencil() const { return (pipelineFlags & PipelineFlags::USES_DEPTH_STENCIL) != 0; }
 	bool UsesInputAttachment() const { return (pipelineFlags & PipelineFlags::USES_INPUT_ATTACHMENT) != 0; }
 

From 6d2f29e7eb6a9eb8a28058dd54ddf05342a64f6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 21 Sep 2022 13:24:39 +0200
Subject: [PATCH 26/35] If depth or stencil are cleared in a renderpass, set
 the pipeline flag.

---
 Common/GPU/Vulkan/VulkanRenderManager.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Common/GPU/Vulkan/VulkanRenderManager.cpp b/Common/GPU/Vulkan/VulkanRenderManager.cpp
index 045ebf7a8a..d8c16e331a 100644
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@@ -719,9 +719,11 @@ void VulkanRenderManager::BindFramebufferAsRenderTarget(VKRFramebuffer *fb, VKRR
 		}
 		if (depth == VKRRenderPassLoadAction::CLEAR) {
 			clearMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
+			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
 		}
 		if (stencil == VKRRenderPassLoadAction::CLEAR) {
 			clearMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
+			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
 		}
 
 		// If we need a clear and the previous step has commands already, it's best to just add a clear and keep going.
@@ -1002,6 +1004,10 @@ void VulkanRenderManager::Clear(uint32_t clearColor, float clearZ, int clearSten
 		curRenderStep_->render.depthLoad = (clearMask & VK_IMAGE_ASPECT_DEPTH_BIT) ? VKRRenderPassLoadAction::CLEAR : VKRRenderPassLoadAction::KEEP;
 		curRenderStep_->render.stencilLoad = (clearMask & VK_IMAGE_ASPECT_STENCIL_BIT) ? VKRRenderPassLoadAction::CLEAR : VKRRenderPassLoadAction::KEEP;
 
+		if (clearMask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
+			curPipelineFlags_ |= PipelineFlags::USES_DEPTH_STENCIL;
+		}
+
 		// In case there were commands already.
 		curRenderStep_->render.numDraws = 0;
 		RemoveDrawCommands(&curRenderStep_->commands);

From 11b807828fd4259102477270a425806a2b52938c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 10:22:29 +0200
Subject: [PATCH 27/35] Fix for render pass merge

---
 Common/GPU/Vulkan/VulkanQueueRunner.cpp | 15 ++++++---------
 Common/GPU/Vulkan/VulkanQueueRunner.h   | 11 ++++++++---
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/Common/GPU/Vulkan/VulkanQueueRunner.cpp b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
index 0db9618e76..8ad509afbb 100644
--- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp
@@ -30,16 +30,14 @@ static void MergeRenderAreaRectInto(VkRect2D *dest, VkRect2D &src) {
 // We need to take the "max" of the features used in the two render passes.
 RenderPassType MergeRPTypes(RenderPassType a, RenderPassType b) {
 	// Either both are backbuffer type, or neither are.
-	_dbg_assert_((a == RP_TYPE_BACKBUFFER) == (b == RP_TYPE_BACKBUFFER));
-	if (a == b) {
-		// Trivial merging case.
+	// These can't merge with other renderpasses
+	if (a == RP_TYPE_BACKBUFFER || b == RP_TYPE_BACKBUFFER) {
+		_dbg_assert_(a == b);
 		return a;
-	} else if (a == RP_TYPE_COLOR_DEPTH && b == RP_TYPE_COLOR_DEPTH_INPUT) {
-		return RP_TYPE_COLOR_DEPTH_INPUT;
-	} else if (a == RP_TYPE_COLOR_DEPTH_INPUT && b == RP_TYPE_COLOR_DEPTH) {
-		return RP_TYPE_COLOR_DEPTH_INPUT;
 	}
-	return a;
+
+	// The rest we can just OR together to get the maximum feature set.
+	return (RenderPassType)((u32)a | (u32)b);
 }
 
 void VulkanQueueRunner::CreateDeviceObjects() {
@@ -399,7 +397,6 @@ VkRenderPass CreateRenderPass(VulkanContext *vulkan, const RPKey &key, RenderPas
 		deps[numDeps].srcAccessMask = 0;
 		deps[numDeps].dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
 		numDeps++;
-		rp.dependencyCount = 1;
 	}
 
 	if (selfDependency) {
diff --git a/Common/GPU/Vulkan/VulkanQueueRunner.h b/Common/GPU/Vulkan/VulkanQueueRunner.h
index d6d07ec6ce..9c1fee77d9 100644
--- a/Common/GPU/Vulkan/VulkanQueueRunner.h
+++ b/Common/GPU/Vulkan/VulkanQueueRunner.h
@@ -51,11 +51,16 @@ ENUM_CLASS_BITOPS(PipelineFlags);
 
 // Pipelines need to be created for the right type of render pass.
 enum RenderPassType {
-	RP_TYPE_BACKBUFFER,  // For the backbuffer we can always use CLEAR/DONT_CARE, so bandwidth cost for a depth channel is negligible.
-	RP_TYPE_COLOR_DEPTH,
-	RP_TYPE_COLOR_DEPTH_INPUT,
+	// These four are organized so that bit 0 is DEPTH and bit 1 is INPUT, so
+	// they can be OR-ed together in MergeRPTypes.
 	RP_TYPE_COLOR,
+	RP_TYPE_COLOR_DEPTH,
 	RP_TYPE_COLOR_INPUT,
+	RP_TYPE_COLOR_DEPTH_INPUT,
+
+	// This is the odd one out, and gets special handling in MergeRPTypes.
+	RP_TYPE_BACKBUFFER,  // For the backbuffer we can always use CLEAR/DONT_CARE, so bandwidth cost for a depth channel is negligible.
+
 	// Later will add pure-color render passes.
 	RP_TYPE_COUNT,
 };

From a31c5c8239e6a58abaf69db58e9d180212420c64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 10:48:05 +0200
Subject: [PATCH 28/35] Cleanup logic

---
 GPU/Common/FramebufferManagerCommon.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index cdfe1fd677..140a0387bc 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -555,15 +555,12 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 	if (!isClearingDepth && useBufferedRendering_) {
 		CopyToDepthFromOverlappingFramebuffers(currentRenderVfb_);
 
-		// Special compatibility trick for Burnout Dominator lens flares. Not sure how to best generalize this. See issue #11100
+		// Special compatibility trick for Burnout Dominator lens flares. See issue #11100
 		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && currentRenderVfb_->z_address > 0x04110000) {
-			// Set the flag, then upload memory contents to depth channel.
 			// Sanity check the depth buffer pointer.
-			if (currentRenderVfb_->z_address > 0x04110000) {
-				if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
-					const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
-					DrawPixels(currentRenderVfb_, 0, 0, (const u8 *)src, GE_FORMAT_DEPTH16, currentRenderVfb_->z_stride, currentRenderVfb_->width, currentRenderVfb_->height, RASTER_DEPTH, "Depth Upload");
-				}
+			if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
+				const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
+				DrawPixels(currentRenderVfb_, 0, 0, (const u8 *)src, GE_FORMAT_DEPTH16, currentRenderVfb_->z_stride, currentRenderVfb_->width, currentRenderVfb_->height, RASTER_DEPTH, "Depth Upload");
 			}
 		}
 	}

From 078fa9beb2ac8a22340e772d80799d40f6596721 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 15:27:17 +0200
Subject: [PATCH 29/35] Fix corruption of Ridge Racer speedometers with
 AutoMaxQuality enabled.

See #8509
---
 GPU/Common/TextureCacheCommon.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index e47f768b35..dddd2d2edd 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -2698,7 +2698,7 @@ bool TextureCacheCommon::PrepareBuildTexture(BuildTexturePlan &plan, TexCacheEnt
 		plan.levelsToLoad = 1;
 	}
 
-	if (plan.isVideo || plan.depth != 1) {
+	if (plan.isVideo || plan.depth != 1 || plan.decodeToClut8) {
 		plan.maxPossibleLevels = 1;
 	} else {
 		plan.maxPossibleLevels = log2i(std::min(plan.createW, plan.createH)) + 1;

From c76d7e844c36fd6ea9f65d6eb83a598933654ab4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 19:37:46 +0200
Subject: [PATCH 30/35] Fix Vulkan regression of #16075 due to silly typo.

---
 Common/GPU/Vulkan/VulkanRenderManager.cpp | 2 +-
 Common/GPU/Vulkan/thin3d_vulkan.cpp       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Common/GPU/Vulkan/VulkanRenderManager.cpp b/Common/GPU/Vulkan/VulkanRenderManager.cpp
index d8c16e331a..82c99e89ce 100644
--- a/Common/GPU/Vulkan/VulkanRenderManager.cpp
+++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp
@@ -669,7 +669,7 @@ void VulkanRenderManager::EndCurRenderStep() {
 		rpType = RP_TYPE_BACKBUFFER;
 	} else if (curPipelineFlags_ & PipelineFlags::USES_INPUT_ATTACHMENT) {
 		// Not allowed on backbuffers.
-		rpType = depthStencil ? RP_TYPE_COLOR_INPUT : RP_TYPE_COLOR_DEPTH_INPUT;
+		rpType = depthStencil ? RP_TYPE_COLOR_DEPTH_INPUT : RP_TYPE_COLOR_INPUT;
 	}
 	// TODO: Also add render pass types for depth/stencil-less.
 
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index 34bbdd5efb..6a5af31453 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -1056,6 +1056,7 @@ Pipeline *VKContext::CreateGraphicsPipeline(const PipelineDesc &desc, const char
 	if (depth->info.depthTestEnable || depth->info.stencilTestEnable) {
 		pipelineFlags |= PipelineFlags::USES_DEPTH_STENCIL;
 	}
+	// TODO: We need code to set USES_BLEND_CONSTANT here too, if we're ever gonna use those in thin3d code.
 
 	VKPipeline *pipeline = new VKPipeline(vulkan_, desc.uniformDesc ? desc.uniformDesc->uniformBufferSize : 16 * sizeof(float), pipelineFlags, tag);
 

From 0ab5d4be7f2a08b8f3a63e43576ae346d4858e21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 20:00:35 +0200
Subject: [PATCH 31/35] Use Unknown's idea from #16081 instead of the
 UploadDepthForCLUTTexture compat.ini flag.

---
 Core/Compatibility.cpp                  |  1 -
 Core/Compatibility.h                    |  1 -
 GPU/Common/FramebufferManagerCommon.cpp |  5 +++--
 assets/compat.ini                       | 13 -------------
 4 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp
index 62ba3ec21b..75c0623ead 100644
--- a/Core/Compatibility.cpp
+++ b/Core/Compatibility.cpp
@@ -108,7 +108,6 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
 	CheckSetting(iniFile, gameID, "SplitFramebufferMargin", &flags_.SplitFramebufferMargin);
 	CheckSetting(iniFile, gameID, "ForceLowerResolutionForEffectsOn", &flags_.ForceLowerResolutionForEffectsOn);
 	CheckSetting(iniFile, gameID, "AllowDownloadCLUT", &flags_.AllowDownloadCLUT);
-	CheckSetting(iniFile, gameID, "UploadDepthForCLUTTextures", &flags_.UploadDepthForCLUTTextures);
 }
 
 void Compatibility::CheckSetting(IniFile &iniFile, const std::string &gameID, const char *option, bool *flag) {
diff --git a/Core/Compatibility.h b/Core/Compatibility.h
index 3a84bfb88e..04f3ab22b7 100644
--- a/Core/Compatibility.h
+++ b/Core/Compatibility.h
@@ -89,7 +89,6 @@ struct CompatFlags {
 	bool SplitFramebufferMargin;
 	bool ForceLowerResolutionForEffectsOn;
 	bool AllowDownloadCLUT;
-	bool UploadDepthForCLUTTextures;
 };
 
 struct VRCompat {
diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index af924216dd..99ceb7ec36 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -555,8 +555,9 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 	if (!isClearingDepth && useBufferedRendering_) {
 		CopyToDepthFromOverlappingFramebuffers(currentRenderVfb_);
 
-		// Special compatibility trick for Burnout Dominator lens flares. See issue #11100
-		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && currentRenderVfb_->z_address > 0x04110000) {
+		// Need to upload the first line of depth buffers, for Burnout Dominator lens flares. See issue #11100 and comments to #16081.
+		// Might make this more generic and upload the whole depth buffer if we find it's needed for something.
+		if (currentRenderVfb_->lastFrameNewSize == gpuStats.numFlips) {
 			// Sanity check the depth buffer pointer.
 			if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
 				const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
diff --git a/assets/compat.ini b/assets/compat.ini
index f153b93933..5ce6077091 100644
--- a/assets/compat.ini
+++ b/assets/compat.ini
@@ -1278,16 +1278,3 @@ ULJM05738 = true
 [AllowDownloadCLUT]
 # Temporary compatibility option, while working on the GPU CLUT-from-framebuffer path.
 # Not required for any games now that it works, but might be useful for development.
-
-[UploadDepthForCLUTTextures]
-# Burnout Dominator - lens flare effect (issue #11100)
-# We need a preinitialized depth buffer
-ULUS10236 = true
-ULES00703 = true
-
-# Need for Speed - Shift (same as Burnout Dominator)
-ULUS10462 = true
-ULES01275 = true
-ULJM05494 = true
-NPJH50143 = true
-ULJM05738 = true

From a6d6e0a3cce3005ead5cafed2e306225cbb6436f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 22 Sep 2022 22:11:16 +0200
Subject: [PATCH 32/35] Texture/Framebuffer match: Ignore stride if texHeight
 == 1. Fixes Ridge Racer lens flares.

---
 GPU/Common/FramebufferManagerCommon.cpp | 5 +++--
 GPU/Common/TextureCacheCommon.cpp       | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index af924216dd..99ceb7ec36 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -555,8 +555,9 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 	if (!isClearingDepth && useBufferedRendering_) {
 		CopyToDepthFromOverlappingFramebuffers(currentRenderVfb_);
 
-		// Special compatibility trick for Burnout Dominator lens flares. See issue #11100
-		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && currentRenderVfb_->z_address > 0x04110000) {
+		// Need to upload the first line of depth buffers, for Burnout Dominator lens flares. See issue #11100 and comments to #16081.
+		// Might make this more generic and upload the whole depth buffer if we find it's needed for something.
+		if (currentRenderVfb_->lastFrameNewSize == gpuStats.numFlips) {
 			// Sanity check the depth buffer pointer.
 			if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
 				const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index dddd2d2edd..205dcbc602 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -1018,7 +1018,8 @@ bool TextureCacheCommon::MatchFramebuffer(
 			return false;
 		}
 
-		if (fb_stride_in_bytes != tex_stride_in_bytes) {
+		// Note the check for texHeight - we really don't care about a stride mismatch if texHeight == 1.
+		if (fb_stride_in_bytes != tex_stride_in_bytes && texHeight > 1) {
 			// Probably irrelevant. Although, as we shall see soon, there are exceptions.
 			// Burnout Dominator lens flare trick special case.
 			if (fb_format == GE_FORMAT_8888 && entry.format == GE_TFMT_CLUT8 && texWidth == 4 && texHeight == 1) {

From 66b6dfd0a5618a4cd11f1c05dca26fbaecef9882 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Thu, 22 Sep 2022 20:21:44 -0700
Subject: [PATCH 33/35] softgpu: Fix self-render detect in Ridge Racer.

When we flush we mark all pending writes zero, but we rely on this being
set to detect self-render.

TRANSFORM_ALL was wrong as well, sometimes clearing BINNER_RANGE.
---
 GPU/Software/BinManager.cpp | 23 +++++++++++++++++------
 GPU/Software/BinManager.h   |  1 +
 GPU/Software/SoftGpu.h      | 10 +++++-----
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/GPU/Software/BinManager.cpp b/GPU/Software/BinManager.cpp
index 42eef12eda..fa7c78f21e 100644
--- a/GPU/Software/BinManager.cpp
+++ b/GPU/Software/BinManager.cpp
@@ -197,16 +197,16 @@ void BinManager::UpdateState(bool throughMode) {
 			Flush("tex");
 
 		// Okay, now update what's pending.
-		constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000;
-		const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
-		pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR);
-		if (state.pixelID.depthWrite)
-			pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR);
+		MarkPendingWrites(state);
 
 		ClearDirty(SoftDirty::BINNER_RANGE);
 	} else if (pendingOverlap_) {
-		if (HasTextureWrite(state))
+		if (HasTextureWrite(state)) {
 			Flush("tex");
+
+			// We need the pending writes set, which flushing cleared.  Set them again.
+			MarkPendingWrites(state);
+		}
 	}
 
 	if (HasDirty(SoftDirty::BINNER_OVERLAP)) {
@@ -282,6 +282,17 @@ void BinManager::MarkPendingReads(const Rasterizer::RasterizerState &state) {
 	}
 }
 
+void BinManager::MarkPendingWrites(const Rasterizer::RasterizerState &state) {
+	DrawingCoords scissorTL(gstate.getScissorX1(), gstate.getScissorY1());
+	DrawingCoords scissorBR(std::min(gstate.getScissorX2(), gstate.getRegionX2()), std::min(gstate.getScissorY2(), gstate.getRegionY2()));
+
+	constexpr uint32_t mirrorMask = 0x0FFFFFFF & ~0x00600000;
+	const uint32_t bpp = state.pixelID.FBFormat() == GE_FORMAT_8888 ? 4 : 2;
+	pendingWrites_[0].Expand(gstate.getFrameBufAddress() & mirrorMask, bpp, gstate.FrameBufStride(), scissorTL, scissorBR);
+	if (state.pixelID.depthWrite)
+		pendingWrites_[1].Expand(gstate.getDepthBufAddress() & mirrorMask, 2, gstate.DepthBufStride(), scissorTL, scissorBR);
+}
+
 inline void BinDirtyRange::Expand(uint32_t newBase, uint32_t bpp, uint32_t stride, DrawingCoords &tl, DrawingCoords &br) {
 	const uint32_t w = br.x - tl.x + 1;
 	const uint32_t h = br.y - tl.y + 1;
diff --git a/GPU/Software/BinManager.h b/GPU/Software/BinManager.h
index 79146eae1d..e74f6faac2 100644
--- a/GPU/Software/BinManager.h
+++ b/GPU/Software/BinManager.h
@@ -267,6 +267,7 @@ private:
 	int mostThreads_ = 0;
 
 	void MarkPendingReads(const Rasterizer::RasterizerState &state);
+	void MarkPendingWrites(const Rasterizer::RasterizerState &state);
 	bool HasTextureWrite(const Rasterizer::RasterizerState &state);
 	BinCoords Scissor(BinCoords range);
 	BinCoords Range(const VertexData &v0, const VertexData &v1, const VertexData &v2);
diff --git a/GPU/Software/SoftGpu.h b/GPU/Software/SoftGpu.h
index 18fa23118d..9944f2b789 100644
--- a/GPU/Software/SoftGpu.h
+++ b/GPU/Software/SoftGpu.h
@@ -64,17 +64,17 @@ enum class SoftDirty : uint64_t {
 	PIXEL_DITHER = 1ULL << 3,
 	PIXEL_WRITEMASK = 1ULL << 4,
 	PIXEL_CACHED = 1ULL << 5,
-	PIXEL_ALL = 63ULL << 0,
+	PIXEL_ALL = 0b111111ULL << 0,
 
 	SAMPLER_BASIC = 1ULL << 6,
 	SAMPLER_TEXLIST = 1ULL << 7,
 	SAMPLER_CLUT = 1ULL << 8,
-	SAMPLER_ALL = 7ULL << 6,
+	SAMPLER_ALL = 0b111ULL << 6,
 
 	RAST_BASIC = 1ULL << 9,
 	RAST_TEX = 1ULL << 10,
 	RAST_OFFSET = 1ULL << 11,
-	RAST_ALL = 7ULL << 9,
+	RAST_ALL = 0b111ULL << 9,
 
 	LIGHT_BASIC = 1ULL << 12,
 	LIGHT_MATERIAL = 1ULL << 13,
@@ -82,13 +82,13 @@ enum class SoftDirty : uint64_t {
 	LIGHT_1 = 1ULL << 15,
 	LIGHT_2 = 1ULL << 16,
 	LIGHT_3 = 1ULL << 17,
-	LIGHT_ALL = 63ULL << 12,
+	LIGHT_ALL = 0b111111ULL << 12,
 
 	TRANSFORM_BASIC = 1ULL << 18,
 	TRANSFORM_MATRIX = 1ULL << 19,
 	TRANSFORM_VIEWPORT = 1ULL << 20,
 	TRANSFORM_FOG = 1ULL << 21,
-	TRANSFORM_ALL = 31ULL << 18,
+	TRANSFORM_ALL = 0b1111ULL << 18,
 
 	BINNER_RANGE = 1ULL << 22,
 	BINNER_OVERLAP = 1ULL << 23,

From 93c909a88e7b2fa49ebd0c7b7d2dd9e26093e21f Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Fri, 23 Sep 2022 00:04:14 -0700
Subject: [PATCH 34/35] GPU: Upload depth only on first usage.

Fixes various glitches in Kingdom Hearts, etc.
---
 GPU/Common/FramebufferManagerCommon.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index 99ceb7ec36..330eda4796 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -548,6 +548,7 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 	}
 
 	// First time use of this framebuffer's depth buffer.
+	bool newlyUsingDepth = (currentRenderVfb_->usageFlags & FB_USAGE_RENDER_DEPTH) == 0;
 	currentRenderVfb_->usageFlags |= FB_USAGE_RENDER_DEPTH;
 
 	// If this first draw call is anything other than a clear, "resolve" the depth buffer,
@@ -557,7 +558,7 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 
 		// Need to upload the first line of depth buffers, for Burnout Dominator lens flares. See issue #11100 and comments to #16081.
 		// Might make this more generic and upload the whole depth buffer if we find it's needed for something.
-		if (currentRenderVfb_->lastFrameNewSize == gpuStats.numFlips) {
+		if (newlyUsingDepth) {
 			// Sanity check the depth buffer pointer.
 			if (Memory::IsValidRange(currentRenderVfb_->z_address, currentRenderVfb_->width * 2)) {
 				const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);

From d79828270ae6287a897bd475f2d29c98956d1a68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 23 Sep 2022 09:13:34 +0200
Subject: [PATCH 35/35] Add Burnout Dominator to list of games that require
 buffered rendering

It does run without but looks really bad, don't want reports of this.
---
 assets/compat.ini | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/assets/compat.ini b/assets/compat.ini
index 5ce6077091..8457a6a36f 100644
--- a/assets/compat.ini
+++ b/assets/compat.ini
@@ -520,6 +520,14 @@ ULES01086 = true
 # LEGO Batman: The Videogame
 ULUS10380 = true
 ULES01151 = true
+# Burnout Dominator
+ULUS10236 = true
+ULES00750 = true
+ULJM05242 = true
+ULJM05371 = true
+NPJH50304 = true
+ULES00703 = true
+
 # TODO: There are many more.
 
 [RequireBlockTransfer]