From 6b55d328e518ecab9f1a093cfd2a78f7ab16e90b Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 28 Dec 2021 15:37:25 -0800
Subject: [PATCH] samplerjit: Use regcache for linear filtering.

This makes it easier to reuse for mipmap filtering.
---
 GPU/Software/Rasterizer.cpp |   2 +-
 GPU/Software/Sampler.h      |   5 +
 GPU/Software/SamplerX86.cpp | 258 +++++++++++++++++++++++-------------
 3 files changed, 174 insertions(+), 91 deletions(-)
diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
index b1ebc7ca78..c0a8c1e9e7 100644
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@@ -540,7 +540,7 @@ static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(Sampler::Funcs sampler,
 	} else {
 		texcolor0 = Vec4<int>(sampler.linear(s, t, x, y, prim_color, tptr0, bufw0, mayHaveMipLevels ? texlevel : 0, mayHaveMipLevels ? frac_texlevel : 0));
 		if (mayHaveMipLevels && frac_texlevel) {
-			texcolor1 = Vec4<int>(sampler.linear(s, t, x, y, prim_color, tptr0 + 1, bufw0 + 1, texlevel + 1, frac_texlevel));
+			texcolor1 = Vec4<int>(sampler.linear(s, t, x, y, prim_color, tptr0 + 1, bufw0 + 1, texlevel + 1, 0));
 		}
 	}
 
diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h
index 1f641fc50f..f9500e4999 100644
--- a/GPU/Software/Sampler.h
+++ b/GPU/Software/Sampler.h
@@ -73,6 +73,8 @@ private:
 	NearestFunc Compile(const SamplerID &id);
 	LinearFunc CompileLinear(const SamplerID &id);
 
+	Rasterizer::RegCache::Reg GetZeroVec();
+
 	bool Jit_ReadTextureFormat(const SamplerID &id);
 	bool Jit_GetTexData(const SamplerID &id, int bitsPerTexel);
 	bool Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel);
@@ -89,6 +91,7 @@ private:
 	bool Jit_PrepareDataOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg);
 	bool Jit_PrepareDataDirectOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, int bitsPerTexel);
 	bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, int bitsPerTexel);
+	bool Jit_BlendQuad(const SamplerID &id, Rasterizer::RegCache::Reg destReg, bool level1);
 
 #if PPSSPP_ARCH(ARM64)
 	Arm64Gen::ARM64FloatEmitter fp;
@@ -104,6 +107,8 @@ private:
 	const u8 *constUNext_ = nullptr;
 	const u8 *constVNext_ = nullptr;
 	const u8 *constOnes_ = nullptr;
+	const u8 *const10Low_ = nullptr;
+	const u8 *const10All_ = nullptr;
 
 	std::unordered_map<SamplerID, NearestFunc> cache_;
 	std::unordered_map<SamplerID, const u8 *> addresses_;
diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp
index 028d69cab8..7d378b2036 100644
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@@ -162,11 +162,11 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 	regCache_.Reset(true);
 
 	// Let's drop some helpful constants here.
-	const u8 *const10All = AlignCode16();
+	const10All_ = AlignCode16();
 	Write16(0x10); Write16(0x10); Write16(0x10); Write16(0x10);
 	Write16(0x10); Write16(0x10); Write16(0x10); Write16(0x10);
 
-	const u8 *const10Low = AlignCode16();
+	const10Low_ = AlignCode16();
 	Write16(0x10); Write16(0x10); Write16(0x10); Write16(0x10);
 	Write16(0); Write16(0); Write16(0); Write16(0);
 
@@ -346,7 +346,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 
 	// This stores the result in an XMM for later processing.
 	// We map lookups to nearest CALLs, with arg order: u, v, src, bufw, level
-	auto doNearestCall = [&](int off) {
+	auto doNearestCall = [&](int off, bool level1) {
 #if PPSSPP_PLATFORM(WINDOWS)
 		static const X64Reg uArgReg = RCX;
 		static const X64Reg vArgReg = RDX;
@@ -360,30 +360,30 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 #endif
 		static const X64Reg resultReg = RAX;
 
-		X64Reg uReg = regCache_.Find(RegCache::VEC_ARG_U);
-		X64Reg vReg = regCache_.Find(RegCache::VEC_ARG_V);
+		X64Reg uReg = regCache_.Find(level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
+		X64Reg vReg = regCache_.Find(level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
 		// Otherwise, we'll overwrite them...
-		_assert_(uReg == XMM0 && vReg == XMM1);
+		_assert_(level1 || (uReg == XMM0 && vReg == XMM1));
 
 		MOVD_xmm(R(uArgReg), uReg);
 		MOVD_xmm(R(vArgReg), vReg);
 
 		X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR);
 		X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW);
-		MOV(64, R(srcArgReg), MDisp(srcReg, 0));
-		MOV(32, R(bufwArgReg), MDisp(bufwReg, 0));
+		MOV(64, R(srcArgReg), MDisp(srcReg, level1 ? 8 : 0));
+		MOV(32, R(bufwArgReg), MDisp(bufwReg, level1 ? 4 : 0));
 		// Leave level/levelFrac, we just always load from RAM on Windows and lock on POSIX.
 		regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR);
 		regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
 
 		PSRLDQ(uReg, 4);
 		PSRLDQ(vReg, 4);
-		regCache_.Unlock(uReg, RegCache::VEC_ARG_U);
-		regCache_.Unlock(vReg, RegCache::VEC_ARG_V);
+		regCache_.Unlock(uReg, level1 ? RegCache::VEC_U1 : RegCache::VEC_ARG_U);
+		regCache_.Unlock(vReg, level1 ? RegCache::VEC_V1 : RegCache::VEC_ARG_V);
 
 		CALL(nearest);
 
-		X64Reg vecResultReg = regCache_.Find(RegCache::VEC_RESULT);
+		X64Reg vecResultReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
 		if (off == 0) {
 			MOVD_xmm(vecResultReg, R(resultReg));
 		} else {
@@ -393,13 +393,41 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 			POR(vecResultReg, R(tempReg));
 			regCache_.Release(tempReg, RegCache::VEC_TEMP0);
 		}
-		regCache_.Unlock(vecResultReg, RegCache::VEC_RESULT);
+		regCache_.Unlock(vecResultReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
 	};
 
-	doNearestCall(0);
-	doNearestCall(4);
-	doNearestCall(8);
-	doNearestCall(12);
+	doNearestCall(0, false);
+	doNearestCall(4, false);
+	doNearestCall(8, false);
+	doNearestCall(12, false);
+
+	if (id.hasAnyMips) {
+		if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC)) {
+			X64Reg levelFracReg = regCache_.Find(RegCache::GEN_ARG_LEVELFRAC);
+			CMP(8, R(levelFracReg), Imm8(0));
+			regCache_.Unlock(levelFracReg, RegCache::GEN_ARG_LEVELFRAC);
+		} else {
+			CMP(8, MDisp(RSP, stackArgPos_ + 24), Imm8(0));
+		}
+		FixupBranch skip = J_CC(CC_Z, true);
+
+		// Modify the level, so the new level value is used.  We don't need the old.
+		if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) {
+			X64Reg levelReg = regCache_.Find(RegCache::GEN_ARG_LEVEL);
+			ADD(32, R(levelReg), Imm8(1));
+			regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
+		} else {
+			// It's fine to just modify this in place.
+			ADD(32, MDisp(RSP, stackArgPos_ + 16), Imm8(1));
+		}
+
+		doNearestCall(0, true);
+		doNearestCall(4, true);
+		doNearestCall(8, true);
+		doNearestCall(12, true);
+
+		SetJumpTarget(skip);
+	}
 
 	// We're done with these now.
 	regCache_.ForceRelease(RegCache::VEC_ARG_U);
@@ -408,12 +436,27 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		regCache_.ForceRelease(RegCache::VEC_U1);
 	if (regCache_.Has(RegCache::VEC_V1))
 		regCache_.ForceRelease(RegCache::VEC_V1);
-	regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
 	regCache_.ForceRelease(RegCache::GEN_ARG_TEXPTR);
 	regCache_.ForceRelease(RegCache::GEN_ARG_BUFW);
 	if (regCache_.Has(RegCache::GEN_ARG_LEVEL))
 		regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL);
 
+	// TODO: Convert to reg cache.
+	success = success && Jit_BlendQuad(id, XMM0, false);
+
+	// Last of all, convert to 32-bit channels.
+	if (cpu_info.bSSE4_1) {
+		PMOVZXWD(XMM0, R(XMM0));
+	} else {
+		X64Reg zeroReg = GetZeroVec();
+		PUNPCKLWD(XMM0, R(zeroReg));
+		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
+	}
+
+	regCache_.ForceRelease(RegCache::VEC_RESULT);
+	if (regCache_.Has(RegCache::VEC_RESULT1))
+		regCache_.ForceRelease(RegCache::VEC_RESULT1);
+
 	if (!success) {
 		regCache_.Reset(false);
 		EndWrite();
@@ -421,79 +464,8 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		return nullptr;
 	}
 
-	// TODO: Convert to reg cache.
-	regCache_.ForceRelease(RegCache::VEC_RESULT);
-	if (regCache_.Has(RegCache::VEC_RESULT1))
-		regCache_.ForceRelease(RegCache::VEC_RESULT1);
-	static const X64Reg fpScratchReg1 = XMM1;
-	static const X64Reg fpScratchReg2 = XMM2;
-	static const X64Reg fpScratchReg3 = XMM3;
-	static const X64Reg fpScratchReg4 = XMM4;
-	static const X64Reg fpScratchReg5 = XMM5;
-
-	// First put the top RRRRRRRR LLLLLLLL into fpScratchReg1, bottom into fpScratchReg2.
-	// Start with XXXX XXXX RRRR LLLL, and then expand 8 bits to 16 bits.
-	if (!cpu_info.bSSE4_1) {
-		PXOR(fpScratchReg3, R(fpScratchReg3));
-		PSHUFD(fpScratchReg1, R(XMM5), _MM_SHUFFLE(0, 0, 1, 0));
-		PSHUFD(fpScratchReg2, R(XMM5), _MM_SHUFFLE(0, 0, 3, 2));
-		PUNPCKLBW(fpScratchReg1, R(fpScratchReg3));
-		PUNPCKLBW(fpScratchReg2, R(fpScratchReg3));
-	} else {
-		PSHUFD(fpScratchReg2, R(XMM5), _MM_SHUFFLE(0, 0, 3, 2));
-		PMOVZXBW(fpScratchReg1, R(XMM5));
-		PMOVZXBW(fpScratchReg2, R(fpScratchReg2));
-	}
-
-	// Grab frac_u and spread to lower (L) lanes.
-	X64Reg fracUReg = regCache_.Find(RegCache::GEN_ARG_FRAC_U);
-	MOVD_xmm(fpScratchReg5, R(fracUReg));
-	regCache_.Unlock(fracUReg, RegCache::GEN_ARG_FRAC_U);
-	regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_U);
-	PSHUFLW(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-	// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.
-	MOVDQA(fpScratchReg3, M(const10Low));
-	PSUBW(fpScratchReg3, R(fpScratchReg5));
-	// Then we just shift and OR in the original frac_u.
-	PSLLDQ(fpScratchReg5, 8);
-	POR(fpScratchReg3, R(fpScratchReg5));
-
-	// Okay, we have 8-bits in the top and bottom rows for the color.
-	// Multiply by frac to get 12, which we keep for the next stage.
-	PMULLW(fpScratchReg1, R(fpScratchReg3));
-	PMULLW(fpScratchReg2, R(fpScratchReg3));
-
-	// Time for frac_v.  This time, we want it in all 8 lanes.
-	X64Reg fracVReg = regCache_.Find(RegCache::GEN_ARG_FRAC_V);
-	MOVD_xmm(fpScratchReg5, R(fracVReg));
-	regCache_.Unlock(fracVReg, RegCache::GEN_ARG_FRAC_V);
-	regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_V);
-	PSHUFLW(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-	PSHUFD(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
-
-	// Now, inverse fpScratchReg5 into fpScratchReg3 for the top row.
-	MOVDQA(fpScratchReg3, M(const10All));
-	PSUBW(fpScratchReg3, R(fpScratchReg5));
-
-	// We had 12, plus 4 frac, that gives us 16.
-	PMULLW(fpScratchReg2, R(fpScratchReg5));
-	PMULLW(fpScratchReg1, R(fpScratchReg3));
-
-	// Finally, time to sum them all up and divide by 256 to get back to 8 bits.
-	PADDUSW(fpScratchReg2, R(fpScratchReg1));
-	PSHUFD(XMM0, R(fpScratchReg2), _MM_SHUFFLE(3, 2, 3, 2));
-	PADDUSW(XMM0, R(fpScratchReg2));
-	PSRLW(XMM0, 8);
-
-	// Last of all, convert to 32-bit channels.
-	if (cpu_info.bSSE4_1) {
-		PMOVZXWD(XMM0, R(XMM0));
-	} else {
-		PXOR(fpScratchReg1, R(fpScratchReg1));
-		PUNPCKLWD(XMM0, R(fpScratchReg1));
-	}
-
-	// TODO: Actually use this (and color) at some point.
+	// TODO: Actually use these at some point.
+	regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
 	if (regCache_.Has(RegCache::GEN_ARG_LEVELFRAC))
 		regCache_.ForceRelease(RegCache::GEN_ARG_LEVELFRAC);
 
@@ -521,6 +493,112 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 	return (LinearFunc)start;
 }
 
+RegCache::Reg SamplerJitCache::GetZeroVec() {
+	if (!regCache_.Has(RegCache::VEC_ZERO)) {
+		X64Reg r = regCache_.Alloc(RegCache::VEC_ZERO);
+		PXOR(r, R(r));
+		return r;
+	}
+	return regCache_.Find(RegCache::VEC_ZERO);
+}
+
+bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, Rasterizer::RegCache::Reg destReg, bool level1) {
+	// First put the top RRRRRRRR LLLLLLLL into topReg, bottom into bottomReg.
+	// Start with XXXX XXXX RRRR LLLL, and then expand 8 bits to 16 bits.
+	X64Reg topReg = regCache_.Alloc(RegCache::VEC_TEMP0);
+	X64Reg bottomReg = regCache_.Alloc(RegCache::VEC_TEMP1);
+
+	X64Reg quadReg = regCache_.Find(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
+	if (!cpu_info.bSSE4_1) {
+		X64Reg zeroReg = GetZeroVec();
+		PSHUFD(topReg, R(quadReg), _MM_SHUFFLE(0, 0, 1, 0));
+		PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
+		PUNPCKLBW(topReg, R(zeroReg));
+		PUNPCKLBW(bottomReg, R(zeroReg));
+		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
+	} else {
+		PSHUFD(bottomReg, R(quadReg), _MM_SHUFFLE(0, 0, 3, 2));
+		PMOVZXBW(topReg, R(quadReg));
+		PMOVZXBW(bottomReg, R(bottomReg));
+	}
+	regCache_.Unlock(quadReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
+	regCache_.ForceRelease(level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
+
+	// Grab frac_u and spread to lower (L) lanes.
+	X64Reg fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
+	X64Reg fracMulReg = regCache_.Alloc(RegCache::VEC_TEMP3);
+	if (level1) {
+		MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_));
+	} else {
+		X64Reg fracUReg = regCache_.Find(RegCache::GEN_ARG_FRAC_U);
+		MOVD_xmm(fracReg, R(fracUReg));
+		regCache_.Unlock(fracUReg, RegCache::GEN_ARG_FRAC_U);
+		regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_U);
+	}
+	PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
+	// Now subtract 0x10 - frac_u in the L lanes only: 00000000 LLLLLLLL.
+	MOVDQA(fracMulReg, M(const10Low_));
+	PSUBW(fracMulReg, R(fracReg));
+	// Then we just shift and OR in the original frac_u.
+	PSLLDQ(fracReg, 8);
+	POR(fracMulReg, R(fracReg));
+	regCache_.Release(fracReg, RegCache::VEC_TEMP2);
+
+	// Okay, we have 8-bits in the top and bottom rows for the color.
+	// Multiply by frac to get 12, which we keep for the next stage.
+	PMULLW(topReg, R(fracMulReg));
+	PMULLW(bottomReg, R(fracMulReg));
+	regCache_.Release(fracMulReg, RegCache::VEC_TEMP3);
+
+	// Time for frac_v.  This time, we want it in all 8 lanes.
+	fracReg = regCache_.Alloc(RegCache::VEC_TEMP2);
+	X64Reg fracTopReg = regCache_.Alloc(RegCache::VEC_TEMP3);
+	if (level1) {
+		MOVD_xmm(fracReg, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_ + 4));
+	} else {
+		X64Reg fracVReg = regCache_.Find(RegCache::GEN_ARG_FRAC_V);
+		MOVD_xmm(fracReg, R(fracVReg));
+		regCache_.Unlock(fracVReg, RegCache::GEN_ARG_FRAC_V);
+		regCache_.ForceRelease(RegCache::GEN_ARG_FRAC_V);
+	}
+	PSHUFLW(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
+	PSHUFD(fracReg, R(fracReg), _MM_SHUFFLE(0, 0, 0, 0));
+
+	// Now, inverse fracReg into fracTopReg for the top row.
+	MOVDQA(fracTopReg, M(const10All_));
+	PSUBW(fracTopReg, R(fracReg));
+
+	// We had 12, plus 4 frac, that gives us 16.
+	PMULLW(bottomReg, R(fracReg));
+	PMULLW(topReg, R(fracTopReg));
+	regCache_.Release(fracReg, RegCache::VEC_TEMP2);
+	regCache_.Release(fracTopReg, RegCache::VEC_TEMP3);
+
+	// Finally, time to sum them all up and divide by 256 to get back to 8 bits.
+	PADDUSW(bottomReg, R(topReg));
+	regCache_.Release(topReg, RegCache::VEC_TEMP0);
+	bool success = regCache_.ChangeReg(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
+	if (!success) {
+		_assert_msg_(destReg == bottomReg, "Unexpected other reg locked as destReg");
+		X64Reg otherReg = regCache_.Alloc(RegCache::VEC_TEMP0);
+		PSHUFD(otherReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
+		PADDUSW(bottomReg, R(otherReg));
+		regCache_.Release(otherReg, RegCache::VEC_TEMP0);
+		regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
+
+		// Okay, now it can be changed.
+		regCache_.ChangeReg(destReg, level1 ? RegCache::VEC_RESULT1 : RegCache::VEC_RESULT);
+	} else {
+		PSHUFD(destReg, R(bottomReg), _MM_SHUFFLE(3, 2, 3, 2));
+		PADDUSW(destReg, R(bottomReg));
+		regCache_.Release(bottomReg, RegCache::VEC_TEMP1);
+	}
+
+	PSRLW(destReg, 8);
+
+	return true;
+}
+
 bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {
 	GETextureFormat fmt = id.TexFmt();
 	bool success = true;