From 74eb450e76027a324f0ef4dbe74b523033660e51 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Tue, 28 Dec 2021 17:52:17 -0800
Subject: [PATCH] samplerjit: Move texture function into jit.

Could do this also for nearest, might end up with a third set of functions
there for a direct sample lookup (for debug funcs.)
---
 GPU/Software/Rasterizer.cpp |  10 +-
 GPU/Software/Sampler.cpp    |   3 +-
 GPU/Software/Sampler.h      |   6 +-
 GPU/Software/SamplerX86.cpp | 207 ++++++++++++++++++++++++++++++++++--
 4 files changed, 209 insertions(+), 17 deletions(-)
diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
index 70a255ffa8..156dace489 100644
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@@ -268,8 +268,8 @@ Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, V
 	case GE_TEXFUNC_DECAL:
 	{
 		if (rgba) {
-			int t = (rgba) ? texcolor.a() : 255;
-			int invt = (rgba) ? 255 - t : 0;
+			int t = texcolor.a();
+			int invt = 255 - t;
 			// Both colors are boosted here, making the alpha have more weight.
 			Vec3<int> one = Vec3<int>::AssignToAll(1);
 			out_rgb = ((prim_color.rgb() + one) * invt + (texcolor.rgb() + one) * t);
@@ -537,11 +537,11 @@ static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(Sampler::Funcs sampler,
 			Vec4<int> texcolor1 = Vec4<int>(sampler.nearest(u[1], v[1], tptr0[1], bufw0[1], texlevel + 1));
 			texcolor0 = (texcolor1 * frac_texlevel + texcolor0 * (16 - frac_texlevel)) / 16;
 		}
-	} else {
-		texcolor0 = Vec4<int>(sampler.linear(s, t, x, y, prim_color, tptr0, bufw0, mayHaveMipLevels ? texlevel : 0, mayHaveMipLevels ? frac_texlevel : 0));
+
+		return GetTextureFunctionOutput(prim_color, ToVec4IntArg(texcolor0));
 	}
 
-	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(texcolor0));
+	return sampler.linear(s, t, x, y, prim_color, tptr0, bufw0, mayHaveMipLevels ? texlevel : 0, mayHaveMipLevels ? frac_texlevel : 0);
 }
 
 template <bool mayHaveMipLevels>
diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp
index 9c73728b3b..8cea0cc557 100644
--- a/GPU/Software/Sampler.cpp
+++ b/GPU/Software/Sampler.cpp
@@ -24,6 +24,7 @@
 #include "Core/Reporting.h"
 #include "GPU/Common/TextureDecoder.h"
 #include "GPU/GPUState.h"
+#include "GPU/Software/Rasterizer.h"
 #include "GPU/Software/RasterizerRegCache.h"
 #include "GPU/Software/Sampler.h"
 
@@ -578,7 +579,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y,
 		const Vec4<int> c1 = SampleLinearLevel(s, t, x, y, tptr + 1, bufw + 1, texlevel + 1);
 		c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
 	}
-	return ToVec4IntResult(c0);
+	return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0));
 }
 
 };
diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h
index 00830b38a5..ad75d79f23 100644
--- a/GPU/Software/Sampler.h
+++ b/GPU/Software/Sampler.h
@@ -74,6 +74,7 @@ private:
 	LinearFunc CompileLinear(const SamplerID &id);
 
 	Rasterizer::RegCache::Reg GetZeroVec();
+	Rasterizer::RegCache::Reg GetGState();
 
 	bool Jit_ReadTextureFormat(const SamplerID &id);
 	bool Jit_GetTexData(const SamplerID &id, int bitsPerTexel);
@@ -93,6 +94,8 @@ private:
 	bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, int bitsPerTexel);
 	bool Jit_BlendQuad(const SamplerID &id, bool level1);
 
+	bool Jit_ApplyTextureFunc(const SamplerID &id);
+
 #if PPSSPP_ARCH(ARM64)
 	Arm64Gen::ARM64FloatEmitter fp;
 #elif PPSSPP_ARCH(AMD64) || PPSSPP_ARCH(X86)
@@ -106,7 +109,8 @@ private:
 	const u8 *constHeightMinus1i_ = nullptr;
 	const u8 *constUNext_ = nullptr;
 	const u8 *constVNext_ = nullptr;
-	const u8 *constOnes_ = nullptr;
+	const u8 *constOnes32_ = nullptr;
+	const u8 *constOnes16_ = nullptr;
 	const u8 *const10Low_ = nullptr;
 	const u8 *const10All_ = nullptr;
 
diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp
index c88cbbf012..cf183fa5a3 100644
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@@ -188,18 +188,20 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		constHeightMinus1i_ = AlignCode16();
 		Write32((1 << id.height0Shift) - 1); Write32((1 << id.height0Shift) - 1);
 		Write32((1 << id.height0Shift) - 1); Write32((1 << id.height0Shift) - 1);
-
-		constOnes_ = nullptr;
 	} else {
 		constWidth256f_ = nullptr;
 		constHeight256f_ = nullptr;
 		constWidthMinus1i_ = nullptr;
 		constHeightMinus1i_ = nullptr;
-
-		constOnes_ = AlignCode16();
-		Write32(1); Write32(1); Write32(1); Write32(1);
 	}
 
+	constOnes32_ = AlignCode16();
+	Write32(1); Write32(1); Write32(1); Write32(1);
+
+	constOnes16_ = AlignCode16();
+	Write16(1); Write16(1); Write16(1); Write16(1);
+	Write16(1); Write16(1); Write16(1); Write16(1);
+
 	constUNext_ = AlignCode16();
 	Write32(0); Write32(1); Write32(0); Write32(1);
 
@@ -490,6 +492,9 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		SetJumpTarget(skip);
 	}
 
+	// Finally, it's time to apply the texture function.
+	success = success && Jit_ApplyTextureFunc(id);
+
 	// Last of all, convert to 32-bit channels.
 	if (cpu_info.bSSE4_1) {
 		PMOVZXWD(XMM0, R(XMM0));
@@ -499,9 +504,6 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
 		regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
 	}
 
-	// TODO: Actually use these at some point.
-	regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
-
 	regCache_.ForceRelease(RegCache::VEC_RESULT);
 
 	if (!success) {
@@ -544,6 +546,15 @@ RegCache::Reg SamplerJitCache::GetZeroVec() {
 	return regCache_.Find(RegCache::VEC_ZERO);
 }
 
+RegCache::Reg SamplerJitCache::GetGState() {
+	if (!regCache_.Has(RegCache::GEN_GSTATE)) {
+		X64Reg r = regCache_.Alloc(RegCache::GEN_GSTATE);
+		MOV(PTRBITS, R(r), ImmPtr(&gstate.nop));
+		return r;
+	}
+	return regCache_.Find(RegCache::GEN_GSTATE);
+}
+
 bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
 	// First put the top RRRRRRRR LLLLLLLL into topReg, bottom into bottomReg.
 	// Start with XXXX XXXX RRRR LLLL, and then expand 8 bits to 16 bits.
@@ -653,6 +664,182 @@ bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
 	return true;
 }
 
+bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {
+	X64Reg resultReg = regCache_.Find(RegCache::VEC_RESULT);
+	X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
+	X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
+
+	auto useAlphaFrom = [&](X64Reg alphaColorReg) {
+		PSRLDQ(alphaColorReg, 6);
+		PSLLDQ(alphaColorReg, 6);
+		// Zero out the result alpha and OR them together.
+		PSLLDQ(resultReg, 10);
+		PSRLDQ(resultReg, 10);
+		POR(resultReg, R(alphaColorReg));
+	};
+
+	// Note: color is in DWORDs, but result is in WORDs.
+	switch (id.TexFunc()) {
+	case GE_TEXFUNC_MODULATE:
+		PACKSSDW(primColorReg, R(primColorReg));
+		MOVDQA(tempReg, M(constOnes16_));
+		PADDW(tempReg, R(primColorReg));
+
+		// Okay, time to multiply.  This produces 16 bits, neatly.
+		PMULLW(resultReg, R(tempReg));
+		if (id.useColorDoubling)
+			PSRLW(resultReg, 7);
+		else
+			PSRLW(resultReg, 8);
+
+		if (!id.useTextureAlpha) {
+			useAlphaFrom(primColorReg);
+		} else if (id.useColorDoubling) {
+			// We still need to finish dividing alpha, it's currently doubled (frmo the 7 above.)
+			MOVDQA(primColorReg, R(resultReg));
+			PSRLW(primColorReg, 1);
+			useAlphaFrom(primColorReg);
+		}
+		break;
+
+	case GE_TEXFUNC_DECAL:
+		PACKSSDW(primColorReg, R(primColorReg));
+		if (id.useTextureAlpha) {
+			// Get alpha into the tempReg.
+			PSHUFLW(tempReg, R(resultReg), _MM_SHUFFLE(3, 3, 3, 3));
+			PADDW(resultReg, M(constOnes16_));
+			PMULLW(resultReg, R(tempReg));
+
+			X64Reg invAlphaReg = regCache_.Alloc(RegCache::VEC_TEMP1);
+			// Materialize some 255s, and subtract out alpha.
+			PCMPEQD(invAlphaReg, R(invAlphaReg));
+			PSRLW(invAlphaReg, 8);
+			PSUBW(invAlphaReg, R(tempReg));
+
+			MOVDQA(tempReg, R(primColorReg));
+			PADDW(tempReg, M(constOnes16_));
+			PMULLW(tempReg, R(invAlphaReg));
+			regCache_.Release(invAlphaReg, RegCache::VEC_TEMP1);
+
+			// Now sum, and divide.
+			PADDW(resultReg, R(tempReg));
+			if (id.useColorDoubling)
+				PSRLW(resultReg, 7);
+			else
+				PSRLW(resultReg, 8);
+		}
+		useAlphaFrom(primColorReg);
+		break;
+
+	case GE_TEXFUNC_BLEND:
+	{
+		PACKSSDW(primColorReg, R(primColorReg));
+
+		// Start out with the prim color side.  Materialize a 255 to inverse resultReg and round.
+		PCMPEQD(tempReg, R(tempReg));
+		PSRLW(tempReg, 8);
+
+		// We're going to lose tempReg, so save the 255s.
+		X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP1);
+		MOVDQA(roundValueReg, R(tempReg));
+
+		PSUBW(tempReg, R(resultReg));
+		PMULLW(tempReg, R(primColorReg));
+		// Okay, now add the rounding value.
+		PADDW(tempReg, R(roundValueReg));
+		regCache_.Release(roundValueReg, RegCache::VEC_TEMP1);
+
+		if (id.useTextureAlpha) {
+			// Before we modify the texture color, let's calculate alpha.
+			PADDW(primColorReg, M(constOnes16_));
+			PMULLW(primColorReg, R(resultReg));
+			// We divide later.
+		}
+
+		X64Reg gstateReg = GetGState();
+		X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1);
+		if (cpu_info.bSSE4_1) {
+			PMOVZXBW(texEnvReg, MDisp(gstateReg, offsetof(GPUgstate, texenvcolor)));
+		} else {
+			MOVD_xmm(texEnvReg, MDisp(gstateReg, offsetof(GPUgstate, texenvcolor)));
+			X64Reg zeroReg = GetZeroVec();
+			PUNPCKLBW(texEnvReg, R(zeroReg));
+			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
+		}
+		PMULLW(resultReg, R(texEnvReg));
+		regCache_.Release(texEnvReg, RegCache::VEC_TEMP1);
+		regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
+
+		// Add in the prim color side and divide.
+		PADDW(resultReg, R(tempReg));
+		if (id.useColorDoubling)
+			PSRLW(resultReg, 7);
+		else
+			PSRLW(resultReg, 8);
+
+		if (id.useTextureAlpha) {
+			// We put the alpha in here, just need to divide it after that multiply.
+			PSRLW(primColorReg, 8);
+		}
+		useAlphaFrom(primColorReg);
+		break;
+	}
+
+	case GE_TEXFUNC_REPLACE:
+		if (id.useColorDoubling && id.useTextureAlpha) {
+			// We can abuse primColorReg as a temp.
+			MOVDQA(primColorReg, R(resultReg));
+			// Shift to zero out alpha in resultReg.
+			PSLLDQ(resultReg, 10);
+			PSRLDQ(resultReg, 10);
+			// Now simply add them together, restoring alpha and doubling the colors.
+			PADDW(resultReg, R(primColorReg));
+		} else if (!id.useTextureAlpha) {
+			if (id.useColorDoubling) {
+				// Let's just double using shifting.  Ignore alpha.
+				PSLLW(resultReg, 1);
+			}
+			// Now we want prim_color in W, so convert, then shift-mask away the color.
+			PACKSSDW(primColorReg, R(primColorReg));
+			useAlphaFrom(primColorReg);
+		}
+		break;
+
+	case GE_TEXFUNC_ADD:
+	case GE_TEXFUNC_UNKNOWN1:
+	case GE_TEXFUNC_UNKNOWN2:
+	case GE_TEXFUNC_UNKNOWN3:
+		PACKSSDW(primColorReg, R(primColorReg));
+		if (id.useTextureAlpha) {
+			MOVDQA(tempReg, M(constOnes16_));
+			// Add and multiply the alpha (and others, but we'll mask them.)
+			PADDW(tempReg, R(primColorReg));
+			PMULLW(tempReg, R(resultReg));
+
+			// Now that we've extracted alpha, sum and double as needed.
+			PADDW(resultReg, R(primColorReg));
+			if (id.useColorDoubling)
+				PSLLW(resultReg, 1);
+
+			// Divide by 256 to normalize alpha.
+			PSRLW(tempReg, 8);
+			useAlphaFrom(tempReg);
+		} else {
+			PADDW(resultReg, R(primColorReg));
+			if (id.useColorDoubling)
+				PSLLW(resultReg, 1);
+			useAlphaFrom(primColorReg);
+		}
+		break;
+	}
+
+	regCache_.Release(tempReg, RegCache::VEC_TEMP0);
+	regCache_.Unlock(resultReg, RegCache::VEC_RESULT);
+	regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
+	regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
+	return true;
+}
+
 bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {
 	GETextureFormat fmt = id.TexFmt();
 	bool success = true;
@@ -1478,7 +1665,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 	if (constWidth256f_ == nullptr) {
 		// We have to figure out levels and the proper width, ugh.
 		X64Reg shiftReg = regCache_.Find(RegCache::GEN_SHIFTVAL);
-		X64Reg gstateReg = regCache_.Alloc(RegCache::GEN_GSTATE);
+		X64Reg gstateReg = GetGState();
 		X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
 
 		X64Reg levelReg = INVALID_REG;
@@ -1544,7 +1731,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
 
 		// Now just subtract one.  We use this later for clamp/wrap.
-		MOVDQA(tempVecReg, M(constOnes_));
+		MOVDQA(tempVecReg, M(constOnes32_));
 		PSUBD(width0VecReg, R(tempVecReg));
 		PSUBD(height0VecReg, R(tempVecReg));
 		PSUBD(width1VecReg, R(tempVecReg));