From 74eb450e76027a324f0ef4dbe74b523033660e51 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 28 Dec 2021 17:52:17 -0800 Subject: [PATCH] samplerjit: Move texture function into jit. Could do this also for nearest, might end up with a third set of functions there for a direct sample lookup (for debug funcs.) --- GPU/Software/Rasterizer.cpp | 10 +- GPU/Software/Sampler.cpp | 3 +- GPU/Software/Sampler.h | 6 +- GPU/Software/SamplerX86.cpp | 207 ++++++++++++++++++++++++++++++++++-- 4 files changed, 209 insertions(+), 17 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 70a255ffa8..156dace489 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -268,8 +268,8 @@ Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, V case GE_TEXFUNC_DECAL: { if (rgba) { - int t = (rgba) ? texcolor.a() : 255; - int invt = (rgba) ? 255 - t : 0; + int t = texcolor.a(); + int invt = 255 - t; // Both colors are boosted here, making the alpha have more weight. Vec3 one = Vec3::AssignToAll(1); out_rgb = ((prim_color.rgb() + one) * invt + (texcolor.rgb() + one) * t); @@ -537,11 +537,11 @@ static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(Sampler::Funcs sampler, Vec4 texcolor1 = Vec4(sampler.nearest(u[1], v[1], tptr0[1], bufw0[1], texlevel + 1)); texcolor0 = (texcolor1 * frac_texlevel + texcolor0 * (16 - frac_texlevel)) / 16; } - } else { - texcolor0 = Vec4(sampler.linear(s, t, x, y, prim_color, tptr0, bufw0, mayHaveMipLevels ? texlevel : 0, mayHaveMipLevels ? frac_texlevel : 0)); + + return GetTextureFunctionOutput(prim_color, ToVec4IntArg(texcolor0)); } - return GetTextureFunctionOutput(prim_color, ToVec4IntArg(texcolor0)); + return sampler.linear(s, t, x, y, prim_color, tptr0, bufw0, mayHaveMipLevels ? texlevel : 0, mayHaveMipLevels ? frac_texlevel : 0); } template diff --git a/GPU/Software/Sampler.cpp b/GPU/Software/Sampler.cpp index 9c73728b3b..8cea0cc557 100644 --- a/GPU/Software/Sampler.cpp +++ b/GPU/Software/Sampler.cpp @@ -24,6 +24,7 @@ #include "Core/Reporting.h" #include "GPU/Common/TextureDecoder.h" #include "GPU/GPUState.h" +#include "GPU/Software/Rasterizer.h" #include "GPU/Software/RasterizerRegCache.h" #include "GPU/Software/Sampler.h" @@ -578,7 +579,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, const Vec4 c1 = SampleLinearLevel(s, t, x, y, tptr + 1, bufw + 1, texlevel + 1); c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16; } - return ToVec4IntResult(c0); + return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0)); } }; diff --git a/GPU/Software/Sampler.h b/GPU/Software/Sampler.h index 00830b38a5..ad75d79f23 100644 --- a/GPU/Software/Sampler.h +++ b/GPU/Software/Sampler.h @@ -74,6 +74,7 @@ private: LinearFunc CompileLinear(const SamplerID &id); Rasterizer::RegCache::Reg GetZeroVec(); + Rasterizer::RegCache::Reg GetGState(); bool Jit_ReadTextureFormat(const SamplerID &id); bool Jit_GetTexData(const SamplerID &id, int bitsPerTexel); @@ -93,6 +94,8 @@ private: bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, int bitsPerTexel); bool Jit_BlendQuad(const SamplerID &id, bool level1); + bool Jit_ApplyTextureFunc(const SamplerID &id); + #if PPSSPP_ARCH(ARM64) Arm64Gen::ARM64FloatEmitter fp; #elif PPSSPP_ARCH(AMD64) || PPSSPP_ARCH(X86) @@ -106,7 +109,8 @@ private: const u8 *constHeightMinus1i_ = nullptr; const u8 *constUNext_ = nullptr; const u8 *constVNext_ = nullptr; - const u8 *constOnes_ = nullptr; + const u8 *constOnes32_ = nullptr; + const u8 *constOnes16_ = nullptr; const u8 *const10Low_ = nullptr; const u8 *const10All_ = nullptr; diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp index c88cbbf012..cf183fa5a3 100644 --- a/GPU/Software/SamplerX86.cpp +++ b/GPU/Software/SamplerX86.cpp @@ -188,18 +188,20 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { constHeightMinus1i_ = AlignCode16(); Write32((1 << id.height0Shift) - 1); Write32((1 << id.height0Shift) - 1); Write32((1 << id.height0Shift) - 1); Write32((1 << id.height0Shift) - 1); - - constOnes_ = nullptr; } else { constWidth256f_ = nullptr; constHeight256f_ = nullptr; constWidthMinus1i_ = nullptr; constHeightMinus1i_ = nullptr; - - constOnes_ = AlignCode16(); - Write32(1); Write32(1); Write32(1); Write32(1); } + constOnes32_ = AlignCode16(); + Write32(1); Write32(1); Write32(1); Write32(1); + + constOnes16_ = AlignCode16(); + Write16(1); Write16(1); Write16(1); Write16(1); + Write16(1); Write16(1); Write16(1); Write16(1); + constUNext_ = AlignCode16(); Write32(0); Write32(1); Write32(0); Write32(1); @@ -490,6 +492,9 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { SetJumpTarget(skip); } + // Finally, it's time to apply the texture function. + success = success && Jit_ApplyTextureFunc(id); + // Last of all, convert to 32-bit channels. if (cpu_info.bSSE4_1) { PMOVZXWD(XMM0, R(XMM0)); @@ -499,9 +504,6 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { regCache_.Unlock(zeroReg, RegCache::VEC_ZERO); } - // TODO: Actually use these at some point. - regCache_.ForceRelease(RegCache::VEC_ARG_COLOR); - regCache_.ForceRelease(RegCache::VEC_RESULT); if (!success) { @@ -544,6 +546,15 @@ RegCache::Reg SamplerJitCache::GetZeroVec() { return regCache_.Find(RegCache::VEC_ZERO); } +RegCache::Reg SamplerJitCache::GetGState() { + if (!regCache_.Has(RegCache::GEN_GSTATE)) { + X64Reg r = regCache_.Alloc(RegCache::GEN_GSTATE); + MOV(PTRBITS, R(r), ImmPtr(&gstate.nop)); + return r; + } + return regCache_.Find(RegCache::GEN_GSTATE); +} + bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) { // First put the top RRRRRRRR LLLLLLLL into topReg, bottom into bottomReg. // Start with XXXX XXXX RRRR LLLL, and then expand 8 bits to 16 bits. @@ -653,6 +664,182 @@ bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) { return true; } +bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) { + X64Reg resultReg = regCache_.Find(RegCache::VEC_RESULT); + X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR); + X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0); + + auto useAlphaFrom = [&](X64Reg alphaColorReg) { + PSRLDQ(alphaColorReg, 6); + PSLLDQ(alphaColorReg, 6); + // Zero out the result alpha and OR them together. + PSLLDQ(resultReg, 10); + PSRLDQ(resultReg, 10); + POR(resultReg, R(alphaColorReg)); + }; + + // Note: color is in DWORDs, but result is in WORDs. + switch (id.TexFunc()) { + case GE_TEXFUNC_MODULATE: + PACKSSDW(primColorReg, R(primColorReg)); + MOVDQA(tempReg, M(constOnes16_)); + PADDW(tempReg, R(primColorReg)); + + // Okay, time to multiply. This produces 16 bits, neatly. + PMULLW(resultReg, R(tempReg)); + if (id.useColorDoubling) + PSRLW(resultReg, 7); + else + PSRLW(resultReg, 8); + + if (!id.useTextureAlpha) { + useAlphaFrom(primColorReg); + } else if (id.useColorDoubling) { + // We still need to finish dividing alpha, it's currently doubled (frmo the 7 above.) + MOVDQA(primColorReg, R(resultReg)); + PSRLW(primColorReg, 1); + useAlphaFrom(primColorReg); + } + break; + + case GE_TEXFUNC_DECAL: + PACKSSDW(primColorReg, R(primColorReg)); + if (id.useTextureAlpha) { + // Get alpha into the tempReg. + PSHUFLW(tempReg, R(resultReg), _MM_SHUFFLE(3, 3, 3, 3)); + PADDW(resultReg, M(constOnes16_)); + PMULLW(resultReg, R(tempReg)); + + X64Reg invAlphaReg = regCache_.Alloc(RegCache::VEC_TEMP1); + // Materialize some 255s, and subtract out alpha. + PCMPEQD(invAlphaReg, R(invAlphaReg)); + PSRLW(invAlphaReg, 8); + PSUBW(invAlphaReg, R(tempReg)); + + MOVDQA(tempReg, R(primColorReg)); + PADDW(tempReg, M(constOnes16_)); + PMULLW(tempReg, R(invAlphaReg)); + regCache_.Release(invAlphaReg, RegCache::VEC_TEMP1); + + // Now sum, and divide. + PADDW(resultReg, R(tempReg)); + if (id.useColorDoubling) + PSRLW(resultReg, 7); + else + PSRLW(resultReg, 8); + } + useAlphaFrom(primColorReg); + break; + + case GE_TEXFUNC_BLEND: + { + PACKSSDW(primColorReg, R(primColorReg)); + + // Start out with the prim color side. Materialize a 255 to inverse resultReg and round. + PCMPEQD(tempReg, R(tempReg)); + PSRLW(tempReg, 8); + + // We're going to lose tempReg, so save the 255s. + X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP1); + MOVDQA(roundValueReg, R(tempReg)); + + PSUBW(tempReg, R(resultReg)); + PMULLW(tempReg, R(primColorReg)); + // Okay, now add the rounding value. + PADDW(tempReg, R(roundValueReg)); + regCache_.Release(roundValueReg, RegCache::VEC_TEMP1); + + if (id.useTextureAlpha) { + // Before we modify the texture color, let's calculate alpha. + PADDW(primColorReg, M(constOnes16_)); + PMULLW(primColorReg, R(resultReg)); + // We divide later. + } + + X64Reg gstateReg = GetGState(); + X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1); + if (cpu_info.bSSE4_1) { + PMOVZXBW(texEnvReg, MDisp(gstateReg, offsetof(GPUgstate, texenvcolor))); + } else { + MOVD_xmm(texEnvReg, MDisp(gstateReg, offsetof(GPUgstate, texenvcolor))); + X64Reg zeroReg = GetZeroVec(); + PUNPCKLBW(texEnvReg, R(zeroReg)); + regCache_.Unlock(zeroReg, RegCache::VEC_ZERO); + } + PMULLW(resultReg, R(texEnvReg)); + regCache_.Release(texEnvReg, RegCache::VEC_TEMP1); + regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE); + + // Add in the prim color side and divide. + PADDW(resultReg, R(tempReg)); + if (id.useColorDoubling) + PSRLW(resultReg, 7); + else + PSRLW(resultReg, 8); + + if (id.useTextureAlpha) { + // We put the alpha in here, just need to divide it after that multiply. + PSRLW(primColorReg, 8); + } + useAlphaFrom(primColorReg); + break; + } + + case GE_TEXFUNC_REPLACE: + if (id.useColorDoubling && id.useTextureAlpha) { + // We can abuse primColorReg as a temp. + MOVDQA(primColorReg, R(resultReg)); + // Shift to zero out alpha in resultReg. + PSLLDQ(resultReg, 10); + PSRLDQ(resultReg, 10); + // Now simply add them together, restoring alpha and doubling the colors. + PADDW(resultReg, R(primColorReg)); + } else if (!id.useTextureAlpha) { + if (id.useColorDoubling) { + // Let's just double using shifting. Ignore alpha. + PSLLW(resultReg, 1); + } + // Now we want prim_color in W, so convert, then shift-mask away the color. + PACKSSDW(primColorReg, R(primColorReg)); + useAlphaFrom(primColorReg); + } + break; + + case GE_TEXFUNC_ADD: + case GE_TEXFUNC_UNKNOWN1: + case GE_TEXFUNC_UNKNOWN2: + case GE_TEXFUNC_UNKNOWN3: + PACKSSDW(primColorReg, R(primColorReg)); + if (id.useTextureAlpha) { + MOVDQA(tempReg, M(constOnes16_)); + // Add and multiply the alpha (and others, but we'll mask them.) + PADDW(tempReg, R(primColorReg)); + PMULLW(tempReg, R(resultReg)); + + // Now that we've extracted alpha, sum and double as needed. + PADDW(resultReg, R(primColorReg)); + if (id.useColorDoubling) + PSLLW(resultReg, 1); + + // Divide by 256 to normalize alpha. + PSRLW(tempReg, 8); + useAlphaFrom(tempReg); + } else { + PADDW(resultReg, R(primColorReg)); + if (id.useColorDoubling) + PSLLW(resultReg, 1); + useAlphaFrom(primColorReg); + } + break; + } + + regCache_.Release(tempReg, RegCache::VEC_TEMP0); + regCache_.Unlock(resultReg, RegCache::VEC_RESULT); + regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR); + regCache_.ForceRelease(RegCache::VEC_ARG_COLOR); + return true; +} + bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) { GETextureFormat fmt = id.TexFmt(); bool success = true; @@ -1478,7 +1665,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) { if (constWidth256f_ == nullptr) { // We have to figure out levels and the proper width, ugh. X64Reg shiftReg = regCache_.Find(RegCache::GEN_SHIFTVAL); - X64Reg gstateReg = regCache_.Alloc(RegCache::GEN_GSTATE); + X64Reg gstateReg = GetGState(); X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0); X64Reg levelReg = INVALID_REG; @@ -1544,7 +1731,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) { regCache_.Unlock(v1Reg, RegCache::VEC_V1); // Now just subtract one. We use this later for clamp/wrap. - MOVDQA(tempVecReg, M(constOnes_)); + MOVDQA(tempVecReg, M(constOnes32_)); PSUBD(width0VecReg, R(tempVecReg)); PSUBD(height0VecReg, R(tempVecReg)); PSUBD(width1VecReg, R(tempVecReg));