samplerjit: Move texture function into jit.

Could do this also for nearest, might end up with a third set of functions
there for a direct sample lookup (for debug funcs.)
This commit is contained in:
Unknown W. Brackets 2021-12-28 17:52:17 -08:00
parent 940e6bb1d7
commit 74eb450e76
4 changed files with 209 additions and 17 deletions

View file

@ -268,8 +268,8 @@ Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, V
case GE_TEXFUNC_DECAL:
{
if (rgba) {
int t = (rgba) ? texcolor.a() : 255;
int invt = (rgba) ? 255 - t : 0;
int t = texcolor.a();
int invt = 255 - t;
// Both colors are boosted here, making the alpha have more weight.
Vec3<int> one = Vec3<int>::AssignToAll(1);
out_rgb = ((prim_color.rgb() + one) * invt + (texcolor.rgb() + one) * t);
@ -537,11 +537,11 @@ static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(Sampler::Funcs sampler,
Vec4<int> texcolor1 = Vec4<int>(sampler.nearest(u[1], v[1], tptr0[1], bufw0[1], texlevel + 1));
texcolor0 = (texcolor1 * frac_texlevel + texcolor0 * (16 - frac_texlevel)) / 16;
}
} else {
texcolor0 = Vec4<int>(sampler.linear(s, t, x, y, prim_color, tptr0, bufw0, mayHaveMipLevels ? texlevel : 0, mayHaveMipLevels ? frac_texlevel : 0));
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(texcolor0));
}
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(texcolor0));
return sampler.linear(s, t, x, y, prim_color, tptr0, bufw0, mayHaveMipLevels ? texlevel : 0, mayHaveMipLevels ? frac_texlevel : 0);
}
template <bool mayHaveMipLevels>

View file

@ -24,6 +24,7 @@
#include "Core/Reporting.h"
#include "GPU/Common/TextureDecoder.h"
#include "GPU/GPUState.h"
#include "GPU/Software/Rasterizer.h"
#include "GPU/Software/RasterizerRegCache.h"
#include "GPU/Software/Sampler.h"
@ -578,7 +579,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y,
const Vec4<int> c1 = SampleLinearLevel(s, t, x, y, tptr + 1, bufw + 1, texlevel + 1);
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) / 16;
}
return ToVec4IntResult(c0);
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0));
}
};

View file

@ -74,6 +74,7 @@ private:
LinearFunc CompileLinear(const SamplerID &id);
Rasterizer::RegCache::Reg GetZeroVec();
Rasterizer::RegCache::Reg GetGState();
bool Jit_ReadTextureFormat(const SamplerID &id);
bool Jit_GetTexData(const SamplerID &id, int bitsPerTexel);
@ -93,6 +94,8 @@ private:
bool Jit_PrepareDataSwizzledOffsets(const SamplerID &id, Rasterizer::RegCache::Reg uReg, Rasterizer::RegCache::Reg vReg, int bitsPerTexel);
bool Jit_BlendQuad(const SamplerID &id, bool level1);
bool Jit_ApplyTextureFunc(const SamplerID &id);
#if PPSSPP_ARCH(ARM64)
Arm64Gen::ARM64FloatEmitter fp;
#elif PPSSPP_ARCH(AMD64) || PPSSPP_ARCH(X86)
@ -106,7 +109,8 @@ private:
const u8 *constHeightMinus1i_ = nullptr;
const u8 *constUNext_ = nullptr;
const u8 *constVNext_ = nullptr;
const u8 *constOnes_ = nullptr;
const u8 *constOnes32_ = nullptr;
const u8 *constOnes16_ = nullptr;
const u8 *const10Low_ = nullptr;
const u8 *const10All_ = nullptr;

View file

@ -188,18 +188,20 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
constHeightMinus1i_ = AlignCode16();
Write32((1 << id.height0Shift) - 1); Write32((1 << id.height0Shift) - 1);
Write32((1 << id.height0Shift) - 1); Write32((1 << id.height0Shift) - 1);
constOnes_ = nullptr;
} else {
constWidth256f_ = nullptr;
constHeight256f_ = nullptr;
constWidthMinus1i_ = nullptr;
constHeightMinus1i_ = nullptr;
constOnes_ = AlignCode16();
Write32(1); Write32(1); Write32(1); Write32(1);
}
constOnes32_ = AlignCode16();
Write32(1); Write32(1); Write32(1); Write32(1);
constOnes16_ = AlignCode16();
Write16(1); Write16(1); Write16(1); Write16(1);
Write16(1); Write16(1); Write16(1); Write16(1);
constUNext_ = AlignCode16();
Write32(0); Write32(1); Write32(0); Write32(1);
@ -490,6 +492,9 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
SetJumpTarget(skip);
}
// Finally, it's time to apply the texture function.
success = success && Jit_ApplyTextureFunc(id);
// Last of all, convert to 32-bit channels.
if (cpu_info.bSSE4_1) {
PMOVZXWD(XMM0, R(XMM0));
@ -499,9 +504,6 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
// TODO: Actually use these at some point.
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::VEC_RESULT);
if (!success) {
@ -544,6 +546,15 @@ RegCache::Reg SamplerJitCache::GetZeroVec() {
return regCache_.Find(RegCache::VEC_ZERO);
}
RegCache::Reg SamplerJitCache::GetGState() {
if (!regCache_.Has(RegCache::GEN_GSTATE)) {
X64Reg r = regCache_.Alloc(RegCache::GEN_GSTATE);
MOV(PTRBITS, R(r), ImmPtr(&gstate.nop));
return r;
}
return regCache_.Find(RegCache::GEN_GSTATE);
}
bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
// First put the top RRRRRRRR LLLLLLLL into topReg, bottom into bottomReg.
// Start with XXXX XXXX RRRR LLLL, and then expand 8 bits to 16 bits.
@ -653,6 +664,182 @@ bool SamplerJitCache::Jit_BlendQuad(const SamplerID &id, bool level1) {
return true;
}
bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {
X64Reg resultReg = regCache_.Find(RegCache::VEC_RESULT);
X64Reg primColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP0);
auto useAlphaFrom = [&](X64Reg alphaColorReg) {
PSRLDQ(alphaColorReg, 6);
PSLLDQ(alphaColorReg, 6);
// Zero out the result alpha and OR them together.
PSLLDQ(resultReg, 10);
PSRLDQ(resultReg, 10);
POR(resultReg, R(alphaColorReg));
};
// Note: color is in DWORDs, but result is in WORDs.
switch (id.TexFunc()) {
case GE_TEXFUNC_MODULATE:
PACKSSDW(primColorReg, R(primColorReg));
MOVDQA(tempReg, M(constOnes16_));
PADDW(tempReg, R(primColorReg));
// Okay, time to multiply. This produces 16 bits, neatly.
PMULLW(resultReg, R(tempReg));
if (id.useColorDoubling)
PSRLW(resultReg, 7);
else
PSRLW(resultReg, 8);
if (!id.useTextureAlpha) {
useAlphaFrom(primColorReg);
} else if (id.useColorDoubling) {
// We still need to finish dividing alpha, it's currently doubled (frmo the 7 above.)
MOVDQA(primColorReg, R(resultReg));
PSRLW(primColorReg, 1);
useAlphaFrom(primColorReg);
}
break;
case GE_TEXFUNC_DECAL:
PACKSSDW(primColorReg, R(primColorReg));
if (id.useTextureAlpha) {
// Get alpha into the tempReg.
PSHUFLW(tempReg, R(resultReg), _MM_SHUFFLE(3, 3, 3, 3));
PADDW(resultReg, M(constOnes16_));
PMULLW(resultReg, R(tempReg));
X64Reg invAlphaReg = regCache_.Alloc(RegCache::VEC_TEMP1);
// Materialize some 255s, and subtract out alpha.
PCMPEQD(invAlphaReg, R(invAlphaReg));
PSRLW(invAlphaReg, 8);
PSUBW(invAlphaReg, R(tempReg));
MOVDQA(tempReg, R(primColorReg));
PADDW(tempReg, M(constOnes16_));
PMULLW(tempReg, R(invAlphaReg));
regCache_.Release(invAlphaReg, RegCache::VEC_TEMP1);
// Now sum, and divide.
PADDW(resultReg, R(tempReg));
if (id.useColorDoubling)
PSRLW(resultReg, 7);
else
PSRLW(resultReg, 8);
}
useAlphaFrom(primColorReg);
break;
case GE_TEXFUNC_BLEND:
{
PACKSSDW(primColorReg, R(primColorReg));
// Start out with the prim color side. Materialize a 255 to inverse resultReg and round.
PCMPEQD(tempReg, R(tempReg));
PSRLW(tempReg, 8);
// We're going to lose tempReg, so save the 255s.
X64Reg roundValueReg = regCache_.Alloc(RegCache::VEC_TEMP1);
MOVDQA(roundValueReg, R(tempReg));
PSUBW(tempReg, R(resultReg));
PMULLW(tempReg, R(primColorReg));
// Okay, now add the rounding value.
PADDW(tempReg, R(roundValueReg));
regCache_.Release(roundValueReg, RegCache::VEC_TEMP1);
if (id.useTextureAlpha) {
// Before we modify the texture color, let's calculate alpha.
PADDW(primColorReg, M(constOnes16_));
PMULLW(primColorReg, R(resultReg));
// We divide later.
}
X64Reg gstateReg = GetGState();
X64Reg texEnvReg = regCache_.Alloc(RegCache::VEC_TEMP1);
if (cpu_info.bSSE4_1) {
PMOVZXBW(texEnvReg, MDisp(gstateReg, offsetof(GPUgstate, texenvcolor)));
} else {
MOVD_xmm(texEnvReg, MDisp(gstateReg, offsetof(GPUgstate, texenvcolor)));
X64Reg zeroReg = GetZeroVec();
PUNPCKLBW(texEnvReg, R(zeroReg));
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
}
PMULLW(resultReg, R(texEnvReg));
regCache_.Release(texEnvReg, RegCache::VEC_TEMP1);
regCache_.Unlock(gstateReg, RegCache::GEN_GSTATE);
// Add in the prim color side and divide.
PADDW(resultReg, R(tempReg));
if (id.useColorDoubling)
PSRLW(resultReg, 7);
else
PSRLW(resultReg, 8);
if (id.useTextureAlpha) {
// We put the alpha in here, just need to divide it after that multiply.
PSRLW(primColorReg, 8);
}
useAlphaFrom(primColorReg);
break;
}
case GE_TEXFUNC_REPLACE:
if (id.useColorDoubling && id.useTextureAlpha) {
// We can abuse primColorReg as a temp.
MOVDQA(primColorReg, R(resultReg));
// Shift to zero out alpha in resultReg.
PSLLDQ(resultReg, 10);
PSRLDQ(resultReg, 10);
// Now simply add them together, restoring alpha and doubling the colors.
PADDW(resultReg, R(primColorReg));
} else if (!id.useTextureAlpha) {
if (id.useColorDoubling) {
// Let's just double using shifting. Ignore alpha.
PSLLW(resultReg, 1);
}
// Now we want prim_color in W, so convert, then shift-mask away the color.
PACKSSDW(primColorReg, R(primColorReg));
useAlphaFrom(primColorReg);
}
break;
case GE_TEXFUNC_ADD:
case GE_TEXFUNC_UNKNOWN1:
case GE_TEXFUNC_UNKNOWN2:
case GE_TEXFUNC_UNKNOWN3:
PACKSSDW(primColorReg, R(primColorReg));
if (id.useTextureAlpha) {
MOVDQA(tempReg, M(constOnes16_));
// Add and multiply the alpha (and others, but we'll mask them.)
PADDW(tempReg, R(primColorReg));
PMULLW(tempReg, R(resultReg));
// Now that we've extracted alpha, sum and double as needed.
PADDW(resultReg, R(primColorReg));
if (id.useColorDoubling)
PSLLW(resultReg, 1);
// Divide by 256 to normalize alpha.
PSRLW(tempReg, 8);
useAlphaFrom(tempReg);
} else {
PADDW(resultReg, R(primColorReg));
if (id.useColorDoubling)
PSLLW(resultReg, 1);
useAlphaFrom(primColorReg);
}
break;
}
regCache_.Release(tempReg, RegCache::VEC_TEMP0);
regCache_.Unlock(resultReg, RegCache::VEC_RESULT);
regCache_.Unlock(primColorReg, RegCache::VEC_ARG_COLOR);
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
return true;
}
bool SamplerJitCache::Jit_ReadTextureFormat(const SamplerID &id) {
GETextureFormat fmt = id.TexFmt();
bool success = true;
@ -1478,7 +1665,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
if (constWidth256f_ == nullptr) {
// We have to figure out levels and the proper width, ugh.
X64Reg shiftReg = regCache_.Find(RegCache::GEN_SHIFTVAL);
X64Reg gstateReg = regCache_.Alloc(RegCache::GEN_GSTATE);
X64Reg gstateReg = GetGState();
X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
X64Reg levelReg = INVALID_REG;
@ -1544,7 +1731,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
regCache_.Unlock(v1Reg, RegCache::VEC_V1);
// Now just subtract one. We use this later for clamp/wrap.
MOVDQA(tempVecReg, M(constOnes_));
MOVDQA(tempVecReg, M(constOnes32_));
PSUBD(width0VecReg, R(tempVecReg));
PSUBD(height0VecReg, R(tempVecReg));
PSUBD(width1VecReg, R(tempVecReg));