softgpu: Cache texture bufws at 16 bit.

Reducing the size of state a bit.
This commit is contained in:
Unknown W. Brackets 2022-09-12 21:57:00 -07:00
parent b2e6a086dc
commit 167213c746
9 changed files with 34 additions and 33 deletions

View file

@ -242,7 +242,7 @@ bool BinManager::HasTextureWrite(const RasterizerState &state) {
if (!state.enableTextures)
return false;
const int textureBits = textureBitsPerPixel[state.samplerID.texfmt];
const uint8_t textureBits = textureBitsPerPixel[state.samplerID.texfmt];
for (int i = 0; i <= state.maxTexLevel; ++i) {
int byteStride = (state.texbufw[i] * textureBits) / 8;
int byteWidth = (state.samplerID.cached.sizes[i].w * textureBits) / 8;

View file

@ -116,7 +116,7 @@ void ComputeRasterizerState(RasterizerState *state, bool throughMode) {
for (uint8_t i = 0; i <= state->maxTexLevel; i++) {
u32 texaddr = gstate.getTextureAddress(i);
state->texaddr[i] = texaddr;
state->texbufw[i] = GetTextureBufw(i, texaddr, texfmt);
state->texbufw[i] = (uint16_t)GetTextureBufw(i, texaddr, texfmt);
if (Memory::IsValidAddress(texaddr))
state->texptr[i] = Memory::GetPointerUnchecked(texaddr);
else
@ -413,7 +413,7 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc
static inline Vec4IntResult SOFTRAST_CALL ApplyTexturing(float s, float t, int x, int y, Vec4IntArg prim_color, int texlevel, int frac_texlevel, bool bilinear, const RasterizerState &state) {
const u8 **tptr0 = const_cast<const u8 **>(&state.texptr[texlevel]);
const int *bufw0 = &state.texbufw[texlevel];
const uint16_t *bufw0 = &state.texbufw[texlevel];
if (!bilinear) {
return state.nearest(s, t, x, y, prim_color, tptr0, bufw0, texlevel, frac_texlevel, state.samplerID);
@ -1468,7 +1468,7 @@ bool GetCurrentTexture(GPUDebugBuffer &buffer, int level)
GETextureFormat texfmt = gstate.getTextureFormat();
u32 texaddr = gstate.getTextureAddress(level);
int texbufw = GetTextureBufw(level, texaddr, texfmt);
u32 texbufw = GetTextureBufw(level, texaddr, texfmt);
int w = gstate.getTextureWidth(level);
int h = gstate.getTextureHeight(level);

View file

@ -39,7 +39,7 @@ struct RasterizerState {
Sampler::LinearFunc linear;
Sampler::NearestFunc nearest;
uint32_t texaddr[8]{};
int texbufw[8]{};
uint16_t texbufw[8]{};
const u8 *texptr[8]{};
float textureLodSlope;
int screenOffsetX;

View file

@ -103,7 +103,7 @@ void DrawSprite(const VertexData &v0, const VertexData &v1, const BinCoords &ran
const u8 *texptr = state.texptr[0];
GETextureFormat texfmt = state.samplerID.TexFmt();
int texbufw = state.texbufw[0];
uint16_t texbufw = state.texbufw[0];
Sampler::FetchFunc fetchFunc = Sampler::GetFetchFunc(state.samplerID);
auto &pixelID = state.pixelID;

View file

@ -38,8 +38,8 @@ using namespace Rasterizer;
namespace Sampler {
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID);
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID);
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID);
std::mutex jitCacheLock;
@ -281,7 +281,7 @@ struct Nearest4 {
};
template <int N>
inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N], const u8 *srcptr, int texbufw, int level, const SamplerID &samplerID) {
inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N], const u8 *srcptr, uint16_t texbufw, int level, const SamplerID &samplerID) {
Nearest4 res;
if (!srcptr) {
memset(res.v, 0, sizeof(res.v));
@ -535,7 +535,7 @@ Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, V
return ToVec4IntResult(Vec4<int>(out_rgb, out_a));
}
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID) {
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID) {
int u, v;
// Nearest filtering only. Round texcoords.
@ -631,7 +631,7 @@ static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadT(int level, fl
return ApplyTexelClampQuadT(samplerID.clampT, base_v, height);
}
static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, int x, int y, const u8 *const *tptr, const int *bufw, int texlevel, const SamplerID &samplerID) {
static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, int x, int y, const u8 *const *tptr, const uint16_t *bufw, int texlevel, const SamplerID &samplerID) {
int frac_u, frac_v;
const Vec4<int> u = GetTexelCoordinatesQuadS(texlevel, s, frac_u, x, samplerID);
const Vec4<int> v = GetTexelCoordinatesQuadT(texlevel, t, frac_v, y, samplerID);
@ -646,7 +646,7 @@ static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, int x, in
return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) / (16 * 16));
}
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int texlevel, int levelFrac, const SamplerID &samplerID) {
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, int x, int y, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int texlevel, int levelFrac, const SamplerID &samplerID) {
Vec4<int> c0 = SampleLinearLevel(s, t, x, y, tptr, bufw, texlevel, samplerID);
if (levelFrac) {
const Vec4<int> c1 = SampleLinearLevel(s, t, x, y, tptr + 1, bufw + 1, texlevel + 1, samplerID);

View file

@ -36,10 +36,10 @@ namespace Sampler {
typedef Rasterizer::Vec4IntResult(SOFTRAST_CALL *FetchFunc)(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID);
FetchFunc GetFetchFunc(SamplerID id);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *NearestFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
NearestFunc GetNearestFunc(SamplerID id);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 *const *tptr, const int *bufw, int level, int levelFrac, const SamplerID &samplerID);
typedef Rasterizer::Vec4IntResult (SOFTRAST_CALL *LinearFunc)(float s, float t, int x, int y, Rasterizer::Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
LinearFunc GetLinearFunc(SamplerID id);
void Init();

View file

@ -246,7 +246,7 @@ NearestFunc SamplerJitCache::CompileNearest(const SamplerID &id) {
auto loadPtrs = [&](bool level1) {
X64Reg bufwReg = regCache_.Alloc(RegCache::GEN_ARG_BUFW);
X64Reg bufwPtrReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
MOV(32, R(bufwReg), MDisp(bufwPtrReg, level1 ? 4 : 0));
MOVZX(32, 16, bufwReg, MDisp(bufwPtrReg, level1 ? 2 : 0));
regCache_.Unlock(bufwPtrReg, RegCache::GEN_ARG_BUFW_PTR);
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW);
regCache_.ForceRetain(RegCache::GEN_ARG_BUFW);
@ -713,7 +713,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
X64Reg srcReg = regCache_.Find(RegCache::GEN_ARG_TEXPTR_PTR);
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
ADD(64, R(srcArgReg), MDisp(srcReg, level1 ? 8 : 0));
MOV(32, R(bufwArgReg), MDisp(bufwReg, level1 ? 4 : 0));
MOVZX(32, 16, bufwArgReg, MDisp(bufwReg, level1 ? 2 : 0));
// Leave level/levelFrac, we just always load from RAM on Windows and lock on POSIX.
regCache_.Unlock(srcReg, RegCache::GEN_ARG_TEXPTR_PTR);
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
@ -2995,12 +2995,13 @@ bool SamplerJitCache::Jit_PrepareDataDirectOffsets(const SamplerID &id, RegCache
if (!id.useStandardBufw || id.hasAnyMips) {
// Spread bufw into each lane.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
if (cpu_info.bAVX2) {
VPBROADCASTD(128, bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
if (cpu_info.bSSE4_1) {
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
} else {
MOVD_xmm(bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
PXOR(bufwVecReg, R(bufwVecReg));
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
}
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
if (bitsPerTexel == 4)
@ -3070,12 +3071,13 @@ bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCac
if (!id.useStandardBufw || id.hasAnyMips) {
// Spread bufw into each lane.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
if (cpu_info.bAVX2) {
VPBROADCASTD(128, bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
if (cpu_info.bSSE4_1) {
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
} else {
MOVD_xmm(bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
PXOR(bufwVecReg, R(bufwVecReg));
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
}
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
}
@ -3162,12 +3164,13 @@ bool SamplerJitCache::Jit_PrepareDataDXTOffsets(const SamplerID &id, Rasterizer:
if (!id.useStandardBufw || id.hasAnyMips) {
// Spread bufw into each lane.
X64Reg bufwReg = regCache_.Find(RegCache::GEN_ARG_BUFW_PTR);
if (cpu_info.bAVX2) {
VPBROADCASTD(128, bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
if (cpu_info.bSSE4_1) {
PMOVZXWD(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0));
} else {
MOVD_xmm(bufwVecReg, MDisp(bufwReg, level1 ? 4 : 0));
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
PXOR(bufwVecReg, R(bufwVecReg));
PINSRW(bufwVecReg, MDisp(bufwReg, level1 ? 2 : 0), 0);
}
PSHUFD(bufwVecReg, R(bufwVecReg), _MM_SHUFFLE(0, 0, 0, 0));
regCache_.Unlock(bufwReg, RegCache::GEN_ARG_BUFW_PTR);
// Divide by 4 before the multiply.

View file

@ -374,8 +374,7 @@ enum GEMatrixType {
GE_MTX_TEXGEN,
};
enum GEComparison
{
enum GEComparison : uint8_t {
GE_COMP_NEVER = 0,
GE_COMP_ALWAYS = 1,
GE_COMP_EQUAL = 2,
@ -578,8 +577,7 @@ enum GEPrimitiveType
GE_PRIM_INVALID = -1,
};
enum GELogicOp
{
enum GELogicOp : uint8_t {
GE_LOGIC_CLEAR = 0,
GE_LOGIC_AND = 1,
GE_LOGIC_AND_REVERSE = 2,

View file

@ -48,7 +48,7 @@ static bool TestSamplerJit() {
bool header = false;
u8 **tptr = new u8 *[8];
int *bufw = new int[8];
uint16_t *bufw = new uint16_t[8];
u8 *clut = new u8[1024];
memset(clut, 0, 1024);