softjit: Precalculate write mask and dither.

This is slightly abusing PixelFuncID, but the intent is to provide some
memory that's easily accessible from the jit func, but still associated
with that calculation (i.e. not global.)
This commit is contained in:
Unknown W. Brackets 2021-11-26 10:12:54 -08:00
parent 4e6a5ce760
commit 1f9dc3a568
3 changed files with 80 additions and 0 deletions

View file

@ -38,6 +38,9 @@ static const X64Reg argZReg = R8;
static const X64Reg argFogReg = R9;
static const X64Reg argColorReg = XMM4;
// Windows reserves space to save args, 1 xmm + 4 ints before the id.
static const OpArg mArgID = MDisp(RSP, 1 * 16 + 4 * PTRBITS / 8);
// Must save: RBX, RSP, RBP, RDI, RSI, R12-R15, XMM6-15
#else
static const X64Reg argXReg = RDI;
@ -46,6 +49,9 @@ static const X64Reg argZReg = RDX;
static const X64Reg argFogReg = RCX;
static const X64Reg argColorReg = XMM0;
// Here we just have the return and padding to align RPB.
static const OpArg mArgID = MDisp(RSP, 16);
// Must save: RBX, RSP, RBP, R12-R15
#endif
@ -1293,14 +1299,18 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {
if (!id.dithering)
return true;
#ifndef SOFTPIXEL_USE_CACHE
X64Reg gstateReg = GetGState();
#endif
X64Reg valueReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_GEN);
// Load the row dither matrix entry (will still need to get the X.)
MOV(32, R(valueReg), R(argYReg));
AND(32, R(valueReg), Imm8(3));
#ifndef SOFTPIXEL_USE_CACHE
MOVZX(32, 16, valueReg, MComplex(gstateReg, valueReg, 4, offsetof(GPUgstate, dithmtx)));
regCache_.Unlock(gstateReg, PixelRegCache::T_GEN);
#endif
// At this point, we're done with depth and y, so let's grab COLOR_OFF and lock it.
// Then we can modify x and throw it away too, which is our actual goal.
@ -1309,6 +1319,8 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {
regCache_.Release(argYReg, PixelRegCache::T_GEN);
AND(32, R(argXReg), Imm32(3));
#ifndef SOFTPIXEL_USE_CACHE
SHL(32, R(argXReg), Imm8(2));
// Conveniently, this is ECX on Windows, but otherwise we need to swap it.
@ -1337,6 +1349,16 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {
SHL(32, R(valueReg), Imm8(4));
MOVSX(32, 8, valueReg, R(valueReg));
SAR(8, R(valueReg), Imm8(4));
#else
// Sum up (x + y * 4) * 2 + ditherMatrix offset to valueReg.
SHL(32, R(argXReg), Imm8(1));
LEA(32, valueReg, MComplex(argXReg, valueReg, 8, offsetof(PixelFuncID, cached.ditherMatrix)));
// Okay, now abuse argXReg to read the PixelFuncID pointer on the stack.
MOV(PTRBITS, R(argXReg), mArgID);
MOVSX(32, 16, valueReg, MRegSum(argXReg, valueReg));
regCache_.Release(argXReg, PixelRegCache::T_GEN);
#endif
// Copy that value into a vec to add to the color.
X64Reg vecValueReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_VEC);
@ -1461,6 +1483,7 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
// Note that we apply the write mask at the destination bit depth.
X64Reg maskReg = INVALID_REG;
if (id.applyColorWriteMask) {
#ifndef SOFTPIXEL_USE_CACHE
X64Reg gstateReg = GetGState();
maskReg = regCache_.Alloc(PixelRegCache::TEMP3, PixelRegCache::T_GEN);
@ -1496,6 +1519,12 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
OR(32, R(maskReg), Imm32(fixedKeepMask));
break;
}
#else
maskReg = regCache_.Alloc(PixelRegCache::TEMP3, PixelRegCache::T_GEN);
// Load the pre-converted and combined write mask.
MOV(PTRBITS, R(maskReg), mArgID);
MOV(32, R(maskReg), MDisp(maskReg, offsetof(PixelFuncID, cached.colorWriteMask)));
#endif
}
// We've run out of regs, let's live without temp2 from here on.

View file

@ -15,12 +15,17 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "Common/Data/Convert/ColorConv.h"
#include "Common/StringUtils.h"
#include "GPU/Software/FuncId.h"
#include "GPU/GPUState.h"
static_assert(sizeof(SamplerID) == sizeof(SamplerID::fullKey), "Bad sampler ID size");
#ifdef SOFTPIXEL_USE_CACHE
static_assert(sizeof(PixelFuncID) == sizeof(PixelFuncID::fullKey) + sizeof(PixelFuncID::cached), "Bad pixel func ID size");
#else
static_assert(sizeof(PixelFuncID) == sizeof(PixelFuncID::fullKey), "Bad pixel func ID size");
#endif
void ComputePixelFuncID(PixelFuncID *id) {
id->fullKey = 0;
@ -93,6 +98,38 @@ void ComputePixelFuncID(PixelFuncID *id) {
id->applyLogicOp = gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY;
id->applyFog = gstate.isFogEnabled() && !gstate.isModeThrough();
}
// Cache some values for later convenience.
if (id->dithering) {
for (int y = 0; y < 4; ++y) {
for (int x = 0; x < 4; ++x)
id->cached.ditherMatrix[y * 4 + x] = gstate.getDitherValue(x, y);
}
}
if (id->applyColorWriteMask) {
uint32_t mask = gstate.getColorMask();
// This flag means stencil clear or stencil test, basically whether writing to stencil.
if (!id->stencilTest)
mask |= 0xFF000000;
switch (id->fbFormat) {
case GE_FORMAT_565:
id->cached.colorWriteMask = RGBA8888ToRGB565(mask);
break;
case GE_FORMAT_5551:
id->cached.colorWriteMask = RGBA8888ToRGBA5551(mask);
break;
case GE_FORMAT_4444:
id->cached.colorWriteMask = RGBA8888ToRGBA4444(mask);
break;
case GE_FORMAT_8888:
id->cached.colorWriteMask = mask;
break;
}
}
}
std::string DescribePixelFuncID(const PixelFuncID &id) {

View file

@ -23,10 +23,22 @@
#include "GPU/ge_constants.h"
#define SOFTPIXEL_USE_CACHE 1
#pragma pack(push, 1)
struct PixelFuncID {
PixelFuncID() {
}
#ifdef SOFTPIXEL_USE_CACHE
struct {
// Warning: these are not hashed or compared for equal. Just cached values.
uint32_t colorWriteMask{};
int16_t ditherMatrix[16]{};
} cached;
#endif
union {
uint64_t fullKey{};
struct {
@ -120,6 +132,8 @@ struct PixelFuncID {
}
};
#pragma pack(pop)
struct SamplerID {
SamplerID() : fullKey(0) {
}