diff --git a/GPU/Software/DrawPixelX86.cpp b/GPU/Software/DrawPixelX86.cpp index 4aaf012094..ad1ad8b25c 100644 --- a/GPU/Software/DrawPixelX86.cpp +++ b/GPU/Software/DrawPixelX86.cpp @@ -38,6 +38,9 @@ static const X64Reg argZReg = R8; static const X64Reg argFogReg = R9; static const X64Reg argColorReg = XMM4; +// Windows reserves space to save args, 1 xmm + 4 ints before the id. +static const OpArg mArgID = MDisp(RSP, 1 * 16 + 4 * PTRBITS / 8); + // Must save: RBX, RSP, RBP, RDI, RSI, R12-R15, XMM6-15 #else static const X64Reg argXReg = RDI; @@ -46,6 +49,9 @@ static const X64Reg argZReg = RDX; static const X64Reg argFogReg = RCX; static const X64Reg argColorReg = XMM0; +// Here we just have the return and padding to align RPB. +static const OpArg mArgID = MDisp(RSP, 16); + // Must save: RBX, RSP, RBP, R12-R15 #endif @@ -1293,14 +1299,18 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { if (!id.dithering) return true; +#ifndef SOFTPIXEL_USE_CACHE X64Reg gstateReg = GetGState(); +#endif X64Reg valueReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_GEN); // Load the row dither matrix entry (will still need to get the X.) MOV(32, R(valueReg), R(argYReg)); AND(32, R(valueReg), Imm8(3)); +#ifndef SOFTPIXEL_USE_CACHE MOVZX(32, 16, valueReg, MComplex(gstateReg, valueReg, 4, offsetof(GPUgstate, dithmtx))); regCache_.Unlock(gstateReg, PixelRegCache::T_GEN); +#endif // At this point, we're done with depth and y, so let's grab COLOR_OFF and lock it. // Then we can modify x and throw it away too, which is our actual goal. @@ -1309,6 +1319,8 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { regCache_.Release(argYReg, PixelRegCache::T_GEN); AND(32, R(argXReg), Imm32(3)); + +#ifndef SOFTPIXEL_USE_CACHE SHL(32, R(argXReg), Imm8(2)); // Conveniently, this is ECX on Windows, but otherwise we need to swap it. @@ -1337,6 +1349,16 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { SHL(32, R(valueReg), Imm8(4)); MOVSX(32, 8, valueReg, R(valueReg)); SAR(8, R(valueReg), Imm8(4)); +#else + // Sum up (x + y * 4) * 2 + ditherMatrix offset to valueReg. + SHL(32, R(argXReg), Imm8(1)); + LEA(32, valueReg, MComplex(argXReg, valueReg, 8, offsetof(PixelFuncID, cached.ditherMatrix))); + + // Okay, now abuse argXReg to read the PixelFuncID pointer on the stack. + MOV(PTRBITS, R(argXReg), mArgID); + MOVSX(32, 16, valueReg, MRegSum(argXReg, valueReg)); + regCache_.Release(argXReg, PixelRegCache::T_GEN); +#endif // Copy that value into a vec to add to the color. X64Reg vecValueReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_VEC); @@ -1461,6 +1483,7 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) { // Note that we apply the write mask at the destination bit depth. X64Reg maskReg = INVALID_REG; if (id.applyColorWriteMask) { +#ifndef SOFTPIXEL_USE_CACHE X64Reg gstateReg = GetGState(); maskReg = regCache_.Alloc(PixelRegCache::TEMP3, PixelRegCache::T_GEN); @@ -1496,6 +1519,12 @@ bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) { OR(32, R(maskReg), Imm32(fixedKeepMask)); break; } +#else + maskReg = regCache_.Alloc(PixelRegCache::TEMP3, PixelRegCache::T_GEN); + // Load the pre-converted and combined write mask. + MOV(PTRBITS, R(maskReg), mArgID); + MOV(32, R(maskReg), MDisp(maskReg, offsetof(PixelFuncID, cached.colorWriteMask))); +#endif } // We've run out of regs, let's live without temp2 from here on. diff --git a/GPU/Software/FuncId.cpp b/GPU/Software/FuncId.cpp index 8c9b4b4e93..08a987b178 100644 --- a/GPU/Software/FuncId.cpp +++ b/GPU/Software/FuncId.cpp @@ -15,12 +15,17 @@ // Official git repository and contact information can be found at // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/. +#include "Common/Data/Convert/ColorConv.h" #include "Common/StringUtils.h" #include "GPU/Software/FuncId.h" #include "GPU/GPUState.h" static_assert(sizeof(SamplerID) == sizeof(SamplerID::fullKey), "Bad sampler ID size"); +#ifdef SOFTPIXEL_USE_CACHE +static_assert(sizeof(PixelFuncID) == sizeof(PixelFuncID::fullKey) + sizeof(PixelFuncID::cached), "Bad pixel func ID size"); +#else static_assert(sizeof(PixelFuncID) == sizeof(PixelFuncID::fullKey), "Bad pixel func ID size"); +#endif void ComputePixelFuncID(PixelFuncID *id) { id->fullKey = 0; @@ -93,6 +98,38 @@ void ComputePixelFuncID(PixelFuncID *id) { id->applyLogicOp = gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY; id->applyFog = gstate.isFogEnabled() && !gstate.isModeThrough(); } + + // Cache some values for later convenience. + if (id->dithering) { + for (int y = 0; y < 4; ++y) { + for (int x = 0; x < 4; ++x) + id->cached.ditherMatrix[y * 4 + x] = gstate.getDitherValue(x, y); + } + } + if (id->applyColorWriteMask) { + uint32_t mask = gstate.getColorMask(); + // This flag means stencil clear or stencil test, basically whether writing to stencil. + if (!id->stencilTest) + mask |= 0xFF000000; + + switch (id->fbFormat) { + case GE_FORMAT_565: + id->cached.colorWriteMask = RGBA8888ToRGB565(mask); + break; + + case GE_FORMAT_5551: + id->cached.colorWriteMask = RGBA8888ToRGBA5551(mask); + break; + + case GE_FORMAT_4444: + id->cached.colorWriteMask = RGBA8888ToRGBA4444(mask); + break; + + case GE_FORMAT_8888: + id->cached.colorWriteMask = mask; + break; + } + } } std::string DescribePixelFuncID(const PixelFuncID &id) { diff --git a/GPU/Software/FuncId.h b/GPU/Software/FuncId.h index f0fb419c58..565860f15d 100644 --- a/GPU/Software/FuncId.h +++ b/GPU/Software/FuncId.h @@ -23,10 +23,22 @@ #include "GPU/ge_constants.h" +#define SOFTPIXEL_USE_CACHE 1 + +#pragma pack(push, 1) + struct PixelFuncID { PixelFuncID() { } +#ifdef SOFTPIXEL_USE_CACHE + struct { + // Warning: these are not hashed or compared for equal. Just cached values. + uint32_t colorWriteMask{}; + int16_t ditherMatrix[16]{}; + } cached; +#endif + union { uint64_t fullKey{}; struct { @@ -120,6 +132,8 @@ struct PixelFuncID { } }; +#pragma pack(pop) + struct SamplerID { SamplerID() : fullKey(0) { }