From 771d459025a8a065689ea4adb011dff279b13870 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 24 Nov 2021 23:01:03 -0800 Subject: [PATCH] softjit: Use SSE4.1 for fog and dither a bit. --- GPU/Software/DrawPixelX86.cpp | 36 ++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/GPU/Software/DrawPixelX86.cpp b/GPU/Software/DrawPixelX86.cpp index e46d17806a..52e3b2fa6c 100644 --- a/GPU/Software/DrawPixelX86.cpp +++ b/GPU/Software/DrawPixelX86.cpp @@ -468,13 +468,20 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { } // Load fog and expand to 16 bit. Ignore the high 8 bits, which'll match up with A. - X64Reg zeroReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_VEC); + X64Reg zeroReg = INVALID_REG; X64Reg fogColorReg = regCache_.Alloc(PixelRegCache::TEMP1, PixelRegCache::T_VEC); - PXOR(zeroReg, R(zeroReg)); X64Reg gstateReg = GetGState(); - MOVD_xmm(fogColorReg, MDisp(gstateReg, offsetof(GPUgstate, fogcolor))); + if (cpu_info.bSSE4_1) { + X64Reg gstateReg = GetGState(); + // This actually loads the texlodslope too, but that's okay. + PMOVZXBW(fogColorReg, MDisp(gstateReg, offsetof(GPUgstate, fogcolor))); + } else { + zeroReg = regCache_.Alloc(PixelRegCache::TEMP0, PixelRegCache::T_VEC); + PXOR(zeroReg, R(zeroReg)); + MOVD_xmm(fogColorReg, MDisp(gstateReg, offsetof(GPUgstate, fogcolor))); + PUNPCKLBW(fogColorReg, R(zeroReg)); + } regCache_.Unlock(gstateReg, PixelRegCache::T_GEN); - PUNPCKLBW(fogColorReg, R(zeroReg)); // Load a set of 255s at 16 bit into a reg for later... X64Reg invertReg = regCache_.Alloc(PixelRegCache::TEMP2, PixelRegCache::T_VEC); @@ -483,8 +490,12 @@ bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) { regCache_.Unlock(constReg, PixelRegCache::T_GEN); // Expand (we clamped) color to 16 bit as well, so we can multiply with fog. - PUNPCKLBW(argColorReg, R(zeroReg)); - regCache_.Release(zeroReg, PixelRegCache::T_VEC); + if (cpu_info.bSSE4_1) { + PMOVZXBW(argColorReg, R(argColorReg)); + } else { + PUNPCKLBW(argColorReg, R(zeroReg)); + regCache_.Release(zeroReg, PixelRegCache::T_VEC); + } // Save A so we can put it back, we don't "fog" A. X64Reg alphaReg; @@ -1003,10 +1014,14 @@ bool PixelJitCache::Jit_Dither(const PixelFuncID &id) { // We use 16-bit because we need a signed add, but we also want to saturate. PSHUFLW(vecValueReg, R(vecValueReg), _MM_SHUFFLE(1, 0, 0, 0)); // With that, now let's convert the color to 16 bit... - X64Reg zeroReg = regCache_.Alloc(PixelRegCache::TEMP1, PixelRegCache::T_VEC); - PXOR(zeroReg, R(zeroReg)); - PUNPCKLBW(argColorReg, R(zeroReg)); - regCache_.Release(zeroReg, PixelRegCache::T_VEC); + if (cpu_info.bSSE4_1) { + PMOVZXBW(argColorReg, R(argColorReg)); + } else { + X64Reg zeroReg = regCache_.Alloc(PixelRegCache::TEMP1, PixelRegCache::T_VEC); + PXOR(zeroReg, R(zeroReg)); + PUNPCKLBW(argColorReg, R(zeroReg)); + regCache_.Release(zeroReg, PixelRegCache::T_VEC); + } // And simply add the dither values. PADDSW(argColorReg, R(vecValueReg)); regCache_.Release(vecValueReg, PixelRegCache::T_VEC); @@ -1520,7 +1535,6 @@ bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, PixelRegCache::Reg c finishes.push_back(J(true)); tableValues[GE_LOGIC_SET] = GetCodePointer(); - // TODO: Apply logic op and add stencil meanwhile. if (stencilReg != INVALID_REG && maskReg != INVALID_REG) { OR(32, R(colorReg), R(stencilReg)); OR(bits, R(colorReg), notStencilMask);