diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp index 181cccf3db..7822f31bee 100644 --- a/GPU/Software/SamplerX86.cpp +++ b/GPU/Software/SamplerX86.cpp @@ -138,6 +138,8 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { regCache_.ForceRetain(RegCache::VEC_ARG_U); regCache_.ChangeReg(XMM1, RegCache::VEC_ARG_V); regCache_.ForceRetain(RegCache::VEC_ARG_V); + regCache_.ChangeReg(XMM5, RegCache::VEC_RESULT); + regCache_.ForceRetain(RegCache::VEC_RESULT); // We'll first write the nearest sampler, which we will CALL. // This may differ slightly based on the "linear" flag. @@ -154,6 +156,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { regCache_.ForceRelease(RegCache::VEC_ARG_U); regCache_.ForceRelease(RegCache::VEC_ARG_V); + regCache_.ForceRelease(RegCache::VEC_RESULT); if (regCache_.Has(RegCache::GEN_ARG_LEVEL)) regCache_.ForceRelease(RegCache::GEN_ARG_LEVEL); regCache_.Reset(true); @@ -178,13 +181,10 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { PUSH(arg2Reg); PUSH(arg1Reg); #endif - // Extra space to restore alignment and save resultReg for lerp. - // TODO: Maybe use XMMs instead? - SUB(64, R(RSP), Imm8(24)); #ifdef _WIN32 - // First arg now starts at 24 (extra space) + 32 (pushed stack) + 8 (ret address) + 32 (shadow space) - const int argOffset = 24 + 32 + 8 + 32; + // First arg now starts at 32 (pushed stack) + 8 (ret address) + 32 (shadow space) + const int argOffset = 32 + 8 + 32; MOV(64, R(R14), MDisp(RSP, argOffset)); MOV(32, R(R15), MDisp(RSP, argOffset + 8)); // level is at argOffset + 16. @@ -205,7 +205,10 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { } // At this point: - // XMM0=uvec, XMM1=vvec, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level + // XMM0=uvec, XMM1=vvec, stack+0=frac_u, stack+8=frac_v, R14=src, R15=bufw, stack+X=level + + // We'll accumulate values into XMM5. + PXOR(XMM5, R(XMM5)); // This stores the result on the stack for later processing. auto doNearestCall = [&](int off) { @@ -226,7 +229,14 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { PSRLDQ(XMM1, 4); CALL(nearest); - MOV(32, MDisp(RSP, off), R(resultReg)); + + if (off == 0) { + MOVD_xmm(XMM5, R(resultReg)); + } else { + MOVD_xmm(XMM2, R(resultReg)); + PSLLDQ(XMM2, off); + POR(XMM5, R(XMM2)); + } }; doNearestCall(0); @@ -239,10 +249,10 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { PXOR(XMM0, R(XMM0)); } - MOVD_xmm(fpScratchReg1, MDisp(RSP, 0)); - MOVD_xmm(fpScratchReg2, MDisp(RSP, 4)); - MOVD_xmm(fpScratchReg3, MDisp(RSP, 8)); - MOVD_xmm(fpScratchReg4, MDisp(RSP, 12)); + PSHUFD(fpScratchReg1, R(XMM5), _MM_SHUFFLE(0, 0, 0, 0)); + PSHUFD(fpScratchReg2, R(XMM5), _MM_SHUFFLE(1, 1, 1, 1)); + PSHUFD(fpScratchReg3, R(XMM5), _MM_SHUFFLE(2, 2, 2, 2)); + PSHUFD(fpScratchReg4, R(XMM5), _MM_SHUFFLE(3, 3, 3, 3)); if (cpu_info.bSSE4_1) { PMOVZXBD(fpScratchReg1, R(fpScratchReg1)); @@ -265,7 +275,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { CVTDQ2PS(fpScratchReg4, R(fpScratchReg4)); // Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)... - MOVD_xmm(fpScratchReg5, MDisp(RSP, 24)); + MOVD_xmm(fpScratchReg5, MDisp(RSP, 0)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(by256)) { @@ -295,7 +305,7 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { ADDPS(fpScratchReg3, R(fpScratchReg4)); // Next, time for frac_v. - MOVD_xmm(fpScratchReg5, MDisp(RSP, 32)); + MOVD_xmm(fpScratchReg5, MDisp(RSP, 8)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(ones)) { @@ -327,7 +337,6 @@ LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { SetJumpTarget(zeroSrc); } - ADD(64, R(RSP), Imm8(24)); POP(arg3Reg); POP(arg4Reg); POP(R14); @@ -1221,12 +1230,12 @@ bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) { } else { if (id.linear) { #ifdef _WIN32 - const int argOffset = 24 + 32 + 8 + 32; + const int argOffset = 32 + 8 + 32; // Extra 8 to account for CALL. MOV(32, R(temp2Reg), MDisp(RSP, argOffset + 16 + 8)); #else // Extra 8 to account for CALL. - MOV(32, R(temp2Reg), MDisp(RSP, 24 + 48 + 8 + 8)); + MOV(32, R(temp2Reg), MDisp(RSP, 48 + 8 + 8)); #endif } else { #ifdef _WIN32