samplerjit: Simplify AVX shift-copies.

These have been the most common and the fallback is safe.  Let's just add
a helper.
This commit is contained in:
Unknown W. Brackets 2022-01-17 15:15:36 -08:00
parent 4ea1c08551
commit 0ba2d05da5
3 changed files with 120 additions and 137 deletions

View file

@ -1683,25 +1683,50 @@ void XEmitter::PUNPCKHWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x69,
void XEmitter::PUNPCKHDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6A, dest, arg);}
void XEmitter::PUNPCKHQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6D, dest, arg);}
void XEmitter::PSRLW(X64Reg reg, int shift)
{
void XEmitter::PSRLW(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSRLW(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLD(X64Reg reg, int shift)
{
void XEmitter::PSRLD(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSRLD(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLQ(X64Reg reg, int shift)
{
void XEmitter::PSRLQ(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSRLQ(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
Write8(shift);
}
void XEmitter::PSRLDQ(X64Reg reg, int shift) {
void XEmitter::PSRLDQ(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSRLDQ(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
Write8(shift);
}
@ -1710,25 +1735,50 @@ void XEmitter::PSRLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD1, reg, arg);
void XEmitter::PSRLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD2, reg, arg); }
void XEmitter::PSRLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xD3, reg, arg); }
void XEmitter::PSLLW(X64Reg reg, int shift)
{
void XEmitter::PSLLW(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSLLW(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLD(X64Reg reg, int shift)
{
void XEmitter::PSLLD(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSLLD(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLQ(X64Reg reg, int shift)
{
void XEmitter::PSLLQ(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSLLQ(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
Write8(shift);
}
void XEmitter::PSLLDQ(X64Reg reg, int shift) {
void XEmitter::PSLLDQ(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSLLDQ(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
Write8(shift);
}
@ -1737,14 +1787,26 @@ void XEmitter::PSLLW(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF1, reg, arg);
void XEmitter::PSLLD(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF2, reg, arg); }
void XEmitter::PSLLQ(X64Reg reg, OpArg arg) { WriteSSEOp(0x66, 0xF3, reg, arg); }
void XEmitter::PSRAW(X64Reg reg, int shift)
{
void XEmitter::PSRAW(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSRAW(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x71, (X64Reg)4, R(reg));
Write8(shift);
}
void XEmitter::PSRAD(X64Reg reg, int shift)
{
void XEmitter::PSRAD(X64Reg dest, X64Reg reg, int shift) {
if (dest != reg) {
if (cpu_info.bAVX) {
VPSRAD(128, dest, reg, shift);
return;
}
MOVDQA(dest, R(reg));
}
WriteSSEOp(0x66, 0x72, (X64Reg)4, R(reg));
Write8(shift);
}

View file

@ -844,10 +844,14 @@ public:
void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle);
void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle);
void PSRLW(X64Reg reg, int shift);
void PSRLD(X64Reg reg, int shift);
void PSRLQ(X64Reg reg, int shift);
void PSRLDQ(X64Reg reg, int shift);
void PSRLW(X64Reg dest, X64Reg reg, int shift);
void PSRLW(X64Reg reg, int shift) { PSRLW(reg, reg, shift); }
void PSRLD(X64Reg dest, X64Reg reg, int shift);
void PSRLD(X64Reg reg, int shift) { PSRLD(reg, reg, shift); }
void PSRLQ(X64Reg dest, X64Reg reg, int shift);
void PSRLQ(X64Reg reg, int shift) { PSRLQ(reg, reg, shift); }
void PSRLDQ(X64Reg dest, X64Reg reg, int shift);
void PSRLDQ(X64Reg reg, int shift) { PSRLDQ(reg, reg, shift); }
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSRLW(X64Reg reg, OpArg arg);
// Note: all values shifted by lowest 64-bit in XMM arg.
@ -855,10 +859,14 @@ public:
// Note: both values shifted by lowest 64-bit in XMM arg.
void PSRLQ(X64Reg reg, OpArg arg);
void PSLLW(X64Reg reg, int shift);
void PSLLD(X64Reg reg, int shift);
void PSLLQ(X64Reg reg, int shift);
void PSLLDQ(X64Reg reg, int shift);
void PSLLW(X64Reg dest, X64Reg reg, int shift);
void PSLLW(X64Reg reg, int shift) { PSLLW(reg, reg, shift); }
void PSLLD(X64Reg dest, X64Reg reg, int shift);
void PSLLD(X64Reg reg, int shift) { PSLLD(reg, reg, shift); }
void PSLLQ(X64Reg dest, X64Reg reg, int shift);
void PSLLQ(X64Reg reg, int shift) { PSLLQ(reg, reg, shift); }
void PSLLDQ(X64Reg dest, X64Reg reg, int shift);
void PSLLDQ(X64Reg reg, int shift) { PSLLDQ(reg, reg, shift); }
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSLLW(X64Reg reg, OpArg arg);
// Note: all values shifted by lowest 64-bit in XMM arg.
@ -866,8 +874,10 @@ public:
// Note: both values shifted by lowest 64-bit in XMM arg.
void PSLLQ(X64Reg reg, OpArg arg);
void PSRAW(X64Reg reg, int shift);
void PSRAD(X64Reg reg, int shift);
void PSRAW(X64Reg dest, X64Reg reg, int shift);
void PSRAW(X64Reg reg, int shift) { PSRAW(reg, reg, shift); }
void PSRAD(X64Reg dest, X64Reg reg, int shift);
void PSRAD(X64Reg reg, int shift) { PSRAD(reg, reg, shift); }
// Note: all values shifted by lowest 64-bit in XMM arg.
void PSRAW(X64Reg reg, OpArg arg);
// Note: all values shifted by lowest 64-bit in XMM arg.

View file

@ -1169,12 +1169,7 @@ bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPe
X64Reg shiftReg = regCache_.Alloc(RegCache::VEC_TEMP1);
// Shift against walls to get 5 bits after the rightmost 2.
if (cpu_info.bAVX) {
VPSLLD(128, shiftReg, formatReg, 32 - 7);
} else {
MOVDQA(shiftReg, R(formatReg));
PSLLD(shiftReg, 32 - 7);
}
PSLLD(shiftReg, formatReg, 32 - 7);
PSRLD(shiftReg, 32 - 5);
// The other lanes are zero, so we can use PSRLD.
PSRLD(indexReg, R(shiftReg));
@ -1189,12 +1184,7 @@ bool SamplerJitCache::Jit_TransformClutIndexQuad(const SamplerID &id, int bitsPe
if (id.hasClutMask) {
X64Reg maskReg = regCache_.Alloc(RegCache::VEC_TEMP1);
// If it was CLUT4, grab only 4 bits of the mask.
if (cpu_info.bAVX) {
VPSLLD(128, maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
} else {
MOVDQA(maskReg, R(formatReg));
PSLLD(maskReg, bitsPerIndex == 4 ? 20 : 16);
}
PSLLD(maskReg, formatReg, bitsPerIndex == 4 ? 20 : 16);
PSRLD(maskReg, bitsPerIndex == 4 ? 28 : 24);
PAND(indexReg, R(maskReg));
@ -1569,12 +1559,7 @@ bool SamplerJitCache::Jit_ApplyTextureFunc(const SamplerID &id) {
useAlphaFrom(primColorReg);
} else if (id.useColorDoubling) {
// We still need to finish dividing alpha, it's currently doubled (from the 7 above.)
if (cpu_info.bAVX) {
VPSRLW(128, primColorReg, resultReg, 1);
} else {
MOVDQA(primColorReg, R(resultReg));
PSRLW(primColorReg, 1);
}
PSRLW(primColorReg, resultReg, 1);
useAlphaFrom(primColorReg);
}
break;
@ -2606,12 +2591,7 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
PSHUFD(size, R(size), _MM_SHUFFLE(0, 0, 0, 0));
}
if (cpu_info.bAVX) {
VPSLLD(128, tempVecReg, size, 8);
} else {
MOVDQA(tempVecReg, R(size));
PSLLD(tempVecReg, 8);
}
PSLLD(tempVecReg, size, 8);
CVTDQ2PS(tempVecReg, R(tempVecReg));
// And then multiply.
MULPS(dest, R(tempVecReg));
@ -2948,12 +2928,7 @@ bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCac
// Divide vvec by 8 in a temp.
X64Reg vMultReg = regCache_.Alloc(RegCache::VEC_TEMP1);
if (cpu_info.bAVX) {
VPSRLD(128, vMultReg, vReg, 3);
} else {
MOVDQA(vMultReg, R(vReg));
PSRLD(vMultReg, 3);
}
PSRLD(vMultReg, vReg, 3);
// And now multiply by bufw. May be able to use a shift in a common case.
int shiftAmount = 32 - clz32_nonzero(bitsPerTexel - 1);
@ -2995,24 +2970,14 @@ bool SamplerJitCache::Jit_PrepareDataSwizzledOffsets(const SamplerID &id, RegCac
// Now get ((uvec / texels_per_tile) / 4) * 32 * 4 aka (uvec / (128 / bitsPerTexel)) << 7.
X64Reg uCopyReg = regCache_.Alloc(RegCache::VEC_TEMP0);
if (cpu_info.bAVX) {
VPSRLD(128, uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
} else {
MOVDQA(uCopyReg, R(uReg));
PSRLD(uCopyReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
}
PSRLD(uCopyReg, uReg, 7 + clz32_nonzero(bitsPerTexel - 1) - 32);
PSLLD(uCopyReg, 7);
// Add it in to our running total.
PADDD(vReg, R(uCopyReg));
if (bitsPerTexel == 4) {
// Finally, we want (uvec & 31) / 2. Use a 16-bit wall.
if (cpu_info.bAVX) {
VPSLLW(128, uCopyReg, uReg, 11);
} else {
MOVDQA(uCopyReg, R(uReg));
PSLLW(uCopyReg, 11);
}
PSLLW(uCopyReg, uReg, 11);
PSRLD(uCopyReg, 12);
// With that, this is our byte offset. uvec & 1 has which half.
PADDD(vReg, R(uCopyReg));
@ -3075,34 +3040,19 @@ bool SamplerJitCache::Jit_Decode5650Quad(const SamplerID &id, Rasterizer::RegCac
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
// Filter out red only into temp1. We do this by shifting into a wall.
if (cpu_info.bAVX) {
VPSLLD(128, temp1Reg, quadReg, 32 - 5);
} else {
MOVDQA(temp1Reg, R(quadReg));
PSLLD(temp1Reg, 32 - 5);
}
PSLLD(temp1Reg, quadReg, 32 - 5);
// Move it right to the top of the 8 bits.
PSRLD(temp1Reg, 24);
// Now we bring in blue, since it's also 5 like red.
if (cpu_info.bAVX) {
VPSRLD(128, temp2Reg, quadReg, 11);
} else {
MOVDQA(temp2Reg, R(quadReg));
// Luckily, we know the top 16 bits are zero. Shift right into a wall.
PSRLD(temp2Reg, 11);
}
// Luckily, we know the top 16 bits are zero. Shift right into a wall.
PSRLD(temp2Reg, quadReg, 11);
// Shift blue into place at 19, and merge back to temp1.
PSLLD(temp2Reg, 19);
POR(temp1Reg, R(temp2Reg));
// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
if (cpu_info.bAVX) {
VPSLLD(128, temp2Reg, temp1Reg, 1);
} else {
MOVDQA(temp2Reg, R(temp1Reg));
PSLLD(temp2Reg, 1);
}
PSLLD(temp2Reg, temp1Reg, 1);
// We go to green last because it's the different one. Shift off red and blue.
PSRLD(quadReg, 5);
@ -3179,22 +3129,12 @@ bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCac
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
// Filter out red only into temp1. We do this by shifting into a wall.
if (cpu_info.bAVX) {
VPSLLD(128, temp1Reg, quadReg, 32 - 5);
} else {
MOVDQA(temp1Reg, R(quadReg));
PSLLD(temp1Reg, 32 - 5);
}
PSLLD(temp1Reg, quadReg, 32 - 5);
// Move it right to the top of the 8 bits.
PSRLD(temp1Reg, 24);
// Add in green and shift into place (top 5 bits of byte 2.)
if (cpu_info.bAVX) {
VPSRLD(128, temp2Reg, quadReg, 5);
} else {
MOVDQA(temp2Reg, R(quadReg));
PSRLD(temp2Reg, 5);
}
PSRLD(temp2Reg, quadReg, 5);
PSLLW(temp2Reg, 11);
POR(temp1Reg, R(temp2Reg));
@ -3206,12 +3146,7 @@ bool SamplerJitCache::Jit_Decode5551Quad(const SamplerID &id, Rasterizer::RegCac
// Combine both together, we still need to swizzle.
POR(quadReg, R(temp1Reg));
if (cpu_info.bAVX) {
VPSRLD(128, temp1Reg, quadReg, 5);
} else {
MOVDQA(temp1Reg, R(quadReg));
PSRLD(temp1Reg, 5);
}
PSRLD(temp1Reg, quadReg, 5);
// Now for swizzle, we'll mask carefully to avoid overflow.
PAND(temp1Reg, M(const5551Swizzle_));
@ -3271,31 +3206,16 @@ bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCac
X64Reg temp2Reg = regCache_.Alloc(RegCache::VEC_TEMP2);
// Mask and move red into position within temp1.
if (cpu_info.bAVX) {
VPSLLD(128, temp1Reg, quadReg, 28);
} else {
MOVDQA(temp1Reg, R(quadReg));
PSLLD(temp1Reg, 28);
}
PSLLD(temp1Reg, quadReg, 28);
PSRLD(temp1Reg, 24);
// Green is easy too, we use a word shift to get a free wall.
if (cpu_info.bAVX) {
VPSRLD(128, temp2Reg, quadReg, 4);
} else {
MOVDQA(temp2Reg, R(quadReg));
PSRLD(temp2Reg, 4);
}
PSRLD(temp2Reg, quadReg, 4);
PSLLW(temp2Reg, 12);
POR(temp1Reg, R(temp2Reg));
// Blue isn't last this time, but it's next.
if (cpu_info.bAVX) {
VPSRLD(128, temp2Reg, quadReg, 8);
} else {
MOVDQA(temp2Reg, R(quadReg));
PSRLD(temp2Reg, 8);
}
PSRLD(temp2Reg, quadReg, 8);
PSLLD(temp2Reg, 28);
PSRLD(temp2Reg, 8);
POR(temp1Reg, R(temp2Reg));
@ -3307,20 +3227,11 @@ bool SamplerJitCache::Jit_Decode4444Quad(const SamplerID &id, Rasterizer::RegCac
POR(quadReg, R(temp1Reg));
// Masking isn't necessary here since everything is 4 wide.
if (cpu_info.bAVX) {
VPSRLD(128, temp1Reg, quadReg, 4);
} else {
MOVDQA(temp1Reg, R(quadReg));
PSRLD(temp1Reg, 4);
}
POR(quadReg, R(temp1Reg));
} else if (cpu_info.bAVX) {
VPSRLD(128, quadReg, temp1Reg, 4);
PSRLD(temp1Reg, quadReg, 4);
POR(quadReg, R(temp1Reg));
} else {
// Overwrite colorReg (we need temp1 as a copy anyway.)
MOVDQA(quadReg, R(temp1Reg));
PSRLD(temp1Reg, 4);
// Overwrite quadReg (we need temp1 as a copy anyway.)
PSRLD(quadReg, temp1Reg, 4);
POR(quadReg, R(temp1Reg));
}