mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
x86jit: Micro optimize vi2x a bit with ssse3/sse4.
Both are small wins.
This commit is contained in:
parent
0e646f748a
commit
bc7497857a
3 changed files with 49 additions and 13 deletions
|
@ -1742,6 +1742,15 @@ void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3800, dest
|
|||
void XEmitter::PTEST(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3817, dest, arg);}
|
||||
void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
|
||||
|
||||
void XEmitter::PMINSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3838, dest, arg);}
|
||||
void XEmitter::PMINSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3839, dest, arg);}
|
||||
void XEmitter::PMINUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383a, dest, arg);}
|
||||
void XEmitter::PMINUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383b, dest, arg);}
|
||||
void XEmitter::PMAXSB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383c, dest, arg);}
|
||||
void XEmitter::PMAXSD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383d, dest, arg);}
|
||||
void XEmitter::PMAXUW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383e, dest, arg);}
|
||||
void XEmitter::PMAXUD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x383f, dest, arg);}
|
||||
|
||||
void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
|
||||
void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
|
||||
void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
|
||||
|
|
|
@ -763,7 +763,15 @@ public:
|
|||
void PMAXUB(X64Reg dest, OpArg arg);
|
||||
void PMINSW(X64Reg dest, OpArg arg);
|
||||
void PMINUB(X64Reg dest, OpArg arg);
|
||||
// SSE4 has PMAXSB and PMINSB and PMAXUW and PMINUW too if we need them.
|
||||
// SSE4: More MAX/MIN instructions.
|
||||
void PMINSB(X64Reg dest, OpArg arg);
|
||||
void PMINSD(X64Reg dest, OpArg arg);
|
||||
void PMINUW(X64Reg dest, OpArg arg);
|
||||
void PMINUD(X64Reg dest, OpArg arg);
|
||||
void PMAXSB(X64Reg dest, OpArg arg);
|
||||
void PMAXSD(X64Reg dest, OpArg arg);
|
||||
void PMAXUW(X64Reg dest, OpArg arg);
|
||||
void PMAXUD(X64Reg dest, OpArg arg);
|
||||
|
||||
void PMOVMSKB(X64Reg dest, OpArg arg);
|
||||
void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle);
|
||||
|
|
|
@ -2222,6 +2222,12 @@ void Jit::Comp_VDet(MIPSOpcode op) {
|
|||
DISABLE;
|
||||
}
|
||||
|
||||
// The goal is to map (reversed byte order for clarity):
|
||||
// 000000AA 000000BB 000000CC 000000DD -> AABBCCDD
|
||||
static s8 MEMORY_ALIGNED16( vi2xc_shuffle[16] ) = { 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
// 0000AAAA 0000BBBB 0000CCCC 0000DDDD -> AAAABBBB CCCCDDDD
|
||||
static s8 MEMORY_ALIGNED16( vi2xs_shuffle[16] ) = { 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 };
|
||||
|
||||
void Jit::Comp_Vi2x(MIPSOpcode op) {
|
||||
CONDITIONAL_DISABLE;
|
||||
if (js.HasUnknownPrefix())
|
||||
|
@ -2293,27 +2299,40 @@ void Jit::Comp_Vi2x(MIPSOpcode op) {
|
|||
PUNPCKLQDQ(dst0, R(XMM0));
|
||||
} else {
|
||||
// Otherwise, we need to zero out the top 2.
|
||||
// TODO: Maybe PAND would be better?
|
||||
// We expect XMM1 to be zero below.
|
||||
PXOR(XMM1, R(XMM1));
|
||||
PUNPCKLQDQ(dst0, R(XMM1));
|
||||
}
|
||||
|
||||
// For "u" type ops, we clamp to zero and shift off the sign bit first.
|
||||
if (unsignedOp) {
|
||||
// Get a mask of the sign bit in dst0, then and in the values. This clamps to 0.
|
||||
MOVDQA(XMM1, R(dst0));
|
||||
PSRAD(dst0, 31);
|
||||
PSLLD(XMM1, 1);
|
||||
PANDN(dst0, R(XMM1));
|
||||
if (cpu_info.bSSE4_1) {
|
||||
if (sz == V_Quad) {
|
||||
// Zeroed in the other case above.
|
||||
PXOR(XMM1, R(XMM1));
|
||||
}
|
||||
PMAXSD(dst0, R(XMM1));
|
||||
PSLLD(dst0, 1);
|
||||
} else {
|
||||
// Get a mask of the sign bit in dst0, then and in the values. This clamps to 0.
|
||||
MOVDQA(XMM1, R(dst0));
|
||||
PSRAD(dst0, 31);
|
||||
PSLLD(XMM1, 1);
|
||||
PANDN(dst0, R(XMM1));
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, everything is aligned in the high bits of our lanes.
|
||||
// Let's *arithmetically* shift in the sign so we can use saturating packs.
|
||||
PSRAD(dst0, 32 - bits);
|
||||
// XMM1 used for the high part just so there's no dependency. It contains garbage or 0.
|
||||
PACKSSDW(dst0, R(XMM1));
|
||||
if (bits == 8) {
|
||||
PACKSSWB(dst0, R(XMM1));
|
||||
if (cpu_info.bSSSE3) {
|
||||
PSHUFB(dst0, bits == 8 ? M(vi2xc_shuffle) : M(vi2xs_shuffle));
|
||||
} else {
|
||||
// Let's *arithmetically* shift in the sign so we can use saturating packs.
|
||||
PSRAD(dst0, 32 - bits);
|
||||
// XMM1 used for the high part just so there's no dependency. It contains garbage or 0.
|
||||
PACKSSDW(dst0, R(XMM1));
|
||||
if (bits == 8) {
|
||||
PACKSSWB(dst0, R(XMM1));
|
||||
}
|
||||
}
|
||||
|
||||
if (!fpr.V(dregs[0]).IsSimpleReg(dst0)) {
|
||||
|
|
Loading…
Add table
Reference in a new issue