Arm VFPU: 2-op and 3-op: map registers before work, improving instruction order a little.

This commit is contained in:
Henrik Rydgard 2014-03-12 11:06:26 +01:00
parent 7ae9c26b6a
commit 174b5187e6

View file

@ -699,8 +699,19 @@ namespace MIPSComp
}
}
// Map first, then work. This will allow us to use VLDMIA more often
// (when we add the appropriate map function) and the instruction ordering
// will improve.
// Note that mapping like this (instead of first all sregs, first all tregs etc)
// reduces the amount of continuous registers a lot :(
for (int i = 0; i < n; i++) {
fpr.MapDirtyInInV(tempregs[i], sregs[i], tregs[i]);
fpr.SpillLockV(tempregs[i]);
fpr.SpillLockV(sregs[i]);
fpr.SpillLockV(tregs[i]);
}
for (int i = 0; i < n; i++) {
switch (op >> 26) {
case 24: //VFPU0
switch ((op >> 23)&7) {
@ -798,6 +809,31 @@ namespace MIPSComp
return;
}
// Catch the disabled operations immediately so we don't map registers unnecessarily later.
// Move these down to the big switch below as they are implemented.
switch ((op >> 16) & 0x1f) {
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
DISABLE;
break;
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
DISABLE;
break;
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
DISABLE;
break;
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
DISABLE;
break;
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
DISABLE;
break;
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
DISABLE;
break;
default:
;
}
VectorSize sz = GetVecSize(op);
int n = GetNumVectorElements(sz);
@ -827,21 +863,27 @@ namespace MIPSComp
t4 = fpr.V(t[2]);
}
// Pre map the registers to get better instruction ordering.
// Note that mapping like this (instead of first all sregs, first all tempregs etc)
// reduces the amount of continuous registers a lot :(
for (int i = 0; i < n; i++) {
fpr.MapDirtyInV(tempregs[i], sregs[i]);
fpr.SpillLockV(tempregs[i]);
fpr.SpillLockV(sregs[i]);
}
// Warning: sregs[i] and tempxregs[i] may be the same reg.
// Helps for vmov, hurts for vrcp, etc.
for (int i = 0; i < n; ++i) {
for (int i = 0; i < n; i++) {
switch ((op >> 16) & 0x1f) {
case 0: // d[i] = s[i]; break; //vmov
// Probably for swizzle.
fpr.MapDirtyInV(tempregs[i], sregs[i]);
VMOV(fpr.V(tempregs[i]), fpr.V(sregs[i]));
break;
case 1: // d[i] = fabsf(s[i]); break; //vabs
fpr.MapDirtyInV(tempregs[i], sregs[i]);
VABS(fpr.V(tempregs[i]), fpr.V(sregs[i]));
break;
case 2: // d[i] = -s[i]; break; //vneg
fpr.MapDirtyInV(tempregs[i], sregs[i]);
VNEG(fpr.V(tempregs[i]), fpr.V(sregs[i]));
break;
@ -866,30 +908,15 @@ namespace MIPSComp
*/
case 16: // d[i] = 1.0f / s[i]; break; //vrcp
fpr.MapDirtyInV(tempregs[i], sregs[i]);
MOVI2F(S0, 1.0f, R0);
VDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
break;
case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
fpr.MapDirtyInV(tempregs[i], sregs[i]);
MOVI2F(S0, 1.0f, R0);
VSQRT(S1, fpr.V(sregs[i]));
VDIV(fpr.V(tempregs[i]), S0, S1);
break;
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
DISABLE;
break;
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
DISABLE;
break;
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
DISABLE;
break;
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
DISABLE;
break;
case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
fpr.MapDirtyInV(tempregs[i], sregs[i]);
VSQRT(fpr.V(tempregs[i]), fpr.V(sregs[i]));
VABS(fpr.V(tempregs[i]), fpr.V(tempregs[i]));
break;
@ -897,7 +924,6 @@ namespace MIPSComp
// Seems to work well enough but can disable if it becomes a problem.
// Should be easy enough to translate to NEON. There we can load all the constants
// in one go of course.
fpr.MapDirtyInV(tempregs[i], sregs[i]);
MOVI2F(S0, 0.0f, R0);
VCMP(fpr.V(sregs[i]), S0); // flags = sign(sregs[i])
VMRS_APSR();
@ -924,17 +950,11 @@ namespace MIPSComp
VMUL(fpr.V(tempregs[i]), fpr.V(tempregs[i]), S1);
break;
case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
fpr.MapDirtyInV(tempregs[i], sregs[i]);
MOVI2F(S0, -1.0f, R0);
VDIV(fpr.V(tempregs[i]), S0, fpr.V(sregs[i]));
break;
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
DISABLE;
break;
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
DISABLE;
break;
default:
ERROR_LOG(JIT, "case missing in vfpu vv2op");
DISABLE;
break;
}