Merge pull request #7117 from unknownbrackets/jit-simd

x86jit: Implement vmmov using SIMD
This commit is contained in:
Henrik Rydgård 2014-11-30 09:24:22 +01:00
commit 7deb8055ee
4 changed files with 88 additions and 2 deletions

View file

@ -2389,6 +2389,48 @@ void Jit::Comp_Vmmov(MIPSOpcode op) {
MatrixSize sz = GetMtxSize(op);
int n = GetMatrixSide(sz);
if (jo.enableVFPUSIMD) {
VectorSize vsz = GetVectorSize(sz);
u8 dest[4][4];
MatrixOverlapType overlap = GetMatrixOverlap(_VD, _VS, sz);
u8 vecs[4];
if (overlap == OVERLAP_NONE) {
GetMatrixColumns(_VD, sz, vecs);
for (int i = 0; i < n; ++i) {
GetVectorRegs(dest[i], vsz, vecs[i]);
}
} else {
for (int i = 0; i < n; ++i) {
fpr.GetTempVS(dest[i], vsz);
}
}
GetMatrixColumns(_VS, sz, vecs);
for (int i = 0; i < n; i++) {
u8 vec[4];
GetVectorRegs(vec, vsz, vecs[i]);
fpr.MapRegsVS(vec, vsz, 0);
fpr.MapRegsVS(dest[i], vsz, MAP_NOINIT);
MOVAPS(fpr.VSX(dest[i]), fpr.VS(vec));
}
if (overlap != OVERLAP_NONE) {
// Okay, move from the temps to VD now.
GetMatrixColumns(_VD, sz, vecs);
for (int i = 0; i < n; i++) {
u8 vec[4];
GetVectorRegs(vec, vsz, vecs[i]);
fpr.MapRegsVS(vec, vsz, MAP_NOINIT);
fpr.MapRegsVS(dest[i], vsz, 0);
MOVAPS(fpr.VSX(vec), fpr.VS(dest[i]));
}
}
fpr.ReleaseSpillLocks();
return;
}
u8 sregs[16], dregs[16];
GetMatrixRegs(sregs, sz, _VS);
GetMatrixRegs(dregs, sz, _VD);

View file

@ -716,6 +716,50 @@ int FPURegCache::GetTempR() {
return -1;
}
int FPURegCache::GetTempVS(u8 *v, VectorSize vsz) {
pendingFlush = true;
const int n = GetNumVectorElements(vsz);
// Let's collect regs as we go, but try for n free in a row.
int found = 0;
for (int r = TEMP0; r <= TEMP0 + NUM_TEMPS - n; ++r) {
if (regs[r].away || regs[r].tempLocked) {
continue;
}
// How many free siblings does this have?
int seq = 1;
for (int i = 1; i < n; ++i) {
if (regs[r + i].away || regs[r + i].tempLocked) {
break;
}
++seq;
}
if (seq == n) {
// Got 'em. Exacty as many as we need.
for (int i = 0; i < n; ++i) {
v[i] = r + i - 32;
}
found = n;
break;
}
if (found < n) {
v[found++] = r - 32;
}
}
if (found != n) {
_assert_msg_(JIT, 0, "Regcache ran out of temp regs, might need to DiscardR() some.");
return -1;
}
for (int i = 0; i < n; ++i) {
regs[v[i] + 32].tempLocked = true;
}
}
void FPURegCache::Flush() {
if (!pendingFlush) {
return;

View file

@ -114,7 +114,7 @@ public:
int GetTempV() {
return GetTempR() - 32;
}
// TODO: GetTempVS?
int GetTempVS(u8 *v, VectorSize vsz);
void SetEmitter(XEmitter *emitter) {emit = emitter;}
void SetOptions(MIPSComp::JitOptions *jo) {jo_ = jo;}

View file

@ -173,7 +173,6 @@ bool TestJit() {
#else
std::vector<std::string> lines = DisassembleX86(block->normalEntry, block->codeSize);
#endif
printf("Jit was %fx faster than interp.\n\n", jit_speed / interp_speed);
// Cut off at 25 due to the repetition above. Might need tweaking for large instructions.
const int cutoff = 25;
for (int i = 0; i < std::min((int)lines.size(), cutoff); i++) {
@ -181,6 +180,7 @@ bool TestJit() {
}
if (lines.size() > cutoff)
printf("...\n");
printf("Jit was %fx faster than interp.\n\n", jit_speed / interp_speed);
}
printf("\n");