mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
VFPU-Int: Add a few fast-paths to ReadMatrix/WriteMatrix. Should gain back part of the speed lost in #12217
(which fixed #5399).
This commit is contained in:
parent
9b30e5393d
commit
8642134419
1 changed files with 83 additions and 19 deletions
|
@ -17,7 +17,8 @@
|
|||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <stdio.h>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
#include "Common/BitScan.h"
|
||||
#include "Common/CommonFuncs.h"
|
||||
|
@ -245,14 +246,44 @@ void ReadMatrix(float *rd, MatrixSize size, int reg) {
|
|||
default: _assert_msg_(JIT, 0, "%s: Bad matrix size", __FUNCTION__);
|
||||
}
|
||||
|
||||
for (int i = 0; i < side; i++) {
|
||||
for (int j = 0; j < side; j++) {
|
||||
int index = mtx * 4;
|
||||
if (transpose)
|
||||
index += ((row+i)&3) + ((col+j)&3)*32;
|
||||
else
|
||||
index += ((col+j)&3) + ((row+i)&3)*32;
|
||||
rd[j*4 + i] = V(index);
|
||||
// The voffset ordering is now integrated in these formulas,
|
||||
// eliminating a table lookup.
|
||||
const float *v = currentMIPS->v + (size_t)mtx * 16;
|
||||
if (transpose) {
|
||||
if (side == 4 && col == 0 && row == 0) {
|
||||
// Simple 4x4 transpose. TODO: Optimize.
|
||||
for (int j = 0; j < 4; j++) {
|
||||
for (int i = 0; i < 4; i++) {
|
||||
rd[j * 4 + i] = v[i * 4 + j];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < side; j++) {
|
||||
for (int i = 0; i < side; i++) {
|
||||
int index = ((row + i) & 3) * 4 + ((col + j) & 3);
|
||||
rd[j * 4 + i] = v[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Fast path
|
||||
if (side == 4 && col == 0 && row == 0) {
|
||||
memcpy(rd, v, sizeof(float) * 16);
|
||||
/*
|
||||
for (int j = 0; j < side; j++) {
|
||||
for (int i = 0; i < side; i++) {
|
||||
int index = mtx * 16 + j * 4 + i;
|
||||
rd[j * 4 + i] = v[index];
|
||||
}
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
for (int j = 0; j < side; j++) {
|
||||
for (int i = 0; i < side; i++) {
|
||||
int index = ((col + j) & 3) * 4 + ((row + i) & 3);
|
||||
rd[j * 4 + i] = v[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -277,16 +308,49 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
|
|||
ERROR_LOG_REPORT(CPU, "Write mask used with vfpu matrix instruction.");
|
||||
}
|
||||
|
||||
for (int i = 0; i < side; i++) {
|
||||
for (int j = 0; j < side; j++) {
|
||||
// Hm, I wonder if this should affect matrices at all.
|
||||
if (j != side -1 || !currentMIPS->VfpuWriteMask(i)) {
|
||||
int index = mtx * 4;
|
||||
if (transpose)
|
||||
index += ((row+i)&3) + ((col+j)&3)*32;
|
||||
else
|
||||
index += ((col+j)&3) + ((row+i)&3)*32;
|
||||
V(index) = rd[j*4+i];
|
||||
// The voffset ordering is now integrated in these formulas,
|
||||
// eliminating a table lookup.
|
||||
float *v = currentMIPS->v + (size_t)mtx * 16;
|
||||
if (transpose) {
|
||||
// Fast path
|
||||
if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
|
||||
for (int j = 0; j < side; j++) {
|
||||
for (int i = 0; i < side; i++) {
|
||||
int index = i * 4 + j;
|
||||
v[index] = rd[j * 4 + i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int j = 0; j < side; j++) {
|
||||
for (int i = 0; i < side; i++) {
|
||||
// Hm, I wonder if this should affect matrices at all.
|
||||
if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
|
||||
int index = ((row + i) & 3) * 4 + ((col + j) & 3);
|
||||
v[index] = rd[j * 4 + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
|
||||
memcpy(v, rd, sizeof(float) * 16);
|
||||
/*
|
||||
for (int j = 0; j < side; j++) {
|
||||
for (int i = 0; i < side; i++) {
|
||||
int index = j * 4 + i;
|
||||
v[index] = rd[j * 4 + i];
|
||||
}
|
||||
}
|
||||
*/
|
||||
} else {
|
||||
for (int j = 0; j < side; j++) {
|
||||
for (int i = 0; i < side; i++) {
|
||||
// Hm, I wonder if this should affect matrices at all.
|
||||
if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
|
||||
int index = ((col + j) & 3) * 4 + ((row + i) & 3);
|
||||
v[index] = rd[j * 4 + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue