Optimize software skinning for x86.

Can't seem to get a win on Windows vs hardware skinning though, even though
draw calls drop by 2/3rd...
This commit is contained in:
Henrik Rydgard 2013-11-10 20:08:46 +01:00 committed by Henrik Rydgård
parent 9333d3ea76
commit 6221dbaf5d
5 changed files with 342 additions and 14 deletions

View file

@ -1371,6 +1371,12 @@ void XEmitter::PSLLQ(X64Reg reg, int shift) {
Write8(shift);
}
void XEmitter::PSLLDQ(X64Reg reg, int shift) {
WriteSSEOp(64, 0x73, true, (X64Reg)7, R(reg));
Write8(shift);
}
// WARNING not REX compatible
void XEmitter::PSRAW(X64Reg reg, int shift) {
if (reg > 7)

View file

@ -658,6 +658,8 @@ public:
void PSLLD(X64Reg reg, int shift);
void PSLLQ(X64Reg reg, int shift);
void PSLLDQ(X64Reg reg, int shift);
void PSRAW(X64Reg reg, int shift);
void PSRAD(X64Reg reg, int shift);

View file

@ -1394,7 +1394,6 @@ void GLES_GPU::ExecuteOpInternal(u32 op, u32 diff) {
if (num < 96 && newVal != ((const u32 *)gstate.boneMatrix)[num]) {
// Bone matrices should NOT flush when software skinning is enabled!
// TODO: Also check for morph...
// TODO: Can't turn this optimizatoin on until we decode per drawcall when sw skinning.
if (!g_Config.bSoftwareSkinning) {
Flush();
shaderManager_->DirtyUniform(DIRTY_BONEMATRIX0 << (num / 12));

View file

@ -15,6 +15,8 @@
// Official git repository and contact information can be found at
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
#include "base/basictypes.h"
#include "Core/Config.h"
#include "Core/MemMap.h"
#include "GPU/ge_constants.h"
@ -35,9 +37,13 @@ static const u8 nrmsize[4] = {0,3,6,12}, nrmalign[4] = {0,1,2,4};
static const u8 possize[4] = {0,3,6,12}, posalign[4] = {0,1,2,4};
static const u8 wtsize[4] = {0,1,2,4}, wtalign[4] = {0,1,2,4};
// When software skinning. This should be stored in registers instead of memory
// when jitting.
float skinMatrix[12];
// When software skinning. This array is only used when non-jitted - when jitted, the matrix
// is kept in registers.
static float skinMatrix[12];
// We start out by converting the active matrices into 4x4 which are easier to multiply with
// using SSE / NEON and store them here.
static float bones[16 * 8];
inline int align(int n, int align) {
return (n + (align - 1)) & ~(align - 1);
@ -930,6 +936,7 @@ struct JitLookup {
#ifdef ARM
static const float by128 = 1.0f / 128.0f;
static const float by256 = 1.0f / 256.0f;
static const float by32768 = 1.0f / 32768.0f;
using namespace ArmGen;
@ -1373,8 +1380,18 @@ void VertexDecoderJitCache::Jit_PosFloat() {
using namespace Gen;
static const float MEMORY_ALIGNED16( by128[4] ) = {1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f};
static const float MEMORY_ALIGNED16( by32768[4] ) = {1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f};
static const float MEMORY_ALIGNED16( by128[4] ) = {
1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
};
static const float MEMORY_ALIGNED16( by256[4] ) = {
1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256
};
static const float MEMORY_ALIGNED16( by32768[4] ) = {
1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
};
static const u32 MEMORY_ALIGNED16( threeMasks[4] ) = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
static const u32 MEMORY_ALIGNED16( aOne[4] ) = {0, 0, 0, 0x3F800000};
#ifdef _M_X64
#ifdef _WIN32
@ -1420,6 +1437,10 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
{&VertexDecoder::Step_WeightsU8Skin, &VertexDecoderJitCache::Jit_WeightsU8Skin},
{&VertexDecoder::Step_WeightsU16Skin, &VertexDecoderJitCache::Jit_WeightsU16Skin},
{&VertexDecoder::Step_WeightsFloatSkin, &VertexDecoderJitCache::Jit_WeightsFloatSkin},
{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
@ -1437,6 +1458,10 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
{&VertexDecoder::Step_NormalS8Skin, &VertexDecoderJitCache::Jit_NormalS8Skin},
{&VertexDecoder::Step_NormalS16Skin, &VertexDecoderJitCache::Jit_NormalS16Skin},
{&VertexDecoder::Step_NormalFloatSkin, &VertexDecoderJitCache::Jit_NormalFloatSkin},
{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
{&VertexDecoder::Step_Color4444, &VertexDecoderJitCache::Jit_Color4444},
{&VertexDecoder::Step_Color565, &VertexDecoderJitCache::Jit_Color565},
@ -1449,6 +1474,10 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
{&VertexDecoder::Step_PosS8Skin, &VertexDecoderJitCache::Jit_PosS8Skin},
{&VertexDecoder::Step_PosS16Skin, &VertexDecoderJitCache::Jit_PosS16Skin},
{&VertexDecoder::Step_PosFloatSkin, &VertexDecoderJitCache::Jit_PosFloatSkin},
};
// TODO: This should probably be global...
@ -1479,9 +1508,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
// Save XMM4/XMM5 which apparently can be problematic?
// Actually, if they are, it must be a compiler bug because they SHOULD be ok.
// So I won't bother.
// SUB(PTRBITS, R(ESP), Imm8(32));
// MOVUPS(MDisp(ESP, 0), XMM4);
// MOVUPS(MDisp(ESP, 16), XMM5);
SUB(PTRBITS, R(ESP), Imm8(64));
MOVUPS(MDisp(ESP, 0), XMM4);
MOVUPS(MDisp(ESP, 16), XMM5);
MOVUPS(MDisp(ESP, 32), XMM6);
MOVUPS(MDisp(ESP, 48), XMM7);
bool prescaleStep = false;
// Look for prescaled texcoord steps
@ -1493,6 +1524,28 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
}
}
// Add code to convert matrices to 4x4.
// Later we might want to do this when the matrices are loaded instead.
// This is mostly proof of concept.
int boneCount = 0;
if (dec.weighttype) {
for (int i = 0; i < 8; i++) {
MOVUPS(XMM0, M((void *)(gstate.boneMatrix + 12 * i)));
MOVUPS(XMM1, M((void *)(gstate.boneMatrix + 12 * i + 3)));
MOVUPS(XMM2, M((void *)(gstate.boneMatrix + 12 * i + 3 * 2)));
MOVUPS(XMM3, M((void *)(gstate.boneMatrix + 12 * i + 3 * 3)));
ANDPS(XMM0, M((void *)&threeMasks));
ANDPS(XMM1, M((void *)&threeMasks));
ANDPS(XMM2, M((void *)&threeMasks));
ANDPS(XMM3, M((void *)&threeMasks));
ORPS(XMM3, M((void *)&aOne));
MOVAPS(M((void *)(bones + 16 * i)), XMM0);
MOVAPS(M((void *)(bones + 16 * i + 4)), XMM1);
MOVAPS(M((void *)(bones + 16 * i + 8)), XMM2);
MOVAPS(M((void *)(bones + 16 * i + 12)), XMM3);
}
}
// Keep the scale/offset in a few fp registers if we need it.
if (prescaleStep) {
#ifdef _M_X64
@ -1529,9 +1582,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
SUB(32, R(counterReg), Imm8(1));
J_CC(CC_NZ, loopStart, true);
// MOVUPS(XMM4, MDisp(ESP, 0));
// MOVUPS(XMM5, MDisp(ESP, 16));
// ADD(PTRBITS, R(ESP), Imm8(32));
MOVUPS(XMM4, MDisp(ESP, 0));
MOVUPS(XMM5, MDisp(ESP, 16));
MOVUPS(XMM6, MDisp(ESP, 32));
MOVUPS(XMM7, MDisp(ESP, 48));
ADD(PTRBITS, R(ESP), Imm8(64));
#ifdef _M_IX86
// Restore register values
@ -1584,6 +1639,118 @@ void VertexDecoderJitCache::Jit_WeightsFloat() {
}
}
void VertexDecoderJitCache::Jit_WeightsU8Skin() {
#ifdef _M_X64
MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
#else
MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
#endif
for (int j = 0; j < dec_->nweights; j++) {
MOVZX(32, 8, tempReg1, MDisp(srcReg, dec_->weightoff + j));
CVTSI2SS(XMM1, R(tempReg1));
MULSS(XMM1, M((void *)&by128));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
if (j == 0) {
MOVAPS(XMM4, MDisp(tempReg2, 0));
MOVAPS(XMM5, MDisp(tempReg2, 16));
MULPS(XMM4, R(XMM1));
MULPS(XMM5, R(XMM1));
MOVAPS(XMM6, MDisp(tempReg2, 32));
MOVAPS(XMM7, MDisp(tempReg2, 48));
MULPS(XMM6, R(XMM1));
MULPS(XMM7, R(XMM1));
} else {
MOVAPS(XMM2, MDisp(tempReg2, 0));
MOVAPS(XMM3, MDisp(tempReg2, 16));
MULPS(XMM2, R(XMM1));
MULPS(XMM3, R(XMM1));
ADDPS(XMM4, R(XMM2));
ADDPS(XMM5, R(XMM3));
MOVAPS(XMM2, MDisp(tempReg2, 32));
MOVAPS(XMM3, MDisp(tempReg2, 48));
MULPS(XMM2, R(XMM1));
MULPS(XMM3, R(XMM1));
ADDPS(XMM6, R(XMM2));
ADDPS(XMM7, R(XMM3));
}
ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
}
}
void VertexDecoderJitCache::Jit_WeightsU16Skin() {
#ifdef _M_X64
MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
#else
MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
#endif
for (int j = 0; j < dec_->nweights; j++) {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->weightoff + j * 2));
CVTSI2SS(XMM1, R(tempReg1));
MULSS(XMM1, M((void *)&by32768));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
if (j == 0) {
MOVAPS(XMM4, MDisp(tempReg2, 0));
MOVAPS(XMM5, MDisp(tempReg2, 16));
MULPS(XMM4, R(XMM1));
MULPS(XMM5, R(XMM1));
MOVAPS(XMM6, MDisp(tempReg2, 32));
MOVAPS(XMM7, MDisp(tempReg2, 48));
MULPS(XMM6, R(XMM1));
MULPS(XMM7, R(XMM1));
} else {
MOVAPS(XMM2, MDisp(tempReg2, 0));
MOVAPS(XMM3, MDisp(tempReg2, 16));
MULPS(XMM2, R(XMM1));
MULPS(XMM3, R(XMM1));
ADDPS(XMM4, R(XMM2));
ADDPS(XMM5, R(XMM3));
MOVAPS(XMM2, MDisp(tempReg2, 32));
MOVAPS(XMM3, MDisp(tempReg2, 48));
MULPS(XMM2, R(XMM1));
MULPS(XMM3, R(XMM1));
ADDPS(XMM6, R(XMM2));
ADDPS(XMM7, R(XMM3));
}
ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
}
}
void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
#ifdef _M_X64
MOV(PTRBITS, R(tempReg2), Imm64((uintptr_t)&bones));
#else
MOV(PTRBITS, R(tempReg2), Imm32((uintptr_t)&bones));
#endif
for (int j = 0; j < dec_->nweights; j++) {
MOVSS(XMM1, MDisp(srcReg, dec_->weightoff + j * 4));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
if (j == 0) {
MOVAPS(XMM4, MDisp(tempReg2, 0));
MOVAPS(XMM5, MDisp(tempReg2, 16));
MULPS(XMM4, R(XMM1));
MULPS(XMM5, R(XMM1));
MOVAPS(XMM6, MDisp(tempReg2, 32));
MOVAPS(XMM7, MDisp(tempReg2, 48));
MULPS(XMM6, R(XMM1));
MULPS(XMM7, R(XMM1));
} else {
MOVAPS(XMM2, MDisp(tempReg2, 0));
MOVAPS(XMM3, MDisp(tempReg2, 16));
MULPS(XMM2, R(XMM1));
MULPS(XMM3, R(XMM1));
ADDPS(XMM4, R(XMM2));
ADDPS(XMM5, R(XMM3));
MOVAPS(XMM2, MDisp(tempReg2, 32));
MOVAPS(XMM3, MDisp(tempReg2, 48));
MULPS(XMM2, R(XMM1));
MULPS(XMM3, R(XMM1));
ADDPS(XMM6, R(XMM2));
ADDPS(XMM7, R(XMM3));
}
ADD(PTRBITS, R(tempReg2), Imm8(4 * 16));
}
}
// Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy.
void VertexDecoderJitCache::Jit_TcU8() {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
@ -1816,6 +1983,77 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
}
void VertexDecoderJitCache::Jit_NormalS8Skin() {
XORPS(XMM3, R(XMM3));
for (int i = 0; i < 3; i++) {
MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->nrmoff + (2 - i)));
CVTSI2SS(XMM3, R(tempReg1));
if (i != 2) {
PSLLDQ(XMM3, 4);
}
}
MULPS(XMM3, M((void *)&by128));
MOVAPS(XMM1, R(XMM3));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(XMM1, R(XMM4));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
MULPS(XMM2, R(XMM5));
ADDPS(XMM1, R(XMM2));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
MULPS(XMM2, R(XMM6));
ADDPS(XMM1, R(XMM2));
MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1);
}
// Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_NormalS16Skin() {
XORPS(XMM3, R(XMM3));
for (int i = 0; i < 3; i++) {
MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->nrmoff + (2 - i) * 2));
CVTSI2SS(XMM3, R(tempReg1));
if (i != 2) {
PSLLDQ(XMM3, 4);
}
}
MULPS(XMM3, M((void *)&by32768));
MOVAPS(XMM1, R(XMM3));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(XMM1, R(XMM4));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
MULPS(XMM2, R(XMM5));
ADDPS(XMM1, R(XMM2));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
MULPS(XMM2, R(XMM6));
ADDPS(XMM1, R(XMM2));
MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1);
}
void VertexDecoderJitCache::Jit_NormalFloatSkin() {
MOVUPS(XMM3, MDisp(srcReg, dec_->nrmoff));
MOVAPS(XMM1, R(XMM3));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(XMM1, R(XMM4));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
MULPS(XMM2, R(XMM5));
ADDPS(XMM1, R(XMM2));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
MULPS(XMM2, R(XMM6));
ADDPS(XMM1, R(XMM2));
MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM1);
}
// Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS8Through() {
// TODO: SIMD
@ -1861,6 +2099,77 @@ void VertexDecoderJitCache::Jit_PosFloat() {
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
}
void VertexDecoderJitCache::Jit_PosS8Skin() {
XORPS(XMM3, R(XMM3));
for (int i = 0; i < 3; i++) {
MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + (2 - i)));
CVTSI2SS(XMM3, R(tempReg1));
if (i != 2) {
PSLLDQ(XMM3, 4);
}
}
MULPS(XMM3, M((void *)&by128));
MOVAPS(XMM1, R(XMM3));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(XMM1, R(XMM4));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
MULPS(XMM2, R(XMM5));
ADDPS(XMM1, R(XMM2));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
MULPS(XMM2, R(XMM6));
ADDPS(XMM1, R(XMM2));
ADDPS(XMM1, R(XMM7));
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1);
}
void VertexDecoderJitCache::Jit_PosS16Skin() {
XORPS(XMM3, R(XMM3));
for (int i = 0; i < 3; i++) {
MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + (2 - i) * 2));
CVTSI2SS(XMM3, R(tempReg1));
if (i != 2) {
PSLLDQ(XMM3, 4);
}
}
MULPS(XMM3, M((void *)&by32768));
MOVAPS(XMM1, R(XMM3));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(XMM1, R(XMM4));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
MULPS(XMM2, R(XMM5));
ADDPS(XMM1, R(XMM2));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
MULPS(XMM2, R(XMM6));
ADDPS(XMM1, R(XMM2));
ADDPS(XMM1, R(XMM7));
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1);
}
// Just copy 12 bytes.
void VertexDecoderJitCache::Jit_PosFloatSkin() {
MOVUPS(XMM3, MDisp(srcReg, dec_->posoff));
MOVAPS(XMM1, R(XMM3));
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
MULPS(XMM1, R(XMM4));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(1, 1, 1, 1));
MULPS(XMM2, R(XMM5));
ADDPS(XMM1, R(XMM2));
MOVAPS(XMM2, R(XMM3));
SHUFPS(XMM2, R(XMM2), _MM_SHUFFLE(2, 2, 2, 2));
MULPS(XMM2, R(XMM6));
ADDPS(XMM1, R(XMM2));
ADDPS(XMM1, R(XMM7));
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM1);
}
#elif defined(PPC)
#error This should not be built for PowerPC, at least not yet.

View file

@ -199,6 +199,10 @@ public:
void Jit_WeightsU16();
void Jit_WeightsFloat();
void Jit_WeightsU8Skin();
void Jit_WeightsU16Skin();
void Jit_WeightsFloatSkin();
void Jit_TcU8();
void Jit_TcU16();
void Jit_TcFloat();
@ -222,11 +226,19 @@ public:
void Jit_NormalS16();
void Jit_NormalFloat();
void Jit_NormalS8Skin();
void Jit_NormalS16Skin();
void Jit_NormalFloatSkin();
void Jit_PosS8();
void Jit_PosS8Through();
void Jit_PosS16();
void Jit_PosS16Through();
void Jit_PosFloat();
void Jit_PosS8Through();
void Jit_PosS16Through();
void Jit_PosS8Skin();
void Jit_PosS16Skin();
void Jit_PosFloatSkin();
private:
bool CompileStep(const VertexDecoder &dec, int i);