Merge pull request #7848 from unknownbrackets/vertdec

Improve arm64 vertex decoder
This commit is contained in:
Henrik Rydgård 2015-07-05 18:25:24 +02:00
commit cc59361408
2 changed files with 565 additions and 113 deletions

View file

@ -59,6 +59,7 @@ static const ARM64Reg neonUVScaleReg = D0;
static const ARM64Reg neonUVOffsetReg = D1;
static const ARM64Reg src[3] = {S2, S3, S8};
static const ARM64Reg srcD[3] = {D2, D3, D8};
static const ARM64Reg srcQ[3] = {Q2, Q3, Q8};
static const ARM64Reg srcNEON = Q8;
@ -438,8 +439,7 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
break;
case 7:
case 8:
fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[1], srcReg, 16);
fp.LDP(128, INDEX_SIGNED, neonWeightRegsQ[0], neonWeightRegsQ[1], srcReg, 0);
break;
}
Jit_ApplyWeights();
@ -447,12 +447,16 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
void VertexDecoderJitCache::Jit_Color8888() {
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
// TODO: Set flags to determine if alpha != 0xFF.
// ANDSI2R(tempReg2, tempReg1, 0xFF000000);
// Set flags to determine if alpha != 0xFF.
ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
CMP(tempReg2, 0);
// Clear fullAlphaReg when the inverse was not 0.
// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
// FixupBranch skip = B(CC_NZ);
MOVI2R(fullAlphaReg, 0);
// SetJumpTarget(skip);
}
void VertexDecoderJitCache::Jit_Color4444() {
@ -472,11 +476,13 @@ void VertexDecoderJitCache::Jit_Color4444() {
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
// TODO: Set flags to determine if alpha != 0xFF.
//MVNS(tempReg2, tempReg, ArithOption(tempReg1, ST_ASR, 24));
//FixupBranch skip = B(CC_EQ);
MOVI2R(fullAlphaReg, 0);
//SetJumpTarget(skip);
// Set flags to determine if alpha != 0xFF.
ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
CMP(tempReg2, 0);
// Clear fullAlphaReg when the inverse was not 0.
// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
}
void VertexDecoderJitCache::Jit_Color565() {
@ -526,40 +532,35 @@ void VertexDecoderJitCache::Jit_Color5551() {
ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
ORR(tempReg2, tempReg2, tempReg1);
// TODO: Set flags to determine if alpha != 0xFF.
//MVNS(tempReg3, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
// Set flags to determine if alpha != 0xFF.
ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
CMP(tempReg3, 0);
STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off);
//FixupBranch skip = B(CC_EQ);
MOVI2R(fullAlphaReg, 0);
//SetJumpTarget(skip);
// Clear fullAlphaReg when the inverse was not 0.
// fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1;
CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
}
void VertexDecoderJitCache::Jit_TcU8() {
LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 1);
ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8));
LDURH(tempReg1, srcReg, dec_->tcoff);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU16() {
LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
LDUR(tempReg1, srcReg, dec_->tcoff);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU16Through() {
LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
LDUR(tempReg1, srcReg, dec_->tcoff);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcFloatThrough() {
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4);
LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU16Double() {
@ -579,60 +580,57 @@ void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
}
void VertexDecoderJitCache::Jit_TcFloat() {
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4);
LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU8Prescale() {
fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff);
fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcU16Prescale() {
fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff);
fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_TcFloatPrescale() {
fp.LDR(64, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff);
fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA
fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
}
void VertexDecoderJitCache::Jit_PosS8() {
Jit_AnyS8ToFloat(dec_->posoff);
STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff);
STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4);
STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8);
fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
}
void VertexDecoderJitCache::Jit_PosS16() {
Jit_AnyS16ToFloat(dec_->posoff);
STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff);
STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4);
STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8);
fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
}
// Just copy 12 bytes.
void VertexDecoderJitCache::Jit_PosFloat() {
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 4);
LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.posoff);
STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.posoff + 4);
STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8);
// Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->posoff);
STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.posoff);
} else {
LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->posoff);
STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.posoff);
LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8);
STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8);
}
}
void VertexDecoderJitCache::Jit_PosS8Through() {
@ -648,22 +646,20 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
}
void VertexDecoderJitCache::Jit_PosS16Through() {
LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 2);
// Start with X and Y (which is signed.)
fp.LDUR(32, src[0], srcReg, dec_->posoff);
fp.SXTL(16, srcD[0], src[0]);
fp.SCVTF(32, srcD[0], srcD[0]);
fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff);
// Now load in Z (which is unsigned.)
LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4);
fp.SCVTF(fpScratchReg, tempReg1);
fp.SCVTF(fpScratchReg2, tempReg2);
fp.SCVTF(fpScratchReg3, tempReg3);
STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff);
STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4);
STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8);
fp.SCVTF(src[1], tempReg3);
STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 8);
}
void VertexDecoderJitCache::Jit_NormalS8() {
LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 1);
LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 2);
ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8));
ORR(tempReg1, tempReg1, tempReg3, ArithOption(tempReg3, ST_LSL, 16));
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
}
@ -671,21 +667,22 @@ void VertexDecoderJitCache::Jit_NormalS8() {
// Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_NormalS16() {
// NOTE: Not LDRH, we just copy the raw bytes here.
LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 2);
LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 4);
ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
LDUR(tempReg1, srcReg, dec_->nrmoff);
LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4);
STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
}
void VertexDecoderJitCache::Jit_NormalFloat() {
LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4);
LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8);
STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.nrmoff + 4);
STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
// Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->nrmoff);
STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.nrmoff);
} else {
LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->nrmoff);
STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8);
STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
}
}
void VertexDecoderJitCache::Jit_NormalS8Skin() {
@ -699,10 +696,7 @@ void VertexDecoderJitCache::Jit_NormalS16Skin() {
}
void VertexDecoderJitCache::Jit_NormalFloatSkin() {
// fp.LDR(128, INDEX_UNSIGNED, srcNEON, srcReg, dec_->nrmoff);
LDR(INDEX_UNSIGNED, src[0], srcReg, dec_->nrmoff);
LDR(INDEX_UNSIGNED, src[1], srcReg, dec_->nrmoff + 4);
LDR(INDEX_UNSIGNED, src[2], srcReg, dec_->nrmoff + 8);
fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff);
Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
}
@ -717,45 +711,30 @@ void VertexDecoderJitCache::Jit_PosS16Skin() {
}
void VertexDecoderJitCache::Jit_PosFloatSkin() {
//fp.LDR(128, INDEX_UNSIGNED, srcNEON, srcReg, dec_->posoff);
LDR(INDEX_UNSIGNED, src[0], srcReg, dec_->posoff);
LDR(INDEX_UNSIGNED, src[1], srcReg, dec_->posoff + 4);
LDR(INDEX_UNSIGNED, src[2], srcReg, dec_->posoff + 8);
fp.LDUR(128, srcQ[0], srcReg, dec_->posoff);
Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
}
void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
// TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too.
LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, srcoff);
LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 1);
LDRSB(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 2);
fp.SCVTF(src[0], tempReg1, 7);
fp.SCVTF(src[1], tempReg2, 7);
fp.SCVTF(src[2], tempReg3, 7);
fp.LDUR(32, src[0], srcReg, srcoff);
fp.SXTL(8, srcD[0], src[0]);
fp.SXTL(16, srcQ[0], srcD[0]);
fp.SCVTF(32, srcQ[0], srcQ[0], 7);
}
void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
// TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too.
LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, srcoff);
LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 2);
LDRSH(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 4);
fp.SCVTF(src[0], tempReg1, 15);
fp.SCVTF(src[1], tempReg2, 15);
fp.SCVTF(src[2], tempReg3, 15);
fp.LDUR(64, src[0], srcReg, srcoff);
fp.SXTL(16, srcQ[0], srcD[0]);
fp.SCVTF(32, srcQ[0], srcQ[0], 15);
}
void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
// Multiply with the matrix sitting in Q4-Q7.
fp.FMUL(32, accNEON, Q4, srcQ[0], 0);
fp.FMLA(32, accNEON, Q5, srcQ[1], 0);
fp.FMLA(32, accNEON, Q6, srcQ[2], 0);
fp.FMLA(32, accNEON, Q5, srcQ[0], 1);
fp.FMLA(32, accNEON, Q6, srcQ[0], 2);
if (pos) {
fp.FADD(32, accNEON, accNEON, Q7);
}
// Ugly store operation.
fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff);
fp.INS(32, accNEON, 0, accNEON, 1);
fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff + 4);
fp.INS(32, accNEON, 0, accNEON, 2);
fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff + 8);
fp.STUR(128, accNEON, dstReg, outOff);
}

View file

@ -20,6 +20,7 @@
#include "Core/Config.h"
#include "GPU/Common/VertexDecoderCommon.h"
#include "GPU/ge_constants.h"
#include "GPU/GPUState.h"
#include "unittest/TestVertexJit.h"
#include "unittest/UnitTest.h"
@ -29,7 +30,7 @@ class VertexDecoderTestHarness {
public:
VertexDecoderTestHarness()
: dec_(nullptr), needsReset_(true), dstPos_(0) {
: dec_(nullptr), needsReset_(true), dstPos_(0), assertFailed_(false) {
src_ = new u8[BUFFER_SIZE];
dst_ = new u8[BUFFER_SIZE];
cache_ = new VertexDecoderJitCache();
@ -108,6 +109,12 @@ public:
Add8(y);
Add8(z);
}
void Add8(u8 x, u8 y, u8 z, u8 w) {
Add8(x);
Add8(y);
Add8(z);
Add8(w);
}
void Add16(u16_le x) {
if (needsReset_) {
@ -148,19 +155,98 @@ public:
}
u16 Get16() {
u16 result;
u16_le result;
memcpy(&result, dst_ + dstPos_, sizeof(result));
dstPos_ += sizeof(result);
return result;
}
float GetFloat() {
float result;
float_le result;
memcpy(&result, dst_ + dstPos_, sizeof(result));
dstPos_ += sizeof(result);
return result;
}
void Assert8(const char *title, u8 x, u8 y) {
u8 resx = Get8();
u8 resy = Get8();
if (resx != x || resy != y) {
assertFailed_ = true;
printf("%s: Failed %d, %d != expected %d, %d\n", title, resx, resy, x, y);
}
}
void Assert8(const char *title, u8 x, u8 y, u8 z) {
u8 resx = Get8();
u8 resy = Get8();
u8 resz = Get8();
if (resx != x || resy != y || resz != z) {
assertFailed_ = true;
printf("%s: Failed %d, %d, %d != expected %d, %d, %d\n", title, resx, resy, resz, x, y, z);
}
}
void Assert8(const char *title, u8 x, u8 y, u8 z, u8 w) {
u8 resx = Get8();
u8 resy = Get8();
u8 resz = Get8();
u8 resw = Get8();
if (resx != x || resy != y || resz != z || resw != w) {
assertFailed_ = true;
printf("%s: Failed %d, %d, %d, %d != expected %d, %d, %d, %d\n", title, resx, resy, resz, resw, x, y, z, w);
}
}
void Assert16(const char *title, u16 x, u16 y) {
u16 resx = Get16();
u16 resy = Get16();
if (resx != x || resy != y) {
assertFailed_ = true;
printf("%s: Failed %d, %d != expected %d, %d\n", title, resx, resy, x, y);
}
}
void Assert16(const char *title, u16 x, u16 y, u16 z) {
u16 resx = Get16();
u16 resy = Get16();
u16 resz = Get16();
if (resx != x || resy != y || resz != z) {
assertFailed_ = true;
printf("%s: Failed %d, %d, %d != expected %d, %d, %d\n", title, resx, resy, resz, x, y, z);
}
}
bool CompareFloat(float a, float b) {
return a - fmodf(a, 0.0000001f) == b - fmodf(b, 0.0000001f);
}
void AssertFloat(const char *title, float x) {
float resx = GetFloat();
if (!CompareFloat(resx, x)) {
assertFailed_ = true;
printf("%s: Failed %f != expected %f\n", title, resx, x);
}
}
void AssertFloat(const char *title, float x, float y) {
float resx = GetFloat();
float resy = GetFloat();
if (!CompareFloat(resx, x) || !CompareFloat(resy, y)) {
assertFailed_ = true;
printf("%s: Failed %f, %f != expected %f, %f\n", title, resx, resy, x, y);
}
}
void AssertFloat(const char *title, float x, float y, float z) {
float resx = GetFloat();
float resy = GetFloat();
float resz = GetFloat();
if (!CompareFloat(resx, x) || !CompareFloat(resy, y) || !CompareFloat(resz, z)) {
assertFailed_ = true;
printf("%s: Failed %f, %f, %f != expected %f, %f, %f\n", title, resx, resy, resz, x, y, z);
}
}
void Skip(u32 c) {
dstPos_ += c;
}
void *GetData() {
return dst_;
}
@ -172,6 +258,10 @@ public:
return 0;
}
bool HasFailed() {
return assertFailed_;
}
private:
void SetupExecute(int vtype, bool useJit) {
if (dec_ != nullptr) {
@ -179,6 +269,7 @@ private:
}
dec_ = new VertexDecoder();
dec_->SetVertexType(vtype, options_, useJit ? cache_ : nullptr);
dstPos_ = 0;
needsReset_ = true;
}
@ -193,19 +284,401 @@ private:
bool needsReset_;
size_t srcPos_;
size_t dstPos_;
bool assertFailed_;
};
static bool TestVertex8() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_TC_8BIT;
dec.Add8(127, 128);
dec.Add8(127, 0, 128);
dec.Add8(127, 0, 128);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertex8-TC", 127, 128);
dec.Skip(2);
dec.Assert8("TestVertex8-Nrm", 127, 0, 128);
dec.Skip(1);
dec.AssertFloat("TestVertex8-Pos", 127.0f / 128.0f, 0.0f, -1.0f);
}
return !dec.HasFailed();
}
static bool TestVertex16() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_TC_16BIT;
dec.Add16(32767, 32768);
dec.Add16(32767, 0, 32768);
dec.Add16(32767, 0, 32768);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.Assert16("TestVertex16-TC", 32767, 32768);
dec.Assert16("TestVertex16-Nrm", 32767, 0, 32768);
dec.Skip(2);
dec.AssertFloat("TestVertex16-Pos", 32767.0f / 32768.0f, 0.0f, -1.0f);
}
return !dec.HasFailed();
}
static bool TestVertexFloat() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_TC_FLOAT;
dec.AddFloat(1.0f, -1.0f);
dec.AddFloat(1.0f, 0.5f, -1.0f);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.AssertFloat("TestVertexFloat-TC", 1.0f, -1.0f);
dec.AssertFloat("TestVertexFloat-Nrm", 1.0f, 0.5f, -1.0f);
dec.AssertFloat("TestVertexFloat-Pos", 1.0f, 0.5f, -1.0f);
}
return !dec.HasFailed();
}
static bool TestVertex8Through() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_TC_8BIT | GE_VTYPE_THROUGH;
dec.Add8(127, 128);
dec.Add8(127, 0, 128);
dec.Add8(127, 0, 128);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertex8Through-TC", 127, 128);
dec.Skip(2);
dec.Assert8("TestVertex8Through-Nrm", 127, 0, 128);
// Ignoring Pos since s8 through isn't really an option.
}
return !dec.HasFailed();
}
static bool TestVertex16Through() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_TC_16BIT | GE_VTYPE_THROUGH;
dec.Add16(32767, 32768);
dec.Add16(32767, 0, 32768);
dec.Add16(32767, 0, 32768);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.Assert16("TestVertex16Through-TC", 32767, 32768);
dec.Assert16("TestVertex16Through-Nrm", 32767, 0, 32768);
dec.Skip(2);
dec.AssertFloat("TestVertex16Through-Pos", 32767.0f, 0.0f, 32768.0f);
}
return !dec.HasFailed();
}
static bool TestVertexFloatThrough() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_TC_FLOAT | GE_VTYPE_THROUGH;
dec.AddFloat(1.0f, -1.0f);
dec.AddFloat(1.0f, 0.5f, -1.0f);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.AssertFloat("TestVertexFloatThrough-TC", 1.0f, -1.0f);
dec.AssertFloat("TestVertexFloatThrough-Nrm", 1.0f, 0.5f, -1.0f);
dec.AssertFloat("TestVertexFloatThrough-Pos", 1.0f, 0.5f, -1.0f);
}
return !dec.HasFailed();
}
static bool TestVertexColor8888() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_8888;
bool failed = false;
dec.Add8(1, 2, 3, 4);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
gstate_c.vertexFullAlpha = true;
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertexColor8888-Col", 1, 2, 3, 4);
dec.AssertFloat("TestVertexColor8888-Pos", 1.0f, 0.5f, -1.0f);
if (gstate_c.vertexFullAlpha) {
printf("TestVertexColor8888: failed to clear vertexFullAlpha\n");
failed = true;
}
}
dec.Add8(255, 255, 255, 255);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
gstate_c.vertexFullAlpha = true;
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertexColor8888-Col", 255, 255, 255, 255);
dec.AssertFloat("TestVertexColor8888-Pos", 1.0f, 0.5f, -1.0f);
if (!gstate_c.vertexFullAlpha) {
printf("TestVertexColor8888: cleared vertexFullAlpha\n");
failed = true;
}
}
return !dec.HasFailed() && !failed;
}
static bool TestVertexColor4444() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_4444;
bool failed = false;
dec.Add16(0x1234, 0);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
gstate_c.vertexFullAlpha = true;
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertexColor4444-Col", 0x44, 0x33, 0x22, 0x11);
dec.AssertFloat("TestVertexColor4444-Pos", 1.0f, 0.5f, -1.0f);
if (gstate_c.vertexFullAlpha) {
printf("TestVertexColor4444: failed to clear vertexFullAlpha\n");
failed = true;
}
}
dec.Add16(0xFFFF, 0);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
gstate_c.vertexFullAlpha = true;
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertexColor4444-Col", 255, 255, 255, 255);
dec.AssertFloat("TestVertexColor4444-Pos", 1.0f, 0.5f, -1.0f);
if (!gstate_c.vertexFullAlpha) {
printf("TestVertexColor4444: cleared vertexFullAlpha\n");
failed = true;
}
}
return !dec.HasFailed() && !failed;
}
static bool TestVertexColor5551() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_5551;
bool failed = false;
dec.Add16((0 << 15) | (1 << 10) | (2 << 5) | 3, 0);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
gstate_c.vertexFullAlpha = true;
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertexColor5551-Col", 0x18, 0x10, 0x8, 0x0);
dec.AssertFloat("TestVertexColor5551-Pos", 1.0f, 0.5f, -1.0f);
if (gstate_c.vertexFullAlpha) {
printf("TestVertexColor5551: failed to clear vertexFullAlpha\n");
failed = true;
}
}
dec.Add16(0xFFFF, 0);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
gstate_c.vertexFullAlpha = true;
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertexColor5551-Col", 255, 255, 255, 255);
dec.AssertFloat("TestVertexColor5551-Pos", 1.0f, 0.5f, -1.0f);
if (!gstate_c.vertexFullAlpha) {
printf("TestVertexColor5551: cleared vertexFullAlpha\n");
failed = true;
}
}
return !dec.HasFailed() && !failed;
}
static bool TestVertexColor565() {
VertexDecoderTestHarness dec;
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_565;
bool failed = false;
dec.Add16((1 << 11) | (2 << 5) | 3, 0);
dec.AddFloat(1.0f, 0.5f, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
gstate_c.vertexFullAlpha = true;
dec.Execute(vtype, 0, jit == 1);
dec.Assert8("TestVertexColor565-Col", 0x18, 0x8, 0x8, 255);
dec.AssertFloat("TestVertexColor565-Pos", 1.0f, 0.5f, -1.0f);
if (!gstate_c.vertexFullAlpha) {
printf("TestVertexColor565: cleared vertexFullAlpha\n");
failed = true;
}
}
return !dec.HasFailed() && !failed;
}
static bool TestVertex8Skin() {
VertexDecoderTestHarness dec;
g_Config.bSoftwareSkinning = true;
for (int i = 0; i < 8 * 12; ++i) {
gstate.boneMatrix[i] = 0.0f;
}
gstate.boneMatrix[0] = 2.0f;
gstate.boneMatrix[4] = 1.0f;
gstate.boneMatrix[8] = 5.0f;
gstate.boneMatrix[12] = 1.0f;
gstate.boneMatrix[16] = 2.0f;
gstate.boneMatrix[20] = 5.0f;
int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_WEIGHT_8BIT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT);
dec.Add8(128 + 64, 128 - 64);
dec.Add8(127, 0, 128);
dec.Add8(127, 0, 128);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.AssertFloat("TestVertex8Skin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 127.0f / 128.0f, 0.0f, 2.0f * 5.0f * -1.0f);
dec.AssertFloat("TestVertex8Skin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 127.0f / 128.0f, 0.0f, 2.0f * 5.0f * -1.0f);
}
return !dec.HasFailed();
}
static bool TestVertex16Skin() {
VertexDecoderTestHarness dec;
g_Config.bSoftwareSkinning = true;
for (int i = 0; i < 8 * 12; ++i) {
gstate.boneMatrix[i] = 0.0f;
}
gstate.boneMatrix[0] = 2.0f;
gstate.boneMatrix[4] = 1.0f;
gstate.boneMatrix[8] = 5.0f;
gstate.boneMatrix[12] = 1.0f;
gstate.boneMatrix[16] = 2.0f;
gstate.boneMatrix[20] = 5.0f;
int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_WEIGHT_16BIT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT);
dec.Add16(32768 + 16384, 32768 - 16384);
dec.Add16(32767, 0, 32768);
dec.Add16(32767, 0, 32768);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.AssertFloat("TestVertex16Skin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 32767.0f / 32768.0f, 0.0f, 2.0f * 5.0f * -1.0f);
dec.AssertFloat("TestVertex16Skin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 32767.0f / 32768.0f, 0.0f, 2.0f * 5.0f * -1.0f);
}
return !dec.HasFailed();
}
static bool TestVertexFloatSkin() {
VertexDecoderTestHarness dec;
g_Config.bSoftwareSkinning = true;
for (int i = 0; i < 8 * 12; ++i) {
gstate.boneMatrix[i] = 0.0f;
}
gstate.boneMatrix[0] = 2.0f;
gstate.boneMatrix[4] = 1.0f;
gstate.boneMatrix[8] = 5.0f;
gstate.boneMatrix[12] = 1.0f;
gstate.boneMatrix[16] = 2.0f;
gstate.boneMatrix[20] = 5.0f;
int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_WEIGHT_FLOAT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT);
dec.AddFloat(1.5f, 0.5f);
dec.AddFloat(1.0f, 0, -1.0f);
dec.AddFloat(1.0f, 0, -1.0f);
for (int jit = 0; jit <= 1; ++jit) {
dec.Execute(vtype, 0, jit == 1);
dec.AssertFloat("TestVertexFloatSkin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 1.0f, 0.0f, 2.0f * 5.0f * -1.0f);
dec.AssertFloat("TestVertexFloatSkin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 1.0f, 0.0f, 2.0f * 5.0f * -1.0f);
}
return !dec.HasFailed();
}
// TODO: Morph (col, pos, nrm), weights (no skin), morph + weights?
typedef bool (*VertexTestFunc)();
static VertexTestFunc vertdecTestFuncs[] = {
&TestVertex8,
&TestVertex16,
&TestVertexFloat,
&TestVertex8Through,
&TestVertex16Through,
&TestVertexFloatThrough,
&TestVertexColor8888,
&TestVertexColor4444,
&TestVertexColor5551,
&TestVertexColor565,
&TestVertex8Skin,
&TestVertex16Skin,
&TestVertexFloatSkin,
};
bool TestVertexJit() {
VertexDecoderTestHarness dec;
for (int i = 0; i < 100; ++i) {
/*for (int i = 0; i < 100; ++i) {
dec.AddFloat(0.5f, 1.0f, -1.0f);
}
int vtype = GE_VTYPE_POS_FLOAT;
int vtype = GE_VTYPE_POS_FLOAT;*/
/*for (int i = 0; i < 100; ++i) {
dec.Add16(32767, 0, 32768);
}
int vtype = GE_VTYPE_POS_16BIT;*/
for (int i = 0; i < 100; ++i) {
dec.Add8(127, 0, 128);
}
int vtype = GE_VTYPE_POS_8BIT;
double yesJit = dec.ExecuteTimed(vtype, 100, true);
double noJit = dec.ExecuteTimed(vtype, 100, false);
printf("Result: %f, %f, %f\n", dec.GetFloat(), dec.GetFloat(), dec.GetFloat());
float x = dec.GetFloat();
float y = dec.GetFloat();
float z = dec.GetFloat();
printf("Result: %f, %f, %f\n", x, y, z);
printf("Jit was %fx faster than steps.\n\n", yesJit / noJit);
return yesJit > noJit;
bool pass = true;
for (size_t i = 0; i < ARRAY_SIZE(vertdecTestFuncs); ++i) {
if (!vertdecTestFuncs[i]()) {
pass = false;
}
}
return pass;
}