diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp index a7c447df91..fa2e05677c 100644 --- a/GPU/Common/VertexDecoderArm64.cpp +++ b/GPU/Common/VertexDecoderArm64.cpp @@ -59,6 +59,7 @@ static const ARM64Reg neonUVScaleReg = D0; static const ARM64Reg neonUVOffsetReg = D1; static const ARM64Reg src[3] = {S2, S3, S8}; +static const ARM64Reg srcD[3] = {D2, D3, D8}; static const ARM64Reg srcQ[3] = {Q2, Q3, Q8}; static const ARM64Reg srcNEON = Q8; @@ -438,8 +439,7 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() { break; case 7: case 8: - fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0); - fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[1], srcReg, 16); + fp.LDP(128, INDEX_SIGNED, neonWeightRegsQ[0], neonWeightRegsQ[1], srcReg, 0); break; } Jit_ApplyWeights(); @@ -447,12 +447,16 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() { void VertexDecoderJitCache::Jit_Color8888() { LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff); - // TODO: Set flags to determine if alpha != 0xFF. - // ANDSI2R(tempReg2, tempReg1, 0xFF000000); + + // Set flags to determine if alpha != 0xFF. + ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); + CMP(tempReg2, 0); + + // Clear fullAlphaReg when the inverse was not 0. + // fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1; + CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); + STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off); - // FixupBranch skip = B(CC_NZ); - MOVI2R(fullAlphaReg, 0); - // SetJumpTarget(skip); } void VertexDecoderJitCache::Jit_Color4444() { @@ -472,11 +476,13 @@ void VertexDecoderJitCache::Jit_Color4444() { STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off); - // TODO: Set flags to determine if alpha != 0xFF. - //MVNS(tempReg2, tempReg, ArithOption(tempReg1, ST_ASR, 24)); - //FixupBranch skip = B(CC_EQ); - MOVI2R(fullAlphaReg, 0); - //SetJumpTarget(skip); + // Set flags to determine if alpha != 0xFF. + ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); + CMP(tempReg2, 0); + + // Clear fullAlphaReg when the inverse was not 0. + // fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1; + CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); } void VertexDecoderJitCache::Jit_Color565() { @@ -526,40 +532,35 @@ void VertexDecoderJitCache::Jit_Color5551() { ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg); ORR(tempReg2, tempReg2, tempReg1); - // TODO: Set flags to determine if alpha != 0xFF. - //MVNS(tempReg3, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); + // Set flags to determine if alpha != 0xFF. + ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24)); + CMP(tempReg3, 0); + STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off); - //FixupBranch skip = B(CC_EQ); - MOVI2R(fullAlphaReg, 0); - //SetJumpTarget(skip); + + // Clear fullAlphaReg when the inverse was not 0. + // fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1; + CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ); } void VertexDecoderJitCache::Jit_TcU8() { - LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 1); - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8)); + LDURH(tempReg1, srcReg, dec_->tcoff); STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_TcU16() { - LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16)); + LDUR(tempReg1, srcReg, dec_->tcoff); STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_TcU16Through() { - LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2); - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16)); + LDUR(tempReg1, srcReg, dec_->tcoff); STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_TcFloatThrough() { - LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); - STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4); + LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff); + STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_TcU16Double() { @@ -579,60 +580,57 @@ void VertexDecoderJitCache::Jit_TcU16ThroughDouble() { } void VertexDecoderJitCache::Jit_TcFloat() { - LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff); - LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff); - STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4); + LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff); + STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_TcU8Prescale() { - fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff); + fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff); fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit fp.UCVTF(32, neonScratchRegD, neonScratchRegD); fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg); - fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff); + fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_TcU16Prescale() { - fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff); + fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff); fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit fp.UCVTF(32, neonScratchRegD, neonScratchRegD); fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg); - fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff); + fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_TcFloatPrescale() { - fp.LDR(64, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff); + fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff); fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg); // TODO: FMLA fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg); - fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff); + fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff); } void VertexDecoderJitCache::Jit_PosS8() { Jit_AnyS8ToFloat(dec_->posoff); - STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff); - STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4); - STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8); + fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff); } void VertexDecoderJitCache::Jit_PosS16() { Jit_AnyS16ToFloat(dec_->posoff); - STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff); - STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4); - STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8); + fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff); } -// Just copy 12 bytes. void VertexDecoderJitCache::Jit_PosFloat() { - LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff); - LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 4); - LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.posoff); - STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.posoff + 4); - STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8); + // Only need to copy 12 bytes, but copying 16 should be okay (and is faster.) + if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) { + LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->posoff); + STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.posoff); + } else { + LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->posoff); + STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.posoff); + LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8); + STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8); + } } void VertexDecoderJitCache::Jit_PosS8Through() { @@ -648,22 +646,20 @@ void VertexDecoderJitCache::Jit_PosS8Through() { } void VertexDecoderJitCache::Jit_PosS16Through() { - LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff); - LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 2); + // Start with X and Y (which is signed.) + fp.LDUR(32, src[0], srcReg, dec_->posoff); + fp.SXTL(16, srcD[0], src[0]); + fp.SCVTF(32, srcD[0], srcD[0]); + fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff); + // Now load in Z (which is unsigned.) LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4); - fp.SCVTF(fpScratchReg, tempReg1); - fp.SCVTF(fpScratchReg2, tempReg2); - fp.SCVTF(fpScratchReg3, tempReg3); - STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff); - STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4); - STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8); + fp.SCVTF(src[1], tempReg3); + STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 8); } void VertexDecoderJitCache::Jit_NormalS8() { - LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff); - LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 1); + LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff); LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 2); - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8)); ORR(tempReg1, tempReg1, tempReg3, ArithOption(tempReg3, ST_LSL, 16)); STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff); } @@ -671,21 +667,22 @@ void VertexDecoderJitCache::Jit_NormalS8() { // Copy 6 bytes and then 2 zeroes. void VertexDecoderJitCache::Jit_NormalS16() { // NOTE: Not LDRH, we just copy the raw bytes here. - LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff); - LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 2); - LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 4); - ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16)); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff); - STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 4); + LDUR(tempReg1, srcReg, dec_->nrmoff); + LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4); + STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff); } void VertexDecoderJitCache::Jit_NormalFloat() { - LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff); - LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4); - LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8); - STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff); - STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.nrmoff + 4); - STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8); + // Only need to copy 12 bytes, but copying 16 should be okay (and is faster.) + if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) { + LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->nrmoff); + STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.nrmoff); + } else { + LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->nrmoff); + STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff); + LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8); + STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8); + } } void VertexDecoderJitCache::Jit_NormalS8Skin() { @@ -699,10 +696,7 @@ void VertexDecoderJitCache::Jit_NormalS16Skin() { } void VertexDecoderJitCache::Jit_NormalFloatSkin() { - // fp.LDR(128, INDEX_UNSIGNED, srcNEON, srcReg, dec_->nrmoff); - LDR(INDEX_UNSIGNED, src[0], srcReg, dec_->nrmoff); - LDR(INDEX_UNSIGNED, src[1], srcReg, dec_->nrmoff + 4); - LDR(INDEX_UNSIGNED, src[2], srcReg, dec_->nrmoff + 8); + fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff); Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false); } @@ -717,45 +711,30 @@ void VertexDecoderJitCache::Jit_PosS16Skin() { } void VertexDecoderJitCache::Jit_PosFloatSkin() { - //fp.LDR(128, INDEX_UNSIGNED, srcNEON, srcReg, dec_->posoff); - LDR(INDEX_UNSIGNED, src[0], srcReg, dec_->posoff); - LDR(INDEX_UNSIGNED, src[1], srcReg, dec_->posoff + 4); - LDR(INDEX_UNSIGNED, src[2], srcReg, dec_->posoff + 8); + fp.LDUR(128, srcQ[0], srcReg, dec_->posoff); Jit_WriteMatrixMul(dec_->decFmt.posoff, true); } void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) { - // TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too. - LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, srcoff); - LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 1); - LDRSB(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 2); - fp.SCVTF(src[0], tempReg1, 7); - fp.SCVTF(src[1], tempReg2, 7); - fp.SCVTF(src[2], tempReg3, 7); + fp.LDUR(32, src[0], srcReg, srcoff); + fp.SXTL(8, srcD[0], src[0]); + fp.SXTL(16, srcQ[0], srcD[0]); + fp.SCVTF(32, srcQ[0], srcQ[0], 7); } void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) { - // TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too. - LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, srcoff); - LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 2); - LDRSH(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 4); - fp.SCVTF(src[0], tempReg1, 15); - fp.SCVTF(src[1], tempReg2, 15); - fp.SCVTF(src[2], tempReg3, 15); + fp.LDUR(64, src[0], srcReg, srcoff); + fp.SXTL(16, srcQ[0], srcD[0]); + fp.SCVTF(32, srcQ[0], srcQ[0], 15); } void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { // Multiply with the matrix sitting in Q4-Q7. fp.FMUL(32, accNEON, Q4, srcQ[0], 0); - fp.FMLA(32, accNEON, Q5, srcQ[1], 0); - fp.FMLA(32, accNEON, Q6, srcQ[2], 0); + fp.FMLA(32, accNEON, Q5, srcQ[0], 1); + fp.FMLA(32, accNEON, Q6, srcQ[0], 2); if (pos) { fp.FADD(32, accNEON, accNEON, Q7); } - // Ugly store operation. - fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff); - fp.INS(32, accNEON, 0, accNEON, 1); - fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff + 4); - fp.INS(32, accNEON, 0, accNEON, 2); - fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff + 8); + fp.STUR(128, accNEON, dstReg, outOff); } diff --git a/unittest/TestVertexJit.cpp b/unittest/TestVertexJit.cpp index 3b773c94fa..79012974fd 100644 --- a/unittest/TestVertexJit.cpp +++ b/unittest/TestVertexJit.cpp @@ -20,6 +20,7 @@ #include "Core/Config.h" #include "GPU/Common/VertexDecoderCommon.h" #include "GPU/ge_constants.h" +#include "GPU/GPUState.h" #include "unittest/TestVertexJit.h" #include "unittest/UnitTest.h" @@ -29,7 +30,7 @@ class VertexDecoderTestHarness { public: VertexDecoderTestHarness() - : dec_(nullptr), needsReset_(true), dstPos_(0) { + : dec_(nullptr), needsReset_(true), dstPos_(0), assertFailed_(false) { src_ = new u8[BUFFER_SIZE]; dst_ = new u8[BUFFER_SIZE]; cache_ = new VertexDecoderJitCache(); @@ -108,6 +109,12 @@ public: Add8(y); Add8(z); } + void Add8(u8 x, u8 y, u8 z, u8 w) { + Add8(x); + Add8(y); + Add8(z); + Add8(w); + } void Add16(u16_le x) { if (needsReset_) { @@ -148,19 +155,98 @@ public: } u16 Get16() { - u16 result; + u16_le result; memcpy(&result, dst_ + dstPos_, sizeof(result)); dstPos_ += sizeof(result); return result; } float GetFloat() { - float result; + float_le result; memcpy(&result, dst_ + dstPos_, sizeof(result)); dstPos_ += sizeof(result); return result; } + void Assert8(const char *title, u8 x, u8 y) { + u8 resx = Get8(); + u8 resy = Get8(); + if (resx != x || resy != y) { + assertFailed_ = true; + printf("%s: Failed %d, %d != expected %d, %d\n", title, resx, resy, x, y); + } + } + void Assert8(const char *title, u8 x, u8 y, u8 z) { + u8 resx = Get8(); + u8 resy = Get8(); + u8 resz = Get8(); + if (resx != x || resy != y || resz != z) { + assertFailed_ = true; + printf("%s: Failed %d, %d, %d != expected %d, %d, %d\n", title, resx, resy, resz, x, y, z); + } + } + void Assert8(const char *title, u8 x, u8 y, u8 z, u8 w) { + u8 resx = Get8(); + u8 resy = Get8(); + u8 resz = Get8(); + u8 resw = Get8(); + if (resx != x || resy != y || resz != z || resw != w) { + assertFailed_ = true; + printf("%s: Failed %d, %d, %d, %d != expected %d, %d, %d, %d\n", title, resx, resy, resz, resw, x, y, z, w); + } + } + + void Assert16(const char *title, u16 x, u16 y) { + u16 resx = Get16(); + u16 resy = Get16(); + if (resx != x || resy != y) { + assertFailed_ = true; + printf("%s: Failed %d, %d != expected %d, %d\n", title, resx, resy, x, y); + } + } + void Assert16(const char *title, u16 x, u16 y, u16 z) { + u16 resx = Get16(); + u16 resy = Get16(); + u16 resz = Get16(); + if (resx != x || resy != y || resz != z) { + assertFailed_ = true; + printf("%s: Failed %d, %d, %d != expected %d, %d, %d\n", title, resx, resy, resz, x, y, z); + } + } + + bool CompareFloat(float a, float b) { + return a - fmodf(a, 0.0000001f) == b - fmodf(b, 0.0000001f); + } + + void AssertFloat(const char *title, float x) { + float resx = GetFloat(); + if (!CompareFloat(resx, x)) { + assertFailed_ = true; + printf("%s: Failed %f != expected %f\n", title, resx, x); + } + } + void AssertFloat(const char *title, float x, float y) { + float resx = GetFloat(); + float resy = GetFloat(); + if (!CompareFloat(resx, x) || !CompareFloat(resy, y)) { + assertFailed_ = true; + printf("%s: Failed %f, %f != expected %f, %f\n", title, resx, resy, x, y); + } + } + void AssertFloat(const char *title, float x, float y, float z) { + float resx = GetFloat(); + float resy = GetFloat(); + float resz = GetFloat(); + if (!CompareFloat(resx, x) || !CompareFloat(resy, y) || !CompareFloat(resz, z)) { + assertFailed_ = true; + printf("%s: Failed %f, %f, %f != expected %f, %f, %f\n", title, resx, resy, resz, x, y, z); + } + } + + void Skip(u32 c) { + dstPos_ += c; + } + void *GetData() { return dst_; } @@ -172,6 +258,10 @@ public: return 0; } + bool HasFailed() { + return assertFailed_; + } + private: void SetupExecute(int vtype, bool useJit) { if (dec_ != nullptr) { @@ -179,6 +269,7 @@ private: } dec_ = new VertexDecoder(); dec_->SetVertexType(vtype, options_, useJit ? cache_ : nullptr); + dstPos_ = 0; needsReset_ = true; } @@ -193,19 +284,401 @@ private: bool needsReset_; size_t srcPos_; size_t dstPos_; + bool assertFailed_; +}; + +static bool TestVertex8() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_TC_8BIT; + + dec.Add8(127, 128); + dec.Add8(127, 0, 128); + dec.Add8(127, 0, 128); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertex8-TC", 127, 128); + dec.Skip(2); + dec.Assert8("TestVertex8-Nrm", 127, 0, 128); + dec.Skip(1); + dec.AssertFloat("TestVertex8-Pos", 127.0f / 128.0f, 0.0f, -1.0f); + } + + return !dec.HasFailed(); +} + +static bool TestVertex16() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_TC_16BIT; + + dec.Add16(32767, 32768); + dec.Add16(32767, 0, 32768); + dec.Add16(32767, 0, 32768); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.Assert16("TestVertex16-TC", 32767, 32768); + dec.Assert16("TestVertex16-Nrm", 32767, 0, 32768); + dec.Skip(2); + dec.AssertFloat("TestVertex16-Pos", 32767.0f / 32768.0f, 0.0f, -1.0f); + } + + return !dec.HasFailed(); +} + +static bool TestVertexFloat() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_TC_FLOAT; + + dec.AddFloat(1.0f, -1.0f); + dec.AddFloat(1.0f, 0.5f, -1.0f); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.AssertFloat("TestVertexFloat-TC", 1.0f, -1.0f); + dec.AssertFloat("TestVertexFloat-Nrm", 1.0f, 0.5f, -1.0f); + dec.AssertFloat("TestVertexFloat-Pos", 1.0f, 0.5f, -1.0f); + } + + return !dec.HasFailed(); +} + +static bool TestVertex8Through() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_TC_8BIT | GE_VTYPE_THROUGH; + + dec.Add8(127, 128); + dec.Add8(127, 0, 128); + dec.Add8(127, 0, 128); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertex8Through-TC", 127, 128); + dec.Skip(2); + dec.Assert8("TestVertex8Through-Nrm", 127, 0, 128); + // Ignoring Pos since s8 through isn't really an option. + } + + return !dec.HasFailed(); +} + +static bool TestVertex16Through() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_TC_16BIT | GE_VTYPE_THROUGH; + + dec.Add16(32767, 32768); + dec.Add16(32767, 0, 32768); + dec.Add16(32767, 0, 32768); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.Assert16("TestVertex16Through-TC", 32767, 32768); + dec.Assert16("TestVertex16Through-Nrm", 32767, 0, 32768); + dec.Skip(2); + dec.AssertFloat("TestVertex16Through-Pos", 32767.0f, 0.0f, 32768.0f); + } + + return !dec.HasFailed(); +} + +static bool TestVertexFloatThrough() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_TC_FLOAT | GE_VTYPE_THROUGH; + + dec.AddFloat(1.0f, -1.0f); + dec.AddFloat(1.0f, 0.5f, -1.0f); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.AssertFloat("TestVertexFloatThrough-TC", 1.0f, -1.0f); + dec.AssertFloat("TestVertexFloatThrough-Nrm", 1.0f, 0.5f, -1.0f); + dec.AssertFloat("TestVertexFloatThrough-Pos", 1.0f, 0.5f, -1.0f); + } + + return !dec.HasFailed(); +} + +static bool TestVertexColor8888() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_8888; + bool failed = false; + + dec.Add8(1, 2, 3, 4); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + gstate_c.vertexFullAlpha = true; + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertexColor8888-Col", 1, 2, 3, 4); + dec.AssertFloat("TestVertexColor8888-Pos", 1.0f, 0.5f, -1.0f); + + if (gstate_c.vertexFullAlpha) { + printf("TestVertexColor8888: failed to clear vertexFullAlpha\n"); + failed = true; + } + } + + dec.Add8(255, 255, 255, 255); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + gstate_c.vertexFullAlpha = true; + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertexColor8888-Col", 255, 255, 255, 255); + dec.AssertFloat("TestVertexColor8888-Pos", 1.0f, 0.5f, -1.0f); + + if (!gstate_c.vertexFullAlpha) { + printf("TestVertexColor8888: cleared vertexFullAlpha\n"); + failed = true; + } + } + + return !dec.HasFailed() && !failed; +} + +static bool TestVertexColor4444() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_4444; + bool failed = false; + + dec.Add16(0x1234, 0); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + gstate_c.vertexFullAlpha = true; + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertexColor4444-Col", 0x44, 0x33, 0x22, 0x11); + dec.AssertFloat("TestVertexColor4444-Pos", 1.0f, 0.5f, -1.0f); + + if (gstate_c.vertexFullAlpha) { + printf("TestVertexColor4444: failed to clear vertexFullAlpha\n"); + failed = true; + } + } + + dec.Add16(0xFFFF, 0); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + gstate_c.vertexFullAlpha = true; + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertexColor4444-Col", 255, 255, 255, 255); + dec.AssertFloat("TestVertexColor4444-Pos", 1.0f, 0.5f, -1.0f); + + if (!gstate_c.vertexFullAlpha) { + printf("TestVertexColor4444: cleared vertexFullAlpha\n"); + failed = true; + } + } + + return !dec.HasFailed() && !failed; +} + +static bool TestVertexColor5551() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_5551; + bool failed = false; + + dec.Add16((0 << 15) | (1 << 10) | (2 << 5) | 3, 0); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + gstate_c.vertexFullAlpha = true; + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertexColor5551-Col", 0x18, 0x10, 0x8, 0x0); + dec.AssertFloat("TestVertexColor5551-Pos", 1.0f, 0.5f, -1.0f); + + if (gstate_c.vertexFullAlpha) { + printf("TestVertexColor5551: failed to clear vertexFullAlpha\n"); + failed = true; + } + } + + dec.Add16(0xFFFF, 0); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + gstate_c.vertexFullAlpha = true; + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertexColor5551-Col", 255, 255, 255, 255); + dec.AssertFloat("TestVertexColor5551-Pos", 1.0f, 0.5f, -1.0f); + + if (!gstate_c.vertexFullAlpha) { + printf("TestVertexColor5551: cleared vertexFullAlpha\n"); + failed = true; + } + } + + return !dec.HasFailed() && !failed; +} + +static bool TestVertexColor565() { + VertexDecoderTestHarness dec; + int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_565; + bool failed = false; + + dec.Add16((1 << 11) | (2 << 5) | 3, 0); + dec.AddFloat(1.0f, 0.5f, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + gstate_c.vertexFullAlpha = true; + dec.Execute(vtype, 0, jit == 1); + dec.Assert8("TestVertexColor565-Col", 0x18, 0x8, 0x8, 255); + dec.AssertFloat("TestVertexColor565-Pos", 1.0f, 0.5f, -1.0f); + + if (!gstate_c.vertexFullAlpha) { + printf("TestVertexColor565: cleared vertexFullAlpha\n"); + failed = true; + } + } + + return !dec.HasFailed() && !failed; +} + +static bool TestVertex8Skin() { + VertexDecoderTestHarness dec; + + g_Config.bSoftwareSkinning = true; + for (int i = 0; i < 8 * 12; ++i) { + gstate.boneMatrix[i] = 0.0f; + } + gstate.boneMatrix[0] = 2.0f; + gstate.boneMatrix[4] = 1.0f; + gstate.boneMatrix[8] = 5.0f; + + gstate.boneMatrix[12] = 1.0f; + gstate.boneMatrix[16] = 2.0f; + gstate.boneMatrix[20] = 5.0f; + + int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_WEIGHT_8BIT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT); + + dec.Add8(128 + 64, 128 - 64); + dec.Add8(127, 0, 128); + dec.Add8(127, 0, 128); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.AssertFloat("TestVertex8Skin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 127.0f / 128.0f, 0.0f, 2.0f * 5.0f * -1.0f); + dec.AssertFloat("TestVertex8Skin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 127.0f / 128.0f, 0.0f, 2.0f * 5.0f * -1.0f); + } + + return !dec.HasFailed(); +} + +static bool TestVertex16Skin() { + VertexDecoderTestHarness dec; + + g_Config.bSoftwareSkinning = true; + for (int i = 0; i < 8 * 12; ++i) { + gstate.boneMatrix[i] = 0.0f; + } + gstate.boneMatrix[0] = 2.0f; + gstate.boneMatrix[4] = 1.0f; + gstate.boneMatrix[8] = 5.0f; + + gstate.boneMatrix[12] = 1.0f; + gstate.boneMatrix[16] = 2.0f; + gstate.boneMatrix[20] = 5.0f; + + int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_WEIGHT_16BIT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT); + + dec.Add16(32768 + 16384, 32768 - 16384); + dec.Add16(32767, 0, 32768); + dec.Add16(32767, 0, 32768); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.AssertFloat("TestVertex16Skin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 32767.0f / 32768.0f, 0.0f, 2.0f * 5.0f * -1.0f); + dec.AssertFloat("TestVertex16Skin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 32767.0f / 32768.0f, 0.0f, 2.0f * 5.0f * -1.0f); + } + + return !dec.HasFailed(); +} + +static bool TestVertexFloatSkin() { + VertexDecoderTestHarness dec; + + g_Config.bSoftwareSkinning = true; + for (int i = 0; i < 8 * 12; ++i) { + gstate.boneMatrix[i] = 0.0f; + } + gstate.boneMatrix[0] = 2.0f; + gstate.boneMatrix[4] = 1.0f; + gstate.boneMatrix[8] = 5.0f; + + gstate.boneMatrix[12] = 1.0f; + gstate.boneMatrix[16] = 2.0f; + gstate.boneMatrix[20] = 5.0f; + + int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_WEIGHT_FLOAT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT); + + dec.AddFloat(1.5f, 0.5f); + dec.AddFloat(1.0f, 0, -1.0f); + dec.AddFloat(1.0f, 0, -1.0f); + + for (int jit = 0; jit <= 1; ++jit) { + dec.Execute(vtype, 0, jit == 1); + dec.AssertFloat("TestVertexFloatSkin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 1.0f, 0.0f, 2.0f * 5.0f * -1.0f); + dec.AssertFloat("TestVertexFloatSkin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 1.0f, 0.0f, 2.0f * 5.0f * -1.0f); + } + + return !dec.HasFailed(); +} + +// TODO: Morph (col, pos, nrm), weights (no skin), morph + weights? + +typedef bool (*VertexTestFunc)(); + +static VertexTestFunc vertdecTestFuncs[] = { + &TestVertex8, + &TestVertex16, + &TestVertexFloat, + + &TestVertex8Through, + &TestVertex16Through, + &TestVertexFloatThrough, + + &TestVertexColor8888, + &TestVertexColor4444, + &TestVertexColor5551, + &TestVertexColor565, + + &TestVertex8Skin, + &TestVertex16Skin, + &TestVertexFloatSkin, }; bool TestVertexJit() { VertexDecoderTestHarness dec; - for (int i = 0; i < 100; ++i) { + /*for (int i = 0; i < 100; ++i) { dec.AddFloat(0.5f, 1.0f, -1.0f); } - int vtype = GE_VTYPE_POS_FLOAT; + int vtype = GE_VTYPE_POS_FLOAT;*/ + /*for (int i = 0; i < 100; ++i) { + dec.Add16(32767, 0, 32768); + } + int vtype = GE_VTYPE_POS_16BIT;*/ + for (int i = 0; i < 100; ++i) { + dec.Add8(127, 0, 128); + } + int vtype = GE_VTYPE_POS_8BIT; double yesJit = dec.ExecuteTimed(vtype, 100, true); double noJit = dec.ExecuteTimed(vtype, 100, false); - printf("Result: %f, %f, %f\n", dec.GetFloat(), dec.GetFloat(), dec.GetFloat()); + float x = dec.GetFloat(); + float y = dec.GetFloat(); + float z = dec.GetFloat(); + printf("Result: %f, %f, %f\n", x, y, z); printf("Jit was %fx faster than steps.\n\n", yesJit / noJit); - return yesJit > noJit; + bool pass = true; + for (size_t i = 0; i < ARRAY_SIZE(vertdecTestFuncs); ++i) { + if (!vertdecTestFuncs[i]()) { + pass = false; + } + } + + return pass; }