diff --git a/GPU/Common/VertexDecoderArm64.cpp b/GPU/Common/VertexDecoderArm64.cpp
index a7c447df91..fa2e05677c 100644
--- a/GPU/Common/VertexDecoderArm64.cpp
+++ b/GPU/Common/VertexDecoderArm64.cpp
@@ -59,6 +59,7 @@ static const ARM64Reg neonUVScaleReg = D0;
 static const ARM64Reg neonUVOffsetReg = D1;
 
 static const ARM64Reg src[3] = {S2, S3, S8};
+static const ARM64Reg srcD[3] = {D2, D3, D8};
 static const ARM64Reg srcQ[3] = {Q2, Q3, Q8};
 
 static const ARM64Reg srcNEON = Q8;
@@ -438,8 +439,7 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
 		break;
 	case 7:
 	case 8:
-		fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[0], srcReg, 0);
-		fp.LDR(128, INDEX_UNSIGNED, neonWeightRegsQ[1], srcReg, 16);
+		fp.LDP(128, INDEX_SIGNED, neonWeightRegsQ[0], neonWeightRegsQ[1], srcReg, 0);
 		break;
 	}
 	Jit_ApplyWeights();
@@ -447,12 +447,16 @@ void VertexDecoderJitCache::Jit_WeightsFloatSkin() {
 
 void VertexDecoderJitCache::Jit_Color8888() {
 	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->coloff);
-	// TODO: Set flags to determine if alpha != 0xFF.
-	// ANDSI2R(tempReg2, tempReg1, 0xFF000000);
+
+	// Set flags to determine if alpha != 0xFF.
+	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
+	CMP(tempReg2, 0);
+
+	// Clear fullAlphaReg when the inverse was not 0.
+	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
+	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
+
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
-	// FixupBranch skip = B(CC_NZ);
-	MOVI2R(fullAlphaReg, 0);
-	// SetJumpTarget(skip);
 }
 
 void VertexDecoderJitCache::Jit_Color4444() {
@@ -472,11 +476,13 @@ void VertexDecoderJitCache::Jit_Color4444() {
 
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.c0off);
 
-	// TODO: Set flags to determine if alpha != 0xFF.
-	//MVNS(tempReg2, tempReg, ArithOption(tempReg1, ST_ASR, 24));
-	//FixupBranch skip = B(CC_EQ);
-	MOVI2R(fullAlphaReg, 0);
-	//SetJumpTarget(skip);
+	// Set flags to determine if alpha != 0xFF.
+	ORN(tempReg2, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
+	CMP(tempReg2, 0);
+
+	// Clear fullAlphaReg when the inverse was not 0.
+	// fullAlphaReg = tempReg2 == 0 ? fullAlphaReg : 0 + 1;
+	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
 }
 
 void VertexDecoderJitCache::Jit_Color565() {
@@ -526,40 +532,35 @@ void VertexDecoderJitCache::Jit_Color5551() {
 	ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg);
 	ORR(tempReg2, tempReg2, tempReg1);
 	
-	// TODO: Set flags to determine if alpha != 0xFF.
-	//MVNS(tempReg3, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
+	// Set flags to determine if alpha != 0xFF.
+	ORN(tempReg3, WZR, tempReg1, ArithOption(tempReg1, ST_ASR, 24));
+	CMP(tempReg3, 0);
+
 	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.c0off);
-	//FixupBranch skip = B(CC_EQ);
-	MOVI2R(fullAlphaReg, 0);
-	//SetJumpTarget(skip);
+
+	// Clear fullAlphaReg when the inverse was not 0.
+	// fullAlphaReg = tempReg3 == 0 ? fullAlphaReg : 0 + 1;
+	CSEL(fullAlphaReg, fullAlphaReg, WZR, CC_EQ);
 }
 
 void VertexDecoderJitCache::Jit_TcU8() {
-	LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
-	LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 1);
-	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8));
+	LDURH(tempReg1, srcReg, dec_->tcoff);
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_TcU16() {
-	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
-	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
-	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
+	LDUR(tempReg1, srcReg, dec_->tcoff);
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_TcU16Through() {
-	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
-	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 2);
-	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
+	LDUR(tempReg1, srcReg, dec_->tcoff);
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_TcFloatThrough() {
-	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
-	LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4);
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
-	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4);
+	LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
+	STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_TcU16Double() {
@@ -579,60 +580,57 @@ void VertexDecoderJitCache::Jit_TcU16ThroughDouble() {
 }
 
 void VertexDecoderJitCache::Jit_TcFloat() {
-	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->tcoff);
-	LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->tcoff + 4);
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.uvoff);
-	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.uvoff + 4);
+	LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->tcoff);
+	STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_TcU8Prescale() {
-	fp.LDR(16, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff);
+	fp.LDUR(16, neonScratchRegD, srcReg, dec_->tcoff);
 	fp.UXTL(8, neonScratchRegQ, neonScratchRegD); // Widen to 16-bit
 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
 	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
 	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
 	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
-	fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
+	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_TcU16Prescale() {
-	fp.LDR(32, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff);
+	fp.LDUR(32, neonScratchRegD, srcReg, dec_->tcoff);
 	fp.UXTL(16, neonScratchRegQ, neonScratchRegD); // Widen to 32-bit
 	fp.UCVTF(32, neonScratchRegD, neonScratchRegD);
 	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
 	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
-	fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
+	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_TcFloatPrescale() {
-	fp.LDR(64, INDEX_UNSIGNED, neonScratchRegD, srcReg, dec_->tcoff);
+	fp.LDUR(64, neonScratchRegD, srcReg, dec_->tcoff);
 	fp.FMUL(32, neonScratchRegD, neonScratchRegD, neonUVScaleReg);  // TODO: FMLA
 	fp.FADD(32, neonScratchRegD, neonScratchRegD, neonUVOffsetReg);
-	fp.STR(64, INDEX_UNSIGNED, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
+	fp.STUR(64, neonScratchRegD, dstReg, dec_->decFmt.uvoff);
 }
 
 void VertexDecoderJitCache::Jit_PosS8() {
 	Jit_AnyS8ToFloat(dec_->posoff);
-	STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff);
-	STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4);
-	STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8);
+	fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
 }
 
 void VertexDecoderJitCache::Jit_PosS16() {
 	Jit_AnyS16ToFloat(dec_->posoff);
-	STR(INDEX_UNSIGNED, src[0], dstReg, dec_->decFmt.posoff);
-	STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 4);
-	STR(INDEX_UNSIGNED, src[2], dstReg, dec_->decFmt.posoff + 8);
+	fp.STUR(128, srcQ[0], dstReg, dec_->decFmt.posoff);
 }
 
-// Just copy 12 bytes.
 void VertexDecoderJitCache::Jit_PosFloat() {
-	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
-	LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 4);
-	LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8);
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.posoff);
-	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.posoff + 4);
-	STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8);
+	// Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
+	if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
+		LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->posoff);
+		STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.posoff);
+	} else {
+		LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->posoff);
+		STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.posoff);
+		LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 8);
+		STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.posoff + 8);
+	}
 }
 
 void VertexDecoderJitCache::Jit_PosS8Through() {
@@ -648,22 +646,20 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
 }
 
 void VertexDecoderJitCache::Jit_PosS16Through() {
-	LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->posoff);
-	LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->posoff + 2);
+	// Start with X and Y (which is signed.)
+	fp.LDUR(32, src[0], srcReg, dec_->posoff);
+	fp.SXTL(16, srcD[0], src[0]);
+	fp.SCVTF(32, srcD[0], srcD[0]);
+	fp.STUR(64, src[0], dstReg, dec_->decFmt.posoff);
+	// Now load in Z (which is unsigned.)
 	LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->posoff + 4);
-	fp.SCVTF(fpScratchReg, tempReg1);
-	fp.SCVTF(fpScratchReg2, tempReg2);
-	fp.SCVTF(fpScratchReg3, tempReg3);
-	STR(INDEX_UNSIGNED, fpScratchReg, dstReg, dec_->decFmt.posoff);
-	STR(INDEX_UNSIGNED, fpScratchReg2, dstReg, dec_->decFmt.posoff + 4);
-	STR(INDEX_UNSIGNED, fpScratchReg3, dstReg, dec_->decFmt.posoff + 8);
+	fp.SCVTF(src[1], tempReg3);
+	STR(INDEX_UNSIGNED, src[1], dstReg, dec_->decFmt.posoff + 8);
 }
 
 void VertexDecoderJitCache::Jit_NormalS8() {
-	LDRB(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
-	LDRB(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 1);
+	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
 	LDRB(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 2);
-	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 8));
 	ORR(tempReg1, tempReg1, tempReg3, ArithOption(tempReg3, ST_LSL, 16));
 	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
 }
@@ -671,21 +667,22 @@ void VertexDecoderJitCache::Jit_NormalS8() {
 // Copy 6 bytes and then 2 zeroes.
 void VertexDecoderJitCache::Jit_NormalS16() {
 	// NOTE: Not LDRH, we just copy the raw bytes here.
-	LDRH(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
-	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 2);
-	LDRH(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 4);
-	ORR(tempReg1, tempReg1, tempReg2, ArithOption(tempReg2, ST_LSL, 16));
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
-	STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 4);
+	LDUR(tempReg1, srcReg, dec_->nrmoff);
+	LDRH(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4);
+	STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
 }
 
 void VertexDecoderJitCache::Jit_NormalFloat() {
-	LDR(INDEX_UNSIGNED, tempReg1, srcReg, dec_->nrmoff);
-	LDR(INDEX_UNSIGNED, tempReg2, srcReg, dec_->nrmoff + 4);
-	LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8);
-	STR(INDEX_UNSIGNED, tempReg1, dstReg, dec_->decFmt.nrmoff);
-	STR(INDEX_UNSIGNED, tempReg2, dstReg, dec_->decFmt.nrmoff + 4);
-	STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
+	// Only need to copy 12 bytes, but copying 16 should be okay (and is faster.)
+	if ((dec_->posoff & 7) == 0 && (dec_->decFmt.posoff & 7) == 0) {
+		LDP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), srcReg, dec_->nrmoff);
+		STP(INDEX_SIGNED, EncodeRegTo64(tempReg1), EncodeRegTo64(tempReg2), dstReg, dec_->decFmt.nrmoff);
+	} else {
+		LDP(INDEX_SIGNED, tempReg1, tempReg2, srcReg, dec_->nrmoff);
+		STP(INDEX_SIGNED, tempReg1, tempReg2, dstReg, dec_->decFmt.nrmoff);
+		LDR(INDEX_UNSIGNED, tempReg3, srcReg, dec_->nrmoff + 8);
+		STR(INDEX_UNSIGNED, tempReg3, dstReg, dec_->decFmt.nrmoff + 8);
+	}
 }
 
 void VertexDecoderJitCache::Jit_NormalS8Skin() {
@@ -699,10 +696,7 @@ void VertexDecoderJitCache::Jit_NormalS16Skin() {
 }
 
 void VertexDecoderJitCache::Jit_NormalFloatSkin() {
-  //	fp.LDR(128, INDEX_UNSIGNED, srcNEON, srcReg, dec_->nrmoff);
-	LDR(INDEX_UNSIGNED, src[0], srcReg, dec_->nrmoff);
-	LDR(INDEX_UNSIGNED, src[1], srcReg, dec_->nrmoff + 4);
-	LDR(INDEX_UNSIGNED, src[2], srcReg, dec_->nrmoff + 8);
+	fp.LDUR(128, srcQ[0], srcReg, dec_->nrmoff);
 	Jit_WriteMatrixMul(dec_->decFmt.nrmoff, false);
 }
 
@@ -717,45 +711,30 @@ void VertexDecoderJitCache::Jit_PosS16Skin() {
 }
 
 void VertexDecoderJitCache::Jit_PosFloatSkin() {
-	//fp.LDR(128, INDEX_UNSIGNED, srcNEON, srcReg, dec_->posoff);
-	LDR(INDEX_UNSIGNED, src[0], srcReg, dec_->posoff);
-	LDR(INDEX_UNSIGNED, src[1], srcReg, dec_->posoff + 4);
-	LDR(INDEX_UNSIGNED, src[2], srcReg, dec_->posoff + 8);
+	fp.LDUR(128, srcQ[0], srcReg, dec_->posoff);
 	Jit_WriteMatrixMul(dec_->decFmt.posoff, true);
 }
 
 void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {
-	// TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too.
-	LDRSB(INDEX_UNSIGNED, tempReg1, srcReg, srcoff);
-	LDRSB(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 1);
-	LDRSB(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 2);
-	fp.SCVTF(src[0], tempReg1, 7);
-	fp.SCVTF(src[1], tempReg2, 7);
-	fp.SCVTF(src[2], tempReg3, 7);
+	fp.LDUR(32, src[0], srcReg, srcoff);
+	fp.SXTL(8, srcD[0], src[0]);
+	fp.SXTL(16, srcQ[0], srcD[0]);
+	fp.SCVTF(32, srcQ[0], srcQ[0], 7);
 }
 
 void VertexDecoderJitCache::Jit_AnyS16ToFloat(int srcoff) {
-	// TODO: NEONize. In that case we'll leave all three floats in one register instead, so callers must change too.
-	LDRSH(INDEX_UNSIGNED, tempReg1, srcReg, srcoff);
-	LDRSH(INDEX_UNSIGNED, tempReg2, srcReg, srcoff + 2);
-	LDRSH(INDEX_UNSIGNED, tempReg3, srcReg, srcoff + 4);
-	fp.SCVTF(src[0], tempReg1, 15);
-	fp.SCVTF(src[1], tempReg2, 15);
-	fp.SCVTF(src[2], tempReg3, 15);
+	fp.LDUR(64, src[0], srcReg, srcoff);
+	fp.SXTL(16, srcQ[0], srcD[0]);
+	fp.SCVTF(32, srcQ[0], srcQ[0], 15);
 }
 
 void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
 	// Multiply with the matrix sitting in Q4-Q7.
 	fp.FMUL(32, accNEON, Q4, srcQ[0], 0);
-	fp.FMLA(32, accNEON, Q5, srcQ[1], 0);
-	fp.FMLA(32, accNEON, Q6, srcQ[2], 0);
+	fp.FMLA(32, accNEON, Q5, srcQ[0], 1);
+	fp.FMLA(32, accNEON, Q6, srcQ[0], 2);
 	if (pos) {
 		fp.FADD(32, accNEON, accNEON, Q7);
 	}
-	// Ugly store operation.
-	fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff);
-	fp.INS(32, accNEON, 0, accNEON, 1);
-	fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff + 4);
-	fp.INS(32, accNEON, 0, accNEON, 2);
-	fp.STR(32, INDEX_UNSIGNED, accNEON, dstReg, outOff + 8);
+	fp.STUR(128, accNEON, dstReg, outOff);
 }
diff --git a/unittest/TestVertexJit.cpp b/unittest/TestVertexJit.cpp
index 3b773c94fa..79012974fd 100644
--- a/unittest/TestVertexJit.cpp
+++ b/unittest/TestVertexJit.cpp
@@ -20,6 +20,7 @@
 #include "Core/Config.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 #include "GPU/ge_constants.h"
+#include "GPU/GPUState.h"
 #include "unittest/TestVertexJit.h"
 #include "unittest/UnitTest.h"
 
@@ -29,7 +30,7 @@ class VertexDecoderTestHarness {
 
 public:
 	VertexDecoderTestHarness()
-		: dec_(nullptr), needsReset_(true), dstPos_(0) {
+		: dec_(nullptr), needsReset_(true), dstPos_(0), assertFailed_(false) {
 		src_ = new u8[BUFFER_SIZE];
 		dst_ = new u8[BUFFER_SIZE];
 		cache_ = new VertexDecoderJitCache();
@@ -108,6 +109,12 @@ public:
 		Add8(y);
 		Add8(z);
 	}
+	void Add8(u8 x, u8 y, u8 z, u8 w) {
+		Add8(x);
+		Add8(y);
+		Add8(z);
+		Add8(w);
+	}
 
 	void Add16(u16_le x) {
 		if (needsReset_) {
@@ -148,19 +155,98 @@ public:
 	}
 
 	u16 Get16() {
-		u16 result;
+		u16_le result;
 		memcpy(&result, dst_ + dstPos_, sizeof(result));
 		dstPos_ += sizeof(result);
 		return result;
 	}
 
 	float GetFloat() {
-		float result;
+		float_le result;
 		memcpy(&result, dst_ + dstPos_, sizeof(result));
 		dstPos_ += sizeof(result);
 		return result;
 	}
 
+	void Assert8(const char *title, u8 x, u8 y) {
+		u8 resx = Get8();
+		u8 resy = Get8();
+		if (resx != x || resy != y) {
+			assertFailed_ = true;
+			printf("%s: Failed %d, %d != expected %d, %d\n", title, resx, resy, x, y);
+		}
+	}
+	void Assert8(const char *title, u8 x, u8 y, u8 z) {
+		u8 resx = Get8();
+		u8 resy = Get8();
+		u8 resz = Get8();
+		if (resx != x || resy != y || resz != z) {
+			assertFailed_ = true;
+			printf("%s: Failed %d, %d, %d != expected %d, %d, %d\n", title, resx, resy, resz, x, y, z);
+		}
+	}
+	void Assert8(const char *title, u8 x, u8 y, u8 z, u8 w) {
+		u8 resx = Get8();
+		u8 resy = Get8();
+		u8 resz = Get8();
+		u8 resw = Get8();
+		if (resx != x || resy != y || resz != z || resw != w) {
+			assertFailed_ = true;
+			printf("%s: Failed %d, %d, %d, %d != expected %d, %d, %d, %d\n", title, resx, resy, resz, resw, x, y, z, w);
+		}
+	}
+
+	void Assert16(const char *title, u16 x, u16 y) {
+		u16 resx = Get16();
+		u16 resy = Get16();
+		if (resx != x || resy != y) {
+			assertFailed_ = true;
+			printf("%s: Failed %d, %d != expected %d, %d\n", title, resx, resy, x, y);
+		}
+	}
+	void Assert16(const char *title, u16 x, u16 y, u16 z) {
+		u16 resx = Get16();
+		u16 resy = Get16();
+		u16 resz = Get16();
+		if (resx != x || resy != y || resz != z) {
+			assertFailed_ = true;
+			printf("%s: Failed %d, %d, %d != expected %d, %d, %d\n", title, resx, resy, resz, x, y, z);
+		}
+	}
+
+	bool CompareFloat(float a, float b) {
+		return a - fmodf(a, 0.0000001f) == b - fmodf(b, 0.0000001f);
+	}
+
+	void AssertFloat(const char *title, float x) {
+		float resx = GetFloat();
+		if (!CompareFloat(resx, x)) {
+			assertFailed_ = true;
+			printf("%s: Failed %f != expected %f\n", title, resx, x);
+		}
+	}
+	void AssertFloat(const char *title, float x, float y) {
+		float resx = GetFloat();
+		float resy = GetFloat();
+		if (!CompareFloat(resx, x) || !CompareFloat(resy, y)) {
+			assertFailed_ = true;
+			printf("%s: Failed %f, %f != expected %f, %f\n", title, resx, resy, x, y);
+		}
+	}
+	void AssertFloat(const char *title, float x, float y, float z) {
+		float resx = GetFloat();
+		float resy = GetFloat();
+		float resz = GetFloat();
+		if (!CompareFloat(resx, x) || !CompareFloat(resy, y) || !CompareFloat(resz, z)) {
+			assertFailed_ = true;
+			printf("%s: Failed %f, %f, %f != expected %f, %f, %f\n", title, resx, resy, resz, x, y, z);
+		}
+	}
+
+	void Skip(u32 c) {
+		dstPos_ += c;
+	}
+
 	void *GetData() {
 		return dst_;
 	}
@@ -172,6 +258,10 @@ public:
 		return 0;
 	}
 
+	bool HasFailed() {
+		return assertFailed_;
+	}
+
 private:
 	void SetupExecute(int vtype, bool useJit) {
 		if (dec_ != nullptr) {
@@ -179,6 +269,7 @@ private:
 		}
 		dec_ = new VertexDecoder();
 		dec_->SetVertexType(vtype, options_, useJit ? cache_ : nullptr);
+		dstPos_ = 0;
 
 		needsReset_ = true;
 	}
@@ -193,19 +284,401 @@ private:
 	bool needsReset_;
 	size_t srcPos_;
 	size_t dstPos_;
+	bool assertFailed_;
+};
+
+static bool TestVertex8() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_TC_8BIT;
+
+	dec.Add8(127, 128);
+	dec.Add8(127, 0, 128);
+	dec.Add8(127, 0, 128);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertex8-TC", 127, 128);
+		dec.Skip(2);
+		dec.Assert8("TestVertex8-Nrm", 127, 0, 128);
+		dec.Skip(1);
+		dec.AssertFloat("TestVertex8-Pos", 127.0f / 128.0f, 0.0f, -1.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertex16() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_TC_16BIT;
+
+	dec.Add16(32767, 32768);
+	dec.Add16(32767, 0, 32768);
+	dec.Add16(32767, 0, 32768);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert16("TestVertex16-TC", 32767, 32768);
+		dec.Assert16("TestVertex16-Nrm", 32767, 0, 32768);
+		dec.Skip(2);
+		dec.AssertFloat("TestVertex16-Pos", 32767.0f / 32768.0f, 0.0f, -1.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertexFloat() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_TC_FLOAT;
+
+	dec.AddFloat(1.0f, -1.0f);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.AssertFloat("TestVertexFloat-TC", 1.0f, -1.0f);
+		dec.AssertFloat("TestVertexFloat-Nrm", 1.0f, 0.5f, -1.0f);
+		dec.AssertFloat("TestVertexFloat-Pos", 1.0f, 0.5f, -1.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertex8Through() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_TC_8BIT | GE_VTYPE_THROUGH;
+
+	dec.Add8(127, 128);
+	dec.Add8(127, 0, 128);
+	dec.Add8(127, 0, 128);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertex8Through-TC", 127, 128);
+		dec.Skip(2);
+		dec.Assert8("TestVertex8Through-Nrm", 127, 0, 128);
+		// Ignoring Pos since s8 through isn't really an option.
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertex16Through() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_TC_16BIT | GE_VTYPE_THROUGH;
+
+	dec.Add16(32767, 32768);
+	dec.Add16(32767, 0, 32768);
+	dec.Add16(32767, 0, 32768);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert16("TestVertex16Through-TC", 32767, 32768);
+		dec.Assert16("TestVertex16Through-Nrm", 32767, 0, 32768);
+		dec.Skip(2);
+		dec.AssertFloat("TestVertex16Through-Pos", 32767.0f, 0.0f, 32768.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertexFloatThrough() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_TC_FLOAT | GE_VTYPE_THROUGH;
+
+	dec.AddFloat(1.0f, -1.0f);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.AssertFloat("TestVertexFloatThrough-TC", 1.0f, -1.0f);
+		dec.AssertFloat("TestVertexFloatThrough-Nrm", 1.0f, 0.5f, -1.0f);
+		dec.AssertFloat("TestVertexFloatThrough-Pos", 1.0f, 0.5f, -1.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertexColor8888() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_8888;
+	bool failed = false;
+
+	dec.Add8(1, 2, 3, 4);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		gstate_c.vertexFullAlpha = true;
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertexColor8888-Col", 1, 2, 3, 4);
+		dec.AssertFloat("TestVertexColor8888-Pos", 1.0f, 0.5f, -1.0f);
+
+		if (gstate_c.vertexFullAlpha) {
+			printf("TestVertexColor8888: failed to clear vertexFullAlpha\n");
+			failed = true;
+		}
+	}
+
+	dec.Add8(255, 255, 255, 255);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		gstate_c.vertexFullAlpha = true;
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertexColor8888-Col", 255, 255, 255, 255);
+		dec.AssertFloat("TestVertexColor8888-Pos", 1.0f, 0.5f, -1.0f);
+
+		if (!gstate_c.vertexFullAlpha) {
+			printf("TestVertexColor8888: cleared vertexFullAlpha\n");
+			failed = true;
+		}
+	}
+
+	return !dec.HasFailed() && !failed;
+}
+
+static bool TestVertexColor4444() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_4444;
+	bool failed = false;
+
+	dec.Add16(0x1234, 0);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		gstate_c.vertexFullAlpha = true;
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertexColor4444-Col", 0x44, 0x33, 0x22, 0x11);
+		dec.AssertFloat("TestVertexColor4444-Pos", 1.0f, 0.5f, -1.0f);
+
+		if (gstate_c.vertexFullAlpha) {
+			printf("TestVertexColor4444: failed to clear vertexFullAlpha\n");
+			failed = true;
+		}
+	}
+
+	dec.Add16(0xFFFF, 0);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		gstate_c.vertexFullAlpha = true;
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertexColor4444-Col", 255, 255, 255, 255);
+		dec.AssertFloat("TestVertexColor4444-Pos", 1.0f, 0.5f, -1.0f);
+
+		if (!gstate_c.vertexFullAlpha) {
+			printf("TestVertexColor4444: cleared vertexFullAlpha\n");
+			failed = true;
+		}
+	}
+
+	return !dec.HasFailed() && !failed;
+}
+
+static bool TestVertexColor5551() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_5551;
+	bool failed = false;
+
+	dec.Add16((0 << 15) | (1 << 10) | (2 << 5) | 3, 0);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		gstate_c.vertexFullAlpha = true;
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertexColor5551-Col", 0x18, 0x10, 0x8, 0x0);
+		dec.AssertFloat("TestVertexColor5551-Pos", 1.0f, 0.5f, -1.0f);
+
+		if (gstate_c.vertexFullAlpha) {
+			printf("TestVertexColor5551: failed to clear vertexFullAlpha\n");
+			failed = true;
+		}
+	}
+
+	dec.Add16(0xFFFF, 0);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		gstate_c.vertexFullAlpha = true;
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertexColor5551-Col", 255, 255, 255, 255);
+		dec.AssertFloat("TestVertexColor5551-Pos", 1.0f, 0.5f, -1.0f);
+
+		if (!gstate_c.vertexFullAlpha) {
+			printf("TestVertexColor5551: cleared vertexFullAlpha\n");
+			failed = true;
+		}
+	}
+
+	return !dec.HasFailed() && !failed;
+}
+
+static bool TestVertexColor565() {
+	VertexDecoderTestHarness dec;
+	int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_COL_565;
+	bool failed = false;
+
+	dec.Add16((1 << 11) | (2 << 5) | 3, 0);
+	dec.AddFloat(1.0f, 0.5f, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		gstate_c.vertexFullAlpha = true;
+		dec.Execute(vtype, 0, jit == 1);
+		dec.Assert8("TestVertexColor565-Col", 0x18, 0x8, 0x8, 255);
+		dec.AssertFloat("TestVertexColor565-Pos", 1.0f, 0.5f, -1.0f);
+
+		if (!gstate_c.vertexFullAlpha) {
+			printf("TestVertexColor565: cleared vertexFullAlpha\n");
+			failed = true;
+		}
+	}
+
+	return !dec.HasFailed() && !failed;
+}
+
+static bool TestVertex8Skin() {
+	VertexDecoderTestHarness dec;
+
+	g_Config.bSoftwareSkinning = true;
+	for (int i = 0; i < 8 * 12; ++i) {
+		gstate.boneMatrix[i] = 0.0f;
+	}
+	gstate.boneMatrix[0] = 2.0f;
+	gstate.boneMatrix[4] = 1.0f;
+	gstate.boneMatrix[8] = 5.0f;
+
+	gstate.boneMatrix[12] = 1.0f;
+	gstate.boneMatrix[16] = 2.0f;
+	gstate.boneMatrix[20] = 5.0f;
+
+	int vtype = GE_VTYPE_POS_8BIT | GE_VTYPE_NRM_8BIT | GE_VTYPE_WEIGHT_8BIT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT);
+
+	dec.Add8(128 + 64, 128 - 64);
+	dec.Add8(127, 0, 128);
+	dec.Add8(127, 0, 128);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.AssertFloat("TestVertex8Skin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 127.0f / 128.0f, 0.0f, 2.0f * 5.0f * -1.0f);
+		dec.AssertFloat("TestVertex8Skin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 127.0f / 128.0f, 0.0f, 2.0f * 5.0f * -1.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertex16Skin() {
+	VertexDecoderTestHarness dec;
+
+	g_Config.bSoftwareSkinning = true;
+	for (int i = 0; i < 8 * 12; ++i) {
+		gstate.boneMatrix[i] = 0.0f;
+	}
+	gstate.boneMatrix[0] = 2.0f;
+	gstate.boneMatrix[4] = 1.0f;
+	gstate.boneMatrix[8] = 5.0f;
+
+	gstate.boneMatrix[12] = 1.0f;
+	gstate.boneMatrix[16] = 2.0f;
+	gstate.boneMatrix[20] = 5.0f;
+
+	int vtype = GE_VTYPE_POS_16BIT | GE_VTYPE_NRM_16BIT | GE_VTYPE_WEIGHT_16BIT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT);
+
+	dec.Add16(32768 + 16384, 32768 - 16384);
+	dec.Add16(32767, 0, 32768);
+	dec.Add16(32767, 0, 32768);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.AssertFloat("TestVertex16Skin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 32767.0f / 32768.0f, 0.0f, 2.0f * 5.0f * -1.0f);
+		dec.AssertFloat("TestVertex16Skin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 32767.0f / 32768.0f, 0.0f, 2.0f * 5.0f * -1.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+static bool TestVertexFloatSkin() {
+	VertexDecoderTestHarness dec;
+
+	g_Config.bSoftwareSkinning = true;
+	for (int i = 0; i < 8 * 12; ++i) {
+		gstate.boneMatrix[i] = 0.0f;
+	}
+	gstate.boneMatrix[0] = 2.0f;
+	gstate.boneMatrix[4] = 1.0f;
+	gstate.boneMatrix[8] = 5.0f;
+
+	gstate.boneMatrix[12] = 1.0f;
+	gstate.boneMatrix[16] = 2.0f;
+	gstate.boneMatrix[20] = 5.0f;
+
+	int vtype = GE_VTYPE_POS_FLOAT | GE_VTYPE_NRM_FLOAT | GE_VTYPE_WEIGHT_FLOAT | (1 << GE_VTYPE_WEIGHTCOUNT_SHIFT);
+
+	dec.AddFloat(1.5f, 0.5f);
+	dec.AddFloat(1.0f, 0, -1.0f);
+	dec.AddFloat(1.0f, 0, -1.0f);
+
+	for (int jit = 0; jit <= 1; ++jit) {
+		dec.Execute(vtype, 0, jit == 1);
+		dec.AssertFloat("TestVertexFloatSkin-Nrm", (2.0f * 1.5f + 1.0f * 0.5f) * 1.0f, 0.0f, 2.0f * 5.0f * -1.0f);
+		dec.AssertFloat("TestVertexFloatSkin-Pos", (2.0f * 1.5f + 1.0f * 0.5f) * 1.0f, 0.0f, 2.0f * 5.0f * -1.0f);
+	}
+
+	return !dec.HasFailed();
+}
+
+// TODO: Morph (col, pos, nrm), weights (no skin), morph + weights?
+
+typedef bool (*VertexTestFunc)();
+
+static VertexTestFunc vertdecTestFuncs[] = {
+	&TestVertex8,
+	&TestVertex16,
+	&TestVertexFloat,
+
+	&TestVertex8Through,
+	&TestVertex16Through,
+	&TestVertexFloatThrough,
+
+	&TestVertexColor8888,
+	&TestVertexColor4444,
+	&TestVertexColor5551,
+	&TestVertexColor565,
+
+	&TestVertex8Skin,
+	&TestVertex16Skin,
+	&TestVertexFloatSkin,
 };
 
 bool TestVertexJit() {
 	VertexDecoderTestHarness dec;
-	for (int i = 0; i < 100; ++i) {
+	/*for (int i = 0; i < 100; ++i) {
 		dec.AddFloat(0.5f, 1.0f, -1.0f);
 	}
-	int vtype = GE_VTYPE_POS_FLOAT;
+	int vtype = GE_VTYPE_POS_FLOAT;*/
+	/*for (int i = 0; i < 100; ++i) {
+		dec.Add16(32767, 0, 32768);
+	}
+	int vtype = GE_VTYPE_POS_16BIT;*/
+	for (int i = 0; i < 100; ++i) {
+		dec.Add8(127, 0, 128);
+	}
+	int vtype = GE_VTYPE_POS_8BIT;
 	double yesJit = dec.ExecuteTimed(vtype, 100, true);
 	double noJit = dec.ExecuteTimed(vtype, 100, false);
 
-	printf("Result: %f, %f, %f\n", dec.GetFloat(), dec.GetFloat(), dec.GetFloat());
+	float x = dec.GetFloat();
+	float y = dec.GetFloat();
+	float z = dec.GetFloat();
+	printf("Result: %f, %f, %f\n", x, y, z);
 	printf("Jit was %fx faster than steps.\n\n", yesJit / noJit);
 
-	return yesJit > noJit;
+	bool pass = true;
+	for (size_t i = 0; i < ARRAY_SIZE(vertdecTestFuncs); ++i) {
+		if (!vertdecTestFuncs[i]()) {
+			pass = false;
+		}
+	}
+
+	return pass;
 }