Vertex decoder JIT for x86 and x64. Handles the most common vertex formats.

2025-04-02 11:01:50 -04:00 · 2013-11-03 15:27:12 +01:00 · 2013-11-03 15:27:12 +01:00 · 810b1a061f
commit 810b1a061f
parent c4e02ab41d
4 changed files with 430 additions and 41 deletions
--- a/GPU/GLES/TransformPipeline.cpp
+++ b/GPU/GLES/TransformPipeline.cpp
@ -151,6 +151,8 @@ TransformDrawEngine::TransformDrawEngine()
 	memset(vbo_, 0, sizeof(vbo_));
 	memset(ebo_, 0, sizeof(ebo_));
 	indexGen.Setup(decIndex);
+	decJitCache_ = new VertexDecoderJitCache();
+
 	InitDeviceObjects();
 	register_gl_resource_holder(this);
 }
@ -164,6 +166,7 @@ TransformDrawEngine::~TransformDrawEngine() {
 	delete [] quadIndices_;

 	unregister_gl_resource_holder(this);
+	delete [] decJitCache_;
 	for (auto iter = decoderMap_.begin(); iter != decoderMap_.end(); iter++) {
 		delete iter->second;
 	}
@ -881,7 +884,7 @@ VertexDecoder *TransformDrawEngine::GetVertexDecoder(u32 vtype) {
 	if (iter != decoderMap_.end())
 		return iter->second;
 	VertexDecoder *dec = new VertexDecoder();
-	dec->SetVertexType(vtype);
+	dec->SetVertexType(vtype, decJitCache_);
 	decoderMap_[vtype] = dec;
 	return dec;
 }
--- a/GPU/GLES/TransformPipeline.h
+++ b/GPU/GLES/TransformPipeline.h
@ -162,6 +162,7 @@ private:
 	// Cached vertex decoders
 	std::map<u32, VertexDecoder *> decoderMap_;
 	VertexDecoder *dec_;
+	VertexDecoderJitCache *decJitCache_;
 	u32 lastVType_;
 	
 	// Vertex collector buffers
--- a/GPU/GLES/VertexDecoder.cpp
+++ b/GPU/GLES/VertexDecoder.cpp
@ -65,6 +65,8 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
 }
 #endif

+VertexDecoder::VertexDecoder() : coloff(0), nrmoff(0), posoff(0), jitted_(0) {}
+
 void VertexDecoder::Step_WeightsU8() const
 {
 	u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
@ -378,7 +380,6 @@ void VertexDecoder::Step_PosS8Through() const
 	v[0] = sv[0];
 	v[1] = sv[1];
 	v[2] = sv[2];
-	v[3] = 0;
 }

 void VertexDecoder::Step_PosS16Through() const
@ -388,7 +389,6 @@ void VertexDecoder::Step_PosS16Through() const
 	v[0] = sv[0];
 	v[1] = sv[1];
 	v[2] = sv[2];
-	v[3] = 0;
 }

 void VertexDecoder::Step_PosFloatThrough() const
@ -529,7 +529,7 @@ static const StepFunction posstep_through[4] = {
 	&VertexDecoder::Step_PosFloatThrough,
 };

-void VertexDecoder::SetVertexType(u32 fmt) {
+void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
 	fmt_ = fmt;
 	throughmode = (fmt & GE_VTYPE_THROUGH) != 0;
 	numSteps_ = 0;
@ -556,6 +556,7 @@ void VertexDecoder::SetVertexType(u32 fmt) {
 	}

 	if (weighttype) { // && nweights?
+		weightoff = size;
 		//size = align(size, wtalign[weighttype]);	unnecessary
 		size += wtsize[weighttype] * nweights;
 		if (wtalign[weighttype] > biggest)
@ -701,6 +702,11 @@ void VertexDecoder::SetVertexType(u32 fmt) {
 	onesize_ = size;
 	size *= morphcount;
 	DEBUG_LOG(G3D,"SVT : size = %i, aligned to biggest %i", size, biggest);
+
+	// Attempt to JIT as well
+	if (jitCache) {
+		jitted_ = jitCache->Compile(*this);
+	}
 }

 void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const {
@ -708,35 +714,26 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe
 	// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
 	decoded_ = decodedptr;
 	ptr_ = (const u8*)verts + indexLowerBound * size;
-	int stride = decFmt.stride;
-	for (int index = indexLowerBound; index <= indexUpperBound; index++) {
-		for (int i = 0; i < numSteps_; i++) {
-			((*this).*steps_[i])();
-		}
-		ptr_ += size;
-		decoded_ += stride;
-	}
-}

-// TODO: Does not support morphs, skinning etc.
-u32 VertexDecoder::InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const {
-	u32 customVertType = (gstate.vertType & ~GE_VTYPE_TC_MASK) | GE_VTYPE_TC_FLOAT;
-	VertexDecoder decOut;
-	decOut.SetVertexType(customVertType);
+	int count = indexUpperBound - indexLowerBound + 1;
+	int stride = decFmt.stride;
+	if (jitted_) {
+		// We've compiled the steps into optimized machine code, so just jump!
+		jitted_(ptr_, decoded_, count);
 	
-	const u8 *inp = (const u8 *)verts;
-	u8 *out = decoded;
-	for (int i = 0; i < count; i++) {
-		if (pos) memcpy(out + decOut.posoff, inp + posoff, possize[pos]);
-		if (nrm) memcpy(out + decOut.nrmoff, inp + nrmoff, nrmsize[nrm]);
-		if (col) memcpy(out + decOut.coloff, inp + coloff, colsize[col]);
-		// Ignore others for now, this is all we need for puzbob.
-		// Inject!
-		memcpy(out + decOut.tcoff, &customuv[i * 2], tcsize[decOut.tc]);
-		inp += this->onesize_;
-		out += decOut.onesize_;
+		// Do we need to update the pointers?
+		ptr_ += size * count;
+		decoded_ += stride * count;
+	} else {
+		// Interpret the decode steps
+		for (; count; count--) {
+			for (int i = 0; i < numSteps_; i++) {
+				((*this).*steps_[i])();
+			}
+			ptr_ += size;
+			decoded_ += stride;
+		}
 	}
-	return customVertType;
 }

 int VertexDecoder::ToString(char *output) const {
@ -761,3 +758,327 @@ int VertexDecoder::ToString(char *output) const {
 	output += sprintf(output, " (size: %i)", VertexSize());
 	return output - start;
 }
+
+VertexDecoderJitCache::VertexDecoderJitCache() {
+	using namespace Gen;
+	// 32k should be enough.
+	AllocCodeSpace(1024 * 32);
+
+	// Add some random code to "help" MSVC's buggy disassembler :(
+#if defined(_WIN32)
+	for (int i = 0; i < 100; i++) {
+		MOV(32, R(EAX), R(EBX));
+		RET();
+	}
+#endif
+}
+
+#ifdef ARM
+
+// TODO
+
+JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
+	return 0;
+}
+
+bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
+	return false;
+}
+
+#elif defined(_M_X64) || defined(_M_IX86)
+
+using namespace Gen;
+
+#ifdef _M_X64
+#ifdef _WIN32
+static const X64Reg tempReg1 = RAX;
+static const X64Reg tempReg2 = R9;
+static const X64Reg tempReg3 = R10;
+static const X64Reg srcReg = RCX;
+static const X64Reg dstReg = RDX;
+static const X64Reg counterReg = R8;
+#else
+static const X64Reg tempReg1 = RAX;
+static const X64Reg tempReg2 = R9;
+static const X64Reg tempReg3 = R10;
+static const X64Reg srcReg = RDI;
+static const X64Reg dstReg = RSI;
+static const X64Reg counterReg = RDX;
+#endif
+#else
+static const X64Reg tempReg1 = EAX;
+static const X64Reg tempReg2 = EBX;
+static const X64Reg tempReg3 = EDX;
+static const X64Reg srcReg = ESI;
+static const X64Reg dstReg = EDI;
+static const X64Reg counterReg = ECX;
+#endif
+
+typedef void (VertexDecoderJitCache::*JitStepFunction)();
+
+struct JitLookup {
+	StepFunction func;
+	JitStepFunction jitFunc;
+};
+
+// To debug, just comment them out one at a time until it works. We fall back
+// on the interpreter if the compiler fails.
+
+static const JitLookup jitLookup[] = {
+	{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
+	{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
+	{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
+
+	{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
+	{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
+	{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
+
+	{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
+	{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
+	
+	{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
+	{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
+	{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
+
+	{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
+	// Todo: The compressed color formats
+
+	{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
+	{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
+	{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
+	
+	{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
+	{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
+	{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
+};
+
+JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
+	dec_ = &dec;
+	const u8 *start = this->GetCodePtr();
+
+#ifdef _M_IX86
+	// Store register values
+	PUSH(ESI);
+	PUSH(EDI);
+	PUSH(EBX);
+	PUSH(EBP);
+
+	// Read parameters
+	int offset = 4;
+	MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
+	MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
+	MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
+
+#endif
+
+	// Let's not bother with a proper stack frame. We just grab the arguments and go.
+	JumpTarget loopStart = GetCodePtr();
+	for (int i = 0; i < dec.numSteps_; i++) {
+		if (!CompileStep(dec, i)) {
+			// Reset the code ptr and return zero to indicate that we failed.
+			SetCodePtr(const_cast<u8 *>(start));
+			return 0;
+		}
+	}
+
+#ifdef _M_X64
+	ADD(64, R(srcReg), Imm32(dec.VertexSize()));
+	ADD(64, R(dstReg), Imm32(dec.decFmt.stride));
+#else
+	ADD(32, R(srcReg), Imm32(dec.VertexSize()));
+	ADD(32, R(dstReg), Imm32(dec.decFmt.stride));
+#endif
+	SUB(32, R(counterReg), Imm8(1));
+	J_CC(CC_NZ, loopStart);
+
+#ifdef _M_IX86
+	// Store register values
+	POP(EBP);
+	POP(EBX);
+	POP(EDI);
+	POP(ESI);
+#endif
+
+	RET();
+
+	return (JittedVertexDecoder)start;
+}
+
+bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
+	// See if we find a matching JIT function
+	for (int i = 0; i < ARRAY_SIZE(jitLookup); i++) {
+		if (dec.steps_[step] == jitLookup[i].func) {
+			((*this).*jitLookup[i].jitFunc)();
+			return true;
+		}
+	}
+	return false;
+}
+
+void VertexDecoderJitCache::Jit_WeightsU8() {
+	// Basic implementation - a byte at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		MOV(8, R(tempReg1), MDisp(srcReg, dec_->weightoff + j));
+		MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), R(tempReg1));
+	}
+	while (j & 3) {
+		MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), Imm8(0));
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsU16() {
+	// Basic implementation - a short at a time. TODO: Optimize
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
+		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
+	}
+	while (j & 3) {
+		MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm8(0));
+		j++;
+	}
+}
+
+void VertexDecoderJitCache::Jit_WeightsFloat() {
+	int j;
+	for (j = 0; j < dec_->nweights; j++) {
+		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
+	}
+	while (j & 3) {  // Zero additional weights rounding up to 4.
+		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
+		j++;
+	}
+}
+
+// Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy.
+void VertexDecoderJitCache::Jit_TcU8() {
+	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
+	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
+}
+
+void VertexDecoderJitCache::Jit_TcU16() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
+	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
+}
+
+void VertexDecoderJitCache::Jit_TcFloat() {
+#ifdef _M_X64
+	MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
+	MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
+#else
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
+	MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
+	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
+	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
+#endif
+}
+
+void VertexDecoderJitCache::Jit_TcU16Through() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
+	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
+}
+
+void VertexDecoderJitCache::Jit_TcFloatThrough() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
+	MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
+	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
+	MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
+}
+
+void VertexDecoderJitCache::Jit_Color8888() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
+	MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
+}
+
+void VertexDecoderJitCache::Jit_Color4444() {
+	/*
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
+	MOV(32, R(tempReg2), R(tempReg1));
+	MOV(32, R(tempReg3), R(tempReg2));
+	AND(32, R(tempReg3), Imm8(0xF));   // t3 = 
+	*/
+	
+	// TODO
+}
+
+void VertexDecoderJitCache::Jit_Color565() {
+	// TODO
+}
+
+void VertexDecoderJitCache::Jit_Color5551() {
+	// TODO
+}
+
+// Copy 3 bytes and then a zero. Might as well copy four.
+void VertexDecoderJitCache::Jit_NormalS8() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
+	AND(32, R(tempReg1), Imm32(0x00FFFFFF));
+	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
+}
+
+// Copy 6 bytes and then 2 zeroes.
+void VertexDecoderJitCache::Jit_NormalS16() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
+	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->nrmoff + 4));
+	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
+	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
+}
+
+void VertexDecoderJitCache::Jit_NormalFloat() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
+	MOV(32, R(tempReg2), MDisp(srcReg, dec_->nrmoff + 4));
+	MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8));
+	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
+	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
+	MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
+}
+
+// Through expands into floats, always. Might want to look at changing this.
+void VertexDecoderJitCache::Jit_PosS8Through() {
+	// TODO: SIMD
+	for (int i = 0; i < 3; i++) {
+		MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
+		CVTSI2SS(XMM0, R(tempReg1));
+		MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
+	}
+}
+
+// Through expands into floats, always. Might want to look at changing this.
+void VertexDecoderJitCache::Jit_PosS16Through() {
+	// TODO: SIMD
+	for (int i = 0; i < 3; i++) {
+		MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + i * 2));
+		CVTSI2SS(XMM0, R(tempReg1));
+		MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
+	}
+}
+
+// Copy 3 bytes and then a zero. Might as well copy four.
+void VertexDecoderJitCache::Jit_PosS8() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
+	AND(32, R(tempReg1), Imm32(0x00FFFFFF));
+	MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
+}
+
+// Copy 6 bytes and then 2 zeroes.
+void VertexDecoderJitCache::Jit_PosS16() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
+	MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4));
+	MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
+	MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
+}
+
+// Just copy 12 bytes.
+void VertexDecoderJitCache::Jit_PosFloat() {
+	MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
+	MOV(32, R(tempReg2), MDisp(srcReg, dec_->posoff + 4));
+	MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8));
+	MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
+	MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
+	MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
+}
+
+#endif
--- a/GPU/GLES/VertexDecoder.h
+++ b/GPU/GLES/VertexDecoder.h
@ -17,16 +17,27 @@

 #pragma once

-#include "GPU/GPUState.h"
-#include "Globals.h"
 #include "base/basictypes.h"
+
+#ifdef ARM
+#include "Common/ArmEmitter.h"
+#else
+#include "Common/x64Emitter.h"
+#endif
+
+#include "Globals.h"
 #include "Core/Reporting.h"
+#include "GPU/GPUState.h"
 #include "GPU/Common/VertexDecoderCommon.h"

 class VertexDecoder;
+class VertexDecoderJitCache;

 typedef void (VertexDecoder::*StepFunction)() const;

+
+typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
+
 // Right now
 //   - compiles into list of called functions
 // Future TODO
@ -34,20 +45,17 @@ typedef void (VertexDecoder::*StepFunction)() const;
 class VertexDecoder
 {
 public:
-	VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {}
-	~VertexDecoder() {}
+	VertexDecoder();
+
+	// A jit cache is not mandatory, we don't use it in the sw renderer
+	void SetVertexType(u32 vtype, VertexDecoderJitCache *jitCache = 0);

-	// prim is needed knowledge for a performance hack (PrescaleUV)
-	void SetVertexType(u32 vtype);
 	u32 VertexType() const { return fmt_; }

 	const DecVtxFormat &GetDecVtxFmt() { return decFmt; }

 	void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;

-	// This could be easily generalized to inject any one component. Don't know another use for it though.
-	u32 InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const;
-
 	bool hasColor() const { return col != 0; }
 	int VertexSize() const { return size; }  // PSP format size

@ -68,8 +76,6 @@ public:
 	void Step_TcU16ThroughDouble() const;
 	void Step_TcFloatThrough() const;

-	// TODO: tcmorph
-
 	void Step_Color4444() const;
 	void Step_Color565() const;
 	void Step_Color5551() const;
@ -147,4 +153,62 @@ public:
 	int nweights;

 	int stats_[NUM_VERTEX_DECODER_STATS];
+
+	JittedVertexDecoder jitted_;
+
+	friend class VertexDecoderJitCache;
 };
+
+
+// A compiled vertex decoder takes the following arguments (C calling convention):
+// u8 *src, u8 *dst, int count
+//
+// x86:
+//   src is placed in esi and dst in edi
+//   for every vertex, we step esi and edi forwards by the two vertex sizes
+//   all movs are done relative to esi and edi
+//
+// that's it!
+
+
+#ifdef ARM
+class VertexDecoderJitCache : public ARMXCodeBlock {
+#else
+class VertexDecoderJitCache : public Gen::XCodeBlock {
+#endif
+public:
+	VertexDecoderJitCache();
+
+	// Returns a pointer to the code to run.
+	JittedVertexDecoder Compile(const VertexDecoder &dec);
+
+	void Jit_WeightsU8();
+	void Jit_WeightsU16();
+	void Jit_WeightsFloat();
+
+	void Jit_TcU8();
+	void Jit_TcU16();
+	void Jit_TcFloat();
+
+	void Jit_TcU16Through();
+	void Jit_TcFloatThrough();
+
+	void Jit_Color8888();
+	void Jit_Color4444();
+	void Jit_Color565();
+	void Jit_Color5551();
+
+	void Jit_NormalS8();
+	void Jit_NormalS16();
+	void Jit_NormalFloat();
+
+	void Jit_PosS8();
+	void Jit_PosS8Through();
+	void Jit_PosS16();
+	void Jit_PosS16Through();
+	void Jit_PosFloat();
+
+private:
+	bool CompileStep(const VertexDecoder &dec, int i);
+	const VertexDecoder *dec_;
+};