Vertex decoder JIT for x86 and x64. Handles the most common vertex formats.

This commit is contained in:
Henrik Rydgard 2013-11-03 15:27:12 +01:00
parent c4e02ab41d
commit 810b1a061f
4 changed files with 430 additions and 41 deletions

View file

@ -151,6 +151,8 @@ TransformDrawEngine::TransformDrawEngine()
memset(vbo_, 0, sizeof(vbo_));
memset(ebo_, 0, sizeof(ebo_));
indexGen.Setup(decIndex);
decJitCache_ = new VertexDecoderJitCache();
InitDeviceObjects();
register_gl_resource_holder(this);
}
@ -164,6 +166,7 @@ TransformDrawEngine::~TransformDrawEngine() {
delete [] quadIndices_;
unregister_gl_resource_holder(this);
delete [] decJitCache_;
for (auto iter = decoderMap_.begin(); iter != decoderMap_.end(); iter++) {
delete iter->second;
}
@ -881,7 +884,7 @@ VertexDecoder *TransformDrawEngine::GetVertexDecoder(u32 vtype) {
if (iter != decoderMap_.end())
return iter->second;
VertexDecoder *dec = new VertexDecoder();
dec->SetVertexType(vtype);
dec->SetVertexType(vtype, decJitCache_);
decoderMap_[vtype] = dec;
return dec;
}

View file

@ -162,6 +162,7 @@ private:
// Cached vertex decoders
std::map<u32, VertexDecoder *> decoderMap_;
VertexDecoder *dec_;
VertexDecoderJitCache *decJitCache_;
u32 lastVType_;
// Vertex collector buffers

View file

@ -65,6 +65,8 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
}
#endif
VertexDecoder::VertexDecoder() : coloff(0), nrmoff(0), posoff(0), jitted_(0) {}
void VertexDecoder::Step_WeightsU8() const
{
u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
@ -378,7 +380,6 @@ void VertexDecoder::Step_PosS8Through() const
v[0] = sv[0];
v[1] = sv[1];
v[2] = sv[2];
v[3] = 0;
}
void VertexDecoder::Step_PosS16Through() const
@ -388,7 +389,6 @@ void VertexDecoder::Step_PosS16Through() const
v[0] = sv[0];
v[1] = sv[1];
v[2] = sv[2];
v[3] = 0;
}
void VertexDecoder::Step_PosFloatThrough() const
@ -529,7 +529,7 @@ static const StepFunction posstep_through[4] = {
&VertexDecoder::Step_PosFloatThrough,
};
void VertexDecoder::SetVertexType(u32 fmt) {
void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
fmt_ = fmt;
throughmode = (fmt & GE_VTYPE_THROUGH) != 0;
numSteps_ = 0;
@ -556,6 +556,7 @@ void VertexDecoder::SetVertexType(u32 fmt) {
}
if (weighttype) { // && nweights?
weightoff = size;
//size = align(size, wtalign[weighttype]); unnecessary
size += wtsize[weighttype] * nweights;
if (wtalign[weighttype] > biggest)
@ -701,6 +702,11 @@ void VertexDecoder::SetVertexType(u32 fmt) {
onesize_ = size;
size *= morphcount;
DEBUG_LOG(G3D,"SVT : size = %i, aligned to biggest %i", size, biggest);
// Attempt to JIT as well
if (jitCache) {
jitted_ = jitCache->Compile(*this);
}
}
void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const {
@ -708,35 +714,26 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe
// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
decoded_ = decodedptr;
ptr_ = (const u8*)verts + indexLowerBound * size;
int stride = decFmt.stride;
for (int index = indexLowerBound; index <= indexUpperBound; index++) {
for (int i = 0; i < numSteps_; i++) {
((*this).*steps_[i])();
}
ptr_ += size;
decoded_ += stride;
}
}
// TODO: Does not support morphs, skinning etc.
u32 VertexDecoder::InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const {
u32 customVertType = (gstate.vertType & ~GE_VTYPE_TC_MASK) | GE_VTYPE_TC_FLOAT;
VertexDecoder decOut;
decOut.SetVertexType(customVertType);
int count = indexUpperBound - indexLowerBound + 1;
int stride = decFmt.stride;
if (jitted_) {
// We've compiled the steps into optimized machine code, so just jump!
jitted_(ptr_, decoded_, count);
const u8 *inp = (const u8 *)verts;
u8 *out = decoded;
for (int i = 0; i < count; i++) {
if (pos) memcpy(out + decOut.posoff, inp + posoff, possize[pos]);
if (nrm) memcpy(out + decOut.nrmoff, inp + nrmoff, nrmsize[nrm]);
if (col) memcpy(out + decOut.coloff, inp + coloff, colsize[col]);
// Ignore others for now, this is all we need for puzbob.
// Inject!
memcpy(out + decOut.tcoff, &customuv[i * 2], tcsize[decOut.tc]);
inp += this->onesize_;
out += decOut.onesize_;
// Do we need to update the pointers?
ptr_ += size * count;
decoded_ += stride * count;
} else {
// Interpret the decode steps
for (; count; count--) {
for (int i = 0; i < numSteps_; i++) {
((*this).*steps_[i])();
}
ptr_ += size;
decoded_ += stride;
}
}
return customVertType;
}
int VertexDecoder::ToString(char *output) const {
@ -761,3 +758,327 @@ int VertexDecoder::ToString(char *output) const {
output += sprintf(output, " (size: %i)", VertexSize());
return output - start;
}
VertexDecoderJitCache::VertexDecoderJitCache() {
using namespace Gen;
// 32k should be enough.
AllocCodeSpace(1024 * 32);
// Add some random code to "help" MSVC's buggy disassembler :(
#if defined(_WIN32)
for (int i = 0; i < 100; i++) {
MOV(32, R(EAX), R(EBX));
RET();
}
#endif
}
#ifdef ARM
// TODO
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
return 0;
}
bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
return false;
}
#elif defined(_M_X64) || defined(_M_IX86)
using namespace Gen;
#ifdef _M_X64
#ifdef _WIN32
static const X64Reg tempReg1 = RAX;
static const X64Reg tempReg2 = R9;
static const X64Reg tempReg3 = R10;
static const X64Reg srcReg = RCX;
static const X64Reg dstReg = RDX;
static const X64Reg counterReg = R8;
#else
static const X64Reg tempReg1 = RAX;
static const X64Reg tempReg2 = R9;
static const X64Reg tempReg3 = R10;
static const X64Reg srcReg = RDI;
static const X64Reg dstReg = RSI;
static const X64Reg counterReg = RDX;
#endif
#else
static const X64Reg tempReg1 = EAX;
static const X64Reg tempReg2 = EBX;
static const X64Reg tempReg3 = EDX;
static const X64Reg srcReg = ESI;
static const X64Reg dstReg = EDI;
static const X64Reg counterReg = ECX;
#endif
typedef void (VertexDecoderJitCache::*JitStepFunction)();
struct JitLookup {
StepFunction func;
JitStepFunction jitFunc;
};
// To debug, just comment them out one at a time until it works. We fall back
// on the interpreter if the compiler fails.
static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
// Todo: The compressed color formats
{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
};
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
dec_ = &dec;
const u8 *start = this->GetCodePtr();
#ifdef _M_IX86
// Store register values
PUSH(ESI);
PUSH(EDI);
PUSH(EBX);
PUSH(EBP);
// Read parameters
int offset = 4;
MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
#endif
// Let's not bother with a proper stack frame. We just grab the arguments and go.
JumpTarget loopStart = GetCodePtr();
for (int i = 0; i < dec.numSteps_; i++) {
if (!CompileStep(dec, i)) {
// Reset the code ptr and return zero to indicate that we failed.
SetCodePtr(const_cast<u8 *>(start));
return 0;
}
}
#ifdef _M_X64
ADD(64, R(srcReg), Imm32(dec.VertexSize()));
ADD(64, R(dstReg), Imm32(dec.decFmt.stride));
#else
ADD(32, R(srcReg), Imm32(dec.VertexSize()));
ADD(32, R(dstReg), Imm32(dec.decFmt.stride));
#endif
SUB(32, R(counterReg), Imm8(1));
J_CC(CC_NZ, loopStart);
#ifdef _M_IX86
// Store register values
POP(EBP);
POP(EBX);
POP(EDI);
POP(ESI);
#endif
RET();
return (JittedVertexDecoder)start;
}
bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
// See if we find a matching JIT function
for (int i = 0; i < ARRAY_SIZE(jitLookup); i++) {
if (dec.steps_[step] == jitLookup[i].func) {
((*this).*jitLookup[i].jitFunc)();
return true;
}
}
return false;
}
void VertexDecoderJitCache::Jit_WeightsU8() {
// Basic implementation - a byte at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
MOV(8, R(tempReg1), MDisp(srcReg, dec_->weightoff + j));
MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), R(tempReg1));
}
while (j & 3) {
MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), Imm8(0));
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsU16() {
// Basic implementation - a short at a time. TODO: Optimize
int j;
for (j = 0; j < dec_->nweights; j++) {
MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
}
while (j & 3) {
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm8(0));
j++;
}
}
void VertexDecoderJitCache::Jit_WeightsFloat() {
int j;
for (j = 0; j < dec_->nweights; j++) {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
}
while (j & 3) { // Zero additional weights rounding up to 4.
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
j++;
}
}
// Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy.
void VertexDecoderJitCache::Jit_TcU8() {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
}
void VertexDecoderJitCache::Jit_TcU16() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
}
void VertexDecoderJitCache::Jit_TcFloat() {
#ifdef _M_X64
MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
#else
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
#endif
}
void VertexDecoderJitCache::Jit_TcU16Through() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
}
void VertexDecoderJitCache::Jit_TcFloatThrough() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
}
void VertexDecoderJitCache::Jit_Color8888() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
}
void VertexDecoderJitCache::Jit_Color4444() {
/*
MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
MOV(32, R(tempReg2), R(tempReg1));
MOV(32, R(tempReg3), R(tempReg2));
AND(32, R(tempReg3), Imm8(0xF)); // t3 =
*/
// TODO
}
void VertexDecoderJitCache::Jit_Color565() {
// TODO
}
void VertexDecoderJitCache::Jit_Color5551() {
// TODO
}
// Copy 3 bytes and then a zero. Might as well copy four.
void VertexDecoderJitCache::Jit_NormalS8() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
}
// Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_NormalS16() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->nrmoff + 4));
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
}
void VertexDecoderJitCache::Jit_NormalFloat() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
MOV(32, R(tempReg2), MDisp(srcReg, dec_->nrmoff + 4));
MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8));
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
}
// Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS8Through() {
// TODO: SIMD
for (int i = 0; i < 3; i++) {
MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
CVTSI2SS(XMM0, R(tempReg1));
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
}
}
// Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS16Through() {
// TODO: SIMD
for (int i = 0; i < 3; i++) {
MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + i * 2));
CVTSI2SS(XMM0, R(tempReg1));
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
}
}
// Copy 3 bytes and then a zero. Might as well copy four.
void VertexDecoderJitCache::Jit_PosS8() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
}
// Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_PosS16() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
}
// Just copy 12 bytes.
void VertexDecoderJitCache::Jit_PosFloat() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
MOV(32, R(tempReg2), MDisp(srcReg, dec_->posoff + 4));
MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
}
#endif

View file

@ -17,16 +17,27 @@
#pragma once
#include "GPU/GPUState.h"
#include "Globals.h"
#include "base/basictypes.h"
#ifdef ARM
#include "Common/ArmEmitter.h"
#else
#include "Common/x64Emitter.h"
#endif
#include "Globals.h"
#include "Core/Reporting.h"
#include "GPU/GPUState.h"
#include "GPU/Common/VertexDecoderCommon.h"
class VertexDecoder;
class VertexDecoderJitCache;
typedef void (VertexDecoder::*StepFunction)() const;
typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
// Right now
// - compiles into list of called functions
// Future TODO
@ -34,20 +45,17 @@ typedef void (VertexDecoder::*StepFunction)() const;
class VertexDecoder
{
public:
VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {}
~VertexDecoder() {}
VertexDecoder();
// A jit cache is not mandatory, we don't use it in the sw renderer
void SetVertexType(u32 vtype, VertexDecoderJitCache *jitCache = 0);
// prim is needed knowledge for a performance hack (PrescaleUV)
void SetVertexType(u32 vtype);
u32 VertexType() const { return fmt_; }
const DecVtxFormat &GetDecVtxFmt() { return decFmt; }
void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;
// This could be easily generalized to inject any one component. Don't know another use for it though.
u32 InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const;
bool hasColor() const { return col != 0; }
int VertexSize() const { return size; } // PSP format size
@ -68,8 +76,6 @@ public:
void Step_TcU16ThroughDouble() const;
void Step_TcFloatThrough() const;
// TODO: tcmorph
void Step_Color4444() const;
void Step_Color565() const;
void Step_Color5551() const;
@ -147,4 +153,62 @@ public:
int nweights;
int stats_[NUM_VERTEX_DECODER_STATS];
JittedVertexDecoder jitted_;
friend class VertexDecoderJitCache;
};
// A compiled vertex decoder takes the following arguments (C calling convention):
// u8 *src, u8 *dst, int count
//
// x86:
// src is placed in esi and dst in edi
// for every vertex, we step esi and edi forwards by the two vertex sizes
// all movs are done relative to esi and edi
//
// that's it!
#ifdef ARM
class VertexDecoderJitCache : public ARMXCodeBlock {
#else
class VertexDecoderJitCache : public Gen::XCodeBlock {
#endif
public:
VertexDecoderJitCache();
// Returns a pointer to the code to run.
JittedVertexDecoder Compile(const VertexDecoder &dec);
void Jit_WeightsU8();
void Jit_WeightsU16();
void Jit_WeightsFloat();
void Jit_TcU8();
void Jit_TcU16();
void Jit_TcFloat();
void Jit_TcU16Through();
void Jit_TcFloatThrough();
void Jit_Color8888();
void Jit_Color4444();
void Jit_Color565();
void Jit_Color5551();
void Jit_NormalS8();
void Jit_NormalS16();
void Jit_NormalFloat();
void Jit_PosS8();
void Jit_PosS8Through();
void Jit_PosS16();
void Jit_PosS16Through();
void Jit_PosFloat();
private:
bool CompileStep(const VertexDecoder &dec, int i);
const VertexDecoder *dec_;
};