mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Vertex decoder JIT for x86 and x64. Handles the most common vertex formats.
This commit is contained in:
parent
c4e02ab41d
commit
810b1a061f
4 changed files with 430 additions and 41 deletions
|
@ -151,6 +151,8 @@ TransformDrawEngine::TransformDrawEngine()
|
|||
memset(vbo_, 0, sizeof(vbo_));
|
||||
memset(ebo_, 0, sizeof(ebo_));
|
||||
indexGen.Setup(decIndex);
|
||||
decJitCache_ = new VertexDecoderJitCache();
|
||||
|
||||
InitDeviceObjects();
|
||||
register_gl_resource_holder(this);
|
||||
}
|
||||
|
@ -164,6 +166,7 @@ TransformDrawEngine::~TransformDrawEngine() {
|
|||
delete [] quadIndices_;
|
||||
|
||||
unregister_gl_resource_holder(this);
|
||||
delete [] decJitCache_;
|
||||
for (auto iter = decoderMap_.begin(); iter != decoderMap_.end(); iter++) {
|
||||
delete iter->second;
|
||||
}
|
||||
|
@ -881,7 +884,7 @@ VertexDecoder *TransformDrawEngine::GetVertexDecoder(u32 vtype) {
|
|||
if (iter != decoderMap_.end())
|
||||
return iter->second;
|
||||
VertexDecoder *dec = new VertexDecoder();
|
||||
dec->SetVertexType(vtype);
|
||||
dec->SetVertexType(vtype, decJitCache_);
|
||||
decoderMap_[vtype] = dec;
|
||||
return dec;
|
||||
}
|
||||
|
|
|
@ -162,6 +162,7 @@ private:
|
|||
// Cached vertex decoders
|
||||
std::map<u32, VertexDecoder *> decoderMap_;
|
||||
VertexDecoder *dec_;
|
||||
VertexDecoderJitCache *decJitCache_;
|
||||
u32 lastVType_;
|
||||
|
||||
// Vertex collector buffers
|
||||
|
|
|
@ -65,6 +65,8 @@ DecVtxFormat GetTransformedVtxFormat(const DecVtxFormat &fmt) {
|
|||
}
|
||||
#endif
|
||||
|
||||
VertexDecoder::VertexDecoder() : coloff(0), nrmoff(0), posoff(0), jitted_(0) {}
|
||||
|
||||
void VertexDecoder::Step_WeightsU8() const
|
||||
{
|
||||
u8 *wt = (u8 *)(decoded_ + decFmt.w0off);
|
||||
|
@ -378,7 +380,6 @@ void VertexDecoder::Step_PosS8Through() const
|
|||
v[0] = sv[0];
|
||||
v[1] = sv[1];
|
||||
v[2] = sv[2];
|
||||
v[3] = 0;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_PosS16Through() const
|
||||
|
@ -388,7 +389,6 @@ void VertexDecoder::Step_PosS16Through() const
|
|||
v[0] = sv[0];
|
||||
v[1] = sv[1];
|
||||
v[2] = sv[2];
|
||||
v[3] = 0;
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_PosFloatThrough() const
|
||||
|
@ -529,7 +529,7 @@ static const StepFunction posstep_through[4] = {
|
|||
&VertexDecoder::Step_PosFloatThrough,
|
||||
};
|
||||
|
||||
void VertexDecoder::SetVertexType(u32 fmt) {
|
||||
void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
|
||||
fmt_ = fmt;
|
||||
throughmode = (fmt & GE_VTYPE_THROUGH) != 0;
|
||||
numSteps_ = 0;
|
||||
|
@ -556,6 +556,7 @@ void VertexDecoder::SetVertexType(u32 fmt) {
|
|||
}
|
||||
|
||||
if (weighttype) { // && nweights?
|
||||
weightoff = size;
|
||||
//size = align(size, wtalign[weighttype]); unnecessary
|
||||
size += wtsize[weighttype] * nweights;
|
||||
if (wtalign[weighttype] > biggest)
|
||||
|
@ -701,6 +702,11 @@ void VertexDecoder::SetVertexType(u32 fmt) {
|
|||
onesize_ = size;
|
||||
size *= morphcount;
|
||||
DEBUG_LOG(G3D,"SVT : size = %i, aligned to biggest %i", size, biggest);
|
||||
|
||||
// Attempt to JIT as well
|
||||
if (jitCache) {
|
||||
jitted_ = jitCache->Compile(*this);
|
||||
}
|
||||
}
|
||||
|
||||
void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowerBound, int indexUpperBound) const {
|
||||
|
@ -708,35 +714,26 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe
|
|||
// decoded_ and ptr_ are used in the steps, so can't be turned into locals for speed.
|
||||
decoded_ = decodedptr;
|
||||
ptr_ = (const u8*)verts + indexLowerBound * size;
|
||||
int stride = decFmt.stride;
|
||||
for (int index = indexLowerBound; index <= indexUpperBound; index++) {
|
||||
for (int i = 0; i < numSteps_; i++) {
|
||||
((*this).*steps_[i])();
|
||||
}
|
||||
ptr_ += size;
|
||||
decoded_ += stride;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Does not support morphs, skinning etc.
|
||||
u32 VertexDecoder::InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const {
|
||||
u32 customVertType = (gstate.vertType & ~GE_VTYPE_TC_MASK) | GE_VTYPE_TC_FLOAT;
|
||||
VertexDecoder decOut;
|
||||
decOut.SetVertexType(customVertType);
|
||||
int count = indexUpperBound - indexLowerBound + 1;
|
||||
int stride = decFmt.stride;
|
||||
if (jitted_) {
|
||||
// We've compiled the steps into optimized machine code, so just jump!
|
||||
jitted_(ptr_, decoded_, count);
|
||||
|
||||
const u8 *inp = (const u8 *)verts;
|
||||
u8 *out = decoded;
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (pos) memcpy(out + decOut.posoff, inp + posoff, possize[pos]);
|
||||
if (nrm) memcpy(out + decOut.nrmoff, inp + nrmoff, nrmsize[nrm]);
|
||||
if (col) memcpy(out + decOut.coloff, inp + coloff, colsize[col]);
|
||||
// Ignore others for now, this is all we need for puzbob.
|
||||
// Inject!
|
||||
memcpy(out + decOut.tcoff, &customuv[i * 2], tcsize[decOut.tc]);
|
||||
inp += this->onesize_;
|
||||
out += decOut.onesize_;
|
||||
// Do we need to update the pointers?
|
||||
ptr_ += size * count;
|
||||
decoded_ += stride * count;
|
||||
} else {
|
||||
// Interpret the decode steps
|
||||
for (; count; count--) {
|
||||
for (int i = 0; i < numSteps_; i++) {
|
||||
((*this).*steps_[i])();
|
||||
}
|
||||
ptr_ += size;
|
||||
decoded_ += stride;
|
||||
}
|
||||
}
|
||||
return customVertType;
|
||||
}
|
||||
|
||||
int VertexDecoder::ToString(char *output) const {
|
||||
|
@ -761,3 +758,327 @@ int VertexDecoder::ToString(char *output) const {
|
|||
output += sprintf(output, " (size: %i)", VertexSize());
|
||||
return output - start;
|
||||
}
|
||||
|
||||
VertexDecoderJitCache::VertexDecoderJitCache() {
|
||||
using namespace Gen;
|
||||
// 32k should be enough.
|
||||
AllocCodeSpace(1024 * 32);
|
||||
|
||||
// Add some random code to "help" MSVC's buggy disassembler :(
|
||||
#if defined(_WIN32)
|
||||
for (int i = 0; i < 100; i++) {
|
||||
MOV(32, R(EAX), R(EBX));
|
||||
RET();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef ARM
|
||||
|
||||
// TODO
|
||||
|
||||
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#elif defined(_M_X64) || defined(_M_IX86)
|
||||
|
||||
using namespace Gen;
|
||||
|
||||
#ifdef _M_X64
|
||||
#ifdef _WIN32
|
||||
static const X64Reg tempReg1 = RAX;
|
||||
static const X64Reg tempReg2 = R9;
|
||||
static const X64Reg tempReg3 = R10;
|
||||
static const X64Reg srcReg = RCX;
|
||||
static const X64Reg dstReg = RDX;
|
||||
static const X64Reg counterReg = R8;
|
||||
#else
|
||||
static const X64Reg tempReg1 = RAX;
|
||||
static const X64Reg tempReg2 = R9;
|
||||
static const X64Reg tempReg3 = R10;
|
||||
static const X64Reg srcReg = RDI;
|
||||
static const X64Reg dstReg = RSI;
|
||||
static const X64Reg counterReg = RDX;
|
||||
#endif
|
||||
#else
|
||||
static const X64Reg tempReg1 = EAX;
|
||||
static const X64Reg tempReg2 = EBX;
|
||||
static const X64Reg tempReg3 = EDX;
|
||||
static const X64Reg srcReg = ESI;
|
||||
static const X64Reg dstReg = EDI;
|
||||
static const X64Reg counterReg = ECX;
|
||||
#endif
|
||||
|
||||
typedef void (VertexDecoderJitCache::*JitStepFunction)();
|
||||
|
||||
struct JitLookup {
|
||||
StepFunction func;
|
||||
JitStepFunction jitFunc;
|
||||
};
|
||||
|
||||
// To debug, just comment them out one at a time until it works. We fall back
|
||||
// on the interpreter if the compiler fails.
|
||||
|
||||
static const JitLookup jitLookup[] = {
|
||||
{&VertexDecoder::Step_WeightsU8, &VertexDecoderJitCache::Jit_WeightsU8},
|
||||
{&VertexDecoder::Step_WeightsU16, &VertexDecoderJitCache::Jit_WeightsU16},
|
||||
{&VertexDecoder::Step_WeightsFloat, &VertexDecoderJitCache::Jit_WeightsFloat},
|
||||
|
||||
{&VertexDecoder::Step_TcU8, &VertexDecoderJitCache::Jit_TcU8},
|
||||
{&VertexDecoder::Step_TcU16, &VertexDecoderJitCache::Jit_TcU16},
|
||||
{&VertexDecoder::Step_TcFloat, &VertexDecoderJitCache::Jit_TcFloat},
|
||||
|
||||
{&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through},
|
||||
{&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough},
|
||||
|
||||
{&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8},
|
||||
{&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16},
|
||||
{&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat},
|
||||
|
||||
{&VertexDecoder::Step_Color8888, &VertexDecoderJitCache::Jit_Color8888},
|
||||
// Todo: The compressed color formats
|
||||
|
||||
{&VertexDecoder::Step_PosS8Through, &VertexDecoderJitCache::Jit_PosS8Through},
|
||||
{&VertexDecoder::Step_PosS16Through, &VertexDecoderJitCache::Jit_PosS16Through},
|
||||
{&VertexDecoder::Step_PosFloatThrough, &VertexDecoderJitCache::Jit_PosFloat},
|
||||
|
||||
{&VertexDecoder::Step_PosS8, &VertexDecoderJitCache::Jit_PosS8},
|
||||
{&VertexDecoder::Step_PosS16, &VertexDecoderJitCache::Jit_PosS16},
|
||||
{&VertexDecoder::Step_PosFloat, &VertexDecoderJitCache::Jit_PosFloat},
|
||||
};
|
||||
|
||||
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
|
||||
dec_ = &dec;
|
||||
const u8 *start = this->GetCodePtr();
|
||||
|
||||
#ifdef _M_IX86
|
||||
// Store register values
|
||||
PUSH(ESI);
|
||||
PUSH(EDI);
|
||||
PUSH(EBX);
|
||||
PUSH(EBP);
|
||||
|
||||
// Read parameters
|
||||
int offset = 4;
|
||||
MOV(32, R(srcReg), MDisp(ESP, 16 + offset + 0));
|
||||
MOV(32, R(dstReg), MDisp(ESP, 16 + offset + 4));
|
||||
MOV(32, R(counterReg), MDisp(ESP, 16 + offset + 8));
|
||||
|
||||
#endif
|
||||
|
||||
// Let's not bother with a proper stack frame. We just grab the arguments and go.
|
||||
JumpTarget loopStart = GetCodePtr();
|
||||
for (int i = 0; i < dec.numSteps_; i++) {
|
||||
if (!CompileStep(dec, i)) {
|
||||
// Reset the code ptr and return zero to indicate that we failed.
|
||||
SetCodePtr(const_cast<u8 *>(start));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _M_X64
|
||||
ADD(64, R(srcReg), Imm32(dec.VertexSize()));
|
||||
ADD(64, R(dstReg), Imm32(dec.decFmt.stride));
|
||||
#else
|
||||
ADD(32, R(srcReg), Imm32(dec.VertexSize()));
|
||||
ADD(32, R(dstReg), Imm32(dec.decFmt.stride));
|
||||
#endif
|
||||
SUB(32, R(counterReg), Imm8(1));
|
||||
J_CC(CC_NZ, loopStart);
|
||||
|
||||
#ifdef _M_IX86
|
||||
// Store register values
|
||||
POP(EBP);
|
||||
POP(EBX);
|
||||
POP(EDI);
|
||||
POP(ESI);
|
||||
#endif
|
||||
|
||||
RET();
|
||||
|
||||
return (JittedVertexDecoder)start;
|
||||
}
|
||||
|
||||
bool VertexDecoderJitCache::CompileStep(const VertexDecoder &dec, int step) {
|
||||
// See if we find a matching JIT function
|
||||
for (int i = 0; i < ARRAY_SIZE(jitLookup); i++) {
|
||||
if (dec.steps_[step] == jitLookup[i].func) {
|
||||
((*this).*jitLookup[i].jitFunc)();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsU8() {
|
||||
// Basic implementation - a byte at a time. TODO: Optimize
|
||||
int j;
|
||||
for (j = 0; j < dec_->nweights; j++) {
|
||||
MOV(8, R(tempReg1), MDisp(srcReg, dec_->weightoff + j));
|
||||
MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), R(tempReg1));
|
||||
}
|
||||
while (j & 3) {
|
||||
MOV(8, MDisp(dstReg, dec_->decFmt.w0off + j), Imm8(0));
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsU16() {
|
||||
// Basic implementation - a short at a time. TODO: Optimize
|
||||
int j;
|
||||
for (j = 0; j < dec_->nweights; j++) {
|
||||
MOV(16, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 2));
|
||||
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), R(tempReg1));
|
||||
}
|
||||
while (j & 3) {
|
||||
MOV(16, MDisp(dstReg, dec_->decFmt.w0off + j * 2), Imm8(0));
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_WeightsFloat() {
|
||||
int j;
|
||||
for (j = 0; j < dec_->nweights; j++) {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
|
||||
}
|
||||
while (j & 3) { // Zero additional weights rounding up to 4.
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill last two bytes with zeroes to align to 4 bytes. MOVZX does it for us, handy.
|
||||
void VertexDecoderJitCache::Jit_TcU8() {
|
||||
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->tcoff));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcU16() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcFloat() {
|
||||
#ifdef _M_X64
|
||||
MOV(64, R(tempReg1), MDisp(srcReg, dec_->tcoff));
|
||||
MOV(64, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
|
||||
#else
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
|
||||
MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
|
||||
#endif
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcU16Through() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_TcFloatThrough() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff));
|
||||
MOV(32, R(tempReg2), MDisp(srcReg, dec_->tcoff + 4));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.uvoff + 4), R(tempReg2));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color8888() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color4444() {
|
||||
/*
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff));
|
||||
MOV(32, R(tempReg2), R(tempReg1));
|
||||
MOV(32, R(tempReg3), R(tempReg2));
|
||||
AND(32, R(tempReg3), Imm8(0xF)); // t3 =
|
||||
*/
|
||||
|
||||
// TODO
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color565() {
|
||||
// TODO
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color5551() {
|
||||
// TODO
|
||||
}
|
||||
|
||||
// Copy 3 bytes and then a zero. Might as well copy four.
|
||||
void VertexDecoderJitCache::Jit_NormalS8() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
|
||||
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
|
||||
}
|
||||
|
||||
// Copy 6 bytes and then 2 zeroes.
|
||||
void VertexDecoderJitCache::Jit_NormalS16() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
|
||||
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->nrmoff + 4));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
|
||||
}
|
||||
|
||||
void VertexDecoderJitCache::Jit_NormalFloat() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff));
|
||||
MOV(32, R(tempReg2), MDisp(srcReg, dec_->nrmoff + 4));
|
||||
MOV(32, R(tempReg3), MDisp(srcReg, dec_->nrmoff + 8));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 4), R(tempReg2));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff + 8), R(tempReg3));
|
||||
}
|
||||
|
||||
// Through expands into floats, always. Might want to look at changing this.
|
||||
void VertexDecoderJitCache::Jit_PosS8Through() {
|
||||
// TODO: SIMD
|
||||
for (int i = 0; i < 3; i++) {
|
||||
MOVSX(32, 8, tempReg1, MDisp(srcReg, dec_->posoff + i));
|
||||
CVTSI2SS(XMM0, R(tempReg1));
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
|
||||
}
|
||||
}
|
||||
|
||||
// Through expands into floats, always. Might want to look at changing this.
|
||||
void VertexDecoderJitCache::Jit_PosS16Through() {
|
||||
// TODO: SIMD
|
||||
for (int i = 0; i < 3; i++) {
|
||||
MOVSX(32, 16, tempReg1, MDisp(srcReg, dec_->posoff + i * 2));
|
||||
CVTSI2SS(XMM0, R(tempReg1));
|
||||
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + i * 4), XMM0);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy 3 bytes and then a zero. Might as well copy four.
|
||||
void VertexDecoderJitCache::Jit_PosS8() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
|
||||
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
|
||||
}
|
||||
|
||||
// Copy 6 bytes and then 2 zeroes.
|
||||
void VertexDecoderJitCache::Jit_PosS16() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
|
||||
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
|
||||
}
|
||||
|
||||
// Just copy 12 bytes.
|
||||
void VertexDecoderJitCache::Jit_PosFloat() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
|
||||
MOV(32, R(tempReg2), MDisp(srcReg, dec_->posoff + 4));
|
||||
MOV(32, R(tempReg3), MDisp(srcReg, dec_->posoff + 8));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 8), R(tempReg3));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -17,16 +17,27 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "GPU/GPUState.h"
|
||||
#include "Globals.h"
|
||||
#include "base/basictypes.h"
|
||||
|
||||
#ifdef ARM
|
||||
#include "Common/ArmEmitter.h"
|
||||
#else
|
||||
#include "Common/x64Emitter.h"
|
||||
#endif
|
||||
|
||||
#include "Globals.h"
|
||||
#include "Core/Reporting.h"
|
||||
#include "GPU/GPUState.h"
|
||||
#include "GPU/Common/VertexDecoderCommon.h"
|
||||
|
||||
class VertexDecoder;
|
||||
class VertexDecoderJitCache;
|
||||
|
||||
typedef void (VertexDecoder::*StepFunction)() const;
|
||||
|
||||
|
||||
typedef void (*JittedVertexDecoder)(const u8 *src, u8 *dst, int count);
|
||||
|
||||
// Right now
|
||||
// - compiles into list of called functions
|
||||
// Future TODO
|
||||
|
@ -34,20 +45,17 @@ typedef void (VertexDecoder::*StepFunction)() const;
|
|||
class VertexDecoder
|
||||
{
|
||||
public:
|
||||
VertexDecoder() : coloff(0), nrmoff(0), posoff(0) {}
|
||||
~VertexDecoder() {}
|
||||
VertexDecoder();
|
||||
|
||||
// A jit cache is not mandatory, we don't use it in the sw renderer
|
||||
void SetVertexType(u32 vtype, VertexDecoderJitCache *jitCache = 0);
|
||||
|
||||
// prim is needed knowledge for a performance hack (PrescaleUV)
|
||||
void SetVertexType(u32 vtype);
|
||||
u32 VertexType() const { return fmt_; }
|
||||
|
||||
const DecVtxFormat &GetDecVtxFmt() { return decFmt; }
|
||||
|
||||
void DecodeVerts(u8 *decoded, const void *verts, int indexLowerBound, int indexUpperBound) const;
|
||||
|
||||
// This could be easily generalized to inject any one component. Don't know another use for it though.
|
||||
u32 InjectUVs(u8 *decoded, const void *verts, float *customuv, int count) const;
|
||||
|
||||
bool hasColor() const { return col != 0; }
|
||||
int VertexSize() const { return size; } // PSP format size
|
||||
|
||||
|
@ -68,8 +76,6 @@ public:
|
|||
void Step_TcU16ThroughDouble() const;
|
||||
void Step_TcFloatThrough() const;
|
||||
|
||||
// TODO: tcmorph
|
||||
|
||||
void Step_Color4444() const;
|
||||
void Step_Color565() const;
|
||||
void Step_Color5551() const;
|
||||
|
@ -147,4 +153,62 @@ public:
|
|||
int nweights;
|
||||
|
||||
int stats_[NUM_VERTEX_DECODER_STATS];
|
||||
|
||||
JittedVertexDecoder jitted_;
|
||||
|
||||
friend class VertexDecoderJitCache;
|
||||
};
|
||||
|
||||
|
||||
// A compiled vertex decoder takes the following arguments (C calling convention):
|
||||
// u8 *src, u8 *dst, int count
|
||||
//
|
||||
// x86:
|
||||
// src is placed in esi and dst in edi
|
||||
// for every vertex, we step esi and edi forwards by the two vertex sizes
|
||||
// all movs are done relative to esi and edi
|
||||
//
|
||||
// that's it!
|
||||
|
||||
|
||||
#ifdef ARM
|
||||
class VertexDecoderJitCache : public ARMXCodeBlock {
|
||||
#else
|
||||
class VertexDecoderJitCache : public Gen::XCodeBlock {
|
||||
#endif
|
||||
public:
|
||||
VertexDecoderJitCache();
|
||||
|
||||
// Returns a pointer to the code to run.
|
||||
JittedVertexDecoder Compile(const VertexDecoder &dec);
|
||||
|
||||
void Jit_WeightsU8();
|
||||
void Jit_WeightsU16();
|
||||
void Jit_WeightsFloat();
|
||||
|
||||
void Jit_TcU8();
|
||||
void Jit_TcU16();
|
||||
void Jit_TcFloat();
|
||||
|
||||
void Jit_TcU16Through();
|
||||
void Jit_TcFloatThrough();
|
||||
|
||||
void Jit_Color8888();
|
||||
void Jit_Color4444();
|
||||
void Jit_Color565();
|
||||
void Jit_Color5551();
|
||||
|
||||
void Jit_NormalS8();
|
||||
void Jit_NormalS16();
|
||||
void Jit_NormalFloat();
|
||||
|
||||
void Jit_PosS8();
|
||||
void Jit_PosS8Through();
|
||||
void Jit_PosS16();
|
||||
void Jit_PosS16Through();
|
||||
void Jit_PosFloat();
|
||||
|
||||
private:
|
||||
bool CompileStep(const VertexDecoder &dec, int i);
|
||||
const VertexDecoder *dec_;
|
||||
};
|
Loading…
Add table
Reference in a new issue