vertexjit: support 8888 morph on ARM.

This commit is contained in:
Unknown W. Brackets 2014-03-20 22:46:33 -07:00
parent 34747b7aa4
commit 29f9ea5df6
3 changed files with 92 additions and 24 deletions

View file

@ -266,5 +266,6 @@ private:
bool CompileStep(const VertexDecoder &dec, int i);
void Jit_ApplyWeights();
void Jit_WriteMatrixMul(int outOff, bool pos);
void Jit_WriteMorphColor(int outOff);
const VertexDecoder *dec_;
};

View file

@ -139,6 +139,8 @@ static const JitLookup jitLookup[] = {
{&VertexDecoder::Step_PosS8Morph, &VertexDecoderJitCache::Jit_PosS8Morph},
{&VertexDecoder::Step_PosS16Morph, &VertexDecoderJitCache::Jit_PosS16Morph},
{&VertexDecoder::Step_PosFloatMorph, &VertexDecoderJitCache::Jit_PosFloatMorph},
{&VertexDecoder::Step_Color8888Morph, &VertexDecoderJitCache::Jit_Color8888Morph},
};
JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) {
@ -680,6 +682,87 @@ void VertexDecoderJitCache::Jit_Color5551() {
STR(tempReg2, dstReg, dec_->decFmt.c0off);
}
void VertexDecoderJitCache::Jit_Color8888Morph() {
ADDI2R(tempReg1, srcReg, dec_->coloff, scratchReg);
MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
bool first = true;
for (int n = 0; n < dec_->morphcount; ++n) {
if (cpu_info.bNEON) {
VLD1_lane(I_32, neonScratchReg, tempReg1, 0, false);
ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
VMOVL(I_8 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
VMOVL(I_16 | I_UNSIGNED, neonScratchRegQ, neonScratchReg);
VCVT(F_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
VLDR(S12, tempReg2, sizeof(float) * n);
if (first) {
first = false;
VMUL_scalar(F_32, Q2, neonScratchRegQ, QScalar(Q3, 0));
} else {
VMLA_scalar(F_32, Q2, neonScratchRegQ, QScalar(Q3, 0));
}
} else {
LDRB(scratchReg, tempReg1, 0);
LDRB(scratchReg2, tempReg1, 1);
LDRB(scratchReg3, tempReg1, 2);
LDRB(tempReg3, tempReg1, 3);
VMOV(fpScratchReg, scratchReg);
VMOV(fpScratchReg2, scratchReg2);
VMOV(fpScratchReg3, scratchReg3);
VMOV(fpScratchReg4, tempReg3);
ADDI2R(tempReg1, tempReg1, dec_->onesize_, scratchReg);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT);
VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT);
VCVT(fpScratchReg4, fpScratchReg4, TO_FLOAT);
VLDR(S12, tempReg2, sizeof(float) * n);
VMUL(S12, S12, S13);
if (first) {
first = false;
VMUL(S8, fpScratchReg, S12);
VMUL(S9, fpScratchReg2, S12);
VMUL(S10, fpScratchReg3, S12);
VMUL(S11, fpScratchReg4, S12);
} else {
VMLA(S8, fpScratchReg, S12);
VMLA(S9, fpScratchReg2, S12);
VMLA(S10, fpScratchReg3, S12);
VMLA(S11, fpScratchReg4, S12);
}
}
}
Jit_WriteMorphColor(dec_->decFmt.c0off);
}
// Expects RGBA color in S8 - S11, which is Q2.
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
if (cpu_info.bNEON) {
ADDI2R(tempReg1, dstReg, outOff, scratchReg);
VCVT(I_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ);
VQMOVN(I_32 | I_UNSIGNED, neonScratchReg, neonScratchRegQ);
VQMOVN(I_16 | I_UNSIGNED, neonScratchReg, neonScratchRegQ);
VST1_lane(I_32, neonScratchReg, tempReg1, 0, false);
} else {
VCVT(S8, S8, TO_INT);
VCVT(S9, S9, TO_INT);
VCVT(S10, S10, TO_INT);
VCVT(S11, S11, TO_INT);
VMOV(scratchReg, fpScratchReg);
VMOV(scratchReg2, fpScratchReg2);
VMOV(scratchReg3, fpScratchReg3);
VMOV(tempReg3, fpScratchReg4);
ORR(scratchReg, scratchReg, Operand2(scratchReg2, ST_LSL, 8));
ORR(scratchReg, scratchReg, Operand2(scratchReg3, ST_LSL, 16));
ORR(scratchReg, scratchReg, Operand2(tempReg3, ST_LSL, 24));
STR(scratchReg, dstReg, outOff);
}
}
void VertexDecoderJitCache::Jit_NormalS8() {
LDRB(tempReg1, srcReg, dec_->nrmoff);
LDRB(tempReg2, srcReg, dec_->nrmoff + 1);

View file

@ -739,13 +739,7 @@ void VertexDecoderJitCache::Jit_Color8888Morph() {
}
}
// Pack back into a u32.
CVTPS2DQ(fpScratchReg, R(fpScratchReg));
PACKSSDW(fpScratchReg, R(fpScratchReg));
PACKUSWB(fpScratchReg, R(fpScratchReg));
MOVD_xmm(R(tempReg1), fpScratchReg);
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
Jit_WriteMorphColor(dec_->decFmt.c0off);
}
static const float MEMORY_ALIGNED16(byColor4444[4]) = { 255.0f / 15.0f, 255.0f / 15.0f, 255.0f / 15.0f, 255.0f / 15.0f, };
@ -789,13 +783,7 @@ void VertexDecoderJitCache::Jit_Color4444Morph() {
}
}
// Pack back into a u32.
CVTPS2DQ(fpScratchReg, R(fpScratchReg));
PACKSSDW(fpScratchReg, R(fpScratchReg));
PACKUSWB(fpScratchReg, R(fpScratchReg));
MOVD_xmm(R(tempReg1), fpScratchReg);
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
Jit_WriteMorphColor(dec_->decFmt.c0off);
}
// Intentionally in reverse order.
@ -849,13 +837,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() {
}
}
// Pack back into a u32.
CVTPS2DQ(fpScratchReg, R(fpScratchReg));
PACKSSDW(fpScratchReg, R(fpScratchReg));
PACKUSWB(fpScratchReg, R(fpScratchReg));
MOVD_xmm(R(tempReg1), fpScratchReg);
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
Jit_WriteMorphColor(dec_->decFmt.c0off);
}
// Intentionally in reverse order.
@ -911,13 +893,15 @@ void VertexDecoderJitCache::Jit_Color5551Morph() {
}
}
Jit_WriteMorphColor(dec_->decFmt.c0off);
}
void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) {
// Pack back into a u32.
CVTPS2DQ(fpScratchReg, R(fpScratchReg));
PACKSSDW(fpScratchReg, R(fpScratchReg));
PACKUSWB(fpScratchReg, R(fpScratchReg));
MOVD_xmm(R(tempReg1), fpScratchReg);
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1));
MOVD_xmm(MDisp(dstReg, outOff), fpScratchReg);
}
// Copy 3 bytes and then a zero. Might as well copy four.