Force positions to scale by 128/32768 as psp does.

This makes everything use floats for positions.  On some hardware/drivers,
this may be faster.  On some it may be slower.  We'll need testing to see
the performance impact.

Fixes Final Fantasy 4's pos misalignments, and probably others (like
Tekken 5 I suspect.)
This commit is contained in:
Unknown W. Brackets 2014-08-17 12:38:21 -07:00
parent 78ddffee2c
commit 385df1c54e
4 changed files with 119 additions and 83 deletions

View file

@ -118,7 +118,7 @@ public:
pos[2] = u[2] * (1.0f / 65535.0f);
} else {
for (int i = 0; i < 3; i++)
pos[i] = s[i] * (1.f / 32767.f);
pos[i] = s[i] * (1.0f / 32768.0f);
}
}
break;
@ -130,10 +130,10 @@ public:
if (isThrough()) {
for (int i = 0; i < 2; i++)
pos[i] = b[i];
pos[2] = u[2] / 255.0f;
pos[2] = u[2] * (1.0f / 255.0f);
} else {
for (int i = 0; i < 3; i++)
pos[i] = b[i] * (1.f / 127.f);
pos[i] = b[i] * (1.0f / 128.0f);
}
}
break;
@ -168,7 +168,7 @@ public:
pos[2] = u[2];
} else {
for (int i = 0; i < 3; i++)
pos[i] = s[i] * (1.f / 32767.f);
pos[i] = s[i] * (1.0f / 32768.0f);
// TODO: Does depth need conversion?
}
}
@ -184,7 +184,7 @@ public:
pos[2] = u[2];
} else {
for (int i = 0; i < 3; i++)
pos[i] = b[i] * (1.f / 127.f);
pos[i] = b[i] * (1.0f / 128.0f);
// TODO: Does depth need conversion?
}
}
@ -203,7 +203,7 @@ public:
{
const float *f = (const float *)(data_ + decFmt_.nrmoff);
for (int i = 0; i < 3; i++)
nrm[i] = f[i] ;
nrm[i] = f[i];
}
break;
case DEC_S16_3:

View file

@ -93,7 +93,7 @@ void VertexDecoder::Step_WeightsU8Skin() const
for (int j = 0; j < nweights; j++) {
const float *bone = &gstate.boneMatrix[j * 12];
if (wdata[j] != 0) {
float weight = wdata[j] / 128.0f;
float weight = wdata[j] * (1.0f / 128.0f);
for (int i = 0; i < 12; i++) {
skinMatrix[i] += weight * bone[i];
}
@ -109,7 +109,7 @@ void VertexDecoder::Step_WeightsU16Skin() const
for (int j = 0; j < nweights; j++) {
const float *bone = &gstate.boneMatrix[j * 12];
if (wdata[j] != 0) {
float weight = wdata[j] / 32768.0f;
float weight = wdata[j] * (1.0f / 32768.0f);
for (int i = 0; i < 12; i++) {
skinMatrix[i] += weight * bone[i];
}
@ -354,7 +354,7 @@ void VertexDecoder::Step_NormalS8Skin() const
{
float *normal = (float *)(decoded_ + decFmt.nrmoff);
const s8 *sv = (const s8*)(ptr_ + nrmoff);
const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) };
Norm3ByMatrix43(normal, fn, skinMatrix);
}
@ -362,7 +362,7 @@ void VertexDecoder::Step_NormalS16Skin() const
{
float *normal = (float *)(decoded_ + decFmt.nrmoff);
const s16 *sv = (const s16*)(ptr_ + nrmoff);
const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) };
Norm3ByMatrix43(normal, fn, skinMatrix);
}
@ -380,7 +380,7 @@ void VertexDecoder::Step_NormalS8Morph() const
for (int n = 0; n < morphcount; n++)
{
const s8 *bv = (const s8*)(ptr_ + onesize_*n + nrmoff);
float multiplier = gstate_c.morphWeights[n] * (1.0f/127.0f);
const float multiplier = gstate_c.morphWeights[n] * (1.0f / 128.0f);
for (int j = 0; j < 3; j++)
normal[j] += bv[j] * multiplier;
}
@ -392,8 +392,8 @@ void VertexDecoder::Step_NormalS16Morph() const
memset(normal, 0, sizeof(float)*3);
for (int n = 0; n < morphcount; n++)
{
float multiplier = gstate_c.morphWeights[n] * (1.0f/32767.0f);
const s16 *sv = (const s16 *)(ptr_ + onesize_*n + nrmoff);
const float multiplier = gstate_c.morphWeights[n] * (1.0f / 32768.0f);
for (int j = 0; j < 3; j++)
normal[j] += sv[j] * multiplier;
}
@ -414,20 +414,18 @@ void VertexDecoder::Step_NormalFloatMorph() const
void VertexDecoder::Step_PosS8() const
{
s8 *v = (s8 *)(decoded_ + decFmt.posoff);
float *pos = (float *)(decoded_ + decFmt.posoff);
const s8 *sv = (const s8*)(ptr_ + posoff);
for (int j = 0; j < 3; j++)
v[j] = sv[j];
v[3] = 0;
pos[j] = sv[j] * (1.0f / 128.0f);
}
void VertexDecoder::Step_PosS16() const
{
s16 *v = (s16 *)(decoded_ + decFmt.posoff);
float *pos = (float *)(decoded_ + decFmt.posoff);
const s16 *sv = (const s16*)(ptr_ + posoff);
for (int j = 0; j < 3; j++)
v[j] = sv[j];
v[3] = 0;
pos[j] = sv[j] * (1.0f / 32768.0f);
}
void VertexDecoder::Step_PosFloat() const
@ -441,7 +439,7 @@ void VertexDecoder::Step_PosS8Skin() const
{
float *pos = (float *)(decoded_ + decFmt.posoff);
const s8 *sv = (const s8*)(ptr_ + posoff);
const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) };
Vec3ByMatrix43(pos, fn, skinMatrix);
}
@ -449,7 +447,7 @@ void VertexDecoder::Step_PosS16Skin() const
{
float *pos = (float *)(decoded_ + decFmt.posoff);
const s16 *sv = (const s16*)(ptr_ + posoff);
const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) };
Vec3ByMatrix43(pos, fn, skinMatrix);
}
@ -491,7 +489,7 @@ void VertexDecoder::Step_PosS8Morph() const
float *v = (float *)(decoded_ + decFmt.posoff);
memset(v, 0, sizeof(float) * 3);
for (int n = 0; n < morphcount; n++) {
float multiplier = 1.0f / 127.0f;
const float multiplier = 1.0f / 128.0f;
const s8 *sv = (const s8*)(ptr_ + onesize_*n + posoff);
for (int j = 0; j < 3; j++)
v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
@ -503,7 +501,7 @@ void VertexDecoder::Step_PosS16Morph() const
float *v = (float *)(decoded_ + decFmt.posoff);
memset(v, 0, sizeof(float) * 3);
for (int n = 0; n < morphcount; n++) {
float multiplier = 1.0f / 32767.0f;
const float multiplier = 1.0f / 32768.0f;
const s16 *sv = (const s16*)(ptr_ + onesize_*n + posoff);
for (int j = 0; j < 3; j++)
v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
@ -806,18 +804,7 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
decFmt.posfmt = DEC_FLOAT_3;
} else {
steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos];
if (morphcount == 1) {
// The non-through-mode position formats match the gl formats perfectly, let's use 'em.
switch (pos) {
case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break;
case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break;
case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break;
}
} else {
// Actually, temporarily let's not.
decFmt.posfmt = DEC_FLOAT_3;
}
decFmt.posfmt = DEC_FLOAT_3;
}
}
decFmt.posoff = decOff;

View file

@ -50,10 +50,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f};
// TODO: Maybe load all morph weights to Q6+ to avoid memory access?
static const float by127 = 1.0f / 127.0f;
static const float by128 = 1.0f / 128.0f;
static const float by256 = 1.0f / 256.0f;
static const float by32767 = 1.0f / 32767.0f;
static const float by32768 = 1.0f / 32768.0f;
using namespace ArmGen;
@ -1147,8 +1144,8 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
// Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS8Through() {
DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
// TODO: SIMD
LDRSB(tempReg1, srcReg, dec_->posoff);
@ -1173,8 +1170,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
// Through expands into floats, always. Might want to look at changing this.
void VertexDecoderJitCache::Jit_PosS16Through() {
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
// TODO: SIMD
LDRSH(tempReg1, srcReg, dec_->posoff);
@ -1197,24 +1194,69 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
}
}
// Copy 3 bytes and then a zero. Might as well copy four.
void VertexDecoderJitCache::Jit_PosS8() {
LDRB(tempReg1, srcReg, dec_->posoff);
LDRB(tempReg2, srcReg, dec_->posoff + 1);
LDRB(tempReg3, srcReg, dec_->posoff + 2);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.posoff);
if (NEONSkinning) {
ADD(scratchReg, srcReg, dec_->posoff);
VMOV_neon(F_32, Q3, by128);
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL(F_32, srcNEON, neonScratchReg, Q3);
} else {
LDRSB(tempReg1, srcReg, dec_->posoff);
LDRSB(tempReg2, srcReg, dec_->posoff + 1);
LDRSB(tempReg3, srcReg, dec_->posoff + 2);
VMOV(src[0], tempReg1);
VMOV(src[1], tempReg2);
VMOV(src[2], tempReg3);
MOVI2F(S15, by128, scratchReg);
VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
VMUL(src[0], src[0], S15);
VMUL(src[1], src[1], S15);
VMUL(src[2], src[2], S15);
}
ADD(scratchReg, dstReg, dec_->decFmt.posoff);
if (NEONSkinning) {
VST1(F_32, srcNEON, scratchReg, 2);
} else {
VSTMIA(scratchReg, false, src[0], 3);
}
}
// Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_PosS16() {
LDRH(tempReg1, srcReg, dec_->posoff);
LDRH(tempReg2, srcReg, dec_->posoff + 2);
LDRH(tempReg3, srcReg, dec_->posoff + 4);
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
STR(tempReg1, dstReg, dec_->decFmt.posoff);
STR(tempReg3, dstReg, dec_->decFmt.posoff + 4);
if (NEONSkinning) {
ADD(scratchReg, srcReg, dec_->posoff);
VMOV_neon(F_32, Q3, by32768);
VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
VMUL(F_32, srcNEON, neonScratchReg, Q3);
} else {
LDRSH(tempReg1, srcReg, dec_->posoff);
LDRSH(tempReg2, srcReg, dec_->posoff + 2);
LDRSH(tempReg3, srcReg, dec_->posoff + 4);
VMOV(fpScratchReg, tempReg1);
VMOV(fpScratchReg2, tempReg2);
VMOV(fpScratchReg3, tempReg3);
MOVI2F(S15, by32768, scratchReg);
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
VMUL(src[0], fpScratchReg, S15);
VMUL(src[1], fpScratchReg2, S15);
VMUL(src[2], fpScratchReg3, S15);
}
ADD(scratchReg, dstReg, dec_->decFmt.posoff);
if (NEONSkinning) {
VST1(F_32, srcNEON, scratchReg, 2);
} else {
VSTMIA(scratchReg, false, src[0], 3);
}
}
// Just copy 12 bytes.
@ -1304,8 +1346,8 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
}
VST1(F_32, accNEON, scratchReg, 2);
} else {
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
MOVP2R(tempReg1, skinMatrix);
VLDMIA(tempReg1, true, fpScratchReg, 3);
@ -1404,10 +1446,10 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
if (useNEON) {
MOVI2FR(scratchReg2, by127);
MOVI2FR(scratchReg2, by128);
VDUP(I_32, Q5, scratchReg2);
} else {
MOVI2F(S13, by127, scratchReg);
MOVI2F(S13, by128, scratchReg);
}
bool first = true;
@ -1474,10 +1516,10 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
if (useNEON) {
MOVI2FR(scratchReg, by32767);
MOVI2FR(scratchReg, by32768);
VDUP(I_32, Q5, scratchReg);
} else {
MOVI2F(S13, by32767, scratchReg);
MOVI2F(S13, by32768, scratchReg);
}
bool first = true;

View file

@ -29,18 +29,9 @@ static float MEMORY_ALIGNED16(bones[16 * 8]);
using namespace Gen;
static const float MEMORY_ALIGNED16( by127[4] ) = {
1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f
};
static const float MEMORY_ALIGNED16( by128[4] ) = {
1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
};
static const float MEMORY_ALIGNED16( by256[4] ) = {
1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256
};
static const float MEMORY_ALIGNED16( by32767[4] ) = {
1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f,
};
static const float MEMORY_ALIGNED16( by32768[4] ) = {
1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
};
@ -1025,19 +1016,35 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 8), fpScratchReg);
}
// Copy 3 bytes and then a zero. Might as well copy four.
void VertexDecoderJitCache::Jit_PosS8() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
XORPS(XMM3, R(XMM3));
MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff));
if (cpu_info.bSSE4_1) {
PMOVSXBD(XMM1, R(XMM1));
} else {
PUNPCKLBW(XMM1, R(XMM3));
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 24);
PSRAD(XMM1, 24);
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by128));
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
}
// Copy 6 bytes and then 2 zeroes.
void VertexDecoderJitCache::Jit_PosS16() {
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
XORPS(XMM3, R(XMM3));
MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff));
if (cpu_info.bSSE4_1) {
PMOVSXWD(XMM1, R(XMM1));
} else {
PUNPCKLWD(XMM1, R(XMM3));
PSLLD(XMM1, 16);
PSRAD(XMM1, 16);
}
CVTDQ2PS(XMM3, R(XMM1));
MULPS(XMM3, M(&by32768));
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
}
// Just copy 12 bytes.
@ -1090,7 +1097,7 @@ void VertexDecoderJitCache::Jit_PosFloatSkin() {
void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
PXOR(fpScratchReg4, R(fpScratchReg4));
MOVAPS(XMM5, M(by127));
MOVAPS(XMM5, M(by128));
// Sum into fpScratchReg.
bool first = true;
@ -1108,7 +1115,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
}
CVTDQ2PS(reg, R(reg));
// Now, It's time to multiply by the weight and 1.0f/127.0f.
// Now, It's time to multiply by the weight and 1.0f/128.0f.
MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
MULSS(fpScratchReg3, R(XMM5));
SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
@ -1128,7 +1135,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
PXOR(fpScratchReg4, R(fpScratchReg4));
MOVAPS(XMM5, M(by32767));
MOVAPS(XMM5, M(by32768));
// Sum into fpScratchReg.
bool first = true;
@ -1145,7 +1152,7 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
}
CVTDQ2PS(reg, R(reg));
// Now, It's time to multiply by the weight and 1.0f/32767.0f.
// Now, It's time to multiply by the weight and 1.0f/32768.0f.
MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
MULSS(fpScratchReg3, R(XMM5));
SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));