mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Force positions to scale by 128/32768 as psp does.
This makes everything use floats for positions. On some hardware/drivers, this may be faster. On some it may be slower. We'll need testing to see the performance impact. Fixes Final Fantasy 4's pos misalignments, and probably others (like Tekken 5 I suspect.)
This commit is contained in:
parent
78ddffee2c
commit
385df1c54e
4 changed files with 119 additions and 83 deletions
|
@ -118,7 +118,7 @@ public:
|
|||
pos[2] = u[2] * (1.0f / 65535.0f);
|
||||
} else {
|
||||
for (int i = 0; i < 3; i++)
|
||||
pos[i] = s[i] * (1.f / 32767.f);
|
||||
pos[i] = s[i] * (1.0f / 32768.0f);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -130,10 +130,10 @@ public:
|
|||
if (isThrough()) {
|
||||
for (int i = 0; i < 2; i++)
|
||||
pos[i] = b[i];
|
||||
pos[2] = u[2] / 255.0f;
|
||||
pos[2] = u[2] * (1.0f / 255.0f);
|
||||
} else {
|
||||
for (int i = 0; i < 3; i++)
|
||||
pos[i] = b[i] * (1.f / 127.f);
|
||||
pos[i] = b[i] * (1.0f / 128.0f);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -168,7 +168,7 @@ public:
|
|||
pos[2] = u[2];
|
||||
} else {
|
||||
for (int i = 0; i < 3; i++)
|
||||
pos[i] = s[i] * (1.f / 32767.f);
|
||||
pos[i] = s[i] * (1.0f / 32768.0f);
|
||||
// TODO: Does depth need conversion?
|
||||
}
|
||||
}
|
||||
|
@ -184,7 +184,7 @@ public:
|
|||
pos[2] = u[2];
|
||||
} else {
|
||||
for (int i = 0; i < 3; i++)
|
||||
pos[i] = b[i] * (1.f / 127.f);
|
||||
pos[i] = b[i] * (1.0f / 128.0f);
|
||||
// TODO: Does depth need conversion?
|
||||
}
|
||||
}
|
||||
|
@ -203,7 +203,7 @@ public:
|
|||
{
|
||||
const float *f = (const float *)(data_ + decFmt_.nrmoff);
|
||||
for (int i = 0; i < 3; i++)
|
||||
nrm[i] = f[i] ;
|
||||
nrm[i] = f[i];
|
||||
}
|
||||
break;
|
||||
case DEC_S16_3:
|
||||
|
|
|
@ -93,7 +93,7 @@ void VertexDecoder::Step_WeightsU8Skin() const
|
|||
for (int j = 0; j < nweights; j++) {
|
||||
const float *bone = &gstate.boneMatrix[j * 12];
|
||||
if (wdata[j] != 0) {
|
||||
float weight = wdata[j] / 128.0f;
|
||||
float weight = wdata[j] * (1.0f / 128.0f);
|
||||
for (int i = 0; i < 12; i++) {
|
||||
skinMatrix[i] += weight * bone[i];
|
||||
}
|
||||
|
@ -109,7 +109,7 @@ void VertexDecoder::Step_WeightsU16Skin() const
|
|||
for (int j = 0; j < nweights; j++) {
|
||||
const float *bone = &gstate.boneMatrix[j * 12];
|
||||
if (wdata[j] != 0) {
|
||||
float weight = wdata[j] / 32768.0f;
|
||||
float weight = wdata[j] * (1.0f / 32768.0f);
|
||||
for (int i = 0; i < 12; i++) {
|
||||
skinMatrix[i] += weight * bone[i];
|
||||
}
|
||||
|
@ -354,7 +354,7 @@ void VertexDecoder::Step_NormalS8Skin() const
|
|||
{
|
||||
float *normal = (float *)(decoded_ + decFmt.nrmoff);
|
||||
const s8 *sv = (const s8*)(ptr_ + nrmoff);
|
||||
const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
|
||||
const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) };
|
||||
Norm3ByMatrix43(normal, fn, skinMatrix);
|
||||
}
|
||||
|
||||
|
@ -362,7 +362,7 @@ void VertexDecoder::Step_NormalS16Skin() const
|
|||
{
|
||||
float *normal = (float *)(decoded_ + decFmt.nrmoff);
|
||||
const s16 *sv = (const s16*)(ptr_ + nrmoff);
|
||||
const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
|
||||
const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) };
|
||||
Norm3ByMatrix43(normal, fn, skinMatrix);
|
||||
}
|
||||
|
||||
|
@ -380,7 +380,7 @@ void VertexDecoder::Step_NormalS8Morph() const
|
|||
for (int n = 0; n < morphcount; n++)
|
||||
{
|
||||
const s8 *bv = (const s8*)(ptr_ + onesize_*n + nrmoff);
|
||||
float multiplier = gstate_c.morphWeights[n] * (1.0f/127.0f);
|
||||
const float multiplier = gstate_c.morphWeights[n] * (1.0f / 128.0f);
|
||||
for (int j = 0; j < 3; j++)
|
||||
normal[j] += bv[j] * multiplier;
|
||||
}
|
||||
|
@ -392,8 +392,8 @@ void VertexDecoder::Step_NormalS16Morph() const
|
|||
memset(normal, 0, sizeof(float)*3);
|
||||
for (int n = 0; n < morphcount; n++)
|
||||
{
|
||||
float multiplier = gstate_c.morphWeights[n] * (1.0f/32767.0f);
|
||||
const s16 *sv = (const s16 *)(ptr_ + onesize_*n + nrmoff);
|
||||
const float multiplier = gstate_c.morphWeights[n] * (1.0f / 32768.0f);
|
||||
for (int j = 0; j < 3; j++)
|
||||
normal[j] += sv[j] * multiplier;
|
||||
}
|
||||
|
@ -414,20 +414,18 @@ void VertexDecoder::Step_NormalFloatMorph() const
|
|||
|
||||
void VertexDecoder::Step_PosS8() const
|
||||
{
|
||||
s8 *v = (s8 *)(decoded_ + decFmt.posoff);
|
||||
float *pos = (float *)(decoded_ + decFmt.posoff);
|
||||
const s8 *sv = (const s8*)(ptr_ + posoff);
|
||||
for (int j = 0; j < 3; j++)
|
||||
v[j] = sv[j];
|
||||
v[3] = 0;
|
||||
pos[j] = sv[j] * (1.0f / 128.0f);
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_PosS16() const
|
||||
{
|
||||
s16 *v = (s16 *)(decoded_ + decFmt.posoff);
|
||||
float *pos = (float *)(decoded_ + decFmt.posoff);
|
||||
const s16 *sv = (const s16*)(ptr_ + posoff);
|
||||
for (int j = 0; j < 3; j++)
|
||||
v[j] = sv[j];
|
||||
v[3] = 0;
|
||||
pos[j] = sv[j] * (1.0f / 32768.0f);
|
||||
}
|
||||
|
||||
void VertexDecoder::Step_PosFloat() const
|
||||
|
@ -441,7 +439,7 @@ void VertexDecoder::Step_PosS8Skin() const
|
|||
{
|
||||
float *pos = (float *)(decoded_ + decFmt.posoff);
|
||||
const s8 *sv = (const s8*)(ptr_ + posoff);
|
||||
const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f };
|
||||
const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) };
|
||||
Vec3ByMatrix43(pos, fn, skinMatrix);
|
||||
}
|
||||
|
||||
|
@ -449,7 +447,7 @@ void VertexDecoder::Step_PosS16Skin() const
|
|||
{
|
||||
float *pos = (float *)(decoded_ + decFmt.posoff);
|
||||
const s16 *sv = (const s16*)(ptr_ + posoff);
|
||||
const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f };
|
||||
const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) };
|
||||
Vec3ByMatrix43(pos, fn, skinMatrix);
|
||||
}
|
||||
|
||||
|
@ -491,7 +489,7 @@ void VertexDecoder::Step_PosS8Morph() const
|
|||
float *v = (float *)(decoded_ + decFmt.posoff);
|
||||
memset(v, 0, sizeof(float) * 3);
|
||||
for (int n = 0; n < morphcount; n++) {
|
||||
float multiplier = 1.0f / 127.0f;
|
||||
const float multiplier = 1.0f / 128.0f;
|
||||
const s8 *sv = (const s8*)(ptr_ + onesize_*n + posoff);
|
||||
for (int j = 0; j < 3; j++)
|
||||
v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
|
||||
|
@ -503,7 +501,7 @@ void VertexDecoder::Step_PosS16Morph() const
|
|||
float *v = (float *)(decoded_ + decFmt.posoff);
|
||||
memset(v, 0, sizeof(float) * 3);
|
||||
for (int n = 0; n < morphcount; n++) {
|
||||
float multiplier = 1.0f / 32767.0f;
|
||||
const float multiplier = 1.0f / 32768.0f;
|
||||
const s16 *sv = (const s16*)(ptr_ + onesize_*n + posoff);
|
||||
for (int j = 0; j < 3; j++)
|
||||
v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]);
|
||||
|
@ -806,18 +804,7 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) {
|
|||
decFmt.posfmt = DEC_FLOAT_3;
|
||||
} else {
|
||||
steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos];
|
||||
|
||||
if (morphcount == 1) {
|
||||
// The non-through-mode position formats match the gl formats perfectly, let's use 'em.
|
||||
switch (pos) {
|
||||
case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break;
|
||||
case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break;
|
||||
case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break;
|
||||
}
|
||||
} else {
|
||||
// Actually, temporarily let's not.
|
||||
decFmt.posfmt = DEC_FLOAT_3;
|
||||
}
|
||||
decFmt.posfmt = DEC_FLOAT_3;
|
||||
}
|
||||
}
|
||||
decFmt.posoff = decOff;
|
||||
|
|
|
@ -50,10 +50,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f};
|
|||
// TODO: Maybe load all morph weights to Q6+ to avoid memory access?
|
||||
|
||||
|
||||
static const float by127 = 1.0f / 127.0f;
|
||||
static const float by128 = 1.0f / 128.0f;
|
||||
static const float by256 = 1.0f / 256.0f;
|
||||
static const float by32767 = 1.0f / 32767.0f;
|
||||
static const float by32768 = 1.0f / 32768.0f;
|
||||
|
||||
using namespace ArmGen;
|
||||
|
@ -1147,8 +1144,8 @@ void VertexDecoderJitCache::Jit_NormalFloat() {
|
|||
// Through expands into floats, always. Might want to look at changing this.
|
||||
void VertexDecoderJitCache::Jit_PosS8Through() {
|
||||
DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
|
||||
|
||||
// TODO: SIMD
|
||||
LDRSB(tempReg1, srcReg, dec_->posoff);
|
||||
|
@ -1173,8 +1170,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() {
|
|||
|
||||
// Through expands into floats, always. Might want to look at changing this.
|
||||
void VertexDecoderJitCache::Jit_PosS16Through() {
|
||||
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
|
||||
|
||||
// TODO: SIMD
|
||||
LDRSH(tempReg1, srcReg, dec_->posoff);
|
||||
|
@ -1197,24 +1194,69 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
|
|||
}
|
||||
}
|
||||
|
||||
// Copy 3 bytes and then a zero. Might as well copy four.
|
||||
void VertexDecoderJitCache::Jit_PosS8() {
|
||||
LDRB(tempReg1, srcReg, dec_->posoff);
|
||||
LDRB(tempReg2, srcReg, dec_->posoff + 1);
|
||||
LDRB(tempReg3, srcReg, dec_->posoff + 2);
|
||||
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8));
|
||||
ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16));
|
||||
STR(tempReg1, dstReg, dec_->decFmt.posoff);
|
||||
if (NEONSkinning) {
|
||||
ADD(scratchReg, srcReg, dec_->posoff);
|
||||
VMOV_neon(F_32, Q3, by128);
|
||||
VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false);
|
||||
VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit
|
||||
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
|
||||
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
|
||||
VMUL(F_32, srcNEON, neonScratchReg, Q3);
|
||||
} else {
|
||||
LDRSB(tempReg1, srcReg, dec_->posoff);
|
||||
LDRSB(tempReg2, srcReg, dec_->posoff + 1);
|
||||
LDRSB(tempReg3, srcReg, dec_->posoff + 2);
|
||||
VMOV(src[0], tempReg1);
|
||||
VMOV(src[1], tempReg2);
|
||||
VMOV(src[2], tempReg3);
|
||||
MOVI2F(S15, by128, scratchReg);
|
||||
VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED);
|
||||
VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED);
|
||||
VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED);
|
||||
VMUL(src[0], src[0], S15);
|
||||
VMUL(src[1], src[1], S15);
|
||||
VMUL(src[2], src[2], S15);
|
||||
}
|
||||
|
||||
ADD(scratchReg, dstReg, dec_->decFmt.posoff);
|
||||
if (NEONSkinning) {
|
||||
VST1(F_32, srcNEON, scratchReg, 2);
|
||||
} else {
|
||||
VSTMIA(scratchReg, false, src[0], 3);
|
||||
}
|
||||
}
|
||||
|
||||
// Copy 6 bytes and then 2 zeroes.
|
||||
void VertexDecoderJitCache::Jit_PosS16() {
|
||||
LDRH(tempReg1, srcReg, dec_->posoff);
|
||||
LDRH(tempReg2, srcReg, dec_->posoff + 2);
|
||||
LDRH(tempReg3, srcReg, dec_->posoff + 4);
|
||||
ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16));
|
||||
STR(tempReg1, dstReg, dec_->decFmt.posoff);
|
||||
STR(tempReg3, dstReg, dec_->decFmt.posoff + 4);
|
||||
if (NEONSkinning) {
|
||||
ADD(scratchReg, srcReg, dec_->posoff);
|
||||
VMOV_neon(F_32, Q3, by32768);
|
||||
VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE);
|
||||
VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit
|
||||
VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ);
|
||||
VMUL(F_32, srcNEON, neonScratchReg, Q3);
|
||||
} else {
|
||||
LDRSH(tempReg1, srcReg, dec_->posoff);
|
||||
LDRSH(tempReg2, srcReg, dec_->posoff + 2);
|
||||
LDRSH(tempReg3, srcReg, dec_->posoff + 4);
|
||||
VMOV(fpScratchReg, tempReg1);
|
||||
VMOV(fpScratchReg2, tempReg2);
|
||||
VMOV(fpScratchReg3, tempReg3);
|
||||
MOVI2F(S15, by32768, scratchReg);
|
||||
VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED);
|
||||
VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED);
|
||||
VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED);
|
||||
VMUL(src[0], fpScratchReg, S15);
|
||||
VMUL(src[1], fpScratchReg2, S15);
|
||||
VMUL(src[2], fpScratchReg3, S15);
|
||||
}
|
||||
|
||||
ADD(scratchReg, dstReg, dec_->decFmt.posoff);
|
||||
if (NEONSkinning) {
|
||||
VST1(F_32, srcNEON, scratchReg, 2);
|
||||
} else {
|
||||
VSTMIA(scratchReg, false, src[0], 3);
|
||||
}
|
||||
}
|
||||
|
||||
// Just copy 12 bytes.
|
||||
|
@ -1304,8 +1346,8 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) {
|
|||
}
|
||||
VST1(F_32, accNEON, scratchReg, 2);
|
||||
} else {
|
||||
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order.");
|
||||
_dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order.");
|
||||
|
||||
MOVP2R(tempReg1, skinMatrix);
|
||||
VLDMIA(tempReg1, true, fpScratchReg, 3);
|
||||
|
@ -1404,10 +1446,10 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
|
|||
MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
|
||||
|
||||
if (useNEON) {
|
||||
MOVI2FR(scratchReg2, by127);
|
||||
MOVI2FR(scratchReg2, by128);
|
||||
VDUP(I_32, Q5, scratchReg2);
|
||||
} else {
|
||||
MOVI2F(S13, by127, scratchReg);
|
||||
MOVI2F(S13, by128, scratchReg);
|
||||
}
|
||||
|
||||
bool first = true;
|
||||
|
@ -1474,10 +1516,10 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
|
|||
MOVP2R(tempReg2, &gstate_c.morphWeights[0]);
|
||||
|
||||
if (useNEON) {
|
||||
MOVI2FR(scratchReg, by32767);
|
||||
MOVI2FR(scratchReg, by32768);
|
||||
VDUP(I_32, Q5, scratchReg);
|
||||
} else {
|
||||
MOVI2F(S13, by32767, scratchReg);
|
||||
MOVI2F(S13, by32768, scratchReg);
|
||||
}
|
||||
|
||||
bool first = true;
|
||||
|
|
|
@ -29,18 +29,9 @@ static float MEMORY_ALIGNED16(bones[16 * 8]);
|
|||
|
||||
using namespace Gen;
|
||||
|
||||
static const float MEMORY_ALIGNED16( by127[4] ) = {
|
||||
1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f
|
||||
};
|
||||
static const float MEMORY_ALIGNED16( by128[4] ) = {
|
||||
1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f
|
||||
};
|
||||
static const float MEMORY_ALIGNED16( by256[4] ) = {
|
||||
1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256
|
||||
};
|
||||
static const float MEMORY_ALIGNED16( by32767[4] ) = {
|
||||
1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f,
|
||||
};
|
||||
static const float MEMORY_ALIGNED16( by32768[4] ) = {
|
||||
1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f,
|
||||
};
|
||||
|
@ -1025,19 +1016,35 @@ void VertexDecoderJitCache::Jit_PosS16Through() {
|
|||
MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 8), fpScratchReg);
|
||||
}
|
||||
|
||||
// Copy 3 bytes and then a zero. Might as well copy four.
|
||||
void VertexDecoderJitCache::Jit_PosS8() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
|
||||
AND(32, R(tempReg1), Imm32(0x00FFFFFF));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
|
||||
XORPS(XMM3, R(XMM3));
|
||||
MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff));
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXBD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PUNPCKLBW(XMM1, R(XMM3));
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 24);
|
||||
PSRAD(XMM1, 24);
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by128));
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
|
||||
}
|
||||
|
||||
// Copy 6 bytes and then 2 zeroes.
|
||||
void VertexDecoderJitCache::Jit_PosS16() {
|
||||
MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff));
|
||||
MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1));
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2));
|
||||
XORPS(XMM3, R(XMM3));
|
||||
MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff));
|
||||
if (cpu_info.bSSE4_1) {
|
||||
PMOVSXWD(XMM1, R(XMM1));
|
||||
} else {
|
||||
PUNPCKLWD(XMM1, R(XMM3));
|
||||
PSLLD(XMM1, 16);
|
||||
PSRAD(XMM1, 16);
|
||||
}
|
||||
CVTDQ2PS(XMM3, R(XMM1));
|
||||
MULPS(XMM3, M(&by32768));
|
||||
MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3);
|
||||
}
|
||||
|
||||
// Just copy 12 bytes.
|
||||
|
@ -1090,7 +1097,7 @@ void VertexDecoderJitCache::Jit_PosFloatSkin() {
|
|||
void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
|
||||
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
|
||||
PXOR(fpScratchReg4, R(fpScratchReg4));
|
||||
MOVAPS(XMM5, M(by127));
|
||||
MOVAPS(XMM5, M(by128));
|
||||
|
||||
// Sum into fpScratchReg.
|
||||
bool first = true;
|
||||
|
@ -1108,7 +1115,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
|
|||
}
|
||||
CVTDQ2PS(reg, R(reg));
|
||||
|
||||
// Now, It's time to multiply by the weight and 1.0f/127.0f.
|
||||
// Now, It's time to multiply by the weight and 1.0f/128.0f.
|
||||
MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
|
||||
MULSS(fpScratchReg3, R(XMM5));
|
||||
SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
@ -1128,7 +1135,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) {
|
|||
void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
|
||||
MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0]));
|
||||
PXOR(fpScratchReg4, R(fpScratchReg4));
|
||||
MOVAPS(XMM5, M(by32767));
|
||||
MOVAPS(XMM5, M(by32768));
|
||||
|
||||
// Sum into fpScratchReg.
|
||||
bool first = true;
|
||||
|
@ -1145,7 +1152,7 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) {
|
|||
}
|
||||
CVTDQ2PS(reg, R(reg));
|
||||
|
||||
// Now, It's time to multiply by the weight and 1.0f/32767.0f.
|
||||
// Now, It's time to multiply by the weight and 1.0f/32768.0f.
|
||||
MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n));
|
||||
MULSS(fpScratchReg3, R(XMM5));
|
||||
SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));
|
||||
|
|
Loading…
Add table
Reference in a new issue