diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 07430429e5..b9211b5e42 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -118,7 +118,7 @@ public: pos[2] = u[2] * (1.0f / 65535.0f); } else { for (int i = 0; i < 3; i++) - pos[i] = s[i] * (1.f / 32767.f); + pos[i] = s[i] * (1.0f / 32768.0f); } } break; @@ -130,10 +130,10 @@ public: if (isThrough()) { for (int i = 0; i < 2; i++) pos[i] = b[i]; - pos[2] = u[2] / 255.0f; + pos[2] = u[2] * (1.0f / 255.0f); } else { for (int i = 0; i < 3; i++) - pos[i] = b[i] * (1.f / 127.f); + pos[i] = b[i] * (1.0f / 128.0f); } } break; @@ -168,7 +168,7 @@ public: pos[2] = u[2]; } else { for (int i = 0; i < 3; i++) - pos[i] = s[i] * (1.f / 32767.f); + pos[i] = s[i] * (1.0f / 32768.0f); // TODO: Does depth need conversion? } } @@ -184,7 +184,7 @@ public: pos[2] = u[2]; } else { for (int i = 0; i < 3; i++) - pos[i] = b[i] * (1.f / 127.f); + pos[i] = b[i] * (1.0f / 128.0f); // TODO: Does depth need conversion? } } @@ -203,7 +203,7 @@ public: { const float *f = (const float *)(data_ + decFmt_.nrmoff); for (int i = 0; i < 3; i++) - nrm[i] = f[i] ; + nrm[i] = f[i]; } break; case DEC_S16_3: diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index e5e67f07a4..0b65354629 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -93,7 +93,7 @@ void VertexDecoder::Step_WeightsU8Skin() const for (int j = 0; j < nweights; j++) { const float *bone = &gstate.boneMatrix[j * 12]; if (wdata[j] != 0) { - float weight = wdata[j] / 128.0f; + float weight = wdata[j] * (1.0f / 128.0f); for (int i = 0; i < 12; i++) { skinMatrix[i] += weight * bone[i]; } @@ -109,7 +109,7 @@ void VertexDecoder::Step_WeightsU16Skin() const for (int j = 0; j < nweights; j++) { const float *bone = &gstate.boneMatrix[j * 12]; if (wdata[j] != 0) { - float weight = wdata[j] / 32768.0f; + float weight = wdata[j] * (1.0f / 32768.0f); for (int i = 0; i < 12; i++) { skinMatrix[i] += weight * bone[i]; } @@ -354,7 +354,7 @@ void VertexDecoder::Step_NormalS8Skin() const { float *normal = (float *)(decoded_ + decFmt.nrmoff); const s8 *sv = (const s8*)(ptr_ + nrmoff); - const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f }; + const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) }; Norm3ByMatrix43(normal, fn, skinMatrix); } @@ -362,7 +362,7 @@ void VertexDecoder::Step_NormalS16Skin() const { float *normal = (float *)(decoded_ + decFmt.nrmoff); const s16 *sv = (const s16*)(ptr_ + nrmoff); - const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f }; + const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) }; Norm3ByMatrix43(normal, fn, skinMatrix); } @@ -380,7 +380,7 @@ void VertexDecoder::Step_NormalS8Morph() const for (int n = 0; n < morphcount; n++) { const s8 *bv = (const s8*)(ptr_ + onesize_*n + nrmoff); - float multiplier = gstate_c.morphWeights[n] * (1.0f/127.0f); + const float multiplier = gstate_c.morphWeights[n] * (1.0f / 128.0f); for (int j = 0; j < 3; j++) normal[j] += bv[j] * multiplier; } @@ -392,8 +392,8 @@ void VertexDecoder::Step_NormalS16Morph() const memset(normal, 0, sizeof(float)*3); for (int n = 0; n < morphcount; n++) { - float multiplier = gstate_c.morphWeights[n] * (1.0f/32767.0f); const s16 *sv = (const s16 *)(ptr_ + onesize_*n + nrmoff); + const float multiplier = gstate_c.morphWeights[n] * (1.0f / 32768.0f); for (int j = 0; j < 3; j++) normal[j] += sv[j] * multiplier; } @@ -414,20 +414,18 @@ void VertexDecoder::Step_NormalFloatMorph() const void VertexDecoder::Step_PosS8() const { - s8 *v = (s8 *)(decoded_ + decFmt.posoff); + float *pos = (float *)(decoded_ + decFmt.posoff); const s8 *sv = (const s8*)(ptr_ + posoff); for (int j = 0; j < 3; j++) - v[j] = sv[j]; - v[3] = 0; + pos[j] = sv[j] * (1.0f / 128.0f); } void VertexDecoder::Step_PosS16() const { - s16 *v = (s16 *)(decoded_ + decFmt.posoff); + float *pos = (float *)(decoded_ + decFmt.posoff); const s16 *sv = (const s16*)(ptr_ + posoff); for (int j = 0; j < 3; j++) - v[j] = sv[j]; - v[3] = 0; + pos[j] = sv[j] * (1.0f / 32768.0f); } void VertexDecoder::Step_PosFloat() const @@ -441,7 +439,7 @@ void VertexDecoder::Step_PosS8Skin() const { float *pos = (float *)(decoded_ + decFmt.posoff); const s8 *sv = (const s8*)(ptr_ + posoff); - const float fn[3] = { sv[0] / 128.0f, sv[1] / 128.0f, sv[2] / 128.0f }; + const float fn[3] = { sv[0] * (1.0f / 128.0f), sv[1] * (1.0f / 128.0f), sv[2] * (1.0f / 128.0f) }; Vec3ByMatrix43(pos, fn, skinMatrix); } @@ -449,7 +447,7 @@ void VertexDecoder::Step_PosS16Skin() const { float *pos = (float *)(decoded_ + decFmt.posoff); const s16 *sv = (const s16*)(ptr_ + posoff); - const float fn[3] = { sv[0] / 32768.0f, sv[1] / 32768.0f, sv[2] / 32768.0f }; + const float fn[3] = { sv[0] * (1.0f / 32768.0f), sv[1] * (1.0f / 32768.0f), sv[2] * (1.0f / 32768.0f) }; Vec3ByMatrix43(pos, fn, skinMatrix); } @@ -491,7 +489,7 @@ void VertexDecoder::Step_PosS8Morph() const float *v = (float *)(decoded_ + decFmt.posoff); memset(v, 0, sizeof(float) * 3); for (int n = 0; n < morphcount; n++) { - float multiplier = 1.0f / 127.0f; + const float multiplier = 1.0f / 128.0f; const s8 *sv = (const s8*)(ptr_ + onesize_*n + posoff); for (int j = 0; j < 3; j++) v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]); @@ -503,7 +501,7 @@ void VertexDecoder::Step_PosS16Morph() const float *v = (float *)(decoded_ + decFmt.posoff); memset(v, 0, sizeof(float) * 3); for (int n = 0; n < morphcount; n++) { - float multiplier = 1.0f / 32767.0f; + const float multiplier = 1.0f / 32768.0f; const s16 *sv = (const s16*)(ptr_ + onesize_*n + posoff); for (int j = 0; j < 3; j++) v[j] += (float)sv[j] * (multiplier * gstate_c.morphWeights[n]); @@ -806,18 +804,7 @@ void VertexDecoder::SetVertexType(u32 fmt, VertexDecoderJitCache *jitCache) { decFmt.posfmt = DEC_FLOAT_3; } else { steps_[numSteps_++] = morphcount == 1 ? posstep[pos] : posstep_morph[pos]; - - if (morphcount == 1) { - // The non-through-mode position formats match the gl formats perfectly, let's use 'em. - switch (pos) { - case GE_VTYPE_POS_8BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S8_3; break; - case GE_VTYPE_POS_16BIT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_S16_3; break; - case GE_VTYPE_POS_FLOAT >> GE_VTYPE_POS_SHIFT: decFmt.posfmt = DEC_FLOAT_3; break; - } - } else { - // Actually, temporarily let's not. - decFmt.posfmt = DEC_FLOAT_3; - } + decFmt.posfmt = DEC_FLOAT_3; } } decFmt.posoff = decOff; diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index b68325f7a0..3b95652c13 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -50,10 +50,7 @@ static float MEMORY_ALIGNED16(boneMask[4]) = {1.0f, 1.0f, 1.0f, 0.0f}; // TODO: Maybe load all morph weights to Q6+ to avoid memory access? -static const float by127 = 1.0f / 127.0f; static const float by128 = 1.0f / 128.0f; -static const float by256 = 1.0f / 256.0f; -static const float by32767 = 1.0f / 32767.0f; static const float by32768 = 1.0f / 32768.0f; using namespace ArmGen; @@ -1147,8 +1144,8 @@ void VertexDecoderJitCache::Jit_NormalFloat() { // Through expands into floats, always. Might want to look at changing this. void VertexDecoderJitCache::Jit_PosS8Through() { DEBUG_LOG_REPORT_ONCE(vertexS8Through, G3D, "Using S8 positions in throughmode"); - _dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order."); - _dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order."); + _dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order."); + _dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order."); // TODO: SIMD LDRSB(tempReg1, srcReg, dec_->posoff); @@ -1173,8 +1170,8 @@ void VertexDecoderJitCache::Jit_PosS8Through() { // Through expands into floats, always. Might want to look at changing this. void VertexDecoderJitCache::Jit_PosS16Through() { - _dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order."); - _dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order."); + _dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order."); + _dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order."); // TODO: SIMD LDRSH(tempReg1, srcReg, dec_->posoff); @@ -1197,24 +1194,69 @@ void VertexDecoderJitCache::Jit_PosS16Through() { } } -// Copy 3 bytes and then a zero. Might as well copy four. void VertexDecoderJitCache::Jit_PosS8() { - LDRB(tempReg1, srcReg, dec_->posoff); - LDRB(tempReg2, srcReg, dec_->posoff + 1); - LDRB(tempReg3, srcReg, dec_->posoff + 2); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 8)); - ORR(tempReg1, tempReg1, Operand2(tempReg3, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.posoff); + if (NEONSkinning) { + ADD(scratchReg, srcReg, dec_->posoff); + VMOV_neon(F_32, Q3, by128); + VLD1_lane(I_32, neonScratchReg, scratchReg, 0, false); + VMOVL(I_8 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 16-bit + VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit + VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ); + VMUL(F_32, srcNEON, neonScratchReg, Q3); + } else { + LDRSB(tempReg1, srcReg, dec_->posoff); + LDRSB(tempReg2, srcReg, dec_->posoff + 1); + LDRSB(tempReg3, srcReg, dec_->posoff + 2); + VMOV(src[0], tempReg1); + VMOV(src[1], tempReg2); + VMOV(src[2], tempReg3); + MOVI2F(S15, by128, scratchReg); + VCVT(src[0], src[0], TO_FLOAT | IS_SIGNED); + VCVT(src[1], src[1], TO_FLOAT | IS_SIGNED); + VCVT(src[2], src[2], TO_FLOAT | IS_SIGNED); + VMUL(src[0], src[0], S15); + VMUL(src[1], src[1], S15); + VMUL(src[2], src[2], S15); + } + + ADD(scratchReg, dstReg, dec_->decFmt.posoff); + if (NEONSkinning) { + VST1(F_32, srcNEON, scratchReg, 2); + } else { + VSTMIA(scratchReg, false, src[0], 3); + } } -// Copy 6 bytes and then 2 zeroes. void VertexDecoderJitCache::Jit_PosS16() { - LDRH(tempReg1, srcReg, dec_->posoff); - LDRH(tempReg2, srcReg, dec_->posoff + 2); - LDRH(tempReg3, srcReg, dec_->posoff + 4); - ORR(tempReg1, tempReg1, Operand2(tempReg2, ST_LSL, 16)); - STR(tempReg1, dstReg, dec_->decFmt.posoff); - STR(tempReg3, dstReg, dec_->decFmt.posoff + 4); + if (NEONSkinning) { + ADD(scratchReg, srcReg, dec_->posoff); + VMOV_neon(F_32, Q3, by32768); + VLD1(I_32, neonScratchReg, scratchReg, 1, ALIGN_NONE); + VMOVL(I_16 | I_SIGNED, neonScratchRegQ, neonScratchReg); // Widen to 32-bit + VCVT(F_32 | I_SIGNED, neonScratchRegQ, neonScratchRegQ); + VMUL(F_32, srcNEON, neonScratchReg, Q3); + } else { + LDRSH(tempReg1, srcReg, dec_->posoff); + LDRSH(tempReg2, srcReg, dec_->posoff + 2); + LDRSH(tempReg3, srcReg, dec_->posoff + 4); + VMOV(fpScratchReg, tempReg1); + VMOV(fpScratchReg2, tempReg2); + VMOV(fpScratchReg3, tempReg3); + MOVI2F(S15, by32768, scratchReg); + VCVT(fpScratchReg, fpScratchReg, TO_FLOAT | IS_SIGNED); + VCVT(fpScratchReg2, fpScratchReg2, TO_FLOAT | IS_SIGNED); + VCVT(fpScratchReg3, fpScratchReg3, TO_FLOAT | IS_SIGNED); + VMUL(src[0], fpScratchReg, S15); + VMUL(src[1], fpScratchReg2, S15); + VMUL(src[2], fpScratchReg3, S15); + } + + ADD(scratchReg, dstReg, dec_->decFmt.posoff); + if (NEONSkinning) { + VST1(F_32, srcNEON, scratchReg, 2); + } else { + VSTMIA(scratchReg, false, src[0], 3); + } } // Just copy 12 bytes. @@ -1304,8 +1346,8 @@ void VertexDecoderJitCache::Jit_WriteMatrixMul(int outOff, bool pos) { } VST1(F_32, accNEON, scratchReg, 2); } else { - _dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScrathRegs must be in order."); - _dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScrathRegs must be in order."); + _dbg_assert_msg_(JIT, fpScratchReg + 1 == fpScratchReg2, "VertexDecoder fpScratchRegs must be in order."); + _dbg_assert_msg_(JIT, fpScratchReg2 + 1 == fpScratchReg3, "VertexDecoder fpScratchRegs must be in order."); MOVP2R(tempReg1, skinMatrix); VLDMIA(tempReg1, true, fpScratchReg, 3); @@ -1404,10 +1446,10 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { MOVP2R(tempReg2, &gstate_c.morphWeights[0]); if (useNEON) { - MOVI2FR(scratchReg2, by127); + MOVI2FR(scratchReg2, by128); VDUP(I_32, Q5, scratchReg2); } else { - MOVI2F(S13, by127, scratchReg); + MOVI2F(S13, by128, scratchReg); } bool first = true; @@ -1474,10 +1516,10 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) { MOVP2R(tempReg2, &gstate_c.morphWeights[0]); if (useNEON) { - MOVI2FR(scratchReg, by32767); + MOVI2FR(scratchReg, by32768); VDUP(I_32, Q5, scratchReg); } else { - MOVI2F(S13, by32767, scratchReg); + MOVI2F(S13, by32768, scratchReg); } bool first = true; diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp index cc1e319cad..e2d171fbab 100644 --- a/GPU/GLES/VertexDecoderX86.cpp +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -29,18 +29,9 @@ static float MEMORY_ALIGNED16(bones[16 * 8]); using namespace Gen; -static const float MEMORY_ALIGNED16( by127[4] ) = { - 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f -}; static const float MEMORY_ALIGNED16( by128[4] ) = { 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f, 1.0f / 128.0f }; -static const float MEMORY_ALIGNED16( by256[4] ) = { - 1.0f / 256, 1.0f / 256, 1.0f / 256, 1.0f / 256 -}; -static const float MEMORY_ALIGNED16( by32767[4] ) = { - 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, -}; static const float MEMORY_ALIGNED16( by32768[4] ) = { 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, 1.0f / 32768.0f, }; @@ -1025,19 +1016,35 @@ void VertexDecoderJitCache::Jit_PosS16Through() { MOVSS(MDisp(dstReg, dec_->decFmt.posoff + 8), fpScratchReg); } -// Copy 3 bytes and then a zero. Might as well copy four. void VertexDecoderJitCache::Jit_PosS8() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); - AND(32, R(tempReg1), Imm32(0x00FFFFFF)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); + XORPS(XMM3, R(XMM3)); + MOVD_xmm(XMM1, MDisp(srcReg, dec_->posoff)); + if (cpu_info.bSSE4_1) { + PMOVSXBD(XMM1, R(XMM1)); + } else { + PUNPCKLBW(XMM1, R(XMM3)); + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 24); + PSRAD(XMM1, 24); + } + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M(&by128)); + MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3); } -// Copy 6 bytes and then 2 zeroes. void VertexDecoderJitCache::Jit_PosS16() { - MOV(32, R(tempReg1), MDisp(srcReg, dec_->posoff)); - MOVZX(32, 16, tempReg2, MDisp(srcReg, dec_->posoff + 4)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff), R(tempReg1)); - MOV(32, MDisp(dstReg, dec_->decFmt.posoff + 4), R(tempReg2)); + XORPS(XMM3, R(XMM3)); + MOVQ_xmm(XMM1, MDisp(srcReg, dec_->posoff)); + if (cpu_info.bSSE4_1) { + PMOVSXWD(XMM1, R(XMM1)); + } else { + PUNPCKLWD(XMM1, R(XMM3)); + PSLLD(XMM1, 16); + PSRAD(XMM1, 16); + } + CVTDQ2PS(XMM3, R(XMM1)); + MULPS(XMM3, M(&by32768)); + MOVUPS(MDisp(dstReg, dec_->decFmt.posoff), XMM3); } // Just copy 12 bytes. @@ -1090,7 +1097,7 @@ void VertexDecoderJitCache::Jit_PosFloatSkin() { void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); PXOR(fpScratchReg4, R(fpScratchReg4)); - MOVAPS(XMM5, M(by127)); + MOVAPS(XMM5, M(by128)); // Sum into fpScratchReg. bool first = true; @@ -1108,7 +1115,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { } CVTDQ2PS(reg, R(reg)); - // Now, It's time to multiply by the weight and 1.0f/127.0f. + // Now, It's time to multiply by the weight and 1.0f/128.0f. MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n)); MULSS(fpScratchReg3, R(XMM5)); SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0)); @@ -1128,7 +1135,7 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); PXOR(fpScratchReg4, R(fpScratchReg4)); - MOVAPS(XMM5, M(by32767)); + MOVAPS(XMM5, M(by32768)); // Sum into fpScratchReg. bool first = true; @@ -1145,7 +1152,7 @@ void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) { } CVTDQ2PS(reg, R(reg)); - // Now, It's time to multiply by the weight and 1.0f/32767.0f. + // Now, It's time to multiply by the weight and 1.0f/32768.0f. MOVSS(fpScratchReg3, MDisp(tempReg1, sizeof(float) * n)); MULSS(fpScratchReg3, R(XMM5)); SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0));