From a0397bce4ca8919db0da0ad42e726845dd127d57 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 13 Apr 2016 23:15:41 -0700 Subject: [PATCH 1/4] Hopefully fix prescale in remasters. Don't actually have a remaster to test, though. --- GPU/Common/VertexDecoderCommon.cpp | 33 ++++++++++++++++++++++-------- GPU/Common/VertexDecoderCommon.h | 1 + 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index f8ab3a63d7..9900d38a4d 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -390,6 +390,13 @@ void VertexDecoder::Step_TcU16Prescale() const { uv[1] = (float)uvdata[1] * (1.f / 32768.f) * gstate_c.uv.vScale + gstate_c.uv.vOff; } +void VertexDecoder::Step_TcU16DoublePrescale() const { + float *uv = (float *)(decoded_ + decFmt.uvoff); + const u16 *uvdata = (const u16_le *)(ptr_ + tcoff); + uv[0] = (float)uvdata[0] * (1.f / 16384.f) * gstate_c.uv.uScale + gstate_c.uv.uOff; + uv[1] = (float)uvdata[1] * (1.f / 16384.f) * gstate_c.uv.vScale + gstate_c.uv.vOff; +} + void VertexDecoder::Step_TcFloatPrescale() const { float *uv = (float *)(decoded_ + decFmt.uvoff); const float *uvdata = (const float*)(ptr_ + tcoff); @@ -752,6 +759,13 @@ static const StepFunction tcstep_prescale[4] = { &VertexDecoder::Step_TcFloatPrescale, }; +static const StepFunction tcstep_prescale_remaster[4] = { + 0, + &VertexDecoder::Step_TcU8Prescale, + &VertexDecoder::Step_TcU16DoublePrescale, + &VertexDecoder::Step_TcFloatPrescale, +}; + static const StepFunction tcstep_through[4] = { 0, &VertexDecoder::Step_TcU8, @@ -767,28 +781,28 @@ static const StepFunction tcstep_throughToFloat[4] = { }; // Some HD Remaster games double the u16 texture coordinates. -static const StepFunction tcstep_Remaster[4] = { +static const StepFunction tcstep_remaster[4] = { 0, &VertexDecoder::Step_TcU8, &VertexDecoder::Step_TcU16Double, &VertexDecoder::Step_TcFloat, }; -static const StepFunction tcstep_RemasterToFloat[4] = { +static const StepFunction tcstep_remasterToFloat[4] = { 0, &VertexDecoder::Step_TcU8ToFloat, &VertexDecoder::Step_TcU16DoubleToFloat, &VertexDecoder::Step_TcFloat, }; -static const StepFunction tcstep_through_Remaster[4] = { +static const StepFunction tcstep_through_remaster[4] = { 0, &VertexDecoder::Step_TcU8, &VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoder::Step_TcFloatThrough, }; -static const StepFunction tcstep_through_RemasterToFloat[4] = { +static const StepFunction tcstep_through_remasterToFloat[4] = { 0, &VertexDecoder::Step_TcU8ToFloat, &VertexDecoder::Step_TcU16ThroughDoubleToFloat, @@ -955,19 +969,22 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, biggest = tcalign[tc]; // NOTE: That we check getUVGenMode here means that we must include it in the decoder ID! - if (g_Config.bPrescaleUV && !throughmode && (gstate.getUVGenMode() == 0 || gstate.getUVGenMode() == 3)) { - steps_[numSteps_++] = tcstep_prescale[tc]; + if (g_Config.bPrescaleUV && !throughmode && (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_COORDS || gstate.getUVGenMode() == GE_TEXMAP_UNKNOWN)) { + if (g_DoubleTextureCoordinates) + steps_[numSteps_++] = tcstep_prescale_remaster[tc]; + else + steps_[numSteps_++] = tcstep_prescale[tc]; decFmt.uvfmt = DEC_FLOAT_2; } else { if (options.expandAllUVtoFloat) { if (g_DoubleTextureCoordinates) - steps_[numSteps_++] = throughmode ? tcstep_through_RemasterToFloat[tc] : tcstep_RemasterToFloat[tc]; + steps_[numSteps_++] = throughmode ? tcstep_through_remasterToFloat[tc] : tcstep_remasterToFloat[tc]; else steps_[numSteps_++] = throughmode ? tcstep_throughToFloat[tc] : tcstepToFloat[tc]; decFmt.uvfmt = DEC_FLOAT_2; } else { if (g_DoubleTextureCoordinates) - steps_[numSteps_++] = throughmode ? tcstep_through_Remaster[tc] : tcstep_Remaster[tc]; + steps_[numSteps_++] = throughmode ? tcstep_through_remaster[tc] : tcstep_remaster[tc]; else steps_[numSteps_++] = throughmode ? tcstep_through[tc] : tcstep[tc]; diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 14ed122bae..dff07bedc2 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -484,6 +484,7 @@ public: void Step_TcU8Prescale() const; void Step_TcU16Prescale() const; + void Step_TcU16DoublePrescale() const; void Step_TcFloatPrescale() const; void Step_TcU16Double() const; From 614665068a7691ba917d46a5311e1a6c28b861e0 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Wed, 13 Apr 2016 23:34:45 -0700 Subject: [PATCH 2/4] Implement morphing for texcoords. Tests show that this can be used. --- GPU/Common/VertexDecoderCommon.cpp | 162 +++++++++++++++++++++++++++-- GPU/Common/VertexDecoderCommon.h | 9 ++ 2 files changed, 164 insertions(+), 7 deletions(-) diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index 9900d38a4d..6f6a0dc9d2 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -385,14 +385,14 @@ void VertexDecoder::Step_TcU8Prescale() const { void VertexDecoder::Step_TcU16Prescale() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le *)(ptr_ + tcoff); + const u16_le *uvdata = (const u16_le *)(ptr_ + tcoff); uv[0] = (float)uvdata[0] * (1.f / 32768.f) * gstate_c.uv.uScale + gstate_c.uv.uOff; uv[1] = (float)uvdata[1] * (1.f / 32768.f) * gstate_c.uv.vScale + gstate_c.uv.vOff; } void VertexDecoder::Step_TcU16DoublePrescale() const { float *uv = (float *)(decoded_ + decFmt.uvoff); - const u16 *uvdata = (const u16_le *)(ptr_ + tcoff); + const u16_le *uvdata = (const u16_le *)(ptr_ + tcoff); uv[0] = (float)uvdata[0] * (1.f / 16384.f) * gstate_c.uv.uScale + gstate_c.uv.uOff; uv[1] = (float)uvdata[1] * (1.f / 16384.f) * gstate_c.uv.vScale + gstate_c.uv.vOff; } @@ -404,6 +404,126 @@ void VertexDecoder::Step_TcFloatPrescale() const { uv[1] = uvdata[1] * gstate_c.uv.vScale + gstate_c.uv.vOff; } +void VertexDecoder::Step_TcU8Morph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u8 *uvdata = (const u8 *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * (1.f / 128.f) * w; + uv[1] += (float)uvdata[1] * (1.f / 128.f) * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0]; + out[1] = uv[1]; +} + +void VertexDecoder::Step_TcU16Morph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * (1.f / 32768.f) * w; + uv[1] += (float)uvdata[1] * (1.f / 32768.f) * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0]; + out[1] = uv[1]; +} + +void VertexDecoder::Step_TcU16DoubleMorph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * (1.f / 16384.f) * w; + uv[1] += (float)uvdata[1] * (1.f / 16384.f) * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0]; + out[1] = uv[1]; +} + +void VertexDecoder::Step_TcFloatMorph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const float_le *uvdata = (const float_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * w; + uv[1] += (float)uvdata[1] * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0]; + out[1] = uv[1]; +} + +void VertexDecoder::Step_TcU8PrescaleMorph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u8 *uvdata = (const u8 *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * (1.f / 128.f) * w; + uv[1] += (float)uvdata[1] * (1.f / 128.f) * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff; + out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff; +} + +void VertexDecoder::Step_TcU16PrescaleMorph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * (1.f / 32768.f) * w; + uv[1] += (float)uvdata[1] * (1.f / 32768.f) * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff; + out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff; +} + +void VertexDecoder::Step_TcU16DoublePrescaleMorph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * (1.f / 16384.f) * w; + uv[1] += (float)uvdata[1] * (1.f / 16384.f) * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff; + out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff; +} + +void VertexDecoder::Step_TcFloatPrescaleMorph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const float_le *uvdata = (const float_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * w; + uv[1] += (float)uvdata[1] * w; + } + + float *out = (float *)(decoded_ + decFmt.uvoff); + out[0] = uv[0] * gstate_c.uv.uScale + gstate_c.uv.uOff; + out[1] = uv[1] * gstate_c.uv.vScale + gstate_c.uv.vOff; +} + void VertexDecoder::Step_ColorInvalid() const { // Do nothing. This is only here to prevent crashes. @@ -766,6 +886,34 @@ static const StepFunction tcstep_prescale_remaster[4] = { &VertexDecoder::Step_TcFloatPrescale, }; +static const StepFunction tcstep_prescale_morph[4] = { + 0, + &VertexDecoder::Step_TcU8PrescaleMorph, + &VertexDecoder::Step_TcU16PrescaleMorph, + &VertexDecoder::Step_TcFloatPrescaleMorph, +}; + +static const StepFunction tcstep_prescale_morph_remaster[4] = { + 0, + &VertexDecoder::Step_TcU8PrescaleMorph, + &VertexDecoder::Step_TcU16DoublePrescaleMorph, + &VertexDecoder::Step_TcFloatPrescaleMorph, +}; + +static const StepFunction tcstep_morph[4] = { + 0, + &VertexDecoder::Step_TcU8Morph, + &VertexDecoder::Step_TcU16Morph, + &VertexDecoder::Step_TcFloatMorph, +}; + +static const StepFunction tcstep_morph_remaster[4] = { + 0, + &VertexDecoder::Step_TcU8Morph, + &VertexDecoder::Step_TcU16DoubleMorph, + &VertexDecoder::Step_TcFloatMorph, +}; + static const StepFunction tcstep_through[4] = { 0, &VertexDecoder::Step_TcU8, @@ -809,9 +957,6 @@ static const StepFunction tcstep_through_remasterToFloat[4] = { &VertexDecoder::Step_TcFloatThrough, }; - -// TODO: Tc Morph - static const StepFunction colstep[8] = { 0, &VertexDecoder::Step_ColorInvalid, @@ -971,9 +1116,12 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, // NOTE: That we check getUVGenMode here means that we must include it in the decoder ID! if (g_Config.bPrescaleUV && !throughmode && (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_COORDS || gstate.getUVGenMode() == GE_TEXMAP_UNKNOWN)) { if (g_DoubleTextureCoordinates) - steps_[numSteps_++] = tcstep_prescale_remaster[tc]; + steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale_remaster[tc] : tcstep_prescale_morph_remaster[tc]; else - steps_[numSteps_++] = tcstep_prescale[tc]; + steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale[tc] : tcstep_prescale_morph[tc]; + decFmt.uvfmt = DEC_FLOAT_2; + } else if (morphcount != 1 && !throughmode) { + steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remaster[tc] : tcstep_morph[tc]; decFmt.uvfmt = DEC_FLOAT_2; } else { if (options.expandAllUVtoFloat) { diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index dff07bedc2..0e30d5caff 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -495,6 +495,15 @@ public: void Step_TcU16ThroughDoubleToFloat() const; void Step_TcFloatThrough() const; + void Step_TcU8Morph() const; + void Step_TcU16Morph() const; + void Step_TcU16DoubleMorph() const; + void Step_TcFloatMorph() const; + void Step_TcU8PrescaleMorph() const; + void Step_TcU16PrescaleMorph() const; + void Step_TcU16DoublePrescaleMorph() const; + void Step_TcFloatPrescaleMorph() const; + void Step_ColorInvalid() const; void Step_Color4444() const; void Step_Color565() const; From ff802a983a1f8e6483455f889c41be74c9610c67 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 16 Apr 2016 17:45:05 -0700 Subject: [PATCH 3/4] vertexjit: Implement x86 tc morph decoding. --- GPU/Common/VertexDecoderCommon.h | 8 ++ GPU/Common/VertexDecoderX86.cpp | 127 ++++++++++++++++++++++++++++++- 2 files changed, 131 insertions(+), 4 deletions(-) diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 0e30d5caff..9f9f141d5c 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -637,6 +637,14 @@ public: void Jit_TcU16Prescale(); void Jit_TcFloatPrescale(); + void Jit_TcAnyMorph(int bits); + void Jit_TcU8Morph(); + void Jit_TcU16Morph(); + void Jit_TcFloatMorph(); + void Jit_TcU8PrescaleMorph(); + void Jit_TcU16PrescaleMorph(); + void Jit_TcFloatPrescaleMorph(); + void Jit_TcU16Double(); void Jit_TcU16ThroughDouble(); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 93db752b7c..ab61a9871e 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -105,6 +105,13 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, + {&VertexDecoder::Step_TcU8Morph, &VertexDecoderJitCache::Jit_TcU8Morph}, + {&VertexDecoder::Step_TcU16Morph, &VertexDecoderJitCache::Jit_TcU16Morph}, + {&VertexDecoder::Step_TcFloatMorph, &VertexDecoderJitCache::Jit_TcFloatMorph}, + {&VertexDecoder::Step_TcU8PrescaleMorph, &VertexDecoderJitCache::Jit_TcU8PrescaleMorph}, + {&VertexDecoder::Step_TcU16PrescaleMorph, &VertexDecoderJitCache::Jit_TcU16PrescaleMorph}, + {&VertexDecoder::Step_TcFloatPrescaleMorph, &VertexDecoderJitCache::Jit_TcFloatPrescaleMorph}, + {&VertexDecoder::Step_TcU16Through, &VertexDecoderJitCache::Jit_TcU16Through}, {&VertexDecoder::Step_TcU16ThroughToFloat, &VertexDecoderJitCache::Jit_TcU16ThroughToFloat}, {&VertexDecoder::Step_TcFloatThrough, &VertexDecoderJitCache::Jit_TcFloatThrough}, @@ -185,6 +192,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec, int dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescale) { prescaleStep = true; } + if (dec.steps_[i] == &VertexDecoder::Step_TcU8PrescaleMorph || + dec.steps_[i] == &VertexDecoder::Step_TcU16PrescaleMorph || + dec.steps_[i] == &VertexDecoder::Step_TcFloatPrescaleMorph) { + prescaleStep = true; + } } // Add code to convert matrices to 4x4. @@ -747,6 +759,105 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() { MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } +void VertexDecoderJitCache::Jit_TcAnyMorph(int bits) { + MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); + if (!cpu_info.bSSE4_1) { + PXOR(fpScratchReg4, R(fpScratchReg4)); + } + + bool first = true; + for (int n = 0; n < dec_->morphcount; ++n) { + const X64Reg reg = first ? fpScratchReg : fpScratchReg2; + const OpArg src = MDisp(srcReg, dec_->onesize_ * n + dec_->tcoff); + + // Load the actual values and convert to float. + if (bits == 32) { + // Two floats: just load as a MOVQ. + MOVQ_xmm(reg, src); + } else { + if (bits == 8) { + MOVZX(32, 16, tempReg2, src); + MOVD_xmm(reg, R(tempReg2)); + } else { + MOVD_xmm(reg, src); + } + if (cpu_info.bSSE4_1) { + if (bits == 8) { + PMOVZXBD(reg, R(reg)); + } else { + PMOVZXWD(reg, R(reg)); + } + } else { + if (bits == 8) { + PUNPCKLBW(reg, R(fpScratchReg4)); + } + PUNPCKLWD(reg, R(fpScratchReg4)); + } + + CVTDQ2PS(reg, R(reg)); + } + + // And now scale by the weight. + MOVSS(fpScratchReg3, MDisp(tempReg1, n * sizeof(float))); + SHUFPS(fpScratchReg3, R(fpScratchReg3), _MM_SHUFFLE(0, 0, 0, 0)); + MULPS(reg, R(fpScratchReg3)); + + if (!first) { + ADDPS(fpScratchReg, R(fpScratchReg2)); + } else { + first = false; + } + } +} + +void VertexDecoderJitCache::Jit_TcU8Morph() { + Jit_TcAnyMorph(8); + // They were all added (weighted) pre-normalize, we normalize once here. + MULPS(fpScratchReg, M(&by128)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcU16Morph() { + Jit_TcAnyMorph(16); + // They were all added (weighted) pre-normalize, we normalize once here. + MULPS(fpScratchReg, M(&by32768)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcFloatMorph() { + Jit_TcAnyMorph(32); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcU8PrescaleMorph() { + Jit_TcAnyMorph(8); + // The scale takes into account the u8 normalization. + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcU16PrescaleMorph() { + Jit_TcAnyMorph(16); + // The scale takes into account the u16 normalization. + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + +void VertexDecoderJitCache::Jit_TcFloatPrescaleMorph() { + Jit_TcAnyMorph(32); + MULPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + ADDPS(fpScratchReg, R(fpScaleOffsetReg)); + SHUFPS(fpScaleOffsetReg, R(fpScaleOffsetReg), _MM_SHUFFLE(1, 0, 3, 2)); + MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); +} + void VertexDecoderJitCache::Jit_TcU16Through() { MOV(32, R(tempReg1), MDisp(srcReg, dec_->tcoff)); MOV(32, MDisp(dstReg, dec_->decFmt.uvoff), R(tempReg1)); @@ -960,7 +1071,9 @@ void VertexDecoderJitCache::Jit_Color5551() { void VertexDecoderJitCache::Jit_Color8888Morph() { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); - PXOR(fpScratchReg4, R(fpScratchReg4)); + if (!cpu_info.bSSE4_1) { + PXOR(fpScratchReg4, R(fpScratchReg4)); + } bool first = true; for (int n = 0; n < dec_->morphcount; ++n) { @@ -994,7 +1107,9 @@ static const float MEMORY_ALIGNED16(byColor4444[4]) = { 255.0f / 15.0f, 255.0f / void VertexDecoderJitCache::Jit_Color4444Morph() { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); - PXOR(fpScratchReg4, R(fpScratchReg4)); + if (!cpu_info.bSSE4_1) { + PXOR(fpScratchReg4, R(fpScratchReg4)); + } MOVDQA(XMM5, M(color4444mask)); MOVAPS(XMM6, M(byColor4444)); @@ -1376,7 +1491,9 @@ void VertexDecoderJitCache::Jit_AnyU16ToFloat(int srcoff, u32 bits) { void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); - PXOR(fpScratchReg4, R(fpScratchReg4)); + if (!cpu_info.bSSE4_1) { + PXOR(fpScratchReg4, R(fpScratchReg4)); + } MOVAPS(XMM5, M(by128)); // Sum into fpScratchReg. @@ -1414,7 +1531,9 @@ void VertexDecoderJitCache::Jit_AnyS8Morph(int srcoff, int dstoff) { void VertexDecoderJitCache::Jit_AnyS16Morph(int srcoff, int dstoff) { MOV(PTRBITS, R(tempReg1), ImmPtr(&gstate_c.morphWeights[0])); - PXOR(fpScratchReg4, R(fpScratchReg4)); + if (!cpu_info.bSSE4_1) { + PXOR(fpScratchReg4, R(fpScratchReg4)); + } MOVAPS(XMM5, M(by32768)); // Sum into fpScratchReg. From ebce8d275378c7f75f2f53d3ac7c20a56f47b075 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 16 Apr 2016 19:00:34 -0700 Subject: [PATCH 4/4] Don't convert to float with prescale off. Since we assume we need to normalize, it seems. --- GPU/Common/VertexDecoderCommon.cpp | 74 +++++++++++++++++++++++++++--- GPU/Common/VertexDecoderCommon.h | 7 ++- GPU/Common/VertexDecoderX86.cpp | 8 ++-- 3 files changed, 76 insertions(+), 13 deletions(-) diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index 6f6a0dc9d2..0d58cde4ad 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -405,6 +405,51 @@ void VertexDecoder::Step_TcFloatPrescale() const { } void VertexDecoder::Step_TcU8Morph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u8 *uvdata = (const u8 *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * w; + uv[1] += (float)uvdata[1] * w; + } + + u8 *out = decoded_ + decFmt.uvoff; + out[0] = (int)uv[0]; + out[1] = (int)uv[1]; +} + +void VertexDecoder::Step_TcU16Morph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * w; + uv[1] += (float)uvdata[1] * w; + } + + u16_le *out = (u16_le *)(decoded_ + decFmt.uvoff); + out[0] = (int)uv[0]; + out[1] = (int)uv[1]; +} + +void VertexDecoder::Step_TcU16DoubleMorph() const { + float uv[2] = { 0, 0 }; + for (int n = 0; n < morphcount; n++) { + float w = gstate_c.morphWeights[n]; + const u16_le *uvdata = (const u16_le *)(ptr_ + onesize_*n + tcoff); + + uv[0] += (float)uvdata[0] * w; + uv[1] += (float)uvdata[1] * w; + } + + u16_le *out = (u16_le *)(decoded_ + decFmt.uvoff); + out[0] = (int)(uv[0] * 2.0f); + out[1] = (int)(uv[1] * 2.0f); +} + +void VertexDecoder::Step_TcU8MorphToFloat() const { float uv[2] = { 0, 0 }; for (int n = 0; n < morphcount; n++) { float w = gstate_c.morphWeights[n]; @@ -419,7 +464,7 @@ void VertexDecoder::Step_TcU8Morph() const { out[1] = uv[1]; } -void VertexDecoder::Step_TcU16Morph() const { +void VertexDecoder::Step_TcU16MorphToFloat() const { float uv[2] = { 0, 0 }; for (int n = 0; n < morphcount; n++) { float w = gstate_c.morphWeights[n]; @@ -434,7 +479,7 @@ void VertexDecoder::Step_TcU16Morph() const { out[1] = uv[1]; } -void VertexDecoder::Step_TcU16DoubleMorph() const { +void VertexDecoder::Step_TcU16DoubleMorphToFloat() const { float uv[2] = { 0, 0 }; for (int n = 0; n < morphcount; n++) { float w = gstate_c.morphWeights[n]; @@ -914,6 +959,20 @@ static const StepFunction tcstep_morph_remaster[4] = { &VertexDecoder::Step_TcFloatMorph, }; +static const StepFunction tcstep_morphToFloat[4] = { + 0, + &VertexDecoder::Step_TcU8MorphToFloat, + &VertexDecoder::Step_TcU16MorphToFloat, + &VertexDecoder::Step_TcFloatMorph, +}; + +static const StepFunction tcstep_morph_remasterToFloat[4] = { + 0, + &VertexDecoder::Step_TcU8MorphToFloat, + &VertexDecoder::Step_TcU16DoubleMorphToFloat, + &VertexDecoder::Step_TcFloatMorph, +}; + static const StepFunction tcstep_through[4] = { 0, &VertexDecoder::Step_TcU8, @@ -1120,18 +1179,19 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, else steps_[numSteps_++] = morphcount == 1 ? tcstep_prescale[tc] : tcstep_prescale_morph[tc]; decFmt.uvfmt = DEC_FLOAT_2; - } else if (morphcount != 1 && !throughmode) { - steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remaster[tc] : tcstep_morph[tc]; - decFmt.uvfmt = DEC_FLOAT_2; } else { if (options.expandAllUVtoFloat) { - if (g_DoubleTextureCoordinates) + if (morphcount != 1 && !throughmode) + steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remasterToFloat[tc] : tcstep_morphToFloat[tc]; + else if (g_DoubleTextureCoordinates) steps_[numSteps_++] = throughmode ? tcstep_through_remasterToFloat[tc] : tcstep_remasterToFloat[tc]; else steps_[numSteps_++] = throughmode ? tcstep_throughToFloat[tc] : tcstepToFloat[tc]; decFmt.uvfmt = DEC_FLOAT_2; } else { - if (g_DoubleTextureCoordinates) + if (morphcount != 1 && !throughmode) + steps_[numSteps_++] = g_DoubleTextureCoordinates ? tcstep_morph_remaster[tc] : tcstep_morph[tc]; + else if (g_DoubleTextureCoordinates) steps_[numSteps_++] = throughmode ? tcstep_through_remaster[tc] : tcstep_remaster[tc]; else steps_[numSteps_++] = throughmode ? tcstep_through[tc] : tcstep[tc]; diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 9f9f141d5c..6163951ecb 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -498,6 +498,9 @@ public: void Step_TcU8Morph() const; void Step_TcU16Morph() const; void Step_TcU16DoubleMorph() const; + void Step_TcU8MorphToFloat() const; + void Step_TcU16MorphToFloat() const; + void Step_TcU16DoubleMorphToFloat() const; void Step_TcFloatMorph() const; void Step_TcU8PrescaleMorph() const; void Step_TcU16PrescaleMorph() const; @@ -638,8 +641,8 @@ public: void Jit_TcFloatPrescale(); void Jit_TcAnyMorph(int bits); - void Jit_TcU8Morph(); - void Jit_TcU16Morph(); + void Jit_TcU8MorphToFloat(); + void Jit_TcU16MorphToFloat(); void Jit_TcFloatMorph(); void Jit_TcU8PrescaleMorph(); void Jit_TcU16PrescaleMorph(); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index ab61a9871e..1f67637ef7 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -105,8 +105,8 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcU16Prescale, &VertexDecoderJitCache::Jit_TcU16Prescale}, {&VertexDecoder::Step_TcFloatPrescale, &VertexDecoderJitCache::Jit_TcFloatPrescale}, - {&VertexDecoder::Step_TcU8Morph, &VertexDecoderJitCache::Jit_TcU8Morph}, - {&VertexDecoder::Step_TcU16Morph, &VertexDecoderJitCache::Jit_TcU16Morph}, + {&VertexDecoder::Step_TcU8MorphToFloat, &VertexDecoderJitCache::Jit_TcU8MorphToFloat}, + {&VertexDecoder::Step_TcU16MorphToFloat, &VertexDecoderJitCache::Jit_TcU16MorphToFloat}, {&VertexDecoder::Step_TcFloatMorph, &VertexDecoderJitCache::Jit_TcFloatMorph}, {&VertexDecoder::Step_TcU8PrescaleMorph, &VertexDecoderJitCache::Jit_TcU8PrescaleMorph}, {&VertexDecoder::Step_TcU16PrescaleMorph, &VertexDecoderJitCache::Jit_TcU16PrescaleMorph}, @@ -810,14 +810,14 @@ void VertexDecoderJitCache::Jit_TcAnyMorph(int bits) { } } -void VertexDecoderJitCache::Jit_TcU8Morph() { +void VertexDecoderJitCache::Jit_TcU8MorphToFloat() { Jit_TcAnyMorph(8); // They were all added (weighted) pre-normalize, we normalize once here. MULPS(fpScratchReg, M(&by128)); MOVQ_xmm(MDisp(dstReg, dec_->decFmt.uvoff), fpScratchReg); } -void VertexDecoderJitCache::Jit_TcU16Morph() { +void VertexDecoderJitCache::Jit_TcU16MorphToFloat() { Jit_TcAnyMorph(16); // They were all added (weighted) pre-normalize, we normalize once here. MULPS(fpScratchReg, M(&by32768));