diff --git a/GPU/Common/VertexDecoderCommon.cpp b/GPU/Common/VertexDecoderCommon.cpp index fa6b417bb3..4e8d76afab 100644 --- a/GPU/Common/VertexDecoderCommon.cpp +++ b/GPU/Common/VertexDecoderCommon.cpp @@ -160,6 +160,30 @@ void VertexDecoder::Step_WeightsU16() const wt[j++] = 0; } +void VertexDecoder::Step_WeightsU8ToFloat() const +{ + float *wt = (float *)(decoded_ + decFmt.w0off); + const u8 *wdata = (const u8*)(ptr_); + int j; + for (j = 0; j < nweights; j++) { + wt[j] = (float)wdata[j] * (1.0f / 128.0f); + } + while (j & 3) // Zero additional weights rounding up to 4. + wt[j++] = 0; +} + +void VertexDecoder::Step_WeightsU16ToFloat() const +{ + float *wt = (float *)(decoded_ + decFmt.w0off); + const u16 *wdata = (const u16*)(ptr_); + int j; + for (j = 0; j < nweights; j++) { + wt[j] = (float)wdata[j] * (1.0f / 32768.0f); + } + while (j & 3) // Zero additional weights rounding up to 4. + wt[j++] = 0; +} + // Float weights should be uncommon, we can live with having to multiply these by 2.0 // to avoid special checks in the vertex shader generator. // (PSP uses 0.0-2.0 fixed point numbers for weights) @@ -459,6 +483,15 @@ void VertexDecoder::Step_NormalS8() const normal[3] = 0; } +void VertexDecoder::Step_NormalS8ToFloat() const +{ + float *normal = (float *)(decoded_ + decFmt.nrmoff); + const s8 *sv = (const s8*)(ptr_ + nrmoff); + normal[0] = sv[0] * (1.0f / 128.0f); + normal[1] = sv[1] * (1.0f / 128.0f); + normal[2] = sv[2] * (1.0f / 128.0f); +} + void VertexDecoder::Step_NormalS16() const { s16 *normal = (s16 *)(decoded_ + decFmt.nrmoff); @@ -649,6 +682,13 @@ static const StepFunction wtstep[4] = { &VertexDecoder::Step_WeightsFloat, }; +static const StepFunction wtstepToFloat[4] = { + 0, + &VertexDecoder::Step_WeightsU8ToFloat, + &VertexDecoder::Step_WeightsU16ToFloat, + &VertexDecoder::Step_WeightsFloat, +}; + static const StepFunction wtstep_skin[4] = { 0, &VertexDecoder::Step_WeightsU8Skin, @@ -746,6 +786,13 @@ static const StepFunction nrmstep[4] = { &VertexDecoder::Step_NormalFloat, }; +static const StepFunction nrmstep8BitToFloat[4] = { + 0, + &VertexDecoder::Step_NormalS8ToFloat, + &VertexDecoder::Step_NormalS16, + &VertexDecoder::Step_NormalFloat, +}; + static const StepFunction nrmstep_skin[4] = { 0, &VertexDecoder::Step_NormalS8Skin, @@ -825,17 +872,21 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, if (skinInDecode) { steps_[numSteps_++] = wtstep_skin[weighttype]; - // No visible output + // No visible output, passed in register/external memory to the "pos" step. } else { - steps_[numSteps_++] = wtstep[weighttype]; - int fmtBase = DEC_FLOAT_1; - if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) { - fmtBase = DEC_U8_1; - } else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) { - fmtBase = DEC_U16_1; - } else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) { + if (options.expandAllWeightsToFloat) { + steps_[numSteps_++] = wtstepToFloat[weighttype]; fmtBase = DEC_FLOAT_1; + } else { + steps_[numSteps_++] = wtstep[weighttype]; + if (weighttype == GE_VTYPE_WEIGHT_8BIT >> GE_VTYPE_WEIGHT_SHIFT) { + fmtBase = DEC_U8_1; + } else if (weighttype == GE_VTYPE_WEIGHT_16BIT >> GE_VTYPE_WEIGHT_SHIFT) { + fmtBase = DEC_U16_1; + } else if (weighttype == GE_VTYPE_WEIGHT_FLOAT >> GE_VTYPE_WEIGHT_SHIFT) { + fmtBase = DEC_FLOAT_1; + } } int numWeights = TranslateNumBones(nweights); @@ -927,14 +978,26 @@ void VertexDecoder::SetVertexType(u32 fmt, const VertexDecoderOptions &options, // After skinning, we always have three floats. decFmt.nrmfmt = DEC_FLOAT_3; } else { - steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm]; - if (morphcount == 1) { - // The normal formats match the gl formats perfectly, let's use 'em. + // The 8-bit and 16-bit normal formats match GL formats nicely, and the 16-bit normal format matches a D3D format so let's use them where possible. switch (nrm) { - case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S8_3; break; - case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_S16_3; break; - case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: decFmt.nrmfmt = DEC_FLOAT_3; break; + case GE_VTYPE_NRM_8BIT >> GE_VTYPE_NRM_SHIFT: + if (options.expand8BitNormalsToFloat) { + decFmt.nrmfmt = DEC_FLOAT_3; + steps_[numSteps_++] = morphcount == 1 ? nrmstep8BitToFloat[nrm] : nrmstep_morph[nrm]; + } else { + decFmt.nrmfmt = DEC_S8_3; + steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm]; + } + break; + case GE_VTYPE_NRM_16BIT >> GE_VTYPE_NRM_SHIFT: + decFmt.nrmfmt = DEC_S16_3; + steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm]; + break; + case GE_VTYPE_NRM_FLOAT >> GE_VTYPE_NRM_SHIFT: + decFmt.nrmfmt = DEC_FLOAT_3; + steps_[numSteps_++] = morphcount == 1 ? nrmstep[nrm] : nrmstep_morph[nrm]; + break; } } else { decFmt.nrmfmt = DEC_FLOAT_3; diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index d07ba5e602..f06d40681f 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -435,6 +435,8 @@ typedef void(*JittedVertexDecoder)(const u8 *src, u8 *dst, int count); struct VertexDecoderOptions { bool expandAllUVtoFloat; + bool expandAllWeightsToFloat; + bool expand8BitNormalsToFloat; }; class VertexDecoder @@ -457,6 +459,8 @@ public: void Step_WeightsU8() const; void Step_WeightsU16() const; + void Step_WeightsU8ToFloat() const; + void Step_WeightsU16ToFloat() const; void Step_WeightsFloat() const; void Step_WeightsU8Skin() const; @@ -492,6 +496,7 @@ public: void Step_Color8888Morph() const; void Step_NormalS8() const; + void Step_NormalS8ToFloat() const; void Step_NormalS16() const; void Step_NormalFloat() const; @@ -627,6 +632,7 @@ public: void Jit_Color5551(); void Jit_NormalS8(); + void Jit_NormalS8ToFloat(); void Jit_NormalS16(); void Jit_NormalFloat(); @@ -635,6 +641,7 @@ public: void Jit_NormalFloatSkin(); void Jit_PosS8(); + void Jit_PosS8ToFloat(); void Jit_PosS16(); void Jit_PosFloat(); void Jit_PosS8Through(); diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp index 28454c061e..8dbeed6e21 100644 --- a/GPU/Common/VertexDecoderX86.cpp +++ b/GPU/Common/VertexDecoderX86.cpp @@ -109,6 +109,7 @@ static const JitLookup jitLookup[] = { {&VertexDecoder::Step_TcU16ThroughDouble, &VertexDecoderJitCache::Jit_TcU16ThroughDouble}, {&VertexDecoder::Step_NormalS8, &VertexDecoderJitCache::Jit_NormalS8}, + {&VertexDecoder::Step_NormalS8ToFloat, &VertexDecoderJitCache::Jit_NormalS8ToFloat}, {&VertexDecoder::Step_NormalS16, &VertexDecoderJitCache::Jit_NormalS16}, {&VertexDecoder::Step_NormalFloat, &VertexDecoderJitCache::Jit_NormalFloat}, @@ -956,6 +957,11 @@ void VertexDecoderJitCache::Jit_NormalS8() { MOV(32, MDisp(dstReg, dec_->decFmt.nrmoff), R(tempReg1)); } +void VertexDecoderJitCache::Jit_NormalS8ToFloat() { + Jit_AnyS8ToFloat(dec_->nrmoff); + MOVUPS(MDisp(dstReg, dec_->decFmt.nrmoff), XMM3); +} + // Copy 6 bytes and then 2 zeroes. void VertexDecoderJitCache::Jit_NormalS16() { MOV(32, R(tempReg1), MDisp(srcReg, dec_->nrmoff)); diff --git a/GPU/Directx9/TransformPipelineDX9.cpp b/GPU/Directx9/TransformPipelineDX9.cpp index dac3894695..877c9667ee 100644 --- a/GPU/Directx9/TransformPipelineDX9.cpp +++ b/GPU/Directx9/TransformPipelineDX9.cpp @@ -147,6 +147,8 @@ TransformDrawEngineDX9::TransformDrawEngineDX9() memset(&decOptions_, 0, sizeof(decOptions_)); decOptions_.expandAllUVtoFloat = true; + decOptions_.expandAllWeightsToFloat = true; + decOptions_.expand8BitNormalsToFloat = true; decimationCounter_ = VERTEXCACHE_DECIMATION_INTERVAL; // Allocate nicely aligned memory. Maybe graphics drivers will diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp index 288d7a6c7f..278474cd3c 100644 --- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp +++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp @@ -312,9 +312,6 @@ void GenerateVertexShaderDX9(int prim, char *buffer, bool useHWTransform) { } else { int numWeights = TranslateNumBonesDX9(vertTypeGetNumBoneWeights(vertType)); - static const char *rescale[4] = {"", " * 1.9921875", " * 1.999969482421875", ""}; // 2*127.5f/128.f, 2*32767.5f/32768.f, 1.0f}; - const char *factor = rescale[vertTypeGetWeightMask(vertType) >> GE_VTYPE_WEIGHT_SHIFT]; - static const char * const boneWeightAttr[8] = { "a_w1.x", "a_w1.y", "a_w1.z", "a_w1.w", "a_w2.x", "a_w2.y", "a_w2.z", "a_w2.w", @@ -377,11 +374,11 @@ void GenerateVertexShaderDX9(int prim, char *buffer, bool useHWTransform) { WRITE(p, ";\n"); // Trying to simplify this results in bugs in LBP... - WRITE(p, " float3 skinnedpos = mul(float4(In.position.xyz, 1.0), skinMatrix).xyz %s;\n", factor); + WRITE(p, " float3 skinnedpos = mul(float4(In.position.xyz, 1.0), skinMatrix).xyz;\n"); WRITE(p, " float3 worldpos = mul(float4(skinnedpos, 1.0), u_world).xyz;\n"); if (hasNormal) { - WRITE(p, " float3 skinnednormal = mul(float4(%sIn.normal, 0.0), skinMatrix).xyz %s;\n", flipNormal ? "-" : "", factor); + WRITE(p, " float3 skinnednormal = mul(float4(%sIn.normal, 0.0), skinMatrix).xyz;\n", flipNormal ? "-" : ""); WRITE(p, " float3 worldnormal = normalize(mul(float4(skinnednormal, 0.0), u_world).xyz);\n"); } else { WRITE(p, " float3 worldnormal = mul( mul( float4(0.0, 0.0, 1.0, 0.0), skinMatrix), u_world).xyz;\n");