From 61f5d3d360ef5d60ca94e685ec6f3e7971f744dd Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 23 Mar 2014 20:37:51 -0700 Subject: [PATCH] Initial stab at tracking vertex alpha. Not sure what efficient method to use on x86... --- GPU/GLES/VertexDecoder.cpp | 9 +++++++ GPU/GLES/VertexDecoder.h | 2 +- GPU/GLES/VertexDecoderArm.cpp | 49 ++++++++++++++++++++++++++++++----- GPU/GLES/VertexDecoderX86.cpp | 24 +++++++++++++++-- 4 files changed, 75 insertions(+), 9 deletions(-) diff --git a/GPU/GLES/VertexDecoder.cpp b/GPU/GLES/VertexDecoder.cpp index 7bcab4bc35..f8cae9348b 100644 --- a/GPU/GLES/VertexDecoder.cpp +++ b/GPU/GLES/VertexDecoder.cpp @@ -219,6 +219,7 @@ void VertexDecoder::Step_Color565() const c[1] = Convert6To8((cdata>>5) & 0x3f); c[2] = Convert5To8((cdata>>11) & 0x1f); c[3] = 255; + // Always full alpha. } void VertexDecoder::Step_Color5551() const @@ -229,6 +230,7 @@ void VertexDecoder::Step_Color5551() const c[1] = Convert5To8((cdata>>5) & 0x1f); c[2] = Convert5To8((cdata>>10) & 0x1f); c[3] = (cdata >> 15) ? 255 : 0; + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] != 0; } void VertexDecoder::Step_Color4444() const @@ -237,6 +239,7 @@ void VertexDecoder::Step_Color4444() const u16 cdata = *(u16*)(ptr_ + coloff); for (int j = 0; j < 4; j++) c[j] = Convert4To8((cdata >> (j * 4)) & 0xF); + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color8888() const @@ -244,6 +247,7 @@ void VertexDecoder::Step_Color8888() const u8 *c = decoded_ + decFmt.c0off; const u8 *cdata = (const u8*)(ptr_ + coloff); memcpy(c, cdata, sizeof(u8) * 4); + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color565Morph() const @@ -262,6 +266,7 @@ void VertexDecoder::Step_Color565Morph() const c[i] = (u8)col[i]; } c[3] = 255; + // Always full alpha. } void VertexDecoder::Step_Color5551Morph() const @@ -280,6 +285,7 @@ void VertexDecoder::Step_Color5551Morph() const for (int i = 0; i < 4; i++) { c[i] = (u8)col[i]; } + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color4444Morph() const @@ -296,6 +302,7 @@ void VertexDecoder::Step_Color4444Morph() const for (int i = 0; i < 4; i++) { c[i] = (u8)col[i]; } + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_Color8888Morph() const @@ -312,6 +319,7 @@ void VertexDecoder::Step_Color8888Morph() const for (int i = 0; i < 4; i++) { c[i] = (u8)(col[i]); } + gstate_c.vertexFullAlpha = gstate_c.vertexFullAlpha && c[3] == 255; } void VertexDecoder::Step_NormalS8() const @@ -841,6 +849,7 @@ void VertexDecoder::DecodeVerts(u8 *decodedptr, const void *verts, int indexLowe jitted_(ptr_, decoded_, count); } else { // Interpret the decode steps + // TODO: Init gstate_c.vertexFullAlpha here? Or in Setup? When is it reset? for (; count; count--) { for (int i = 0; i < numSteps_; i++) { ((*this).*steps_[i])(); diff --git a/GPU/GLES/VertexDecoder.h b/GPU/GLES/VertexDecoder.h index 92643bb5ae..dad0e142c3 100644 --- a/GPU/GLES/VertexDecoder.h +++ b/GPU/GLES/VertexDecoder.h @@ -266,6 +266,6 @@ private: bool CompileStep(const VertexDecoder &dec, int i); void Jit_ApplyWeights(); void Jit_WriteMatrixMul(int outOff, bool pos); - void Jit_WriteMorphColor(int outOff); + void Jit_WriteMorphColor(int outOff, bool checkAlpha = true); const VertexDecoder *dec_; }; diff --git a/GPU/GLES/VertexDecoderArm.cpp b/GPU/GLES/VertexDecoderArm.cpp index 0efba2780a..f702246bf1 100644 --- a/GPU/GLES/VertexDecoderArm.cpp +++ b/GPU/GLES/VertexDecoderArm.cpp @@ -61,7 +61,8 @@ static const ARMReg tempReg2 = R4; static const ARMReg tempReg3 = R5; static const ARMReg scratchReg = R6; static const ARMReg scratchReg2 = R7; -static const ARMReg scratchReg3 = R12; +static const ARMReg scratchReg3 = R8; +static const ARMReg hasAlphaReg = R12; static const ARMReg srcReg = R0; static const ARMReg dstReg = R1; static const ARMReg counterReg = R2; @@ -262,6 +263,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { // TODO: Preload scale factors } + if (dec.col) { + MOV(hasAlphaReg, 0); + } + JumpTarget loopStart = GetCodePtr(); // Preload data cache ahead of reading. This offset seems pretty good. PLD(srcReg, 64); @@ -281,6 +286,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUBS(counterReg, counterReg, 1); B_CC(CC_NEQ, loopStart); + // TODO: Do something with hasAlphaReg. + if (dec.col) { + + } + if (NEONSkinning || NEONMorphing) { VPOP(D8, 8); } @@ -664,7 +674,12 @@ void VertexDecoderJitCache::Jit_TcFloatPrescale() { void VertexDecoderJitCache::Jit_Color8888() { LDR(tempReg1, srcReg, dec_->coloff); + // Set flags to determine if alpha != 0xFF. + MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24)); STR(tempReg1, dstReg, dec_->decFmt.c0off); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); } void VertexDecoderJitCache::Jit_Color4444() { @@ -679,10 +694,16 @@ void VertexDecoderJitCache::Jit_Color4444() { ANDI2R(tempReg3, tempReg1, 0xF000, scratchReg); ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 12)); - // And saturate. + // And expand to 8 bits. ORR(tempReg1, tempReg2, Operand2(tempReg2, ST_LSL, 4)); STR(tempReg1, dstReg, dec_->decFmt.c0off); + + // Set flags to determine if alpha != 0xFF. + MVNS(tempReg2, Operand2(tempReg1, ST_ASR, 24)); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); } void VertexDecoderJitCache::Jit_Color565() { @@ -706,7 +727,7 @@ void VertexDecoderJitCache::Jit_Color565() { ORR(tempReg3, tempReg3, Operand2(tempReg1, ST_LSR, 4)); ORR(tempReg2, tempReg2, Operand2(tempReg3, ST_LSL, 8)); - // Add in full alpha. + // Add in full alpha. No need to update hasAlphaReg. ORI2R(tempReg1, tempReg2, 0xFF000000, scratchReg); STR(tempReg1, dstReg, dec_->decFmt.c0off); @@ -731,8 +752,13 @@ void VertexDecoderJitCache::Jit_Color5551() { // Now we just need alpha. Since we loaded as signed, it'll be extended. ANDI2R(tempReg1, tempReg1, 0xFF000000, scratchReg); ORR(tempReg2, tempReg2, tempReg1); - + + // Set flags to determine if alpha != 0xFF. + MVNS(tempReg3, Operand2(tempReg1, ST_ASR, 24)); STR(tempReg2, dstReg, dec_->decFmt.c0off); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); } void VertexDecoderJitCache::Jit_Color8888Morph() { @@ -957,7 +983,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() { } else { VMOV(S11, tempReg3); } - Jit_WriteMorphColor(dec_->decFmt.c0off); + Jit_WriteMorphColor(dec_->decFmt.c0off, false); } // First is the left shift, second is the right shift (against walls, to get the RGBA values.) @@ -1045,13 +1071,16 @@ void VertexDecoderJitCache::Jit_Color5551Morph() { } // Expects RGBA color in S8 - S11, which is Q2. -void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) { +void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { if (NEONMorphing) { ADDI2R(tempReg1, dstReg, outOff, scratchReg); VCVT(I_32 | I_UNSIGNED, neonScratchRegQ, neonScratchRegQ); VQMOVN(I_32 | I_UNSIGNED, neonScratchReg, neonScratchRegQ); VQMOVN(I_16 | I_UNSIGNED, neonScratchReg, neonScratchRegQ); VST1_lane(I_32, neonScratchReg, tempReg1, 0, true); + if (checkAlpha) { + VMOV_neon(I_32, scratchReg, neonScratchReg, 0); + } } else { VCVT(S8, S8, TO_INT); VCVT(S9, S9, TO_INT); @@ -1066,6 +1095,14 @@ void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) { ORR(scratchReg, scratchReg, Operand2(tempReg3, ST_LSL, 24)); STR(scratchReg, dstReg, outOff); } + + // Set flags to determine if alpha != 0xFF. + if (checkAlpha) { + MVNS(tempReg2, Operand2(scratchReg, ST_ASR, 24)); + SetCC(CC_NEQ); + ORR(hasAlphaReg, hasAlphaReg, IMM(1)); + SetCC(CC_AL); + } } void VertexDecoderJitCache::Jit_NormalS8() { diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp index 32223cf42e..d1128144f7 100644 --- a/GPU/GLES/VertexDecoderX86.cpp +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -54,6 +54,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RCX; static const X64Reg dstReg = RDX; static const X64Reg counterReg = R8; +static const OpArg hasAlphaArg = R(R14); #else static const X64Reg tempReg1 = RAX; static const X64Reg tempReg2 = R9; @@ -61,6 +62,7 @@ static const X64Reg tempReg3 = R10; static const X64Reg srcReg = RDI; static const X64Reg dstReg = RSI; static const X64Reg counterReg = RDX; +static const OpArg hasAlphaArg = R(R14); #endif #else static const X64Reg tempReg1 = EAX; @@ -69,6 +71,8 @@ static const X64Reg tempReg3 = EDX; static const X64Reg srcReg = ESI; static const X64Reg dstReg = EDI; static const X64Reg counterReg = ECX; +static u32 hasAlphaValue; +static const OpArg hasAlphaArg = M(&hasAlphaValue); #endif // XMM0-XMM5 are volatile on Windows X64 @@ -234,6 +238,10 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { UNPCKLPD(fpScaleOffsetReg, R(fpScratchReg)); } + if (dec.col) { + MOV(32, hasAlphaArg, Imm32(0)); + } + // Let's not bother with a proper stack frame. We just grab the arguments and go. JumpTarget loopStart = GetCodePtr(); for (int i = 0; i < dec.numSteps_; i++) { @@ -249,6 +257,11 @@ JittedVertexDecoder VertexDecoderJitCache::Compile(const VertexDecoder &dec) { SUB(32, R(counterReg), Imm8(1)); J_CC(CC_NZ, loopStart, true); + // TODO: Do something with hasAlphaArg from EAX. + if (dec.col) { + //MOV(32, R(EAX), hasAlphaArg); + } + MOVUPS(XMM4, MDisp(ESP, 0)); MOVUPS(XMM5, MDisp(ESP, 16)); MOVUPS(XMM6, MDisp(ESP, 32)); @@ -556,6 +569,7 @@ void VertexDecoderJitCache::Jit_TcFloatThrough() { void VertexDecoderJitCache::Jit_Color8888() { MOV(32, R(tempReg1), MDisp(srcReg, dec_->coloff)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg1)); + // TODO: hasAlphaArg. } static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; @@ -625,6 +639,7 @@ void VertexDecoderJitCache::Jit_Color4444() { OR(32, R(tempReg2), R(tempReg3)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + // TODO: hasAlphaArg. } void VertexDecoderJitCache::Jit_Color565() { @@ -661,6 +676,7 @@ void VertexDecoderJitCache::Jit_Color565() { OR(32, R(tempReg2), R(tempReg1)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + // Never has alpha, no need to update hasAlphaArg. } void VertexDecoderJitCache::Jit_Color5551() { @@ -696,6 +712,7 @@ void VertexDecoderJitCache::Jit_Color5551() { OR(32, R(tempReg2), R(tempReg1)); MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); + // TODO: hasAlphaArg. } void VertexDecoderJitCache::Jit_Color8888Morph() { @@ -825,7 +842,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() { } } - Jit_WriteMorphColor(dec_->decFmt.c0off); + Jit_WriteMorphColor(dec_->decFmt.c0off, false); } // Intentionally in reverse order. @@ -884,12 +901,15 @@ void VertexDecoderJitCache::Jit_Color5551Morph() { Jit_WriteMorphColor(dec_->decFmt.c0off); } -void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff) { +void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { // Pack back into a u32. CVTPS2DQ(fpScratchReg, R(fpScratchReg)); PACKSSDW(fpScratchReg, R(fpScratchReg)); PACKUSWB(fpScratchReg, R(fpScratchReg)); MOVD_xmm(MDisp(dstReg, outOff), fpScratchReg); + if (checkAlpha) { + // TODO: hasAlphaArg. + } } // Copy 3 bytes and then a zero. Might as well copy four.