From 9f4301000436790345ee8e15f5366f1a492afa38 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Tue, 14 Feb 2023 20:38:46 -0800 Subject: [PATCH] riscv: Reuse some color morphing code. Small optimization loss in 5551, but fairly minimal and I think it's worth it. --- GPU/Common/VertexDecoderRiscV.cpp | 161 ++++++------------------------ 1 file changed, 31 insertions(+), 130 deletions(-) diff --git a/GPU/Common/VertexDecoderRiscV.cpp b/GPU/Common/VertexDecoderRiscV.cpp index 2263437e2a..9a052242b1 100644 --- a/GPU/Common/VertexDecoderRiscV.cpp +++ b/GPU/Common/VertexDecoderRiscV.cpp @@ -883,48 +883,7 @@ void VertexDecoderJitCache::Jit_Color8888Morph() { FMADD(32, fpSrc[3], fpScratchReg1, fpScratchReg4, fpSrc[3]); } - if (cpu_info.RiscV_B) { - LI(scratchReg, 0xFF); - FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); - MAX(tempReg1, tempReg1, R_ZERO); - MIN(tempReg1, tempReg1, scratchReg); - for (int i = 1; i < 4; ++i) { - FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); - MAX(tempReg2, tempReg2, R_ZERO); - MIN(tempReg2, tempReg2, scratchReg); - // If it's alpha, set tempReg3 as a flag. - if (i == 3) - SLTIU(tempReg3, tempReg2, 0xFF); - SLLI(tempReg2, tempReg2, i * 8); - OR(tempReg1, tempReg1, tempReg2); - } - } else { - // Clamp to [0, 255] as floats, since we have FMIN/FMAX. Better than branching, probably... - LI(scratchReg, 255.0f); - FMV(FMv::W, FMv::X, fpScratchReg1, R_ZERO); - FMV(FMv::W, FMv::X, fpScratchReg2, scratchReg); - for (int i = 0; i < 4; ++i) { - FMAX(32, fpSrc[i], fpSrc[i], fpScratchReg1); - FMIN(32, fpSrc[i], fpSrc[i], fpScratchReg2); - } - - FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); - for (int i = 1; i < 4; ++i) { - FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); - // If it's alpha, set tempReg3 as a flag. - if (i == 3) - SLTIU(tempReg3, tempReg2, 0xFF); - SLLI(tempReg2, tempReg2, i * 8); - OR(tempReg1, tempReg1, tempReg2); - } - } - - // Now use the flag we set earlier to update fullAlphaReg. - // We translate it to a mask, tempReg3=-1 if full alpha, 0 otherwise. - ADDI(tempReg3, tempReg3, -1); - AND(fullAlphaReg, fullAlphaReg, tempReg3); - - SW(tempReg1, dstReg, dec_->decFmt.c0off); + Jit_WriteMorphColor(dec_->decFmt.c0off, true); } void VertexDecoderJitCache::Jit_Color4444Morph() { @@ -952,48 +911,7 @@ void VertexDecoderJitCache::Jit_Color4444Morph() { FMADD(32, fpSrc[3], fpScratchReg1, fpScratchReg4, fpSrc[3]); } - if (cpu_info.RiscV_B) { - LI(scratchReg, 0xFF); - FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); - MAX(tempReg1, tempReg1, R_ZERO); - MIN(tempReg1, tempReg1, scratchReg); - for (int i = 1; i < 4; ++i) { - FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); - MAX(tempReg2, tempReg2, R_ZERO); - MIN(tempReg2, tempReg2, scratchReg); - // If it's alpha, set tempReg3 as a flag. - if (i == 3) - SLTIU(tempReg3, tempReg2, 0xFF); - SLLI(tempReg2, tempReg2, i * 8); - OR(tempReg1, tempReg1, tempReg2); - } - } else { - // Clamp to [0, 255] as floats, since we have FMIN/FMAX. Better than branching, probably... - LI(scratchReg, 255.0f); - FMV(FMv::W, FMv::X, fpScratchReg1, R_ZERO); - FMV(FMv::W, FMv::X, fpScratchReg2, scratchReg); - for (int i = 0; i < 4; ++i) { - FMAX(32, fpSrc[i], fpSrc[i], fpScratchReg1); - FMIN(32, fpSrc[i], fpSrc[i], fpScratchReg2); - } - - FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); - for (int i = 1; i < 4; ++i) { - FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); - // If it's alpha, set tempReg3 as a flag. - if (i == 3) - SLTIU(tempReg3, tempReg2, 0xFF); - SLLI(tempReg2, tempReg2, i * 8); - OR(tempReg1, tempReg1, tempReg2); - } - } - - // Now use the flag we set earlier to update fullAlphaReg. - // We translate it to a mask, tempReg3=-1 if full alpha, 0 otherwise. - ADDI(tempReg3, tempReg3, -1); - AND(fullAlphaReg, fullAlphaReg, tempReg3); - - SW(tempReg1, dstReg, dec_->decFmt.c0off); + Jit_WriteMorphColor(dec_->decFmt.c0off, true); } void VertexDecoderJitCache::Jit_Color565Morph() { @@ -1033,43 +951,7 @@ void VertexDecoderJitCache::Jit_Color565Morph() { FMADD(32, fpSrc[2], fpScratchReg1, fpScratchReg3, fpSrc[2]); } - if (cpu_info.RiscV_B) { - LI(scratchReg, 0xFF); - FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); - MAX(tempReg1, tempReg1, R_ZERO); - MIN(tempReg1, tempReg1, scratchReg); - for (int i = 1; i < 3; ++i) { - FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); - MAX(tempReg2, tempReg2, R_ZERO); - MIN(tempReg2, tempReg2, scratchReg); - SLLI(tempReg2, tempReg2, i * 8); - OR(tempReg1, tempReg1, tempReg2); - } - - SLLI(scratchReg, scratchReg, 24); - OR(tempReg1, tempReg1, scratchReg); - } else { - // Clamp to [0, 255] as floats, since we have FMIN/FMAX. Better than branching, probably... - LI(scratchReg, 255.0f); - FMV(FMv::W, FMv::X, fpScratchReg1, R_ZERO); - FMV(FMv::W, FMv::X, fpScratchReg2, scratchReg); - for (int i = 0; i < 3; ++i) { - FMAX(32, fpSrc[i], fpSrc[i], fpScratchReg1); - FMIN(32, fpSrc[i], fpSrc[i], fpScratchReg2); - } - - FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); - for (int i = 1; i < 3; ++i) { - FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); - SLLI(tempReg2, tempReg2, i * 8); - OR(tempReg1, tempReg1, tempReg2); - } - - LI(scratchReg, (s32)0xFF000000); - OR(tempReg1, tempReg1, scratchReg); - } - - SW(tempReg1, dstReg, dec_->decFmt.c0off); + Jit_WriteMorphColor(dec_->decFmt.c0off, false); } void VertexDecoderJitCache::Jit_Color5551Morph() { @@ -1105,12 +987,16 @@ void VertexDecoderJitCache::Jit_Color5551Morph() { FMV(FMv::W, FMv::X, fpScratchReg2, scratchReg); FMUL(32, fpSrc[3], fpSrc[3], fpScratchReg2, Round::TOZERO); + Jit_WriteMorphColor(dec_->decFmt.c0off, true); +} + +void VertexDecoderJitCache::Jit_WriteMorphColor(int outOff, bool checkAlpha) { if (cpu_info.RiscV_B) { LI(scratchReg, 0xFF); FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); MAX(tempReg1, tempReg1, R_ZERO); MIN(tempReg1, tempReg1, scratchReg); - for (int i = 1; i < 4; ++i) { + for (int i = 1; i < (checkAlpha ? 4 : 3); ++i) { FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); MAX(tempReg2, tempReg2, R_ZERO); MIN(tempReg2, tempReg2, scratchReg); @@ -1120,17 +1006,24 @@ void VertexDecoderJitCache::Jit_Color5551Morph() { SLLI(tempReg2, tempReg2, i * 8); OR(tempReg1, tempReg1, tempReg2); } + + if (!checkAlpha) { + // For 565 only, take our 0xFF constant above and slot it into alpha. + SLLI(scratchReg, scratchReg, 24); + OR(tempReg1, tempReg1, scratchReg); + } } else { // Clamp to [0, 255] as floats, since we have FMIN/FMAX. Better than branching, probably... - // We still have 255.0f in fpScratchReg2. + LI(scratchReg, 255.0f); FMV(FMv::W, FMv::X, fpScratchReg1, R_ZERO); - for (int i = 0; i < 4; ++i) { + FMV(FMv::W, FMv::X, fpScratchReg2, scratchReg); + for (int i = 0; i < (checkAlpha ? 4 : 3); ++i) { FMAX(32, fpSrc[i], fpSrc[i], fpScratchReg1); FMIN(32, fpSrc[i], fpSrc[i], fpScratchReg2); } FCVT(FConv::WU, FConv::S, tempReg1, fpSrc[0], Round::TOZERO); - for (int i = 1; i < 4; ++i) { + for (int i = 1; i < (checkAlpha ? 4 : 3); ++i) { FCVT(FConv::WU, FConv::S, tempReg2, fpSrc[i], Round::TOZERO); // If it's alpha, set tempReg3 as a flag. if (i == 3) @@ -1138,14 +1031,22 @@ void VertexDecoderJitCache::Jit_Color5551Morph() { SLLI(tempReg2, tempReg2, i * 8); OR(tempReg1, tempReg1, tempReg2); } + + if (!checkAlpha) { + // For 565 only, we need to force alpha to 0xFF. + LI(scratchReg, (s32)0xFF000000); + OR(tempReg1, tempReg1, scratchReg); + } } - // Now use the flag we set earlier to update fullAlphaReg. - // We translate it to a mask, tempReg3=-1 if full alpha, 0 otherwise. - ADDI(tempReg3, tempReg3, -1); - AND(fullAlphaReg, fullAlphaReg, tempReg3); + if (checkAlpha) { + // Now use the flag we set earlier to update fullAlphaReg. + // We translate it to a mask, tempReg3=-1 if full alpha, 0 otherwise. + ADDI(tempReg3, tempReg3, -1); + AND(fullAlphaReg, fullAlphaReg, tempReg3); + } - SW(tempReg1, dstReg, dec_->decFmt.c0off); + SW(tempReg1, dstReg, outOff); } void VertexDecoderJitCache::Jit_AnyS8ToFloat(int srcoff) {