diff --git a/Common/CPUDetect.cpp b/Common/CPUDetect.cpp index 01a4d06cf4..262bbf4def 100644 --- a/Common/CPUDetect.cpp +++ b/Common/CPUDetect.cpp @@ -255,6 +255,8 @@ void CPUInfo::Detect() { if ((cpu_id[1] >> 11) & 1) bRTM = true; } + + bBMI2_fast = bBMI2 && (vendor != VENDOR_AMD || family >= 0x19); } if (max_ex_fn >= 0x80000004) { // Extract brand string diff --git a/Common/CPUDetect.h b/Common/CPUDetect.h index c2985fc3c5..cdf32bf1b8 100644 --- a/Common/CPUDetect.h +++ b/Common/CPUDetect.h @@ -54,6 +54,7 @@ struct CPUInfo { bool bLZCNT; bool bBMI1; bool bBMI2; + bool bBMI2_fast; bool bXOP; bool bRTM; diff --git a/GPU/Software/DrawPixelX86.cpp b/GPU/Software/DrawPixelX86.cpp index f4c17011a1..912915aed8 100644 --- a/GPU/Software/DrawPixelX86.cpp +++ b/GPU/Software/DrawPixelX86.cpp @@ -2143,6 +2143,24 @@ bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg color bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) { Describe("ConvertFrom565"); + + if (cpu_info.bBMI2_fast) { + // Start off with the high bits. + MOV(32, R(temp1Reg), Imm32(0x00F8FCF8)); + PDEP(32, temp1Reg, colorReg, R(temp1Reg)); + + // Now grab the low bits (they end up packed.) + MOV(32, R(temp2Reg), Imm32(0x0000E61C)); + PEXT(32, colorReg, colorReg, R(temp2Reg)); + // And spread them back out. + MOV(32, R(temp2Reg), Imm32(0x00070307)); + PDEP(32, colorReg, colorReg, R(temp2Reg)); + + // Finally put the high bits in, we're done. + OR(32, R(colorReg), R(temp1Reg)); + return true; + } + // Filter out red only into temp1. MOV(32, R(temp1Reg), R(colorReg)); AND(16, R(temp1Reg), Imm16(0x1F << 0)); @@ -2178,6 +2196,27 @@ bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colo bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) { Describe("ConvertFrom5551"); + + if (cpu_info.bBMI2_fast) { + // First, grab the top bits. + MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8)); + PDEP(32, colorReg, colorReg, R(temp1Reg)); + + // Now make the swizzle bits. + MOV(32, R(temp2Reg), R(colorReg)); + SHR(32, R(temp2Reg), Imm8(5)); + AND(32, R(temp2Reg), Imm32(0x00070707)); + + if (keepAlpha) { + // Sign extend the alpha bit to 8 bits. + SHL(32, R(colorReg), Imm8(7)); + SAR(32, R(colorReg), Imm8(7)); + } + + OR(32, R(colorReg), R(temp2Reg)); + return true; + } + // Filter out red only into temp1. MOV(32, R(temp1Reg), R(colorReg)); AND(16, R(temp1Reg), Imm16(0x1F << 0)); @@ -2215,6 +2254,19 @@ bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg col bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) { Describe("ConvertFrom4444"); + + if (cpu_info.bBMI2_fast) { + // First, spread the bits out with spaces. + MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0)); + PDEP(32, colorReg, colorReg, R(temp1Reg)); + + // Now swizzle the low bits in. + MOV(32, R(temp1Reg), R(colorReg)); + SHR(32, R(temp1Reg), Imm8(4)); + OR(32, R(colorReg), R(temp1Reg)); + return true; + } + // Move red into position within temp1. MOV(32, R(temp1Reg), R(colorReg)); AND(16, R(temp1Reg), Imm16(0xF << 0));