From 38d0bac1df5e68c48e0074140e26d5386db1a09e Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 17 Mar 2014 01:21:52 -0700 Subject: [PATCH] Optimize some 4444/8888 color conversions. Small performance boost in softgpu. --- GPU/GLES/VertexDecoderX86.cpp | 62 ++++++++++++++++++++--------------- GPU/Software/Colors.h | 29 ++++++++-------- 2 files changed, 49 insertions(+), 42 deletions(-) diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp index 11ce4065a7..808b4a30b3 100644 --- a/GPU/GLES/VertexDecoderX86.cpp +++ b/GPU/GLES/VertexDecoderX86.cpp @@ -565,11 +565,23 @@ void VertexDecoderJitCache::Jit_Color8888() { } static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, }; - +static const u32 MEMORY_ALIGNED16(color4444mask[4]) = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, }; void VertexDecoderJitCache::Jit_Color4444() { // Needs benchmarking. A bit wasteful by only using 1 SSE lane. #if 0 + MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff)); + PUNPCKLBW(fpScratchReg, R(fpScratchReg)); + PAND(fpScratchReg, M(color4444mask)); + MOVSS(fpScratchReg2, R(fpScratchReg)); + MOVSS(fpScratchReg3, R(fpScratchReg)); + PSRLW(fpScratchReg2, 4); + PSLLW(fpScratchReg3, 4); + POR(fpScratchReg, R(fpScratchReg2)); + POR(fpScratchReg, R(fpScratchReg3)); + MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), fpScratchReg); + return; +#elif 0 // Alternate approach MOVD_xmm(XMM3, MDisp(srcReg, dec_->coloff)); MOVAPS(XMM2, R(XMM3)); @@ -592,36 +604,32 @@ void VertexDecoderJitCache::Jit_Color4444() { MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff)); - // 0000ABGR, copy R and double forwards. + // Pick out A and B, and space them out by a nibble. + MOV(32, R(tempReg2), R(tempReg1)); MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x0000000F)); - MOV(32, R(tempReg2), R(tempReg3)); + AND(32, R(tempReg2), Imm32(0x0000F000)); + AND(32, R(tempReg3), Imm32(0x00000F00)); + SHL(32, R(tempReg2), Imm8(4)); + OR(32, R(tempReg2), R(tempReg3)); + + // Now grab R and G. + MOV(32, R(tempReg3), R(tempReg1)); + AND(32, R(tempReg1), Imm32(0x0000000F)); + AND(32, R(tempReg3), Imm32(0x000000F0)); + + // Currently: 000A0B00, so let's shift once so G is spaced out. + SHL(32, R(tempReg2), Imm8(4)); + OR(32, R(tempReg2), R(tempReg3)); + + // Now: 00A0B0G0, so shift it once more to add R at the bottom. + SHL(32, R(tempReg2), Imm8(4)); + OR(32, R(tempReg2), R(tempReg1)); + + // Now we just need to duplicate the nibbles. + MOV(32, R(tempReg3), R(tempReg2)); SHL(32, R(tempReg3), Imm8(4)); OR(32, R(tempReg2), R(tempReg3)); - // tempReg1 -> 00ABGR00, then double G backwards. - SHL(32, R(tempReg1), Imm8(8)); - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x0000F000)); - OR(32, R(tempReg2), R(tempReg3)); - SHR(32, R(tempReg3), Imm8(4)); - OR(32, R(tempReg2), R(tempReg3)); - - // Now do B forwards again (still 00ABGR00.) - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0x000F0000)); - OR(32, R(tempReg2), R(tempReg3)); - SHL(32, R(tempReg3), Imm8(4)); - OR(32, R(tempReg2), R(tempReg3)); - - // tempReg1 -> ABGR0000, then double A backwards. - SHL(32, R(tempReg1), Imm8(8)); - MOV(32, R(tempReg3), R(tempReg1)); - AND(32, R(tempReg3), Imm32(0xF0000000)); - OR(32, R(tempReg2), R(tempReg3)); - SHR(32, R(tempReg3), Imm8(4)); - OR(32, R(tempReg2), R(tempReg3)); - MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2)); } diff --git a/GPU/Software/Colors.h b/GPU/Software/Colors.h index db32848d69..6bd9674b86 100644 --- a/GPU/Software/Colors.h +++ b/GPU/Software/Colors.h @@ -17,15 +17,17 @@ #pragma once -#include "CommonTypes.h" +#include "Common/CommonTypes.h" static inline u32 DecodeRGBA4444(u16 src) { - u8 r = Convert4To8((src >> 0) & 0x0f); - u8 g = Convert4To8((src >> 4) & 0x0f); - u8 b = Convert4To8((src >> 8) & 0x0f); - u8 a = Convert4To8((src >> 12) & 0x0f); - return (a << 24) | (b << 16) | (g << 8) | r; + const u32 r = (src & 0x000F) << 0; + const u32 g = (src & 0x00F0) << 4; + const u32 b = (src & 0x0F00) << 8; + const u32 a = (src & 0xF000) << 12; + + const u32 c = r | g | b | a; + return c | (c << 4); } static inline u32 DecodeRGBA5551(u16 src) @@ -87,13 +89,10 @@ static inline u16 RGBA8888To5551(u32 value) static inline u16 RGBA8888To4444(u32 value) { - u8 r = value & 0xFF; - u8 g = (value >> 8) & 0xFF; - u8 b = (value >> 16) & 0xFF; - u8 a = (value >> 24) & 0xFF; - r >>= 4; - g >>= 4; - b >>= 4; - a >>= 4; - return (u16)r | ((u16)g << 4) | ((u16)b << 8) | ((u16)a << 12); + const u32 c = value >> 4; + const u16 r = (c >> 0) & 0x000F; + const u16 g = (c >> 4) & 0x00F0; + const u16 b = (c >> 8) & 0x0F00; + const u16 a = (c >> 12) & 0xF000; + return r | g | b | a; }