From 38d0bac1df5e68c48e0074140e26d5386db1a09e Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 17 Mar 2014 01:21:52 -0700
Subject: [PATCH] Optimize some 4444/8888 color conversions.

Small performance boost in softgpu.
---
 GPU/GLES/VertexDecoderX86.cpp | 62 ++++++++++++++++++++---------------
 GPU/Software/Colors.h         | 29 ++++++++--------
 2 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/GPU/GLES/VertexDecoderX86.cpp b/GPU/GLES/VertexDecoderX86.cpp
index 11ce4065a7..808b4a30b3 100644
--- a/GPU/GLES/VertexDecoderX86.cpp
+++ b/GPU/GLES/VertexDecoderX86.cpp
@@ -565,11 +565,23 @@ void VertexDecoderJitCache::Jit_Color8888() {
 }
 
 static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, };
-
+static const u32 MEMORY_ALIGNED16(color4444mask[4]) = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
 
 void VertexDecoderJitCache::Jit_Color4444() {
 	// Needs benchmarking. A bit wasteful by only using 1 SSE lane.
 #if 0
+	MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff));
+	PUNPCKLBW(fpScratchReg, R(fpScratchReg));
+	PAND(fpScratchReg, M(color4444mask));
+	MOVSS(fpScratchReg2, R(fpScratchReg));
+	MOVSS(fpScratchReg3, R(fpScratchReg));
+	PSRLW(fpScratchReg2, 4);
+	PSLLW(fpScratchReg3, 4);
+	POR(fpScratchReg, R(fpScratchReg2));
+	POR(fpScratchReg, R(fpScratchReg3));
+	MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), fpScratchReg);
+	return;
+#elif 0
 	// Alternate approach
 	MOVD_xmm(XMM3, MDisp(srcReg, dec_->coloff));
 	MOVAPS(XMM2, R(XMM3));
@@ -592,36 +604,32 @@ void VertexDecoderJitCache::Jit_Color4444() {
 
 	MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));
 
-	// 0000ABGR, copy R and double forwards.
+	// Pick out A and B, and space them out by a nibble.
+	MOV(32, R(tempReg2), R(tempReg1));
 	MOV(32, R(tempReg3), R(tempReg1));
-	AND(32, R(tempReg3), Imm32(0x0000000F));
-	MOV(32, R(tempReg2), R(tempReg3));
+	AND(32, R(tempReg2), Imm32(0x0000F000));
+	AND(32, R(tempReg3), Imm32(0x00000F00));
+	SHL(32, R(tempReg2), Imm8(4));
+	OR(32, R(tempReg2), R(tempReg3));
+
+	// Now grab R and G.
+	MOV(32, R(tempReg3), R(tempReg1));
+	AND(32, R(tempReg1), Imm32(0x0000000F));
+	AND(32, R(tempReg3), Imm32(0x000000F0));
+
+	// Currently: 000A0B00, so let's shift once so G is spaced out.
+	SHL(32, R(tempReg2), Imm8(4));
+	OR(32, R(tempReg2), R(tempReg3));
+
+	// Now: 00A0B0G0, so shift it once more to add R at the bottom.
+	SHL(32, R(tempReg2), Imm8(4));
+	OR(32, R(tempReg2), R(tempReg1));
+
+	// Now we just need to duplicate the nibbles.
+	MOV(32, R(tempReg3), R(tempReg2));
 	SHL(32, R(tempReg3), Imm8(4));
 	OR(32, R(tempReg2), R(tempReg3));
 
-	// tempReg1 -> 00ABGR00, then double G backwards.
-	SHL(32, R(tempReg1), Imm8(8));
-	MOV(32, R(tempReg3), R(tempReg1));
-	AND(32, R(tempReg3), Imm32(0x0000F000));
-	OR(32, R(tempReg2), R(tempReg3));
-	SHR(32, R(tempReg3), Imm8(4));
-	OR(32, R(tempReg2), R(tempReg3));
-
-	// Now do B forwards again (still 00ABGR00.)
-	MOV(32, R(tempReg3), R(tempReg1));
-	AND(32, R(tempReg3), Imm32(0x000F0000));
-	OR(32, R(tempReg2), R(tempReg3));
-	SHL(32, R(tempReg3), Imm8(4));
-	OR(32, R(tempReg2), R(tempReg3));
-
-	// tempReg1 -> ABGR0000, then double A backwards.
-	SHL(32, R(tempReg1), Imm8(8));
-	MOV(32, R(tempReg3), R(tempReg1));
-	AND(32, R(tempReg3), Imm32(0xF0000000));
-	OR(32, R(tempReg2), R(tempReg3));
-	SHR(32, R(tempReg3), Imm8(4));
-	OR(32, R(tempReg2), R(tempReg3));
-
 	MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
 }
 
diff --git a/GPU/Software/Colors.h b/GPU/Software/Colors.h
index db32848d69..6bd9674b86 100644
--- a/GPU/Software/Colors.h
+++ b/GPU/Software/Colors.h
@@ -17,15 +17,17 @@
 
 #pragma once
 
-#include "CommonTypes.h"
+#include "Common/CommonTypes.h"
 
 static inline u32 DecodeRGBA4444(u16 src)
 {
-	u8 r = Convert4To8((src >> 0) & 0x0f);
-	u8 g = Convert4To8((src >> 4) & 0x0f);
-	u8 b = Convert4To8((src >> 8) & 0x0f);
-	u8 a = Convert4To8((src >> 12) & 0x0f);
-	return (a << 24) | (b << 16) | (g << 8) | r;
+	const u32 r = (src & 0x000F) << 0;
+	const u32 g = (src & 0x00F0) << 4;
+	const u32 b = (src & 0x0F00) << 8;
+	const u32 a = (src & 0xF000) << 12;
+
+	const u32 c = r | g | b | a;
+	return c | (c << 4);
 }
 
 static inline u32 DecodeRGBA5551(u16 src)
@@ -87,13 +89,10 @@ static inline u16 RGBA8888To5551(u32 value)
 
 static inline u16 RGBA8888To4444(u32 value)
 {
-	u8 r = value & 0xFF;
-	u8 g = (value >> 8) & 0xFF;
-	u8 b = (value >> 16) & 0xFF;
-	u8 a = (value >> 24) & 0xFF;
-	r >>= 4;
-	g >>= 4;
-	b >>= 4;
-	a >>= 4;
-	return (u16)r | ((u16)g << 4) | ((u16)b << 8) | ((u16)a << 12);
+	const u32 c = value >> 4;
+	const u16 r = (c >>  0) & 0x000F;
+	const u16 g = (c >>  4) & 0x00F0;
+	const u16 b = (c >>  8) & 0x0F00;
+	const u16 a = (c >> 12) & 0xF000;
+	return r | g | b | a;
 }