Optimize some 4444/8888 color conversions.

Small performance boost in softgpu.
This commit is contained in:
Unknown W. Brackets 2014-03-17 01:21:52 -07:00
parent 6de2129f98
commit 38d0bac1df
2 changed files with 49 additions and 42 deletions

View file

@ -565,11 +565,23 @@ void VertexDecoderJitCache::Jit_Color8888() {
}
static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, };
static const u32 MEMORY_ALIGNED16(color4444mask[4]) = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
void VertexDecoderJitCache::Jit_Color4444() {
// Needs benchmarking. A bit wasteful by only using 1 SSE lane.
#if 0
MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff));
PUNPCKLBW(fpScratchReg, R(fpScratchReg));
PAND(fpScratchReg, M(color4444mask));
MOVSS(fpScratchReg2, R(fpScratchReg));
MOVSS(fpScratchReg3, R(fpScratchReg));
PSRLW(fpScratchReg2, 4);
PSLLW(fpScratchReg3, 4);
POR(fpScratchReg, R(fpScratchReg2));
POR(fpScratchReg, R(fpScratchReg3));
MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), fpScratchReg);
return;
#elif 0
// Alternate approach
MOVD_xmm(XMM3, MDisp(srcReg, dec_->coloff));
MOVAPS(XMM2, R(XMM3));
@ -592,36 +604,32 @@ void VertexDecoderJitCache::Jit_Color4444() {
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));
// 0000ABGR, copy R and double forwards.
// Pick out A and B, and space them out by a nibble.
MOV(32, R(tempReg2), R(tempReg1));
MOV(32, R(tempReg3), R(tempReg1));
AND(32, R(tempReg3), Imm32(0x0000000F));
MOV(32, R(tempReg2), R(tempReg3));
AND(32, R(tempReg2), Imm32(0x0000F000));
AND(32, R(tempReg3), Imm32(0x00000F00));
SHL(32, R(tempReg2), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));
// Now grab R and G.
MOV(32, R(tempReg3), R(tempReg1));
AND(32, R(tempReg1), Imm32(0x0000000F));
AND(32, R(tempReg3), Imm32(0x000000F0));
// Currently: 000A0B00, so let's shift once so G is spaced out.
SHL(32, R(tempReg2), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));
// Now: 00A0B0G0, so shift it once more to add R at the bottom.
SHL(32, R(tempReg2), Imm8(4));
OR(32, R(tempReg2), R(tempReg1));
// Now we just need to duplicate the nibbles.
MOV(32, R(tempReg3), R(tempReg2));
SHL(32, R(tempReg3), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));
// tempReg1 -> 00ABGR00, then double G backwards.
SHL(32, R(tempReg1), Imm8(8));
MOV(32, R(tempReg3), R(tempReg1));
AND(32, R(tempReg3), Imm32(0x0000F000));
OR(32, R(tempReg2), R(tempReg3));
SHR(32, R(tempReg3), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));
// Now do B forwards again (still 00ABGR00.)
MOV(32, R(tempReg3), R(tempReg1));
AND(32, R(tempReg3), Imm32(0x000F0000));
OR(32, R(tempReg2), R(tempReg3));
SHL(32, R(tempReg3), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));
// tempReg1 -> ABGR0000, then double A backwards.
SHL(32, R(tempReg1), Imm8(8));
MOV(32, R(tempReg3), R(tempReg1));
AND(32, R(tempReg3), Imm32(0xF0000000));
OR(32, R(tempReg2), R(tempReg3));
SHR(32, R(tempReg3), Imm8(4));
OR(32, R(tempReg2), R(tempReg3));
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
}

View file

@ -17,15 +17,17 @@
#pragma once
#include "CommonTypes.h"
#include "Common/CommonTypes.h"
static inline u32 DecodeRGBA4444(u16 src)
{
u8 r = Convert4To8((src >> 0) & 0x0f);
u8 g = Convert4To8((src >> 4) & 0x0f);
u8 b = Convert4To8((src >> 8) & 0x0f);
u8 a = Convert4To8((src >> 12) & 0x0f);
return (a << 24) | (b << 16) | (g << 8) | r;
const u32 r = (src & 0x000F) << 0;
const u32 g = (src & 0x00F0) << 4;
const u32 b = (src & 0x0F00) << 8;
const u32 a = (src & 0xF000) << 12;
const u32 c = r | g | b | a;
return c | (c << 4);
}
static inline u32 DecodeRGBA5551(u16 src)
@ -87,13 +89,10 @@ static inline u16 RGBA8888To5551(u32 value)
static inline u16 RGBA8888To4444(u32 value)
{
u8 r = value & 0xFF;
u8 g = (value >> 8) & 0xFF;
u8 b = (value >> 16) & 0xFF;
u8 a = (value >> 24) & 0xFF;
r >>= 4;
g >>= 4;
b >>= 4;
a >>= 4;
return (u16)r | ((u16)g << 4) | ((u16)b << 8) | ((u16)a << 12);
const u32 c = value >> 4;
const u16 r = (c >> 0) & 0x000F;
const u16 g = (c >> 4) & 0x00F0;
const u16 b = (c >> 8) & 0x0F00;
const u16 a = (c >> 12) & 0xF000;
return r | g | b | a;
}