mirror of
https://github.com/hrydgard/ppsspp.git
synced 2025-04-02 11:01:50 -04:00
Optimize some 4444/8888 color conversions.
Small performance boost in softgpu.
This commit is contained in:
parent
6de2129f98
commit
38d0bac1df
2 changed files with 49 additions and 42 deletions
|
@ -565,11 +565,23 @@ void VertexDecoderJitCache::Jit_Color8888() {
|
|||
}
|
||||
|
||||
static const u32 MEMORY_ALIGNED16(nibbles[4]) = { 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, };
|
||||
|
||||
static const u32 MEMORY_ALIGNED16(color4444mask[4]) = { 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, 0xf00ff00f, };
|
||||
|
||||
void VertexDecoderJitCache::Jit_Color4444() {
|
||||
// Needs benchmarking. A bit wasteful by only using 1 SSE lane.
|
||||
#if 0
|
||||
MOVD_xmm(fpScratchReg, MDisp(srcReg, dec_->coloff));
|
||||
PUNPCKLBW(fpScratchReg, R(fpScratchReg));
|
||||
PAND(fpScratchReg, M(color4444mask));
|
||||
MOVSS(fpScratchReg2, R(fpScratchReg));
|
||||
MOVSS(fpScratchReg3, R(fpScratchReg));
|
||||
PSRLW(fpScratchReg2, 4);
|
||||
PSLLW(fpScratchReg3, 4);
|
||||
POR(fpScratchReg, R(fpScratchReg2));
|
||||
POR(fpScratchReg, R(fpScratchReg3));
|
||||
MOVD_xmm(MDisp(dstReg, dec_->decFmt.c0off), fpScratchReg);
|
||||
return;
|
||||
#elif 0
|
||||
// Alternate approach
|
||||
MOVD_xmm(XMM3, MDisp(srcReg, dec_->coloff));
|
||||
MOVAPS(XMM2, R(XMM3));
|
||||
|
@ -592,36 +604,32 @@ void VertexDecoderJitCache::Jit_Color4444() {
|
|||
|
||||
MOVZX(32, 16, tempReg1, MDisp(srcReg, dec_->coloff));
|
||||
|
||||
// 0000ABGR, copy R and double forwards.
|
||||
// Pick out A and B, and space them out by a nibble.
|
||||
MOV(32, R(tempReg2), R(tempReg1));
|
||||
MOV(32, R(tempReg3), R(tempReg1));
|
||||
AND(32, R(tempReg3), Imm32(0x0000000F));
|
||||
MOV(32, R(tempReg2), R(tempReg3));
|
||||
AND(32, R(tempReg2), Imm32(0x0000F000));
|
||||
AND(32, R(tempReg3), Imm32(0x00000F00));
|
||||
SHL(32, R(tempReg2), Imm8(4));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
|
||||
// Now grab R and G.
|
||||
MOV(32, R(tempReg3), R(tempReg1));
|
||||
AND(32, R(tempReg1), Imm32(0x0000000F));
|
||||
AND(32, R(tempReg3), Imm32(0x000000F0));
|
||||
|
||||
// Currently: 000A0B00, so let's shift once so G is spaced out.
|
||||
SHL(32, R(tempReg2), Imm8(4));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
|
||||
// Now: 00A0B0G0, so shift it once more to add R at the bottom.
|
||||
SHL(32, R(tempReg2), Imm8(4));
|
||||
OR(32, R(tempReg2), R(tempReg1));
|
||||
|
||||
// Now we just need to duplicate the nibbles.
|
||||
MOV(32, R(tempReg3), R(tempReg2));
|
||||
SHL(32, R(tempReg3), Imm8(4));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
|
||||
// tempReg1 -> 00ABGR00, then double G backwards.
|
||||
SHL(32, R(tempReg1), Imm8(8));
|
||||
MOV(32, R(tempReg3), R(tempReg1));
|
||||
AND(32, R(tempReg3), Imm32(0x0000F000));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
SHR(32, R(tempReg3), Imm8(4));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
|
||||
// Now do B forwards again (still 00ABGR00.)
|
||||
MOV(32, R(tempReg3), R(tempReg1));
|
||||
AND(32, R(tempReg3), Imm32(0x000F0000));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
SHL(32, R(tempReg3), Imm8(4));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
|
||||
// tempReg1 -> ABGR0000, then double A backwards.
|
||||
SHL(32, R(tempReg1), Imm8(8));
|
||||
MOV(32, R(tempReg3), R(tempReg1));
|
||||
AND(32, R(tempReg3), Imm32(0xF0000000));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
SHR(32, R(tempReg3), Imm8(4));
|
||||
OR(32, R(tempReg2), R(tempReg3));
|
||||
|
||||
MOV(32, MDisp(dstReg, dec_->decFmt.c0off), R(tempReg2));
|
||||
}
|
||||
|
||||
|
|
|
@ -17,15 +17,17 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "CommonTypes.h"
|
||||
#include "Common/CommonTypes.h"
|
||||
|
||||
static inline u32 DecodeRGBA4444(u16 src)
|
||||
{
|
||||
u8 r = Convert4To8((src >> 0) & 0x0f);
|
||||
u8 g = Convert4To8((src >> 4) & 0x0f);
|
||||
u8 b = Convert4To8((src >> 8) & 0x0f);
|
||||
u8 a = Convert4To8((src >> 12) & 0x0f);
|
||||
return (a << 24) | (b << 16) | (g << 8) | r;
|
||||
const u32 r = (src & 0x000F) << 0;
|
||||
const u32 g = (src & 0x00F0) << 4;
|
||||
const u32 b = (src & 0x0F00) << 8;
|
||||
const u32 a = (src & 0xF000) << 12;
|
||||
|
||||
const u32 c = r | g | b | a;
|
||||
return c | (c << 4);
|
||||
}
|
||||
|
||||
static inline u32 DecodeRGBA5551(u16 src)
|
||||
|
@ -87,13 +89,10 @@ static inline u16 RGBA8888To5551(u32 value)
|
|||
|
||||
static inline u16 RGBA8888To4444(u32 value)
|
||||
{
|
||||
u8 r = value & 0xFF;
|
||||
u8 g = (value >> 8) & 0xFF;
|
||||
u8 b = (value >> 16) & 0xFF;
|
||||
u8 a = (value >> 24) & 0xFF;
|
||||
r >>= 4;
|
||||
g >>= 4;
|
||||
b >>= 4;
|
||||
a >>= 4;
|
||||
return (u16)r | ((u16)g << 4) | ((u16)b << 8) | ((u16)a << 12);
|
||||
const u32 c = value >> 4;
|
||||
const u16 r = (c >> 0) & 0x000F;
|
||||
const u16 g = (c >> 4) & 0x00F0;
|
||||
const u16 b = (c >> 8) & 0x0F00;
|
||||
const u16 a = (c >> 12) & 0xF000;
|
||||
return r | g | b | a;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue