mirror of
https://github.com/PCSX2/pcsx2.git
synced 2025-04-02 10:52:54 -04:00
git-svn-id: http://pcsx2.googlecode.com/svn/branches/pcsx2_0.9.4@186 96395faa-99c1-11dd-bbfe-3dabce05a288
1632 lines
47 KiB
ArmAsm
1632 lines
47 KiB
ArmAsm
/*Pcsx2 - Pc Ps2 Emulator
|
|
Copyright (C) 2002-2007 Pcsx2 Team
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
*/
|
|
.intel_syntax
|
|
|
|
.extern _vifRegs
|
|
.extern _vifMaskRegs
|
|
.extern _vifRow
|
|
|
|
#ifdef __x86_64__
|
|
#define VIF_ESP %rsp
|
|
#define VIF_SRC %rsi
|
|
#define VIF_INC %rcx
|
|
#define VIF_DST %rdi
|
|
#define VIF_SIZE %edx
|
|
#define VIF_TMPADDR %rax
|
|
#define VIF_SAVEEBX %r8
|
|
#define VIF_SAVEEBXd %r8d
|
|
#else
|
|
#define VIF_ESP %esp
|
|
#define VIF_SRC %esi
|
|
#define VIF_INC %ecx
|
|
#define VIF_DST %edi
|
|
#define VIF_SIZE %edx
|
|
#define VIF_TMPADDR %eax
|
|
#define VIF_SAVEEBX %ebx
|
|
#define VIF_SAVEEBXd %ebx
|
|
#endif
|
|
|
|
#define XMM_R0 %xmm0
|
|
#define XMM_R1 %xmm1
|
|
#define XMM_R2 %xmm2
|
|
#define XMM_WRITEMASK %xmm3
|
|
#define XMM_ROWMASK %xmm4
|
|
#define XMM_ROWCOLMASK %xmm5
|
|
#define XMM_ROW %xmm6
|
|
#define XMM_COL %xmm7
|
|
|
|
#define XMM_R3 XMM_COL
|
|
|
|
// writing masks
|
|
#define UNPACK_Write0_Regular(r0, CL, DEST_OFFSET, MOVDQA) \
|
|
MOVDQA qword ptr [VIF_DST+DEST_OFFSET], r0;
|
|
|
|
#define UNPACK_Write1_Regular(r0, CL, DEST_OFFSET, MOVDQA) \
|
|
MOVDQA qword ptr [VIF_DST], r0; \
|
|
add VIF_DST, VIF_INC; \
|
|
|
|
#define UNPACK_Write0_Mask UNPACK_Write0_Regular
|
|
#define UNPACK_Write1_Mask UNPACK_Write1_Regular
|
|
|
|
// masked write (dest needs to be in edi)
|
|
#define UNPACK_Write0_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \
|
|
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 48]; \
|
|
pand r0, XMM_WRITEMASK; \
|
|
pandn XMM_WRITEMASK, qword ptr [VIF_DST]; \
|
|
por r0, XMM_WRITEMASK; \
|
|
MOVDQA qword ptr [VIF_DST], r0; \
|
|
add VIF_DST, 16; \
|
|
|
|
// masked write (dest needs to be in edi)
|
|
#define UNPACK_Write1_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \
|
|
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(0) + 48]; \
|
|
pand r0, XMM_WRITEMASK; \
|
|
pandn XMM_WRITEMASK, qword ptr [VIF_DST]; \
|
|
por r0, XMM_WRITEMASK; \
|
|
MOVDQA qword ptr [VIF_DST], r0; \
|
|
add VIF_DST, VIF_INC; \
|
|
|
|
#define UNPACK_Mask_SSE_0(r0) \
|
|
pand r0, XMM_WRITEMASK; \
|
|
por r0, XMM_ROWCOLMASK; \
|
|
|
|
// once a qword is uncomprssed, applies masks and saves
|
|
// note: modifying XMM_WRITEMASK
|
|
// dest = row + write (only when mask=0), otherwise write
|
|
#define UNPACK_Mask_SSE_1(r0) \
|
|
pand r0, XMM_WRITEMASK; \
|
|
por r0, XMM_ROWCOLMASK; \
|
|
pand XMM_WRITEMASK, XMM_ROW; \
|
|
paddd r0, XMM_WRITEMASK; \
|
|
|
|
// dest = row + write (only when mask=0), otherwise write
|
|
// row = row + write (only when mask = 0), otherwise row
|
|
#define UNPACK_Mask_SSE_2(r0) \
|
|
pand r0, XMM_WRITEMASK; \
|
|
pand XMM_WRITEMASK, XMM_ROW; \
|
|
paddd XMM_ROW, r0; \
|
|
por r0, XMM_ROWCOLMASK; \
|
|
paddd r0, XMM_WRITEMASK; \
|
|
|
|
#define UNPACK_WriteMask_SSE_0 UNPACK_Mask_SSE_0
|
|
#define UNPACK_WriteMask_SSE_1 UNPACK_Mask_SSE_1
|
|
#define UNPACK_WriteMask_SSE_2 UNPACK_Mask_SSE_2
|
|
|
|
#define UNPACK_Regular_SSE_0(r0)
|
|
|
|
#define UNPACK_Regular_SSE_1(r0) \
|
|
paddd r0, XMM_ROW; \
|
|
|
|
#define UNPACK_Regular_SSE_2(r0) \
|
|
paddd r0, XMM_ROW; \
|
|
movdqa XMM_ROW, r0; \
|
|
|
|
// setting up masks
|
|
#define UNPACK_Setup_Mask_SSE(CL) \
|
|
mov VIF_TMPADDR, _vifMaskRegs; \
|
|
movdqa XMM_ROWMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 16]; \
|
|
movdqa XMM_ROWCOLMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 32]; \
|
|
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(CL)]; \
|
|
pand XMM_ROWMASK, XMM_ROW; \
|
|
pand XMM_ROWCOLMASK, XMM_COL; \
|
|
por XMM_ROWCOLMASK, XMM_ROWMASK; \
|
|
|
|
#define UNPACK_Start_Setup_Mask_SSE_0(CL) UNPACK_Setup_Mask_SSE(CL)
|
|
#define UNPACK_Start_Setup_Mask_SSE_1(CL) \
|
|
mov VIF_TMPADDR, _vifMaskRegs; \
|
|
movdqa XMM_ROWMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 16]; \
|
|
movdqa XMM_ROWCOLMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 32]; \
|
|
pand XMM_ROWMASK, XMM_ROW; \
|
|
pand XMM_ROWCOLMASK, XMM_COL; \
|
|
por XMM_ROWCOLMASK, XMM_ROWMASK; \
|
|
|
|
#define UNPACK_Start_Setup_Mask_SSE_2(CL)
|
|
|
|
#define UNPACK_Setup_Mask_SSE_0_1(CL)
|
|
#define UNPACK_Setup_Mask_SSE_1_1(CL) \
|
|
mov VIF_TMPADDR, _vifMaskRegs; \
|
|
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(0)]; \
|
|
|
|
// ignore CL, since vif.cycle.wl == 1
|
|
#define UNPACK_Setup_Mask_SSE_2_1(CL) \
|
|
mov VIF_TMPADDR, _vifMaskRegs; \
|
|
movdqa XMM_ROWMASK, qword ptr [VIF_TMPADDR + 64*(0) + 16]; \
|
|
movdqa XMM_ROWCOLMASK, qword ptr [VIF_TMPADDR + 64*(0) + 32]; \
|
|
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(0)]; \
|
|
pand XMM_ROWMASK, XMM_ROW; \
|
|
pand XMM_ROWCOLMASK, XMM_COL; \
|
|
por XMM_ROWCOLMASK, XMM_ROWMASK; \
|
|
|
|
#define UNPACK_Setup_Mask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL)
|
|
#define UNPACK_Setup_Mask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL)
|
|
#define UNPACK_Setup_Mask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL)
|
|
|
|
// write mask always destroys XMM_WRITEMASK, so 0_0 = 1_0
|
|
#define UNPACK_Setup_WriteMask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL)
|
|
#define UNPACK_Setup_WriteMask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL)
|
|
#define UNPACK_Setup_WriteMask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL)
|
|
#define UNPACK_Setup_WriteMask_SSE_0_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL)
|
|
#define UNPACK_Setup_WriteMask_SSE_1_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL)
|
|
#define UNPACK_Setup_WriteMask_SSE_2_1(CL) UNPACK_Setup_Mask_SSE_2_1(CL)
|
|
|
|
#define UNPACK_Start_Setup_WriteMask_SSE_0(CL) UNPACK_Start_Setup_Mask_SSE_1(CL)
|
|
#define UNPACK_Start_Setup_WriteMask_SSE_1(CL) UNPACK_Start_Setup_Mask_SSE_1(CL)
|
|
#define UNPACK_Start_Setup_WriteMask_SSE_2(CL) UNPACK_Start_Setup_Mask_SSE_2(CL)
|
|
|
|
#define UNPACK_Start_Setup_Regular_SSE_0(CL)
|
|
#define UNPACK_Start_Setup_Regular_SSE_1(CL)
|
|
#define UNPACK_Start_Setup_Regular_SSE_2(CL)
|
|
#define UNPACK_Setup_Regular_SSE_0_0(CL)
|
|
#define UNPACK_Setup_Regular_SSE_1_0(CL)
|
|
#define UNPACK_Setup_Regular_SSE_2_0(CL)
|
|
#define UNPACK_Setup_Regular_SSE_0_1(CL)
|
|
#define UNPACK_Setup_Regular_SSE_1_1(CL)
|
|
#define UNPACK_Setup_Regular_SSE_2_1(CL)
|
|
|
|
#define UNPACK_INC_DST_0_Regular(qw) add VIF_DST, (16*qw)
|
|
#define UNPACK_INC_DST_1_Regular(qw)
|
|
#define UNPACK_INC_DST_0_Mask(qw) add VIF_DST, (16*qw)
|
|
#define UNPACK_INC_DST_1_Mask(qw)
|
|
#define UNPACK_INC_DST_0_WriteMask(qw)
|
|
#define UNPACK_INC_DST_1_WriteMask(qw)
|
|
|
|
// unpacks for 1,2,3,4 elements (V3 uses this directly)
|
|
#define UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType) \
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \
|
|
\
|
|
UNPACK_INC_DST_##TOTALCL##_##MaskType##(4)
|
|
|
|
// V3 uses this directly
|
|
#define UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType) \
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
|
|
\
|
|
UNPACK_INC_DST_##TOTALCL##_##MaskType##(3); \
|
|
|
|
#define UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType) \
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
|
|
\
|
|
UNPACK_INC_DST_##TOTALCL##_##MaskType##(2); \
|
|
|
|
#define UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType) \
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
|
|
\
|
|
UNPACK_INC_DST_##TOTALCL##_##MaskType##(1); \
|
|
|
|
// S-32
|
|
// only when cl==1
|
|
#define UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
|
|
MOVDQA XMM_R3, qword ptr [VIF_SRC]; \
|
|
\
|
|
pshufd XMM_R0, XMM_R3, 0; \
|
|
pshufd XMM_R1, XMM_R3, 0x55; \
|
|
pshufd XMM_R2, XMM_R3, 0xaa; \
|
|
pshufd XMM_R3, XMM_R3, 0xff; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_S_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa)
|
|
#define UNPACK_S_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu)
|
|
|
|
#define UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
|
|
MOVDQA XMM_R2, qword ptr [VIF_SRC]; \
|
|
\
|
|
pshufd XMM_R0, XMM_R2, 0; \
|
|
pshufd XMM_R1, XMM_R2, 0x55; \
|
|
pshufd XMM_R2, XMM_R2, 0xaa; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_S_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa)
|
|
#define UNPACK_S_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu)
|
|
|
|
#define UNPACK_S_32SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R1, qword ptr [VIF_SRC]; \
|
|
\
|
|
pshufd XMM_R0, XMM_R1, 0; \
|
|
pshufd XMM_R1, XMM_R1, 0x55; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_S_32SSE_2A UNPACK_S_32SSE_2
|
|
|
|
#define UNPACK_S_32SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
pshufd XMM_R0, XMM_R0, 0; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
#define UNPACK_S_32SSE_1A UNPACK_S_32SSE_1
|
|
|
|
// S-16
|
|
#define UNPACK_S_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R3, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R3, XMM_R3; \
|
|
UNPACK_RIGHTSHIFT XMM_R3, 16; \
|
|
\
|
|
pshufd XMM_R0, XMM_R3, 0; \
|
|
pshufd XMM_R1, XMM_R3, 0x55; \
|
|
pshufd XMM_R2, XMM_R3, 0xaa; \
|
|
pshufd XMM_R3, XMM_R3, 0xff; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_S_16SSE_4A UNPACK_S_16SSE_4
|
|
|
|
#define UNPACK_S_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R2, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
pshufd XMM_R0, XMM_R2, 0; \
|
|
pshufd XMM_R1, XMM_R2, 0x55; \
|
|
pshufd XMM_R2, XMM_R2, 0xaa; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
add VIF_SRC, 6; \
|
|
|
|
#define UNPACK_S_16SSE_3A UNPACK_S_16SSE_3
|
|
|
|
#define UNPACK_S_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R1, dword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
\
|
|
pshufd XMM_R0, XMM_R1, 0; \
|
|
pshufd XMM_R1, XMM_R1, 0x55; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
#define UNPACK_S_16SSE_2A UNPACK_S_16SSE_2
|
|
|
|
#define UNPACK_S_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
pshufd XMM_R0, XMM_R0, 0; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 2; \
|
|
|
|
#define UNPACK_S_16SSE_1A UNPACK_S_16SSE_1
|
|
|
|
// S-8
|
|
#define UNPACK_S_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R3, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R3, XMM_R3; \
|
|
punpcklwd XMM_R3, XMM_R3; \
|
|
UNPACK_RIGHTSHIFT XMM_R3, 24; \
|
|
\
|
|
pshufd XMM_R0, XMM_R3, 0; \
|
|
pshufd XMM_R1, XMM_R3, 0x55; \
|
|
pshufd XMM_R2, XMM_R3, 0xaa; \
|
|
pshufd XMM_R3, XMM_R3, 0xff; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
#define UNPACK_S_8SSE_4A UNPACK_S_8SSE_4
|
|
|
|
#define UNPACK_S_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R2, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R2, XMM_R2; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
pshufd XMM_R0, XMM_R2, 0; \
|
|
pshufd XMM_R1, XMM_R2, 0x55; \
|
|
pshufd XMM_R2, XMM_R2, 0xaa; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 3; \
|
|
|
|
#define UNPACK_S_8SSE_3A UNPACK_S_8SSE_3
|
|
|
|
#define UNPACK_S_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R1, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R1, XMM_R1; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
\
|
|
pshufd XMM_R0, XMM_R1, 0; \
|
|
pshufd XMM_R1, XMM_R1, 0x55; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 2; \
|
|
|
|
#define UNPACK_S_8SSE_2A UNPACK_S_8SSE_2
|
|
|
|
#define UNPACK_S_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
pshufd XMM_R0, XMM_R0, 0; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
inc VIF_SRC; \
|
|
|
|
#define UNPACK_S_8SSE_1A UNPACK_S_8SSE_1
|
|
|
|
// V2-32
|
|
#define UNPACK_V2_32SSE_4A(CL, TOTALCL, MaskType, ModeType) \
|
|
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
|
|
MOVDQA XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
pshufd XMM_R1, XMM_R0, 0xee; \
|
|
pshufd XMM_R3, XMM_R2, 0xee; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 32; \
|
|
|
|
#define UNPACK_V2_32SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R1, qword ptr [VIF_SRC+8]; \
|
|
movq XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
movq XMM_R3, qword ptr [VIF_SRC+24]; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 32; \
|
|
|
|
#define UNPACK_V2_32SSE_3A(CL, TOTALCL, MaskType, ModeType) \
|
|
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
pshufd XMM_R1, XMM_R0, 0xee; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 24; \
|
|
|
|
#define UNPACK_V2_32SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R1, qword ptr [VIF_SRC+8]; \
|
|
movq XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 24; \
|
|
|
|
#define UNPACK_V2_32SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R1, qword ptr [VIF_SRC+8]; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V2_32SSE_2A UNPACK_V2_32SSE_2
|
|
|
|
#define UNPACK_V2_32SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V2_32SSE_1A UNPACK_V2_32SSE_1
|
|
|
|
// V2-16
|
|
// due to lemmings, have to copy lower qword to the upper qword of every reg
|
|
#define UNPACK_V2_16SSE_4A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpckhwd XMM_R2, qword ptr [VIF_SRC]; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
punpckhqdq XMM_R3, XMM_R2; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpcklqdq XMM_R2, XMM_R2; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
punpckhqdq XMM_R3, XMM_R3; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V2_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpckhwd XMM_R2, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
punpckhqdq XMM_R3, XMM_R2; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpcklqdq XMM_R2, XMM_R2; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
punpckhqdq XMM_R3, XMM_R3; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V2_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpckhwd XMM_R2, qword ptr [VIF_SRC]; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpcklqdq XMM_R2, XMM_R2; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_V2_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpckhwd XMM_R2, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpcklqdq XMM_R2, XMM_R2; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_V2_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V2_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V2_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklwd XMM_R0, dword ptr [VIF_SRC]; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
#define UNPACK_V2_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
// V2-8
|
|
// and1 streetball needs to copy lower qword to the upper qword of every reg
|
|
#define UNPACK_V2_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpckhwd XMM_R2, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
punpckhqdq XMM_R3, XMM_R2; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpcklqdq XMM_R2, XMM_R2; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
punpckhqdq XMM_R3, XMM_R3; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V2_8SSE_4A UNPACK_V2_8SSE_4
|
|
|
|
#define UNPACK_V2_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpckhwd XMM_R2, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpcklqdq XMM_R2, XMM_R2; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 6; \
|
|
|
|
#define UNPACK_V2_8SSE_3A UNPACK_V2_8SSE_3
|
|
|
|
#define UNPACK_V2_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
\
|
|
punpckhqdq XMM_R1, XMM_R0; \
|
|
\
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
punpckhqdq XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
#define UNPACK_V2_8SSE_2A UNPACK_V2_8SSE_2
|
|
|
|
#define UNPACK_V2_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
punpcklqdq XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 2; \
|
|
|
|
#define UNPACK_V2_8SSE_1A UNPACK_V2_8SSE_1
|
|
|
|
// V3-32
|
|
// midnight club 2 crashes because reading a qw at +36 is out of bounds
|
|
#define UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
|
|
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqu XMM_R1, qword ptr [VIF_SRC+12]; \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
|
|
\
|
|
MOVDQA XMM_R3, qword ptr [VIF_SRC+32]; \
|
|
movdqu XMM_R2, qword ptr [VIF_SRC+24]; \
|
|
psrldq XMM_R3, 4; \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \
|
|
\
|
|
UNPACK_INC_DST_##TOTALCL##_##MaskType##(4); \
|
|
\
|
|
add VIF_SRC, 48; \
|
|
|
|
#define UNPACK_V3_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa)
|
|
#define UNPACK_V3_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu)
|
|
|
|
#define UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
|
|
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqu XMM_R1, qword ptr [VIF_SRC+12]; \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
|
|
\
|
|
movdqu XMM_R2, qword ptr [VIF_SRC+24]; \
|
|
\
|
|
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
|
|
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
|
|
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
|
|
\
|
|
UNPACK_INC_DST_##TOTALCL##_##MaskType##(3); \
|
|
\
|
|
add VIF_SRC, 36; \
|
|
|
|
#define UNPACK_V3_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa)
|
|
#define UNPACK_V3_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu)
|
|
|
|
#define UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
|
|
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqu XMM_R1, qword ptr [VIF_SRC+12]; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 24; \
|
|
|
|
#define UNPACK_V3_32SSE_2A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqa)
|
|
#define UNPACK_V3_32SSE_2(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqu)
|
|
|
|
#define UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
|
|
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_V3_32SSE_1A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqa)
|
|
#define UNPACK_V3_32SSE_1(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqu)
|
|
|
|
// V3-16
|
|
#define UNPACK_V3_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R1, qword ptr [VIF_SRC+6]; \
|
|
\
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
movq XMM_R2, qword ptr [VIF_SRC+12]; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
movq XMM_R3, qword ptr [VIF_SRC+18]; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
punpcklwd XMM_R3, XMM_R3; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R3, 16; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 24; \
|
|
|
|
#define UNPACK_V3_16SSE_4A UNPACK_V3_16SSE_4
|
|
|
|
#define UNPACK_V3_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R1, qword ptr [VIF_SRC+6]; \
|
|
\
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
movq XMM_R2, qword ptr [VIF_SRC+12]; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 18; \
|
|
|
|
#define UNPACK_V3_16SSE_3A UNPACK_V3_16SSE_3
|
|
|
|
#define UNPACK_V3_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R1, qword ptr [VIF_SRC+6]; \
|
|
\
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_V3_16SSE_2A UNPACK_V3_16SSE_2
|
|
|
|
#define UNPACK_V3_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 6; \
|
|
|
|
#define UNPACK_V3_16SSE_1A UNPACK_V3_16SSE_1
|
|
|
|
// V3-8
|
|
#define UNPACK_V3_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R1, qword ptr [VIF_SRC]; \
|
|
movq XMM_R3, qword ptr [VIF_SRC+6]; \
|
|
\
|
|
punpcklbw XMM_R1, XMM_R1; \
|
|
punpcklbw XMM_R3, XMM_R3; \
|
|
punpcklwd XMM_R0, XMM_R1; \
|
|
psrldq XMM_R1, 6; \
|
|
punpcklwd XMM_R2, XMM_R3; \
|
|
psrldq XMM_R3, 6; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
punpcklwd XMM_R3, XMM_R3; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R3, 24; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_V3_8SSE_4A UNPACK_V3_8SSE_4
|
|
|
|
#define UNPACK_V3_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, word ptr [VIF_SRC]; \
|
|
movd XMM_R1, dword ptr [VIF_SRC+3]; \
|
|
\
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
movd XMM_R2, dword ptr [VIF_SRC+6]; \
|
|
punpcklbw XMM_R1, XMM_R1; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklbw XMM_R2, XMM_R2; \
|
|
\
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 9 \
|
|
|
|
#define UNPACK_V3_8SSE_3A UNPACK_V3_8SSE_3
|
|
|
|
#define UNPACK_V3_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
movd XMM_R1, dword ptr [VIF_SRC+3]; \
|
|
\
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklbw XMM_R1, XMM_R1; \
|
|
\
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 6; \
|
|
|
|
#define UNPACK_V3_8SSE_2A UNPACK_V3_8SSE_2
|
|
|
|
#define UNPACK_V3_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 3; \
|
|
|
|
#define UNPACK_V3_8SSE_1A UNPACK_V3_8SSE_1
|
|
|
|
// V4-32
|
|
#define UNPACK_V4_32SSE_4A(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqa XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqa XMM_R1, qword ptr [VIF_SRC+16]; \
|
|
movdqa XMM_R2, qword ptr [VIF_SRC+32]; \
|
|
movdqa XMM_R3, qword ptr [VIF_SRC+48]; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 64; \
|
|
|
|
#define UNPACK_V4_32SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqu XMM_R1, qword ptr [VIF_SRC+16]; \
|
|
movdqu XMM_R2, qword ptr [VIF_SRC+32]; \
|
|
movdqu XMM_R3, qword ptr [VIF_SRC+48]; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 64; \
|
|
|
|
#define UNPACK_V4_32SSE_3A(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqa XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqa XMM_R1, qword ptr [VIF_SRC+16]; \
|
|
movdqa XMM_R2, qword ptr [VIF_SRC+32]; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 48; \
|
|
|
|
#define UNPACK_V4_32SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqu XMM_R1, qword ptr [VIF_SRC+16]; \
|
|
movdqu XMM_R2, qword ptr [VIF_SRC+32]; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 48; \
|
|
|
|
#define UNPACK_V4_32SSE_2A(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqa XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqa XMM_R1, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 32; \
|
|
|
|
#define UNPACK_V4_32SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqu XMM_R1, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 32; \
|
|
|
|
#define UNPACK_V4_32SSE_1A(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqa XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V4_32SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
// V4-16
|
|
#define UNPACK_V4_16SSE_4A(CL, TOTALCL, MaskType, ModeType) \
|
|
\
|
|
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpckhwd XMM_R1, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
punpckhwd XMM_R3, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R3, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 32; \
|
|
|
|
#define UNPACK_V4_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
movdqu XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpckhwd XMM_R3, XMM_R2; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R3, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 32; \
|
|
|
|
#define UNPACK_V4_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpckhwd XMM_R1, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 24; \
|
|
|
|
#define UNPACK_V4_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R2, qword ptr [VIF_SRC+16]; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 16; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 24; \
|
|
|
|
#define UNPACK_V4_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpckhwd XMM_R1, qword ptr [VIF_SRC]; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V4_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movq XMM_R1, qword ptr [VIF_SRC+8]; \
|
|
\
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R1, XMM_R1; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 16; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V4_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V4_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 16; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
// V4-8
|
|
#define UNPACK_V4_8SSE_4A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpckhbw XMM_R2, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpckhwd XMM_R3, XMM_R2; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R3, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V4_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
movdqu XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpckhbw XMM_R2, XMM_R0; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
\
|
|
punpckhwd XMM_R3, XMM_R2; \
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R3, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 16; \
|
|
|
|
#define UNPACK_V4_8SSE_3A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpckhbw XMM_R2, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_V4_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
movd XMM_R2, dword ptr [VIF_SRC+8]; \
|
|
\
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklbw XMM_R2, XMM_R2; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R2, 24; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 12; \
|
|
|
|
#define UNPACK_V4_8SSE_2A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V4_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
movq XMM_R0, qword ptr [VIF_SRC]; \
|
|
\
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
UNPACK_RIGHTSHIFT XMM_R1, 24; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V4_8SSE_1A(CL, TOTALCL, MaskType, ModeType) \
|
|
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
#define UNPACK_V4_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
movd XMM_R0, dword ptr [VIF_SRC]; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
UNPACK_RIGHTSHIFT XMM_R0, 24; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
// V4-5
|
|
.extern s_TempDecompress
|
|
|
|
#define DECOMPRESS_RGBA(OFFSET) \
|
|
mov %bl, %al; \
|
|
shl %bl, 3; \
|
|
mov byte ptr [s_TempDecompress+OFFSET], %bl; \
|
|
\
|
|
mov %bx, %ax; \
|
|
shr %bx, 2; \
|
|
and %bx, 0xf8; \
|
|
mov byte ptr [s_TempDecompress+OFFSET+1], %bl; \
|
|
\
|
|
mov %bx, %ax; \
|
|
shr %bx, 7; \
|
|
and %bx, 0xf8; \
|
|
mov byte ptr [s_TempDecompress+OFFSET+2], %bl; \
|
|
mov %bx, %ax; \
|
|
shr %bx, 8; \
|
|
and %bx, 0x80; \
|
|
mov byte ptr [s_TempDecompress+OFFSET+3], %bl; \
|
|
|
|
#define UNPACK_V4_5SSE_4(CL, TOTALCL, MaskType, ModeType) \
|
|
mov %eax, dword ptr [VIF_SRC]; \
|
|
DECOMPRESS_RGBA(0); \
|
|
\
|
|
shr %eax, 16; \
|
|
DECOMPRESS_RGBA(4); \
|
|
\
|
|
mov %eax, dword ptr [VIF_SRC+4]; \
|
|
DECOMPRESS_RGBA(8); \
|
|
\
|
|
shr %eax, 16; \
|
|
DECOMPRESS_RGBA(12); \
|
|
\
|
|
movdqa XMM_R0, qword ptr [s_TempDecompress]; \
|
|
\
|
|
punpckhbw XMM_R2, XMM_R0; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
\
|
|
punpckhwd XMM_R3, XMM_R2; \
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
psrld XMM_R0, 24; \
|
|
psrld XMM_R1, 24; \
|
|
psrld XMM_R2, 24; \
|
|
psrld XMM_R3, 24; \
|
|
\
|
|
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 8; \
|
|
|
|
#define UNPACK_V4_5SSE_4A UNPACK_V4_5SSE_4
|
|
|
|
#define UNPACK_V4_5SSE_3(CL, TOTALCL, MaskType, ModeType) \
|
|
mov %eax, dword ptr [VIF_SRC]; \
|
|
DECOMPRESS_RGBA(0); \
|
|
\
|
|
shr %eax, 16; \
|
|
DECOMPRESS_RGBA(4); \
|
|
\
|
|
mov %eax, dword ptr [VIF_SRC]; \
|
|
DECOMPRESS_RGBA(8); \
|
|
\
|
|
movdqa XMM_R0, qword ptr [s_TempDecompress]; \
|
|
\
|
|
punpckhbw XMM_R2, XMM_R0; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R2, XMM_R2; \
|
|
\
|
|
psrld XMM_R0, 24; \
|
|
psrld XMM_R1, 24; \
|
|
psrld XMM_R2, 24; \
|
|
\
|
|
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 6; \
|
|
|
|
#define UNPACK_V4_5SSE_3A UNPACK_V4_5SSE_3
|
|
|
|
#define UNPACK_V4_5SSE_2(CL, TOTALCL, MaskType, ModeType) \
|
|
mov %eax, dword ptr [VIF_SRC]; \
|
|
DECOMPRESS_RGBA(0); \
|
|
\
|
|
shr %eax, 16; \
|
|
DECOMPRESS_RGBA(4); \
|
|
\
|
|
movq XMM_R0, qword ptr [s_TempDecompress]; \
|
|
\
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
\
|
|
punpckhwd XMM_R1, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
psrld XMM_R0, 24; \
|
|
psrld XMM_R1, 24; \
|
|
\
|
|
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 4; \
|
|
|
|
#define UNPACK_V4_5SSE_2A UNPACK_V4_5SSE_2
|
|
|
|
#define UNPACK_V4_5SSE_1(CL, TOTALCL, MaskType, ModeType) \
|
|
mov %ax, word ptr [VIF_SRC]; \
|
|
DECOMPRESS_RGBA(0) \
|
|
\
|
|
movd XMM_R0, dword ptr [s_TempDecompress]; \
|
|
punpcklbw XMM_R0, XMM_R0; \
|
|
punpcklwd XMM_R0, XMM_R0; \
|
|
\
|
|
psrld XMM_R0, 24; \
|
|
\
|
|
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
|
|
\
|
|
add VIF_SRC, 2; \
|
|
|
|
#define UNPACK_V4_5SSE_1A UNPACK_V4_5SSE_1
|
|
|
|
#pragma warning(disable:4731)
|
|
|
|
#define SAVE_ROW_REG_BASE \
|
|
mov VIF_TMPADDR, _vifRow; \
|
|
movdqa qword ptr [VIF_TMPADDR], XMM_ROW; \
|
|
mov VIF_TMPADDR, _vifRegs; \
|
|
movss dword ptr [VIF_TMPADDR+0x100], XMM_ROW; \
|
|
psrldq XMM_ROW, 4; \
|
|
movss dword ptr [VIF_TMPADDR+0x110], XMM_ROW; \
|
|
psrldq XMM_ROW, 4; \
|
|
movss dword ptr [VIF_TMPADDR+0x120], XMM_ROW; \
|
|
psrldq XMM_ROW, 4; \
|
|
movss dword ptr [VIF_TMPADDR+0x130], XMM_ROW; \
|
|
|
|
#define SAVE_NO_REG
|
|
|
|
#ifdef __x86_64__
|
|
#define INIT_ARGS()
|
|
|
|
#define POP_REGS()
|
|
|
|
#define INC_STACK(reg) add %rsp, 8;
|
|
|
|
#else
|
|
|
|
// 32 bit versions have the args on the stack
|
|
#define INIT_ARGS() \
|
|
push %edi; \
|
|
push %esi; \
|
|
push %ebx; \
|
|
mov VIF_DST, dword ptr [%esp+4+12]; \
|
|
mov VIF_SRC, dword ptr [%esp+8+12]; \
|
|
mov VIF_SIZE, dword ptr [%esp+12+12]; \
|
|
|
|
|
|
#define POP_REGS() \
|
|
pop %ebx; \
|
|
pop %esi; \
|
|
pop %edi; \
|
|
|
|
#define INC_STACK(reg) add %esp, 4;
|
|
|
|
#endif
|
|
|
|
// qsize - bytes of compressed size of 1 decompressed qword
|
|
// int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType(u32* dest, u32* data, int dmasize)
|
|
|
|
#define defUNPACK_SkippingWrite(name, MaskType, ModeType, qsize, sign, SAVE_ROW_REG) \
|
|
.globl UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType; \
|
|
UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType: \
|
|
INIT_ARGS(); \
|
|
mov VIF_TMPADDR, _vifRegs; \
|
|
movzx VIF_INC, byte ptr [VIF_TMPADDR + 0x40]; \
|
|
movzx VIF_SAVEEBX, byte ptr [VIF_TMPADDR + 0x41]; \
|
|
sub VIF_INC, VIF_SAVEEBX; \
|
|
shl VIF_INC, 4; \
|
|
\
|
|
cmp VIF_SAVEEBXd, 1; \
|
|
je name##_##sign##_##MaskType##_##ModeType##_WL1; \
|
|
cmp VIF_SAVEEBXd, 2; \
|
|
je name##_##sign##_##MaskType##_##ModeType##_WL2; \
|
|
cmp VIF_SAVEEBXd, 3; \
|
|
je name##_##sign##_##MaskType##_##ModeType##_WL3; \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_WL4; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_WL1: \
|
|
UNPACK_Start_Setup_##MaskType##_SSE_##ModeType##(0); \
|
|
\
|
|
cmp VIF_SIZE, qsize; \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
|
|
\
|
|
add VIF_INC, 16; \
|
|
\
|
|
/* first align VIF_SRC to 16 bytes */ \
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_Align16: \
|
|
\
|
|
test VIF_SRC, 15; \
|
|
jz name##_##sign##_##MaskType##_##ModeType##_C1_UnpackAligned; \
|
|
\
|
|
UNPACK_##name##SSE_1(0, 1, MaskType, ModeType); \
|
|
\
|
|
cmp VIF_SIZE, (2*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec; \
|
|
sub VIF_SIZE, qsize; \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Align16; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_UnpackAligned: \
|
|
\
|
|
cmp VIF_SIZE, (2*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1; \
|
|
cmp VIF_SIZE, (3*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2; \
|
|
cmp VIF_SIZE, (4*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack3; \
|
|
prefetchnta [VIF_SRC + 64]; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack4: \
|
|
UNPACK_##name##SSE_4A(0, 1, MaskType, ModeType); \
|
|
\
|
|
cmp VIF_SIZE, (8*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_DoneUnpack4; \
|
|
sub VIF_SIZE, (4*qsize); \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Unpack4; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_DoneUnpack4: \
|
|
\
|
|
sub VIF_SIZE, (4*qsize); \
|
|
cmp VIF_SIZE, qsize; \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
|
|
cmp VIF_SIZE, (2*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1; \
|
|
cmp VIF_SIZE, (3*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2; \
|
|
/* fall through */ \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack3: \
|
|
UNPACK_##name##SSE_3A(0, 1, MaskType, ModeType); \
|
|
\
|
|
sub VIF_SIZE, (3*qsize); \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2: \
|
|
UNPACK_##name##SSE_2A(0, 1, MaskType, ModeType); \
|
|
\
|
|
sub VIF_SIZE, (2*qsize); \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1: \
|
|
UNPACK_##name##SSE_1A(0, 1, MaskType, ModeType); \
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec: \
|
|
sub VIF_SIZE, qsize; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C1_Done3: \
|
|
SAVE_ROW_REG; \
|
|
mov %eax, VIF_SIZE; \
|
|
POP_REGS(); \
|
|
ret; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_WL2: \
|
|
cmp VIF_SIZE, (2*qsize); \
|
|
\
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C2_Done3; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C2_Unpack: \
|
|
UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \
|
|
\
|
|
add VIF_DST, VIF_INC; /* take into account wl */ \
|
|
cmp VIF_SIZE, (4*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C2_Done2; \
|
|
sub VIF_SIZE, (2*qsize); \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C2_Unpack; /* unpack next */ \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C2_Done2: \
|
|
sub VIF_SIZE, (2*qsize); \
|
|
name##_##sign##_##MaskType##_##ModeType##_C2_Done3: \
|
|
cmp VIF_SIZE, qsize; \
|
|
/* execute left over qw */ \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C2_Done4; \
|
|
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \
|
|
\
|
|
sub VIF_SIZE, qsize; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C2_Done4: \
|
|
\
|
|
SAVE_ROW_REG; \
|
|
mov %eax, VIF_SIZE; \
|
|
POP_REGS(); \
|
|
ret; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_WL3: \
|
|
cmp VIF_SIZE, (3*qsize); \
|
|
\
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done5; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C3_Unpack: \
|
|
UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \
|
|
\
|
|
add VIF_DST, VIF_INC; /* take into account wl */ \
|
|
cmp VIF_SIZE, (6*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done2; \
|
|
sub VIF_SIZE, (3*qsize); \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C3_Unpack; /* unpack next */ \
|
|
name##_##sign##_##MaskType##_##ModeType##_C3_Done2: \
|
|
sub VIF_SIZE, (3*qsize); \
|
|
name##_##sign##_##MaskType##_##ModeType##_C3_Done5: \
|
|
cmp VIF_SIZE, qsize; \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done4; \
|
|
\
|
|
/* execute left over qw */ \
|
|
cmp VIF_SIZE, (2*qsize); \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done3; \
|
|
\
|
|
/* process 2 qws */ \
|
|
UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \
|
|
\
|
|
sub VIF_SIZE, (2*qsize); \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C3_Done4; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C3_Done3: \
|
|
/* process 1 qw */ \
|
|
sub VIF_SIZE, qsize; \
|
|
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \
|
|
name##_##sign##_##MaskType##_##ModeType##_C3_Done4: \
|
|
SAVE_ROW_REG; \
|
|
mov %eax, VIF_SIZE; \
|
|
POP_REGS(); \
|
|
ret; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_WL4: /* >= 4 */ \
|
|
sub VIF_SAVEEBX, 3; \
|
|
push VIF_INC; \
|
|
cmp VIF_SIZE, qsize; \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C4_Unpack: \
|
|
cmp VIF_SIZE, (3*qsize); \
|
|
jge name##_##sign##_##MaskType##_##ModeType##_C4_Unpack3; \
|
|
cmp VIF_SIZE, (2*qsize); \
|
|
jge name##_##sign##_##MaskType##_##ModeType##_C4_Unpack2; \
|
|
\
|
|
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType) \
|
|
\
|
|
/* not enough data left */ \
|
|
sub VIF_SIZE, qsize; \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C4_Unpack2: \
|
|
UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \
|
|
\
|
|
/* not enough data left */ \
|
|
sub VIF_SIZE, (2*qsize); \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C4_Unpack3: \
|
|
UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \
|
|
\
|
|
sub VIF_SIZE, (3*qsize); \
|
|
/* more data left, process 1qw at a time */ \
|
|
mov VIF_INC, VIF_SAVEEBX; \
|
|
\
|
|
name##_##sign##_##MaskType##_##ModeType##_C4_UnpackX: \
|
|
/* check if any data left */ \
|
|
cmp VIF_SIZE, qsize; \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
|
|
\
|
|
UNPACK_##name##SSE_1(3, 0, MaskType, ModeType); \
|
|
\
|
|
sub VIF_SIZE, qsize; \
|
|
cmp VIF_INC, 1; \
|
|
je name##_##sign##_##MaskType##_##ModeType##_C4_DoneLoop; \
|
|
sub VIF_INC, 1; \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C4_UnpackX; \
|
|
name##_##sign##_##MaskType##_##ModeType##_C4_DoneLoop: \
|
|
add VIF_DST, [VIF_ESP]; /* take into account wl */ \
|
|
cmp VIF_SIZE, qsize; \
|
|
jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
|
|
jmp name##_##sign##_##MaskType##_##ModeType##_C4_Unpack; /* unpack next */ \
|
|
name##_##sign##_##MaskType##_##ModeType##_C4_Done: \
|
|
\
|
|
SAVE_ROW_REG; \
|
|
INC_STACK(); \
|
|
mov %eax, VIF_SIZE; \
|
|
POP_REGS(); \
|
|
ret; \
|
|
|
|
#define UNPACK_RIGHTSHIFT psrld
|
|
#define defUNPACK_SkippingWrite2(name, qsize) \
|
|
defUNPACK_SkippingWrite(name, Regular, 0, qsize, u, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Regular, 1, qsize, u, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Regular, 2, qsize, u, SAVE_ROW_REG_BASE) \
|
|
defUNPACK_SkippingWrite(name, Mask, 0, qsize, u, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Mask, 1, qsize, u, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Mask, 2, qsize, u, SAVE_ROW_REG_BASE) \
|
|
defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, u, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, u, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, u, SAVE_ROW_REG_BASE) \
|
|
|
|
defUNPACK_SkippingWrite2(S_32, 4)
|
|
defUNPACK_SkippingWrite2(S_16, 2)
|
|
defUNPACK_SkippingWrite2(S_8, 1)
|
|
defUNPACK_SkippingWrite2(V2_32, 8)
|
|
defUNPACK_SkippingWrite2(V2_16, 4)
|
|
defUNPACK_SkippingWrite2(V2_8, 2)
|
|
defUNPACK_SkippingWrite2(V3_32, 12)
|
|
defUNPACK_SkippingWrite2(V3_16, 6)
|
|
defUNPACK_SkippingWrite2(V3_8, 3)
|
|
defUNPACK_SkippingWrite2(V4_32, 16)
|
|
defUNPACK_SkippingWrite2(V4_16, 8)
|
|
defUNPACK_SkippingWrite2(V4_8, 4)
|
|
defUNPACK_SkippingWrite2(V4_5, 2)
|
|
|
|
#undef UNPACK_RIGHTSHIFT
|
|
#undef defUNPACK_SkippingWrite2
|
|
|
|
#define UNPACK_RIGHTSHIFT psrad
|
|
#define defUNPACK_SkippingWrite2(name, qsize) \
|
|
defUNPACK_SkippingWrite(name, Mask, 0, qsize, s, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Regular, 0, qsize, s, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Regular, 1, qsize, s, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Regular, 2, qsize, s, SAVE_ROW_REG_BASE) \
|
|
defUNPACK_SkippingWrite(name, Mask, 1, qsize, s, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, Mask, 2, qsize, s, SAVE_ROW_REG_BASE) \
|
|
defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, s, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, s, SAVE_NO_REG) \
|
|
defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, s, SAVE_ROW_REG_BASE) \
|
|
|
|
defUNPACK_SkippingWrite2(S_16, 2)
|
|
defUNPACK_SkippingWrite2(S_8, 1)
|
|
defUNPACK_SkippingWrite2(V2_16, 4)
|
|
defUNPACK_SkippingWrite2(V2_8, 2)
|
|
defUNPACK_SkippingWrite2(V3_16, 6)
|
|
defUNPACK_SkippingWrite2(V3_8, 3)
|
|
defUNPACK_SkippingWrite2(V4_16, 8)
|
|
defUNPACK_SkippingWrite2(V4_8, 4)
|
|
|
|
#undef UNPACK_RIGHTSHIFT
|
|
#undef defUNPACK_SkippingWrite2
|