pcsx2/x86/aVif.S
zerofrog df521ae24f 0.9.4 release
git-svn-id: http://pcsx2.googlecode.com/svn/branches/pcsx2_0.9.4@186 96395faa-99c1-11dd-bbfe-3dabce05a288
2007-11-11 02:55:00 +00:00

1632 lines
47 KiB
ArmAsm

/*Pcsx2 - Pc Ps2 Emulator
Copyright (C) 2002-2007 Pcsx2 Team
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
.intel_syntax
.extern _vifRegs
.extern _vifMaskRegs
.extern _vifRow
#ifdef __x86_64__
#define VIF_ESP %rsp
#define VIF_SRC %rsi
#define VIF_INC %rcx
#define VIF_DST %rdi
#define VIF_SIZE %edx
#define VIF_TMPADDR %rax
#define VIF_SAVEEBX %r8
#define VIF_SAVEEBXd %r8d
#else
#define VIF_ESP %esp
#define VIF_SRC %esi
#define VIF_INC %ecx
#define VIF_DST %edi
#define VIF_SIZE %edx
#define VIF_TMPADDR %eax
#define VIF_SAVEEBX %ebx
#define VIF_SAVEEBXd %ebx
#endif
#define XMM_R0 %xmm0
#define XMM_R1 %xmm1
#define XMM_R2 %xmm2
#define XMM_WRITEMASK %xmm3
#define XMM_ROWMASK %xmm4
#define XMM_ROWCOLMASK %xmm5
#define XMM_ROW %xmm6
#define XMM_COL %xmm7
#define XMM_R3 XMM_COL
// writing masks
#define UNPACK_Write0_Regular(r0, CL, DEST_OFFSET, MOVDQA) \
MOVDQA qword ptr [VIF_DST+DEST_OFFSET], r0;
#define UNPACK_Write1_Regular(r0, CL, DEST_OFFSET, MOVDQA) \
MOVDQA qword ptr [VIF_DST], r0; \
add VIF_DST, VIF_INC; \
#define UNPACK_Write0_Mask UNPACK_Write0_Regular
#define UNPACK_Write1_Mask UNPACK_Write1_Regular
// masked write (dest needs to be in edi)
#define UNPACK_Write0_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 48]; \
pand r0, XMM_WRITEMASK; \
pandn XMM_WRITEMASK, qword ptr [VIF_DST]; \
por r0, XMM_WRITEMASK; \
MOVDQA qword ptr [VIF_DST], r0; \
add VIF_DST, 16; \
// masked write (dest needs to be in edi)
#define UNPACK_Write1_WriteMask(r0, CL, DEST_OFFSET, MOVDQA) \
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(0) + 48]; \
pand r0, XMM_WRITEMASK; \
pandn XMM_WRITEMASK, qword ptr [VIF_DST]; \
por r0, XMM_WRITEMASK; \
MOVDQA qword ptr [VIF_DST], r0; \
add VIF_DST, VIF_INC; \
#define UNPACK_Mask_SSE_0(r0) \
pand r0, XMM_WRITEMASK; \
por r0, XMM_ROWCOLMASK; \
// once a qword is uncomprssed, applies masks and saves
// note: modifying XMM_WRITEMASK
// dest = row + write (only when mask=0), otherwise write
#define UNPACK_Mask_SSE_1(r0) \
pand r0, XMM_WRITEMASK; \
por r0, XMM_ROWCOLMASK; \
pand XMM_WRITEMASK, XMM_ROW; \
paddd r0, XMM_WRITEMASK; \
// dest = row + write (only when mask=0), otherwise write
// row = row + write (only when mask = 0), otherwise row
#define UNPACK_Mask_SSE_2(r0) \
pand r0, XMM_WRITEMASK; \
pand XMM_WRITEMASK, XMM_ROW; \
paddd XMM_ROW, r0; \
por r0, XMM_ROWCOLMASK; \
paddd r0, XMM_WRITEMASK; \
#define UNPACK_WriteMask_SSE_0 UNPACK_Mask_SSE_0
#define UNPACK_WriteMask_SSE_1 UNPACK_Mask_SSE_1
#define UNPACK_WriteMask_SSE_2 UNPACK_Mask_SSE_2
#define UNPACK_Regular_SSE_0(r0)
#define UNPACK_Regular_SSE_1(r0) \
paddd r0, XMM_ROW; \
#define UNPACK_Regular_SSE_2(r0) \
paddd r0, XMM_ROW; \
movdqa XMM_ROW, r0; \
// setting up masks
#define UNPACK_Setup_Mask_SSE(CL) \
mov VIF_TMPADDR, _vifMaskRegs; \
movdqa XMM_ROWMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 16]; \
movdqa XMM_ROWCOLMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 32]; \
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(CL)]; \
pand XMM_ROWMASK, XMM_ROW; \
pand XMM_ROWCOLMASK, XMM_COL; \
por XMM_ROWCOLMASK, XMM_ROWMASK; \
#define UNPACK_Start_Setup_Mask_SSE_0(CL) UNPACK_Setup_Mask_SSE(CL)
#define UNPACK_Start_Setup_Mask_SSE_1(CL) \
mov VIF_TMPADDR, _vifMaskRegs; \
movdqa XMM_ROWMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 16]; \
movdqa XMM_ROWCOLMASK, qword ptr [VIF_TMPADDR + 64*(CL) + 32]; \
pand XMM_ROWMASK, XMM_ROW; \
pand XMM_ROWCOLMASK, XMM_COL; \
por XMM_ROWCOLMASK, XMM_ROWMASK; \
#define UNPACK_Start_Setup_Mask_SSE_2(CL)
#define UNPACK_Setup_Mask_SSE_0_1(CL)
#define UNPACK_Setup_Mask_SSE_1_1(CL) \
mov VIF_TMPADDR, _vifMaskRegs; \
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(0)]; \
// ignore CL, since vif.cycle.wl == 1
#define UNPACK_Setup_Mask_SSE_2_1(CL) \
mov VIF_TMPADDR, _vifMaskRegs; \
movdqa XMM_ROWMASK, qword ptr [VIF_TMPADDR + 64*(0) + 16]; \
movdqa XMM_ROWCOLMASK, qword ptr [VIF_TMPADDR + 64*(0) + 32]; \
movdqa XMM_WRITEMASK, qword ptr [VIF_TMPADDR + 64*(0)]; \
pand XMM_ROWMASK, XMM_ROW; \
pand XMM_ROWCOLMASK, XMM_COL; \
por XMM_ROWCOLMASK, XMM_ROWMASK; \
#define UNPACK_Setup_Mask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL)
#define UNPACK_Setup_Mask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL)
#define UNPACK_Setup_Mask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL)
// write mask always destroys XMM_WRITEMASK, so 0_0 = 1_0
#define UNPACK_Setup_WriteMask_SSE_0_0(CL) UNPACK_Setup_Mask_SSE(CL)
#define UNPACK_Setup_WriteMask_SSE_1_0(CL) UNPACK_Setup_Mask_SSE(CL)
#define UNPACK_Setup_WriteMask_SSE_2_0(CL) UNPACK_Setup_Mask_SSE(CL)
#define UNPACK_Setup_WriteMask_SSE_0_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL)
#define UNPACK_Setup_WriteMask_SSE_1_1(CL) UNPACK_Setup_Mask_SSE_1_1(CL)
#define UNPACK_Setup_WriteMask_SSE_2_1(CL) UNPACK_Setup_Mask_SSE_2_1(CL)
#define UNPACK_Start_Setup_WriteMask_SSE_0(CL) UNPACK_Start_Setup_Mask_SSE_1(CL)
#define UNPACK_Start_Setup_WriteMask_SSE_1(CL) UNPACK_Start_Setup_Mask_SSE_1(CL)
#define UNPACK_Start_Setup_WriteMask_SSE_2(CL) UNPACK_Start_Setup_Mask_SSE_2(CL)
#define UNPACK_Start_Setup_Regular_SSE_0(CL)
#define UNPACK_Start_Setup_Regular_SSE_1(CL)
#define UNPACK_Start_Setup_Regular_SSE_2(CL)
#define UNPACK_Setup_Regular_SSE_0_0(CL)
#define UNPACK_Setup_Regular_SSE_1_0(CL)
#define UNPACK_Setup_Regular_SSE_2_0(CL)
#define UNPACK_Setup_Regular_SSE_0_1(CL)
#define UNPACK_Setup_Regular_SSE_1_1(CL)
#define UNPACK_Setup_Regular_SSE_2_1(CL)
#define UNPACK_INC_DST_0_Regular(qw) add VIF_DST, (16*qw)
#define UNPACK_INC_DST_1_Regular(qw)
#define UNPACK_INC_DST_0_Mask(qw) add VIF_DST, (16*qw)
#define UNPACK_INC_DST_1_Mask(qw)
#define UNPACK_INC_DST_0_WriteMask(qw)
#define UNPACK_INC_DST_1_WriteMask(qw)
// unpacks for 1,2,3,4 elements (V3 uses this directly)
#define UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType) \
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \
\
UNPACK_INC_DST_##TOTALCL##_##MaskType##(4)
// V3 uses this directly
#define UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType) \
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
\
UNPACK_INC_DST_##TOTALCL##_##MaskType##(3); \
#define UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType) \
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
\
UNPACK_INC_DST_##TOTALCL##_##MaskType##(2); \
#define UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType) \
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
\
UNPACK_INC_DST_##TOTALCL##_##MaskType##(1); \
// S-32
// only when cl==1
#define UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
MOVDQA XMM_R3, qword ptr [VIF_SRC]; \
\
pshufd XMM_R0, XMM_R3, 0; \
pshufd XMM_R1, XMM_R3, 0x55; \
pshufd XMM_R2, XMM_R3, 0xaa; \
pshufd XMM_R3, XMM_R3, 0xff; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_S_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa)
#define UNPACK_S_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu)
#define UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
MOVDQA XMM_R2, qword ptr [VIF_SRC]; \
\
pshufd XMM_R0, XMM_R2, 0; \
pshufd XMM_R1, XMM_R2, 0x55; \
pshufd XMM_R2, XMM_R2, 0xaa; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_S_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa)
#define UNPACK_S_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_S_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu)
#define UNPACK_S_32SSE_2(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R1, qword ptr [VIF_SRC]; \
\
pshufd XMM_R0, XMM_R1, 0; \
pshufd XMM_R1, XMM_R1, 0x55; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_S_32SSE_2A UNPACK_S_32SSE_2
#define UNPACK_S_32SSE_1(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
pshufd XMM_R0, XMM_R0, 0; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
#define UNPACK_S_32SSE_1A UNPACK_S_32SSE_1
// S-16
#define UNPACK_S_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R3, qword ptr [VIF_SRC]; \
punpcklwd XMM_R3, XMM_R3; \
UNPACK_RIGHTSHIFT XMM_R3, 16; \
\
pshufd XMM_R0, XMM_R3, 0; \
pshufd XMM_R1, XMM_R3, 0x55; \
pshufd XMM_R2, XMM_R3, 0xaa; \
pshufd XMM_R3, XMM_R3, 0xff; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_S_16SSE_4A UNPACK_S_16SSE_4
#define UNPACK_S_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R2, qword ptr [VIF_SRC]; \
punpcklwd XMM_R2, XMM_R2; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
pshufd XMM_R0, XMM_R2, 0; \
pshufd XMM_R1, XMM_R2, 0x55; \
pshufd XMM_R2, XMM_R2, 0xaa; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
add VIF_SRC, 6; \
#define UNPACK_S_16SSE_3A UNPACK_S_16SSE_3
#define UNPACK_S_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R1, dword ptr [VIF_SRC]; \
punpcklwd XMM_R1, XMM_R1; \
UNPACK_RIGHTSHIFT XMM_R1, 16; \
\
pshufd XMM_R0, XMM_R1, 0; \
pshufd XMM_R1, XMM_R1, 0x55; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
#define UNPACK_S_16SSE_2A UNPACK_S_16SSE_2
#define UNPACK_S_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
pshufd XMM_R0, XMM_R0, 0; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 2; \
#define UNPACK_S_16SSE_1A UNPACK_S_16SSE_1
// S-8
#define UNPACK_S_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R3, dword ptr [VIF_SRC]; \
punpcklbw XMM_R3, XMM_R3; \
punpcklwd XMM_R3, XMM_R3; \
UNPACK_RIGHTSHIFT XMM_R3, 24; \
\
pshufd XMM_R0, XMM_R3, 0; \
pshufd XMM_R1, XMM_R3, 0x55; \
pshufd XMM_R2, XMM_R3, 0xaa; \
pshufd XMM_R3, XMM_R3, 0xff; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
#define UNPACK_S_8SSE_4A UNPACK_S_8SSE_4
#define UNPACK_S_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R2, dword ptr [VIF_SRC]; \
punpcklbw XMM_R2, XMM_R2; \
punpcklwd XMM_R2, XMM_R2; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
pshufd XMM_R0, XMM_R2, 0; \
pshufd XMM_R1, XMM_R2, 0x55; \
pshufd XMM_R2, XMM_R2, 0xaa; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 3; \
#define UNPACK_S_8SSE_3A UNPACK_S_8SSE_3
#define UNPACK_S_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R1, dword ptr [VIF_SRC]; \
punpcklbw XMM_R1, XMM_R1; \
punpcklwd XMM_R1, XMM_R1; \
UNPACK_RIGHTSHIFT XMM_R1, 24; \
\
pshufd XMM_R0, XMM_R1, 0; \
pshufd XMM_R1, XMM_R1, 0x55; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 2; \
#define UNPACK_S_8SSE_2A UNPACK_S_8SSE_2
#define UNPACK_S_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
punpcklbw XMM_R0, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
pshufd XMM_R0, XMM_R0, 0; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
inc VIF_SRC; \
#define UNPACK_S_8SSE_1A UNPACK_S_8SSE_1
// V2-32
#define UNPACK_V2_32SSE_4A(CL, TOTALCL, MaskType, ModeType) \
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
MOVDQA XMM_R2, qword ptr [VIF_SRC+16]; \
\
pshufd XMM_R1, XMM_R0, 0xee; \
pshufd XMM_R3, XMM_R2, 0xee; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 32; \
#define UNPACK_V2_32SSE_4(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R1, qword ptr [VIF_SRC+8]; \
movq XMM_R2, qword ptr [VIF_SRC+16]; \
movq XMM_R3, qword ptr [VIF_SRC+24]; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 32; \
#define UNPACK_V2_32SSE_3A(CL, TOTALCL, MaskType, ModeType) \
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R2, qword ptr [VIF_SRC+16]; \
pshufd XMM_R1, XMM_R0, 0xee; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 24; \
#define UNPACK_V2_32SSE_3(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R1, qword ptr [VIF_SRC+8]; \
movq XMM_R2, qword ptr [VIF_SRC+16]; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 24; \
#define UNPACK_V2_32SSE_2(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R1, qword ptr [VIF_SRC+8]; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_V2_32SSE_2A UNPACK_V2_32SSE_2
#define UNPACK_V2_32SSE_1(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V2_32SSE_1A UNPACK_V2_32SSE_1
// V2-16
// due to lemmings, have to copy lower qword to the upper qword of every reg
#define UNPACK_V2_16SSE_4A(CL, TOTALCL, MaskType, ModeType) \
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
punpckhwd XMM_R2, qword ptr [VIF_SRC]; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
punpckhqdq XMM_R1, XMM_R0; \
punpckhqdq XMM_R3, XMM_R2; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpcklqdq XMM_R2, XMM_R2; \
punpckhqdq XMM_R1, XMM_R1; \
punpckhqdq XMM_R3, XMM_R3; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
add VIF_SRC, 16; \
#define UNPACK_V2_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
\
punpckhwd XMM_R2, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
punpckhqdq XMM_R1, XMM_R0; \
punpckhqdq XMM_R3, XMM_R2; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpcklqdq XMM_R2, XMM_R2; \
punpckhqdq XMM_R1, XMM_R1; \
punpckhqdq XMM_R3, XMM_R3; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_V2_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
punpckhwd XMM_R2, qword ptr [VIF_SRC]; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
punpckhqdq XMM_R1, XMM_R0; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpcklqdq XMM_R2, XMM_R2; \
punpckhqdq XMM_R1, XMM_R1; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_V2_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
\
punpckhwd XMM_R2, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
punpckhqdq XMM_R1, XMM_R0; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpcklqdq XMM_R2, XMM_R2; \
punpckhqdq XMM_R1, XMM_R1; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_V2_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
\
punpckhqdq XMM_R1, XMM_R0; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpckhqdq XMM_R1, XMM_R1; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V2_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
\
punpckhqdq XMM_R1, XMM_R0; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpckhqdq XMM_R1, XMM_R1; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V2_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \
punpcklwd XMM_R0, dword ptr [VIF_SRC]; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
punpcklqdq XMM_R0, XMM_R0; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
#define UNPACK_V2_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
punpcklqdq XMM_R0, XMM_R0; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
// V2-8
// and1 streetball needs to copy lower qword to the upper qword of every reg
#define UNPACK_V2_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
\
punpcklbw XMM_R0, XMM_R0; \
punpckhwd XMM_R2, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
punpckhqdq XMM_R1, XMM_R0; \
punpckhqdq XMM_R3, XMM_R2; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpcklqdq XMM_R2, XMM_R2; \
punpckhqdq XMM_R1, XMM_R1; \
punpckhqdq XMM_R3, XMM_R3; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V2_8SSE_4A UNPACK_V2_8SSE_4
#define UNPACK_V2_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
\
punpcklbw XMM_R0, XMM_R0; \
punpckhwd XMM_R2, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
punpckhqdq XMM_R1, XMM_R0; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpcklqdq XMM_R2, XMM_R2; \
punpckhqdq XMM_R1, XMM_R1; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 6; \
#define UNPACK_V2_8SSE_3A UNPACK_V2_8SSE_3
#define UNPACK_V2_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
punpcklbw XMM_R0, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
\
punpckhqdq XMM_R1, XMM_R0; \
\
punpcklqdq XMM_R0, XMM_R0; \
punpckhqdq XMM_R1, XMM_R1; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
#define UNPACK_V2_8SSE_2A UNPACK_V2_8SSE_2
#define UNPACK_V2_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
punpcklbw XMM_R0, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
punpcklqdq XMM_R0, XMM_R0; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 2; \
#define UNPACK_V2_8SSE_1A UNPACK_V2_8SSE_1
// V3-32
// midnight club 2 crashes because reading a qw at +36 is out of bounds
#define UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
movdqu XMM_R1, qword ptr [VIF_SRC+12]; \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+0); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
\
MOVDQA XMM_R3, qword ptr [VIF_SRC+32]; \
movdqu XMM_R2, qword ptr [VIF_SRC+24]; \
psrldq XMM_R3, 4; \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+3); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R3); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R3, CL+3, 48, movdqa); \
\
UNPACK_INC_DST_##TOTALCL##_##MaskType##(4); \
\
add VIF_SRC, 48; \
#define UNPACK_V3_32SSE_4A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqa)
#define UNPACK_V3_32SSE_4(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_4x(CL, TOTALCL, MaskType, ModeType, movdqu)
#define UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
movdqu XMM_R1, qword ptr [VIF_SRC+12]; \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R0); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R0, CL, 0, movdqa); \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+1); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R1); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R1, CL+1, 16, movdqa); \
\
movdqu XMM_R2, qword ptr [VIF_SRC+24]; \
\
UNPACK_Setup_##MaskType##_SSE_##ModeType##_##TOTALCL##(CL+2); \
UNPACK_##MaskType##_SSE_##ModeType##(XMM_R2); \
UNPACK_Write##TOTALCL##_##MaskType##(XMM_R2, CL+2, 32, movdqa); \
\
UNPACK_INC_DST_##TOTALCL##_##MaskType##(3); \
\
add VIF_SRC, 36; \
#define UNPACK_V3_32SSE_3A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqa)
#define UNPACK_V3_32SSE_3(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_3x(CL, TOTALCL, MaskType, ModeType, movdqu)
#define UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
movdqu XMM_R1, qword ptr [VIF_SRC+12]; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 24; \
#define UNPACK_V3_32SSE_2A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqa)
#define UNPACK_V3_32SSE_2(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_2x(CL, TOTALCL, MaskType, ModeType, movdqu)
#define UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, MOVDQA) \
MOVDQA XMM_R0, qword ptr [VIF_SRC]; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_V3_32SSE_1A(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqa)
#define UNPACK_V3_32SSE_1(CL, TOTALCL, MaskType, ModeType) UNPACK_V3_32SSE_1x(CL, TOTALCL, MaskType, ModeType, movdqu)
// V3-16
#define UNPACK_V3_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R1, qword ptr [VIF_SRC+6]; \
\
punpcklwd XMM_R0, XMM_R0; \
movq XMM_R2, qword ptr [VIF_SRC+12]; \
punpcklwd XMM_R1, XMM_R1; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
movq XMM_R3, qword ptr [VIF_SRC+18]; \
UNPACK_RIGHTSHIFT XMM_R1, 16; \
punpcklwd XMM_R2, XMM_R2; \
punpcklwd XMM_R3, XMM_R3; \
\
UNPACK_RIGHTSHIFT XMM_R2, 16; \
UNPACK_RIGHTSHIFT XMM_R3, 16; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 24; \
#define UNPACK_V3_16SSE_4A UNPACK_V3_16SSE_4
#define UNPACK_V3_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R1, qword ptr [VIF_SRC+6]; \
\
punpcklwd XMM_R0, XMM_R0; \
movq XMM_R2, qword ptr [VIF_SRC+12]; \
punpcklwd XMM_R1, XMM_R1; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
punpcklwd XMM_R2, XMM_R2; \
\
UNPACK_RIGHTSHIFT XMM_R1, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 18; \
#define UNPACK_V3_16SSE_3A UNPACK_V3_16SSE_3
#define UNPACK_V3_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R1, qword ptr [VIF_SRC+6]; \
\
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R1, XMM_R1; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R1, 16; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_V3_16SSE_2A UNPACK_V3_16SSE_2
#define UNPACK_V3_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 6; \
#define UNPACK_V3_16SSE_1A UNPACK_V3_16SSE_1
// V3-8
#define UNPACK_V3_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R1, qword ptr [VIF_SRC]; \
movq XMM_R3, qword ptr [VIF_SRC+6]; \
\
punpcklbw XMM_R1, XMM_R1; \
punpcklbw XMM_R3, XMM_R3; \
punpcklwd XMM_R0, XMM_R1; \
psrldq XMM_R1, 6; \
punpcklwd XMM_R2, XMM_R3; \
psrldq XMM_R3, 6; \
punpcklwd XMM_R1, XMM_R1; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
punpcklwd XMM_R3, XMM_R3; \
\
UNPACK_RIGHTSHIFT XMM_R2, 24; \
UNPACK_RIGHTSHIFT XMM_R1, 24; \
UNPACK_RIGHTSHIFT XMM_R3, 24; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_V3_8SSE_4A UNPACK_V3_8SSE_4
#define UNPACK_V3_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, word ptr [VIF_SRC]; \
movd XMM_R1, dword ptr [VIF_SRC+3]; \
\
punpcklbw XMM_R0, XMM_R0; \
movd XMM_R2, dword ptr [VIF_SRC+6]; \
punpcklbw XMM_R1, XMM_R1; \
punpcklwd XMM_R0, XMM_R0; \
punpcklbw XMM_R2, XMM_R2; \
\
punpcklwd XMM_R1, XMM_R1; \
punpcklwd XMM_R2, XMM_R2; \
\
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R1, 24; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 9 \
#define UNPACK_V3_8SSE_3A UNPACK_V3_8SSE_3
#define UNPACK_V3_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
movd XMM_R1, dword ptr [VIF_SRC+3]; \
\
punpcklbw XMM_R0, XMM_R0; \
punpcklbw XMM_R1, XMM_R1; \
\
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R1, XMM_R1; \
\
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R1, 24; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 6; \
#define UNPACK_V3_8SSE_2A UNPACK_V3_8SSE_2
#define UNPACK_V3_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
punpcklbw XMM_R0, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 3; \
#define UNPACK_V3_8SSE_1A UNPACK_V3_8SSE_1
// V4-32
#define UNPACK_V4_32SSE_4A(CL, TOTALCL, MaskType, ModeType) \
movdqa XMM_R0, qword ptr [VIF_SRC]; \
movdqa XMM_R1, qword ptr [VIF_SRC+16]; \
movdqa XMM_R2, qword ptr [VIF_SRC+32]; \
movdqa XMM_R3, qword ptr [VIF_SRC+48]; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 64; \
#define UNPACK_V4_32SSE_4(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
movdqu XMM_R1, qword ptr [VIF_SRC+16]; \
movdqu XMM_R2, qword ptr [VIF_SRC+32]; \
movdqu XMM_R3, qword ptr [VIF_SRC+48]; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 64; \
#define UNPACK_V4_32SSE_3A(CL, TOTALCL, MaskType, ModeType) \
movdqa XMM_R0, qword ptr [VIF_SRC]; \
movdqa XMM_R1, qword ptr [VIF_SRC+16]; \
movdqa XMM_R2, qword ptr [VIF_SRC+32]; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 48; \
#define UNPACK_V4_32SSE_3(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
movdqu XMM_R1, qword ptr [VIF_SRC+16]; \
movdqu XMM_R2, qword ptr [VIF_SRC+32]; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 48; \
#define UNPACK_V4_32SSE_2A(CL, TOTALCL, MaskType, ModeType) \
movdqa XMM_R0, qword ptr [VIF_SRC]; \
movdqa XMM_R1, qword ptr [VIF_SRC+16]; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 32; \
#define UNPACK_V4_32SSE_2(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
movdqu XMM_R1, qword ptr [VIF_SRC+16]; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 32; \
#define UNPACK_V4_32SSE_1A(CL, TOTALCL, MaskType, ModeType) \
movdqa XMM_R0, qword ptr [VIF_SRC]; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_V4_32SSE_1(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
// V4-16
#define UNPACK_V4_16SSE_4A(CL, TOTALCL, MaskType, ModeType) \
\
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
punpckhwd XMM_R1, qword ptr [VIF_SRC]; \
punpcklwd XMM_R2, qword ptr [VIF_SRC+16]; \
punpckhwd XMM_R3, qword ptr [VIF_SRC+16]; \
\
UNPACK_RIGHTSHIFT XMM_R1, 16; \
UNPACK_RIGHTSHIFT XMM_R3, 16; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 32; \
#define UNPACK_V4_16SSE_4(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
movdqu XMM_R2, qword ptr [VIF_SRC+16]; \
\
punpckhwd XMM_R1, XMM_R0; \
punpckhwd XMM_R3, XMM_R2; \
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
\
UNPACK_RIGHTSHIFT XMM_R1, 16; \
UNPACK_RIGHTSHIFT XMM_R3, 16; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 32; \
#define UNPACK_V4_16SSE_3A(CL, TOTALCL, MaskType, ModeType) \
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
punpckhwd XMM_R1, qword ptr [VIF_SRC]; \
punpcklwd XMM_R2, qword ptr [VIF_SRC+16]; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R1, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 24; \
#define UNPACK_V4_16SSE_3(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R2, qword ptr [VIF_SRC+16]; \
\
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R1, 16; \
UNPACK_RIGHTSHIFT XMM_R2, 16; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 24; \
#define UNPACK_V4_16SSE_2A(CL, TOTALCL, MaskType, ModeType) \
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
punpckhwd XMM_R1, qword ptr [VIF_SRC]; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R1, 16; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_V4_16SSE_2(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movq XMM_R1, qword ptr [VIF_SRC+8]; \
\
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R1, XMM_R1; \
\
UNPACK_RIGHTSHIFT XMM_R0, 16; \
UNPACK_RIGHTSHIFT XMM_R1, 16; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_V4_16SSE_1A(CL, TOTALCL, MaskType, ModeType) \
punpcklwd XMM_R0, qword ptr [VIF_SRC]; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V4_16SSE_1(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 16; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
// V4-8
#define UNPACK_V4_8SSE_4A(CL, TOTALCL, MaskType, ModeType) \
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
punpckhbw XMM_R2, qword ptr [VIF_SRC]; \
\
punpckhwd XMM_R1, XMM_R0; \
punpckhwd XMM_R3, XMM_R2; \
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
\
UNPACK_RIGHTSHIFT XMM_R1, 24; \
UNPACK_RIGHTSHIFT XMM_R3, 24; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_V4_8SSE_4(CL, TOTALCL, MaskType, ModeType) \
movdqu XMM_R0, qword ptr [VIF_SRC]; \
\
punpckhbw XMM_R2, XMM_R0; \
punpcklbw XMM_R0, XMM_R0; \
\
punpckhwd XMM_R3, XMM_R2; \
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R3, 24; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R1, 24; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 16; \
#define UNPACK_V4_8SSE_3A(CL, TOTALCL, MaskType, ModeType) \
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
punpckhbw XMM_R2, qword ptr [VIF_SRC]; \
\
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
\
UNPACK_RIGHTSHIFT XMM_R1, 24; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_V4_8SSE_3(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
movd XMM_R2, dword ptr [VIF_SRC+8]; \
\
punpcklbw XMM_R0, XMM_R0; \
punpcklbw XMM_R2, XMM_R2; \
\
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R1, 24; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
UNPACK_RIGHTSHIFT XMM_R2, 24; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 12; \
#define UNPACK_V4_8SSE_2A(CL, TOTALCL, MaskType, ModeType) \
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
\
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R1, 24; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V4_8SSE_2(CL, TOTALCL, MaskType, ModeType) \
movq XMM_R0, qword ptr [VIF_SRC]; \
\
punpcklbw XMM_R0, XMM_R0; \
\
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
UNPACK_RIGHTSHIFT XMM_R1, 24; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V4_8SSE_1A(CL, TOTALCL, MaskType, ModeType) \
punpcklbw XMM_R0, qword ptr [VIF_SRC]; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
#define UNPACK_V4_8SSE_1(CL, TOTALCL, MaskType, ModeType) \
movd XMM_R0, dword ptr [VIF_SRC]; \
punpcklbw XMM_R0, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
UNPACK_RIGHTSHIFT XMM_R0, 24; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
// V4-5
.extern s_TempDecompress
#define DECOMPRESS_RGBA(OFFSET) \
mov %bl, %al; \
shl %bl, 3; \
mov byte ptr [s_TempDecompress+OFFSET], %bl; \
\
mov %bx, %ax; \
shr %bx, 2; \
and %bx, 0xf8; \
mov byte ptr [s_TempDecompress+OFFSET+1], %bl; \
\
mov %bx, %ax; \
shr %bx, 7; \
and %bx, 0xf8; \
mov byte ptr [s_TempDecompress+OFFSET+2], %bl; \
mov %bx, %ax; \
shr %bx, 8; \
and %bx, 0x80; \
mov byte ptr [s_TempDecompress+OFFSET+3], %bl; \
#define UNPACK_V4_5SSE_4(CL, TOTALCL, MaskType, ModeType) \
mov %eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0); \
\
shr %eax, 16; \
DECOMPRESS_RGBA(4); \
\
mov %eax, dword ptr [VIF_SRC+4]; \
DECOMPRESS_RGBA(8); \
\
shr %eax, 16; \
DECOMPRESS_RGBA(12); \
\
movdqa XMM_R0, qword ptr [s_TempDecompress]; \
\
punpckhbw XMM_R2, XMM_R0; \
punpcklbw XMM_R0, XMM_R0; \
\
punpckhwd XMM_R3, XMM_R2; \
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
\
psrld XMM_R0, 24; \
psrld XMM_R1, 24; \
psrld XMM_R2, 24; \
psrld XMM_R3, 24; \
\
UNPACK4_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 8; \
#define UNPACK_V4_5SSE_4A UNPACK_V4_5SSE_4
#define UNPACK_V4_5SSE_3(CL, TOTALCL, MaskType, ModeType) \
mov %eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0); \
\
shr %eax, 16; \
DECOMPRESS_RGBA(4); \
\
mov %eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(8); \
\
movdqa XMM_R0, qword ptr [s_TempDecompress]; \
\
punpckhbw XMM_R2, XMM_R0; \
punpcklbw XMM_R0, XMM_R0; \
\
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
punpcklwd XMM_R2, XMM_R2; \
\
psrld XMM_R0, 24; \
psrld XMM_R1, 24; \
psrld XMM_R2, 24; \
\
UNPACK3_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 6; \
#define UNPACK_V4_5SSE_3A UNPACK_V4_5SSE_3
#define UNPACK_V4_5SSE_2(CL, TOTALCL, MaskType, ModeType) \
mov %eax, dword ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0); \
\
shr %eax, 16; \
DECOMPRESS_RGBA(4); \
\
movq XMM_R0, qword ptr [s_TempDecompress]; \
\
punpcklbw XMM_R0, XMM_R0; \
\
punpckhwd XMM_R1, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
psrld XMM_R0, 24; \
psrld XMM_R1, 24; \
\
UNPACK2_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 4; \
#define UNPACK_V4_5SSE_2A UNPACK_V4_5SSE_2
#define UNPACK_V4_5SSE_1(CL, TOTALCL, MaskType, ModeType) \
mov %ax, word ptr [VIF_SRC]; \
DECOMPRESS_RGBA(0) \
\
movd XMM_R0, dword ptr [s_TempDecompress]; \
punpcklbw XMM_R0, XMM_R0; \
punpcklwd XMM_R0, XMM_R0; \
\
psrld XMM_R0, 24; \
\
UNPACK1_SSE(CL, TOTALCL, MaskType, ModeType); \
\
add VIF_SRC, 2; \
#define UNPACK_V4_5SSE_1A UNPACK_V4_5SSE_1
#pragma warning(disable:4731)
#define SAVE_ROW_REG_BASE \
mov VIF_TMPADDR, _vifRow; \
movdqa qword ptr [VIF_TMPADDR], XMM_ROW; \
mov VIF_TMPADDR, _vifRegs; \
movss dword ptr [VIF_TMPADDR+0x100], XMM_ROW; \
psrldq XMM_ROW, 4; \
movss dword ptr [VIF_TMPADDR+0x110], XMM_ROW; \
psrldq XMM_ROW, 4; \
movss dword ptr [VIF_TMPADDR+0x120], XMM_ROW; \
psrldq XMM_ROW, 4; \
movss dword ptr [VIF_TMPADDR+0x130], XMM_ROW; \
#define SAVE_NO_REG
#ifdef __x86_64__
#define INIT_ARGS()
#define POP_REGS()
#define INC_STACK(reg) add %rsp, 8;
#else
// 32 bit versions have the args on the stack
#define INIT_ARGS() \
push %edi; \
push %esi; \
push %ebx; \
mov VIF_DST, dword ptr [%esp+4+12]; \
mov VIF_SRC, dword ptr [%esp+8+12]; \
mov VIF_SIZE, dword ptr [%esp+12+12]; \
#define POP_REGS() \
pop %ebx; \
pop %esi; \
pop %edi; \
#define INC_STACK(reg) add %esp, 4;
#endif
// qsize - bytes of compressed size of 1 decompressed qword
// int UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType(u32* dest, u32* data, int dmasize)
#define defUNPACK_SkippingWrite(name, MaskType, ModeType, qsize, sign, SAVE_ROW_REG) \
.globl UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType; \
UNPACK_SkippingWrite_##name##_##sign##_##MaskType##_##ModeType: \
INIT_ARGS(); \
mov VIF_TMPADDR, _vifRegs; \
movzx VIF_INC, byte ptr [VIF_TMPADDR + 0x40]; \
movzx VIF_SAVEEBX, byte ptr [VIF_TMPADDR + 0x41]; \
sub VIF_INC, VIF_SAVEEBX; \
shl VIF_INC, 4; \
\
cmp VIF_SAVEEBXd, 1; \
je name##_##sign##_##MaskType##_##ModeType##_WL1; \
cmp VIF_SAVEEBXd, 2; \
je name##_##sign##_##MaskType##_##ModeType##_WL2; \
cmp VIF_SAVEEBXd, 3; \
je name##_##sign##_##MaskType##_##ModeType##_WL3; \
jmp name##_##sign##_##MaskType##_##ModeType##_WL4; \
\
name##_##sign##_##MaskType##_##ModeType##_WL1: \
UNPACK_Start_Setup_##MaskType##_SSE_##ModeType##(0); \
\
cmp VIF_SIZE, qsize; \
jl name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
\
add VIF_INC, 16; \
\
/* first align VIF_SRC to 16 bytes */ \
name##_##sign##_##MaskType##_##ModeType##_C1_Align16: \
\
test VIF_SRC, 15; \
jz name##_##sign##_##MaskType##_##ModeType##_C1_UnpackAligned; \
\
UNPACK_##name##SSE_1(0, 1, MaskType, ModeType); \
\
cmp VIF_SIZE, (2*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec; \
sub VIF_SIZE, qsize; \
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Align16; \
\
name##_##sign##_##MaskType##_##ModeType##_C1_UnpackAligned: \
\
cmp VIF_SIZE, (2*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1; \
cmp VIF_SIZE, (3*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2; \
cmp VIF_SIZE, (4*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack3; \
prefetchnta [VIF_SRC + 64]; \
\
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack4: \
UNPACK_##name##SSE_4A(0, 1, MaskType, ModeType); \
\
cmp VIF_SIZE, (8*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C1_DoneUnpack4; \
sub VIF_SIZE, (4*qsize); \
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Unpack4; \
\
name##_##sign##_##MaskType##_##ModeType##_C1_DoneUnpack4: \
\
sub VIF_SIZE, (4*qsize); \
cmp VIF_SIZE, qsize; \
jl name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
cmp VIF_SIZE, (2*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1; \
cmp VIF_SIZE, (3*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2; \
/* fall through */ \
\
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack3: \
UNPACK_##name##SSE_3A(0, 1, MaskType, ModeType); \
\
sub VIF_SIZE, (3*qsize); \
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
\
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack2: \
UNPACK_##name##SSE_2A(0, 1, MaskType, ModeType); \
\
sub VIF_SIZE, (2*qsize); \
jmp name##_##sign##_##MaskType##_##ModeType##_C1_Done3; \
\
name##_##sign##_##MaskType##_##ModeType##_C1_Unpack1: \
UNPACK_##name##SSE_1A(0, 1, MaskType, ModeType); \
name##_##sign##_##MaskType##_##ModeType##_C1_DoneWithDec: \
sub VIF_SIZE, qsize; \
name##_##sign##_##MaskType##_##ModeType##_C1_Done3: \
SAVE_ROW_REG; \
mov %eax, VIF_SIZE; \
POP_REGS(); \
ret; \
\
name##_##sign##_##MaskType##_##ModeType##_WL2: \
cmp VIF_SIZE, (2*qsize); \
\
jl name##_##sign##_##MaskType##_##ModeType##_C2_Done3; \
name##_##sign##_##MaskType##_##ModeType##_C2_Unpack: \
UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \
\
add VIF_DST, VIF_INC; /* take into account wl */ \
cmp VIF_SIZE, (4*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C2_Done2; \
sub VIF_SIZE, (2*qsize); \
jmp name##_##sign##_##MaskType##_##ModeType##_C2_Unpack; /* unpack next */ \
\
name##_##sign##_##MaskType##_##ModeType##_C2_Done2: \
sub VIF_SIZE, (2*qsize); \
name##_##sign##_##MaskType##_##ModeType##_C2_Done3: \
cmp VIF_SIZE, qsize; \
/* execute left over qw */ \
jl name##_##sign##_##MaskType##_##ModeType##_C2_Done4; \
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \
\
sub VIF_SIZE, qsize; \
name##_##sign##_##MaskType##_##ModeType##_C2_Done4: \
\
SAVE_ROW_REG; \
mov %eax, VIF_SIZE; \
POP_REGS(); \
ret; \
\
name##_##sign##_##MaskType##_##ModeType##_WL3: \
cmp VIF_SIZE, (3*qsize); \
\
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done5; \
name##_##sign##_##MaskType##_##ModeType##_C3_Unpack: \
UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \
\
add VIF_DST, VIF_INC; /* take into account wl */ \
cmp VIF_SIZE, (6*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done2; \
sub VIF_SIZE, (3*qsize); \
jmp name##_##sign##_##MaskType##_##ModeType##_C3_Unpack; /* unpack next */ \
name##_##sign##_##MaskType##_##ModeType##_C3_Done2: \
sub VIF_SIZE, (3*qsize); \
name##_##sign##_##MaskType##_##ModeType##_C3_Done5: \
cmp VIF_SIZE, qsize; \
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done4; \
\
/* execute left over qw */ \
cmp VIF_SIZE, (2*qsize); \
jl name##_##sign##_##MaskType##_##ModeType##_C3_Done3; \
\
/* process 2 qws */ \
UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \
\
sub VIF_SIZE, (2*qsize); \
jmp name##_##sign##_##MaskType##_##ModeType##_C3_Done4; \
name##_##sign##_##MaskType##_##ModeType##_C3_Done3: \
/* process 1 qw */ \
sub VIF_SIZE, qsize; \
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType); \
name##_##sign##_##MaskType##_##ModeType##_C3_Done4: \
SAVE_ROW_REG; \
mov %eax, VIF_SIZE; \
POP_REGS(); \
ret; \
\
name##_##sign##_##MaskType##_##ModeType##_WL4: /* >= 4 */ \
sub VIF_SAVEEBX, 3; \
push VIF_INC; \
cmp VIF_SIZE, qsize; \
jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
\
name##_##sign##_##MaskType##_##ModeType##_C4_Unpack: \
cmp VIF_SIZE, (3*qsize); \
jge name##_##sign##_##MaskType##_##ModeType##_C4_Unpack3; \
cmp VIF_SIZE, (2*qsize); \
jge name##_##sign##_##MaskType##_##ModeType##_C4_Unpack2; \
\
UNPACK_##name##SSE_1(0, 0, MaskType, ModeType) \
\
/* not enough data left */ \
sub VIF_SIZE, qsize; \
jmp name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
name##_##sign##_##MaskType##_##ModeType##_C4_Unpack2: \
UNPACK_##name##SSE_2(0, 0, MaskType, ModeType); \
\
/* not enough data left */ \
sub VIF_SIZE, (2*qsize); \
jmp name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
name##_##sign##_##MaskType##_##ModeType##_C4_Unpack3: \
UNPACK_##name##SSE_3(0, 0, MaskType, ModeType); \
\
sub VIF_SIZE, (3*qsize); \
/* more data left, process 1qw at a time */ \
mov VIF_INC, VIF_SAVEEBX; \
\
name##_##sign##_##MaskType##_##ModeType##_C4_UnpackX: \
/* check if any data left */ \
cmp VIF_SIZE, qsize; \
jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
\
UNPACK_##name##SSE_1(3, 0, MaskType, ModeType); \
\
sub VIF_SIZE, qsize; \
cmp VIF_INC, 1; \
je name##_##sign##_##MaskType##_##ModeType##_C4_DoneLoop; \
sub VIF_INC, 1; \
jmp name##_##sign##_##MaskType##_##ModeType##_C4_UnpackX; \
name##_##sign##_##MaskType##_##ModeType##_C4_DoneLoop: \
add VIF_DST, [VIF_ESP]; /* take into account wl */ \
cmp VIF_SIZE, qsize; \
jl name##_##sign##_##MaskType##_##ModeType##_C4_Done; \
jmp name##_##sign##_##MaskType##_##ModeType##_C4_Unpack; /* unpack next */ \
name##_##sign##_##MaskType##_##ModeType##_C4_Done: \
\
SAVE_ROW_REG; \
INC_STACK(); \
mov %eax, VIF_SIZE; \
POP_REGS(); \
ret; \
#define UNPACK_RIGHTSHIFT psrld
#define defUNPACK_SkippingWrite2(name, qsize) \
defUNPACK_SkippingWrite(name, Regular, 0, qsize, u, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Regular, 1, qsize, u, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Regular, 2, qsize, u, SAVE_ROW_REG_BASE) \
defUNPACK_SkippingWrite(name, Mask, 0, qsize, u, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Mask, 1, qsize, u, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Mask, 2, qsize, u, SAVE_ROW_REG_BASE) \
defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, u, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, u, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, u, SAVE_ROW_REG_BASE) \
defUNPACK_SkippingWrite2(S_32, 4)
defUNPACK_SkippingWrite2(S_16, 2)
defUNPACK_SkippingWrite2(S_8, 1)
defUNPACK_SkippingWrite2(V2_32, 8)
defUNPACK_SkippingWrite2(V2_16, 4)
defUNPACK_SkippingWrite2(V2_8, 2)
defUNPACK_SkippingWrite2(V3_32, 12)
defUNPACK_SkippingWrite2(V3_16, 6)
defUNPACK_SkippingWrite2(V3_8, 3)
defUNPACK_SkippingWrite2(V4_32, 16)
defUNPACK_SkippingWrite2(V4_16, 8)
defUNPACK_SkippingWrite2(V4_8, 4)
defUNPACK_SkippingWrite2(V4_5, 2)
#undef UNPACK_RIGHTSHIFT
#undef defUNPACK_SkippingWrite2
#define UNPACK_RIGHTSHIFT psrad
#define defUNPACK_SkippingWrite2(name, qsize) \
defUNPACK_SkippingWrite(name, Mask, 0, qsize, s, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Regular, 0, qsize, s, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Regular, 1, qsize, s, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Regular, 2, qsize, s, SAVE_ROW_REG_BASE) \
defUNPACK_SkippingWrite(name, Mask, 1, qsize, s, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, Mask, 2, qsize, s, SAVE_ROW_REG_BASE) \
defUNPACK_SkippingWrite(name, WriteMask, 0, qsize, s, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, WriteMask, 1, qsize, s, SAVE_NO_REG) \
defUNPACK_SkippingWrite(name, WriteMask, 2, qsize, s, SAVE_ROW_REG_BASE) \
defUNPACK_SkippingWrite2(S_16, 2)
defUNPACK_SkippingWrite2(S_8, 1)
defUNPACK_SkippingWrite2(V2_16, 4)
defUNPACK_SkippingWrite2(V2_8, 2)
defUNPACK_SkippingWrite2(V3_16, 6)
defUNPACK_SkippingWrite2(V3_8, 3)
defUNPACK_SkippingWrite2(V4_16, 8)
defUNPACK_SkippingWrite2(V4_8, 4)
#undef UNPACK_RIGHTSHIFT
#undef defUNPACK_SkippingWrite2