/* Pcsx2 - Pc Ps2 Emulator * Copyright (C) 2002-2005 Pcsx2 Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ // Fast assembly routines for x86-64 // zerofrog(@gmail.com) .intel_syntax .extern g_EEFreezeRegs .extern FreezeMMXRegs_ // mmx memcmp implementation, size has to be a multiple of 8 // returns 0 is equal, nonzero value if not equal // ~10 times faster than standard memcmp // (zerofrog) // u8 memcmp_mmx(const void* src1, const void* src2, int cmpsize) #ifdef __x86_64__ #define MEMCMP_SRC1 %rdi #define MEMCMP_SRC2 %rsi #define MEMCMP_SIZE %edx #else #define MEMCMP_SRC1 %edx #define MEMCMP_SRC2 %esi #define MEMCMP_SIZE %ecx #endif .globl memcmp_mmx memcmp_mmx: #ifndef __x86_64__ // make sure mmx regs are stored // FreezeMMXRegs(1); cmp dword ptr [g_EEFreezeRegs], 0 je memcmp_mmx_begin push 1 call FreezeMMXRegs_ add %esp, 4 memcmp_mmx_begin: push %esi mov MEMCMP_SRC1, dword ptr [%esp+8] mov MEMCMP_SRC2, dword ptr [%esp+12] mov MEMCMP_SIZE, dword ptr [%esp+16] #endif cmp MEMCMP_SIZE, 32 jl memcmp_Done4 // custom test first 8 to make sure things are ok movq %mm0, [MEMCMP_SRC2] movq %mm1, [MEMCMP_SRC2+8] pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd %mm1, [MEMCMP_SRC1+8] pand %mm0, %mm1 movq %mm2, [MEMCMP_SRC2+16] pmovmskb %eax, %mm0 movq %mm3, [MEMCMP_SRC2+24] // check if eq cmp %eax, 0xff je memcmp_NextComp mov %eax, 1 jmp memcmp_End memcmp_NextComp: pcmpeqd %mm2, [MEMCMP_SRC1+16] pcmpeqd %mm3, [MEMCMP_SRC1+24] pand %mm2, %mm3 pmovmskb %eax, %mm2 sub MEMCMP_SIZE, 32 add MEMCMP_SRC2, 32 add MEMCMP_SRC1, 32 // check if eq cmp %eax, 0xff je memcmp_ContinueTest mov %eax, 1 jmp memcmp_End cmp MEMCMP_SIZE, 64 jl memcmp_Done8 memcmp_Cmp8: movq %mm0, [MEMCMP_SRC2] movq %mm1, [MEMCMP_SRC2+8] movq %mm2, [MEMCMP_SRC2+16] movq %mm3, [MEMCMP_SRC2+24] movq %mm4, [MEMCMP_SRC2+32] movq %mm5, [MEMCMP_SRC2+40] movq %mm6, [MEMCMP_SRC2+48] movq %mm7, [MEMCMP_SRC2+56] pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd %mm2, [MEMCMP_SRC1+16] pcmpeqd %mm3, [MEMCMP_SRC1+24] pand %mm0, %mm1 pcmpeqd %mm4, [MEMCMP_SRC1+32] pand %mm0, %mm2 pcmpeqd %mm5, [MEMCMP_SRC1+40] pand %mm0, %mm3 pcmpeqd %mm6, [MEMCMP_SRC1+48] pand %mm0, %mm4 pcmpeqd %mm7, [MEMCMP_SRC1+56] pand %mm0, %mm5 pand %mm0, %mm6 pand %mm0, %mm7 pmovmskb %eax, %mm0 // check if eq cmp %eax, 0xff je memcmp_Continue mov %eax, 1 jmp memcmp_End memcmp_Continue: sub MEMCMP_SIZE, 64 add MEMCMP_SRC2, 64 add MEMCMP_SRC1, 64 memcmp_ContinueTest: cmp MEMCMP_SIZE, 64 jge memcmp_Cmp8 memcmp_Done8: test MEMCMP_SIZE, 0x20 jz memcmp_Done4 movq %mm0, [MEMCMP_SRC2] movq %mm1, [MEMCMP_SRC2+8] movq %mm2, [MEMCMP_SRC2+16] movq %mm3, [MEMCMP_SRC2+24] pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd %mm2, [MEMCMP_SRC1+16] pcmpeqd %mm3, [MEMCMP_SRC1+24] pand %mm0, %mm1 pand %mm0, %mm2 pand %mm0, %mm3 pmovmskb %eax, %mm0 sub MEMCMP_SIZE, 32 add MEMCMP_SRC2, 32 add MEMCMP_SRC1, 32 // check if eq cmp %eax, 0xff je memcmp_Done4 mov %eax, 1 jmp memcmp_End memcmp_Done4: cmp MEMCMP_SIZE, 24 jne memcmp_Done2 movq %mm0, [MEMCMP_SRC2] movq %mm1, [MEMCMP_SRC2+8] movq %mm2, [MEMCMP_SRC2+16] pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd %mm1, [MEMCMP_SRC1+8] pcmpeqd %mm2, [MEMCMP_SRC1+16] pand %mm0, %mm1 pand %mm0, %mm2 pmovmskb %eax, %mm0 // check if eq cmp %eax, 0xff je memcmp_Done mov %eax, 1 jmp memcmp_End memcmp_Done2: cmp MEMCMP_SIZE, 16 jne memcmp_Done1 movq %mm0, [MEMCMP_SRC2] movq %mm1, [MEMCMP_SRC2+8] pcmpeqd %mm0, [MEMCMP_SRC1] pcmpeqd %mm1, [MEMCMP_SRC1+8] pand %mm0, %mm1 pmovmskb %eax, %mm0 // check if eq cmp %eax, 0xff je memcmp_Done mov %eax, 1 jmp memcmp_End memcmp_Done1: cmp MEMCMP_SIZE, 8 jne memcmp_Done mov %eax, [MEMCMP_SRC2] mov MEMCMP_SRC2, [MEMCMP_SRC2+4] cmp %eax, [MEMCMP_SRC1] je memcmp_Next mov %eax, 1 jmp memcmp_End memcmp_Next: cmp MEMCMP_SRC2, [MEMCMP_SRC1+4] je memcmp_Done mov %eax, 1 jmp memcmp_End memcmp_Done: xor %eax, %eax memcmp_End: emms #ifndef __x86_64__ pop %esi #endif ret // memxor_mmx #ifdef __x86_64__ #define MEMXOR_SRC1 %rdi #define MEMXOR_SRC2 %rsi #define MEMXOR_SIZE %edx #else #define MEMXOR_SRC1 %edx #define MEMXOR_SRC2 %esi #define MEMXOR_SIZE %ecx #endif .globl memxor_mmx memxor_mmx: #ifndef __x86_64__ // make sure mmx regs are stored // FreezeMMXRegs(1); cmp dword ptr [g_EEFreezeRegs], 0 je memxor_mmx_begin push 1 call FreezeMMXRegs_ add %esp, 4 memxor_mmx_begin: push %esi mov MEMXOR_SRC1, dword ptr [%esp+8] mov MEMXOR_SRC2, dword ptr [%esp+12] mov MEMXOR_SIZE, dword ptr [%esp+16] #endif cmp MEMXOR_SIZE, 64 jl memxor_Setup4 movq %mm0, [MEMXOR_SRC2] movq %mm1, [MEMXOR_SRC2+8] movq %mm2, [MEMXOR_SRC2+16] movq %mm3, [MEMXOR_SRC2+24] movq %mm4, [MEMXOR_SRC2+32] movq %mm5, [MEMXOR_SRC2+40] movq %mm6, [MEMXOR_SRC2+48] movq %mm7, [MEMXOR_SRC2+56] sub MEMXOR_SIZE, 64 add MEMXOR_SRC2, 64 cmp MEMXOR_SIZE, 64 jl memxor_End8 memxor_Cmp8: pxor %mm0, [MEMXOR_SRC2] pxor %mm1, [MEMXOR_SRC2+8] pxor %mm2, [MEMXOR_SRC2+16] pxor %mm3, [MEMXOR_SRC2+24] pxor %mm4, [MEMXOR_SRC2+32] pxor %mm5, [MEMXOR_SRC2+40] pxor %mm6, [MEMXOR_SRC2+48] pxor %mm7, [MEMXOR_SRC2+56] sub MEMXOR_SIZE, 64 add MEMXOR_SRC2, 64 cmp MEMXOR_SIZE, 64 jge memxor_Cmp8 memxor_End8: pxor %mm0, %mm4 pxor %mm1, %mm5 pxor %mm2, %mm6 pxor %mm3, %mm7 cmp MEMXOR_SIZE, 32 jl memxor_End4 pxor %mm0, [MEMXOR_SRC2] pxor %mm1, [MEMXOR_SRC2+8] pxor %mm2, [MEMXOR_SRC2+16] pxor %mm3, [MEMXOR_SRC2+24] sub MEMXOR_SIZE, 32 add MEMXOR_SRC2, 32 jmp memxor_End4 memxor_Setup4: cmp MEMXOR_SIZE, 32 jl memxor_Setup2 movq %mm0, [MEMXOR_SRC2] movq %mm1, [MEMXOR_SRC2+8] movq %mm2, [MEMXOR_SRC2+16] movq %mm3, [MEMXOR_SRC2+24] sub MEMXOR_SIZE, 32 add MEMXOR_SRC2, 32 memxor_End4: pxor %mm0, %mm2 pxor %mm1, %mm3 cmp MEMXOR_SIZE, 16 jl memxor_End2 pxor %mm0, [MEMXOR_SRC2] pxor %mm1, [MEMXOR_SRC2+8] sub MEMXOR_SIZE, 16 add MEMXOR_SRC2, 16 jmp memxor_End2 memxor_Setup2: cmp MEMXOR_SIZE, 16 jl memxor_Setup1 movq %mm0, [MEMXOR_SRC2] movq %mm1, [MEMXOR_SRC2+8] sub MEMXOR_SIZE, 16 add MEMXOR_SRC2, 16 memxor_End2: pxor %mm0, %mm1 cmp MEMXOR_SIZE, 8 jl memxor_End1 pxor %mm0, [MEMXOR_SRC2] memxor_End1: movq [MEMXOR_SRC1], %mm0 jmp memxor_End memxor_Setup1: movq %mm0, [MEMXOR_SRC2] movq [MEMXOR_SRC1], %mm0 memxor_End: emms #ifndef __x86_64__ pop %esi #endif ret