/* Copyright (C) 2012 Corn aligned case & byte copy (except ASM) by Copyright (C) 2009 Raphael E-mail: raphael@fx-world.org homepage: http://wordpress.fx-world.org */ #include "Base/Types.h" #include "Base/Types.h" #include "System/Endian.h" #include "Utility/FastMemcpy.h" #include "System/Timing.h" #include #include //***************************************************************************** //Copy native N64 memory with CPU only //Corn //Little Endian //***************************************************************************** void memcpy_byteswap( void* dst, const void* src, size_t size ) { u8* src8 = (u8*)src; u8* dst8 = (u8*)dst; // < 4 isn't worth trying any optimisations... if(size>=4) { // Align dst on 4 bytes or just resume if already done while (((((uintptr_t)dst8) & 0x3)!=0) ) { *(u8*)((uintptr_t)dst8++ ^ U8_TWIDDLE) = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE); size--; } // We are dst aligned now but need at least 4 bytes to copy if(size>=4) { u32 src_alignment = (uintptr_t)src8&0x3; if (src_alignment == 0) // We are now both dst and src aligned and >= 4 bytes to copy { #if defined(DAEDALUS_POSIX) || defined(DAEDALUS_W32) u32 size_aligned = (size & ~0x3); // memcpy is almost 50% faster for windows and linux memcpy(dst8, src8, size_aligned); src8 += size_aligned; dst8 += size_aligned; #else //This is faster than the PSP's GCC memcpy //TODO: Profile for other plaforms to see if memcpy is faster u32* src32 = (u32*)src8; u32* dst32 = (u32*)dst8; u32 size32 = size >> 2; while (size32 & 0x3) { *dst32++ = *src32++; size32--; } u32 size128 = size32 >> 2; while (size128--) { dst32[0] = src32[0]; dst32[1] = src32[1]; dst32[2] = src32[2]; dst32[3] = src32[3]; src32 += 4; dst32 += 4; } src8 = (u8*)src32; dst8 = (u8*)dst32; #endif } else // We are now dst aligned and src unligned and >= 4 bytes to copy { u32* src32 = (u32*)((uintptr_t)src8 & ~0x3); u32* dst32 = (u32*)dst8; u32 srcTmp = *src32++; u32 dstTmp = 0; u32 size32 = size >> 2; switch( src_alignment ) { case 1: while(size32--) { dstTmp = srcTmp << 8; srcTmp = *src32++; *dst32++ = dstTmp | (srcTmp >> 24); } break; case 2: while(size32--) { dstTmp = srcTmp << 16; srcTmp = *src32++; *dst32++ = dstTmp | (srcTmp >> 16); } break; case 3: while(size32--) { dstTmp = srcTmp << 24; srcTmp = *src32++; *dst32++ = dstTmp | (srcTmp >> 8); } break; } src8 = (u8*)src32 - (4-src_alignment); dst8 = (u8*)dst32; } } } // Copy any remaining byte by byte... size &= 0x03; while(size--) { *(u8*)((uintptr_t)dst8++ ^ U8_TWIDDLE) = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE); } } #ifdef PROFILE_MEMCPY void byteswap_copy( void* dst, const void* src, size_t size ) { u8* src8 = (u8*)src; u8* dst8 = (u8*)dst; while(size--) { *(u8*)((uintptr_t)dst8++ ^ U8_TWIDDLE) = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE); } } static inline u64 GetCurrent() { u64 tick; NTiming::GetPreciseTime(&tick); return tick; } #define MEMCPY_TEST(d, s, n) { \ u32 _fast_memcpy_swizzle = 0; \ { \ u64 time; \ NTiming::GetPreciseTime(&time); \ for (u32 j=0; j<10000; ++j) \ memcpy_byteswap(d, s, n); \ _fast_memcpy_swizzle = (u32)(GetCurrent()-time); \ } \ u32 _copy_byteswap = 0; \ { \ u64 time = GetCurrent(); \ for (u32 j=0; j<10000; ++j) \ byteswap_copy(d, s, n); \ _copy_byteswap = (u32)(GetCurrent()-time); \ } \ printf("%ld bytes | BYTESWAP COPY %d | MEMCPY SWIZZLE %d\n", n, _copy_byteswap, _fast_memcpy_swizzle); \ } void memcpy_test( void * dst, const void * src, size_t size ) { MEMCPY_TEST(dst, src, size); } #endif // PROFILE_MEMCPY