daedalus/Source/SysPSP/Utility/FastMemcpyPSP.cpp
2022-03-22 18:06:17 +11:00

376 lines
8.7 KiB
C++

/*
Copyright (C) 2009 Raphael
E-mail: raphael@fx-world.org
homepage: http://wordpress.fx-world.org
*/
#include "BuildOptions.h"
#include "Base/Types.h"
#include "Utility/FastMemcpy.h"
#include "Base/Types.h"
#include <string.h>
// Avoid using VFPU in our audio plugin, otherwise the ME will choke
//
//*****************************************************************************
//Uses VFPU if possible else normal memcpy() (orignal code by Alex? Modified by Corn)
//*****************************************************************************
//
void memcpy_vfpu( void* dst, const void* src, size_t size )
{
//less than 16bytes or there is no 32bit alignment -> not worth optimizing
if( ((u32)src&0x3) != ((u32)dst&0x3) && (size<16) )
{
memcpy( dst, src, size );
return;
}
u8* src8 = (u8*)src;
u8* dst8 = (u8*)dst;
// Align dst to 4 bytes or just resume if already done
while( ((u32)dst8&0x3)!=0 )
{
*dst8++ = *src8++;
size--;
}
u32 *dst32=(u32*)dst8;
u32 *src32=(u32*)src8;
// Align dst to 16 bytes or just resume if already done
while( ((u32)dst32&0xF)!=0 )
{
*dst32++ = *src32++;
size -= 4;
}
dst8=(u8*)dst32;
src8=(u8*)src32;
if( ((u32)src8&0xF)==0 ) //Both src and dst are 16byte aligned
{
while (size>63)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"lv.q c000, 0(%1)\n"
"lv.q c010, 16(%1)\n"
"lv.q c020, 32(%1)\n"
"lv.q c030, 48(%1)\n"
"sv.q c000, 0(%0)\n"
"sv.q c010, 16(%0)\n"
"sv.q c020, 32(%0)\n"
"sv.q c030, 48(%0)\n"
"addiu %2, %2, -64\n" //size -= 64;
"addiu %1, %1, 64\n" //dst8 += 64;
"addiu %0, %0, 64\n" //src8 += 64;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
while (size>15)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"lv.q c000, 0(%1)\n"
"sv.q c000, 0(%0)\n"
"addiu %2, %2, -16\n" //size -= 16;
"addiu %1, %1, 16\n" //dst8 += 16;
"addiu %0, %0, 16\n" //src8 += 16;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
}
else //At least src is 4byte and dst is 16byte aligned
{
while (size>63)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"ulv.q c000, 0(%1)\n"
"ulv.q c010, 16(%1)\n"
"ulv.q c020, 32(%1)\n"
"ulv.q c030, 48(%1)\n"
"sv.q c000, 0(%0)\n"
"sv.q c010, 16(%0)\n"
"sv.q c020, 32(%0)\n"
"sv.q c030, 48(%0)\n"
"addiu %2, %2, -64\n" //size -= 64;
"addiu %1, %1, 64\n" //dst8 += 64;
"addiu %0, %0, 64\n" //src8 += 64;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
while (size>15)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"ulv.q c000, 0(%1)\n"
"sv.q c000, 0(%0)\n"
"addiu %2, %2, -16\n" //size -= 16;
"addiu %1, %1, 16\n" //dst8 += 16;
"addiu %0, %0, 16\n" //src8 += 16;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
}
// Most copies are completed with the VFPU, so fast out
if (size == 0)
return;
dst32=(u32*)dst8;
src32=(u32*)src8;
//Copy remaning 32bit...
while( size>3 )
{
*dst32++ = *src32++;
size -= 4;
}
dst8=(u8*)dst32;
src8=(u8*)src32;
//Copy remaning bytes if any...
while( size>0 )
{
*dst8++ = *src8++;
size--;
}
}
//*****************************************************************************
//Swizzled memcopy uses VFPU if possible else normal memcpy() //Corn
//*****************************************************************************
void memcpy_vfpu_byteswap( void* dst, const void* src, size_t size )
{
u8* src8 = (u8*)src;
u8* dst8 = (u8*)dst;
u32 *dst32;
u32 *src32;
// < 4 isn't worth trying any optimisations...
if( size>=4 )
{
// Align dst on 4 bytes or just resume if already done
while( ((u32)dst8&0x3)!=0 )
{
*(u8*)((u32)dst8++ ^ 3) = *(u8*)((u32)src8++ ^ 3);
size--;
}
// if < 4 its not possible to optimize...
if( size>=4 )
{
// src has to be at least 32bit aligned to try any optimisations...
if( (((u32)src8&0x3) == 0) )
{
// Not atleast 16 bytes at this point? -> not worth trying any VFPU optimisations...
if (size>15)
{
src32 = (u32*)src8;
dst32 = (u32*)dst8;
// Align dst to 16 bytes to gain from vfpu aligned stores or just resume if already done
while( ((u32)dst32&0xF)!=0 )
{
*dst32++ = *src32++;
size -= 4;
}
src8 = (u8*)src32;
dst8 = (u8*)dst32;
if( ((u32)src8&0xF)==0 ) //Both src and dst are 16byte aligned
{
while (size>63)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"lv.q c000, 0(%1)\n"
"lv.q c010, 16(%1)\n"
"lv.q c020, 32(%1)\n"
"lv.q c030, 48(%1)\n"
"sv.q c000, 0(%0)\n"
"sv.q c010, 16(%0)\n"
"sv.q c020, 32(%0)\n"
"sv.q c030, 48(%0)\n"
"addiu %2, %2, -64\n" //size -= 64;
"addiu %1, %1, 64\n" //dst8 += 64;
"addiu %0, %0, 64\n" //src8 += 64;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
while (size>15)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"lv.q c000, 0(%1)\n"
"sv.q c000, 0(%0)\n"
"addiu %2, %2, -16\n" //size -= 16;
"addiu %1, %1, 16\n" //dst8 += 16;
"addiu %0, %0, 16\n" //src8 += 16;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
}
else //At least src is 4byte and dst is 16byte aligned
{
while (size>63)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"ulv.q c000, 0(%1)\n"
"ulv.q c010, 16(%1)\n"
"ulv.q c020, 32(%1)\n"
"ulv.q c030, 48(%1)\n"
"sv.q c000, 0(%0)\n"
"sv.q c010, 16(%0)\n"
"sv.q c020, 32(%0)\n"
"sv.q c030, 48(%0)\n"
"addiu %2, %2, -64\n" //size -= 64;
"addiu %1, %1, 64\n" //dst8 += 64;
"addiu %0, %0, 64\n" //src8 += 64;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
while (size>15)
{
asm(".set push\n" // save assembler option
".set noreorder\n" // suppress reordering
"ulv.q c000, 0(%1)\n"
"sv.q c000, 0(%0)\n"
"addiu %2, %2, -16\n" //size -= 16;
"addiu %1, %1, 16\n" //dst8 += 16;
"addiu %0, %0, 16\n" //src8 += 16;
".set pop\n" // restore assembler option
:"+r"(dst8),"+r"(src8),"+r"(size)
:
:"memory"
);
}
}
// Most copies are completed with the VFPU, so fast out
if (size == 0)
return;
}
// Copy the remaning 32bit...
src32 = (u32*)src8;
dst32 = (u32*)dst8;
while(size>3)
{
*dst32++ = *src32++;
size -= 4;
}
src8 = (u8*)src32;
dst8 = (u8*)dst32;
}
else
{
//We are now dst aligned and src unligned and >= 4 bytes to copy
dst32 = (u32*)dst8;
src32 = (u32*)((u32)src8 & ~0x3);
u32 srcTmp {*src32++};
u32 dstTmp {0};
u32 size32 {size >> 2}; // Size in dwords (to avoid a sltiu in loop)
size &= 0x3; // Update remaining bytes if any..
switch( (u32)src8&0x3 )
{
/*case 0: //src is aligned too
src32 = (u32*)src8;
while(size32--)
{
*dst32++ = *src32++;
}
break;*/
case 1:
{
while(size32--)
{
dstTmp = srcTmp << 8;
srcTmp = *src32++;
dstTmp |= srcTmp >> 24;
*dst32++ = dstTmp;
}
src8 = (u8*)src32 - 3;
}
break;
case 2:
{
while(size32--)
{
dstTmp = srcTmp << 16;
srcTmp = *src32++;
dstTmp |= srcTmp >> 16;
*dst32++ = dstTmp;
}
src8 = (u8*)src32 - 2;
}
break;
case 3:
{
while(size32--)
{
dstTmp = srcTmp << 24;
srcTmp = *src32++;
dstTmp |= srcTmp >> 8;
*dst32++ = dstTmp;
}
src8 = (u8*)src32 - 1;
}
break;
}
dst8 = (u8*)dst32;
}
}
}
// Copy the remaning bytes if any...
while (size--)
{
*(u8*)((u32)dst8++ ^ 3) = *(u8*)((u32)src8++ ^ 3);
}
}