Bone matrix loading speedup. Bit of a hack but seems reliable enough.

This commit is contained in:
Henrik Rydgard 2013-11-14 14:02:31 +01:00
parent 003cb41db5
commit 8c562a615f
3 changed files with 49 additions and 1 deletions

View file

@ -725,9 +725,22 @@ void GPUCommon::ExecuteOp(u32 op, u32 diff) {
case GE_CMD_CALL:
{
easy_guard guard(listLock);
// Saint Seiya needs correct support for relative calls.
u32 retval = currentList->pc + 4;
u32 target = gstate_c.getRelativeAddress(data);
// Bone matrix optimization - many games will CALL a bone matrix (!).
if ((Memory::ReadUnchecked_U32(target) >> 24) == GE_CMD_BONEMATRIXDATA) {
// Check for the end
if ((Memory::ReadUnchecked_U32(target + 11 * 4) >> 24) == GE_CMD_BONEMATRIXDATA &&
(Memory::ReadUnchecked_U32(target + 12 * 4) >> 24) == GE_CMD_RET) {
// Yep, pretty sure this is a bone matrix call.
gstate.FastLoadBoneMatrix(target);
break;
}
}
if (currentList->stackptr == ARRAY_SIZE(currentList->stack)) {
ERROR_LOG_REPORT(G3D, "CALL: Stack full!");
} else if (!Memory::IsValidAddress(target)) {

View file

@ -30,6 +30,10 @@
#include "Core/CoreParameter.h"
#include "Core/Config.h"
#include "Core/System.h"
#include "Core/MemMap.h"
#ifdef _M_SSE
#include <emmintrin.h>
#endif
// This must be aligned so that the matrices within are aligned.
GPUgstate MEMORY_ALIGNED16(gstate);
@ -175,6 +179,34 @@ void GPUgstate::Save(u32_le *ptr) {
memcpy(matrices, tgenMatrix, sizeof(tgenMatrix)); matrices += sizeof(tgenMatrix);
}
void GPUgstate::FastLoadBoneMatrix(u32 addr) {
const u32 *src = (const u32 *)Memory::GetPointerUnchecked(addr);
u32 num = boneMatrixNumber;
u32 *dst = (u32 *)(boneMatrix + (num & 0x7F));
#ifdef _M_SSE
__m128i row1 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);
__m128i row2 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 4)), 8);
__m128i row3 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 8)), 8);
if ((num & 0x3) == 0) {
_mm_store_si128((__m128i *)dst, row1);
_mm_store_si128((__m128i *)(dst + 4), row2);
_mm_store_si128((__m128i *)(dst + 8), row3);
} else {
_mm_storeu_si128((__m128i *)dst, row1);
_mm_storeu_si128((__m128i *)(dst + 4), row2);
_mm_storeu_si128((__m128i *)(dst + 8), row3);
}
#else
for (int i = 0; i < 12; i++) {
dst[i] = src[i] << 8;
}
#endif
num += 12;
gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F);
}
void GPUgstate::Restore(u32_le *ptr) {
// Not sure what the first 10 values are, exactly, but these seem right.
gstate_c.vertexAddr = ptr[5];

View file

@ -395,7 +395,10 @@ struct GPUgstate
int getTransferHeight() const { return ((transfersize >> 10) & 0x3FF) + 1; }
int getTransferBpp() const { return (transferstart & 1) ? 4 : 2; }
// Real data in the context ends here
void FastLoadBoneMatrix(u32 addr);
// Real data in the context ends here
void Save(u32_le *ptr);
void Restore(u32_le *ptr);