daedalus/Source/HLEGraphics/RDPStateManager.cpp
2019-03-23 20:24:40 -04:00

710 lines
18 KiB
C++

/*
Copyright (C) 2006,2007 StrmnNrmn
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "stdafx.h"
#include "RDPStateManager.h"
#include "DLDebug.h"
#include "Core/Memory.h"
#include "Core/ROM.h"
#include "Debug/DBGConsole.h"
#include "HLEGraphics/uCodes/UcodeDefs.h"
#include "Math/MathUtil.h"
#include "OSHLE/ultra_gbi.h"
#include "Utility/Alignment.h"
#include "Utility/Endian.h"
#include "Utility/FastMemcpy.h"
#include "Utility/Macros.h"
extern SImageDescriptor g_TI; //Texture data from Timg ucode
CRDPStateManager gRDPStateManager;
static const char * const kTLUTTypeName[] = {"None", "?", "RGBA16", "IA16"};
RDP_OtherMode gRDPOtherMode;
#define MAX_TMEM_ADDRESS 4096
//Granularity down to 24bytes is good enuff also only need to address the upper half of TMEM for palettes//Corn
u32* gTlutLoadAddresses[ MAX_TMEM_ADDRESS >> 6 ];
#ifdef DAEDALUS_ACCURATE_TMEM
ALIGNED_GLOBAL(u8, gTMEM[ MAX_TMEM_ADDRESS ], 16); // 4Kb
#endif
#ifdef DAEDALUS_ACCURATE_TMEM
// CopyLineQwords** passes in an offset in 32bit words, (i.e. does mem_address/4)
// So it's aligned by definition, we have asserts in place just in case!
// Due to how TMEM is organized, erg the last 3 bits in the address are always "0"
// It should be safe to assume copies will always be in qwords
#define FAST_TMEM_COPY
static inline void CopyLineQwords(void * dst, const void * src, u32 qwords)
{
#ifdef FAST_TMEM_COPY
u32* src32 = (u32*)src;
u32* dst32 = (u32*)dst;
DAEDALUS_ASSERT( ((uintptr_t)src32&0x3)==0, "src is not aligned!");
while(qwords--)
{
dst32[0] = BSWAP32(src32[0]);
dst32[1] = BSWAP32(src32[1]);
dst32 += 2;
src32 += 2;
}
#else
u8* src8 = (u8*)src;
u8* dst8 = (u8*)dst;
u32 bytes = qwords * 8;
while(bytes--)
{
*dst8++ = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE);
}
#endif
}
static void CopyLineQwordsSwap(void * dst, const void * src, u32 qwords)
{
#ifdef FAST_TMEM_COPY
u32* src32 {(u32*)src};
u32* dst32 {(u32*)dst};
DAEDALUS_ASSERT( ((uintptr_t)src32&0x3 )==0, "src is not aligned!");
while(qwords--)
{
dst32[1] = BSWAP32(src32[0]);
dst32[0] = BSWAP32(src32[1]);
dst32 += 2;
src32 += 2;
}
#else
u8* src8 {(u8*)src};
u8* dst8 {(u8*)dst};
u32 bytes {qwords * 8};
while(bytes--)
{
*(u8*)((uintptr_t)dst8++ ^ 0x4) = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE);
}
#endif
}
static void CopyLineQwordsSwap32(void * dst, const void * src, u32 qwords)
{
#ifdef FAST_TMEM_COPY
u32* src32 {(u32*)src};
u32* dst32 {(u32*)dst};
DAEDALUS_ASSERT( ((uintptr_t)src32&0x3 )==0, "src is not aligned!");
u32 size128 {qwords >>1};
while(size128--)
{
dst32[2] = BSWAP32(src32[0]);
dst32[3] = BSWAP32(src32[1]);
dst32[0] = BSWAP32(src32[2]);
dst32[1] = BSWAP32(src32[3]);
dst32 += 4;
src32 += 4;
}
// Copy any remaining quadword
qwords&=0x1;
while(qwords--)
{
*(u32*)((uintptr_t)dst32++ ^ 0x8) = BSWAP32(src32[0]);
*(u32*)((uintptr_t)dst32++ ^ 0x8) = BSWAP32(src32[1]);
src32+=2;
}
#else
u8* src8 {(u8*)src};
u8* dst8 {(u8*)dst};
u32 bytes {qwords * 8};
while(bytes--)
{
*(u8*)((uintptr_t)dst8++ ^ 0x8) = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE);
}
#endif
}
static inline void CopyLine(void * dst, const void * src, u32 bytes)
{
#ifdef FAST_TMEM_COPY
u32* src32 {(u32*)src};
u32* dst32 {(u32*)dst};
DAEDALUS_ASSERT((bytes&0x3)==0, "CopyLine: Remaning bytes! (%d)",bytes);
u32 size32 {bytes >> 2};
u32 src_alignment {(uintptr_t)src32&0x3};
if(src_alignment == 0)
{
while (size32--)
{
*dst32++ = BSWAP32(src32[0]);
src32++;
}
}
else
{
// Zelda and DK64 have unaligned copies. so let's optimize 'em
src32 = (u32*)((uintptr_t)src & ~0x3);
u32 src_tmp {*src32++};
u32 dst_tmp {};
// calculate offset 3..1..2
u32 offset {4-src_alignment};
u32 lshift {src_alignment<<3};
u32 rshift {offset<<3};
while(size32--)
{
dst_tmp = src_tmp << lshift;
src_tmp = *src32++;
dst_tmp|= src_tmp >> rshift;
*dst32++ = BSWAP32(dst_tmp);
}
src32 -= offset;
}
#else
u8* src8 {(u8*)src};
u8* dst8 {(u8*)dst};
while(bytes--)
{
*dst8++ = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE);
}
#endif
}
static inline void CopyLine16(u16 * dst16, const u16 * src16, u32 words)
{
#ifdef FAST_TMEM_COPY
DAEDALUS_ASSERT( ((uintptr_t)src16&0x1 )==0, "src is not aligned!");
u32 dwords {words >> 1};
while(dwords--)
{
dst16[0] = src16[1];
dst16[4] = src16[0];
dst16+=8;
src16+=2;
}
// Copy any remaining word
words&= 0x1;
#endif
while(words--)
{
*dst16 = *(u16*)((uintptr_t)src16++ ^ U16_TWIDDLE);
dst16+=4;
}
}
static inline void CopyLineSwap(void * dst, const void * src, u32 bytes)
{
u32* src32 {(u32*)src};
u32* dst32 {(u32*)dst};
#ifdef FAST_TMEM_COPY
DAEDALUS_ASSERT((bytes&0x7)==0, "CopyLineSwap: Remaning bytes! (%d)",bytes);
if( ((uintptr_t)src32&0x3 )==0)
{
u32 size64 {bytes >> 3};
while (size64--)
{
dst32[0] = BSWAP32(src32[1]);
dst32[1] = BSWAP32(src32[0]);
dst32 += 2;
src32 += 2;
}
}
else
#endif
{
// Optimize me: Bomberman, Zelda, and Quest 64 have unaligned copies here
//DBGConsole_Msg(0, "[WWarning CopyLineSwap: Performing slow copy]" );
u8* src8 {(u8*)src32};
u8* dst8 {(u8*)dst32};
while(bytes--)
{
// Alternate 32 bit words are swapped
*(u8*)((uintptr_t)dst8++ ^ 0x4) = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE);
}
}
}
static inline void CopyLineSwap32(void * dst, const void * src, u32 bytes)
{
u32* src32 {(u32*)(src)};
u32* dst32 {(u32*)(dst)};
#ifdef FAST_TMEM_COPY
DAEDALUS_ASSERT((bytes&0x7)==0, "CopyLineSwap32: Remaning bytes! (%d)",bytes);
if( ((uintptr_t)src32&0x3 )==0)
{
u32 size128 {bytes >> 4};
while (size128--)
{
dst32[0] = BSWAP32(src32[2]);
dst32[1] = BSWAP32(src32[3]);
dst32[2] = BSWAP32(src32[0]);
dst32[3] = BSWAP32(src32[1]);
dst32 += 4;
src32 += 4;
}
// Copy any remaining quadword
bytes&=0xF;
while(bytes--)
{
*(u32*)((uintptr_t)dst32++ ^ 0x8) = BSWAP32(src32[0]);
*(u32*)((uintptr_t)dst32++ ^ 0x8) = BSWAP32(src32[1]);
src32+=2;
}
}
else
#endif
{
// Have yet to see game with unaligned copies here
//DBGConsole_Msg(0, "[WWarning CopyLineSwap32: Performing slow copy]" );
u8* src8 {(u8*)src32};
u8* dst8 {(u8*)dst32};
while(bytes--)
{
// Alternate 64 bit words are swapped
*(u8*)((uintptr_t)dst8++ ^ 0x8) = *(u8*)((uintptr_t)src8++ ^ U8_TWIDDLE);
}
}
}
#endif
CRDPStateManager::CRDPStateManager()
: EmulateMirror(true)
{
ClearAllEntries();
InvalidateAllTileTextureInfo();
}
CRDPStateManager::~CRDPStateManager()
{
}
void CRDPStateManager::Reset()
{
ClearAllEntries();
InvalidateAllTileTextureInfo();
memset(mTiles, 0, sizeof(mTiles));
memset(mTileSizes, 0, sizeof(mTileSizes));
memset(mTileTextureInfo, 0, sizeof(mTileTextureInfo));
}
void CRDPStateManager::SetTile( const RDP_Tile & tile )
{
u32 idx {tile.tile_idx};
if( mTiles[ idx ] != tile )
{
mTiles[ idx ] = tile;
mTileTextureInfoValid[ idx ] = false;
}
}
void CRDPStateManager::SetTileSize( const RDP_TileSize & tile_size )
{
u32 idx {tile_size.tile_idx};
if( mTileSizes[ idx ] != tile_size )
{
mTileSizes[ idx ] = tile_size;
mTileTextureInfoValid[ idx ] = false;
}
}
void CRDPStateManager::LoadBlock(const SetLoadTile & load)
{
u32 uls {load.sl}; //Left
u32 ult {load.tl}; //Top
u32 dxt {load.th}; // 1.11 fixed point
u32 tile_idx {load.tile};
u32 address {g_TI.GetAddress( uls, ult )};
//u32 ByteSize = (load.sh + 1) << (g_TI.Size == G_IM_SIZ_32b);
bool swapped = (dxt) ? false : true;
#ifdef DAEDALUS_DEBUG_DISPLAYLIST
DL_PF(" Tile[%d] (%d,%d - %d) DXT[0x%04x] = [%d]Bytes/line => [%d]Pixels/line Address[0x%08x]",
tile_idx,
uls, ult,
load.sh,
dxt,
(g_TI.Width << g_TI.Size >> 1),
bytes2pixels( (g_TI.Width << g_TI.Size >> 1), g_TI.Size ),
address);
#endif
InvalidateAllTileTextureInfo(); // Can potentially invalidate all texture infos
const RDP_Tile & rdp_tile {mTiles[tile_idx]};
u32 tmem_lookup {(u32)(rdp_tile.tmem >> 4)};
//Invalidate load info after current TMEM address to the end of TMEM (fixes Fzero and SSV) //Corn
ClearEntries( tmem_lookup );
SetValidEntry( tmem_lookup );
TimgLoadDetails & info = mTmemLoadInfo[ tmem_lookup ];
info.Address = address;
info.Pitch = ~0;
info.Swapped = swapped;
#ifdef DAEDALUS_ACCURATE_TMEM
u32 lrs {load.sh};
u32 bytes {((lrs+1) << g_TI.Size) >> 1};
DAEDALUS_DL_ASSERT( bytes <= 4096, "Suspiciously large loadblock: %d bytes", bytes );
DAEDALUS_DL_ASSERT( bytes, "LoadBLock: No bytes??" );
u32 qwords {(bytes+7) / 8};
u32 tmem_offset {(rdp_tile.tmem << 3)};
u32 ram_offset {address};
if (( (address + bytes) > MAX_RAM_ADDRESS) || (tmem_offset + bytes) > MAX_TMEM_ADDRESS )
{
DBGConsole_Msg(0, "[WWarning LoadBlock address is invalid]" );
return;
}
u32* dst {(u32*)(gTMEM + tmem_offset)};
u32* src {(u32*)(g_pu8RamBase + ram_offset)};
if (dxt == 0)
{
CopyLineQwords(dst, src, qwords);
}
else
{
void (*CopyLineQwordsMode)(void*, const void*, u32);
if(g_TI.Size == G_IM_SIZ_32b)
CopyLineQwordsMode = CopyLineQwordsSwap32;
else
CopyLineQwordsMode = CopyLineQwordsSwap;
u32 qwords_per_line {(2048 + dxt-1) / dxt};
DAEDALUS_ASSERT(qwords_per_line == (u32)ceilf(2048.f / (float)dxt), "Broken DXT calc");
u32 odd_row {};
for (u32 i {}; i < qwords; /* updated in loop */)
{
u32 qwords_to_copy {Min(qwords-i, qwords_per_line)};
if (odd_row)
{
CopyLineQwordsMode(dst, src, qwords_to_copy);
}
else
{
CopyLineQwords(dst, src, qwords_to_copy);
}
i += qwords_to_copy;
qwords_to_copy *= 2; // 2 32bit words per qword
dst += qwords_to_copy;
src += qwords_to_copy;
odd_row ^= 0x1; // Odd lines are word swapped
}
}
//InvalidateTileHashes();
#endif // DAEDALUS_ACCURATE_TMEM
}
void CRDPStateManager::LoadTile(const SetLoadTile & load)
{
u32 uls {load.sl}; //Left
u32 ult {load.tl}; //Top
u32 tile_idx {load.tile};
u32 address {g_TI.GetAddress( uls / 4, ult / 4 )};
u32 pitch {g_TI.GetPitch()};
#ifdef DAEDALUS_DEBUG_DISPLAYLIST
DL_PF(" Tile[%d] (%d,%d)->(%d,%d) [%d x %d] Address[0x%08x]",
tile_idx,
load.sl / 4, load.tl / 4, load.sh / 4 + 1, load.th / 4 + 1,
(load.sh - load.sl) / 4 + 1, (load.th - load.tl) / 4 + 1,
address);
#endif
InvalidateAllTileTextureInfo(); // Can potentially invalidate all texture infos
const RDP_Tile & rdp_tile = mTiles[tile_idx];
#ifdef DAEDALUS_DEBUG_DISPLAYLIST
DAEDALUS_DL_ASSERT( (rdp_tile.size > 0) || (uls & 4) == 0, "Expecting an even Left for 4bpp formats (left is %f)", uls / 4.f );
#endif
u32 tmem_lookup {(u32)(rdp_tile.tmem >> 4)};
SetValidEntry( tmem_lookup );
TimgLoadDetails & info = mTmemLoadInfo[ tmem_lookup ];
info.Address = address;
info.Pitch = pitch;
info.Swapped = false;
#ifdef DAEDALUS_ACCURATE_TMEM
u32 lrs {load.sh};
u32 lrt {load.th};
u32 ram_address {address;
u32 h {((lrt-ult)>>2) + 1};
u32 w {((lrs-uls)>>2) + 1};
u32 bytes {((h * w) << g_TI.Size) >> 1};
#ifdef DAEDALUS_DEBUG_DISPLAYLIST
DAEDALUS_USE(bytes);
DAEDALUS_DL_ASSERT( bytes <= MAX_TMEM_ADDRESS,
"Suspiciously large texture load: %d bytes (%dx%d, %dbpp)",
bytes, w, h, (1<<(g_TI.Size+2)) );
#endif
u32 tmem_offset {rdp_tile.tmem << 3};
u32 ram_offset {ram_address};
u32 bytes_per_tmem_line {rdp_tile.line << 3};
void (*CopyLineMode)(void*, const void*, u32);
if (g_TI.Size == G_IM_SIZ_32b)
{
bytes_per_tmem_line *= 2;
CopyLineMode = CopyLineSwap32;
}
else
{
CopyLineMode = CopyLineSwap;
}
u32 bytes_to_copy {(bytes_per_tmem_line * h)};
if ((address + bytes_to_copy) > MAX_RAM_ADDRESS || (tmem_offset + bytes_to_copy) > MAX_TMEM_ADDRESS)
{
DBGConsole_Msg(0, "[WWarning LoadTile address is invalid]" );
return;
}
u8* dst {gTMEM + tmem_offset};
u8* src {g_pu8RamBase + ram_offset};
for (u32 y {}; y < h; ++y)
{
if (y&1)
{
CopyLineMode(dst, src, bytes_per_tmem_line);
}
else
{
CopyLine(dst, src, bytes_per_tmem_line);
}
// There might be uninitialised padding bytes here, but we don't care.
dst += bytes_per_tmem_line;
src += pitch;
}
//InvalidateTileHashes();
#endif // DAEDALUS_ACCURATE_TMEM
}
void CRDPStateManager::LoadTlut(const SetLoadTile & load)
{
// Tlut fmt is sometimes wrong (in 007) and is set after tlut load, but before tile load
// Format is always 16bpp - RGBA16 or IA16:
//DAEDALUS_DL_ASSERT(g_TI.Size == G_IM_SIZ_16b, "Crazy tlut load - not 16bpp");
u32 uls {load.sl}; //Left
u32 ult {load.tl}; //Top
u32 lrs {load.sh}; //Right
u32 lrt {load.th}; //Bottom
u32 tile_idx {load.tile};
u32 ram_offset {g_TI.GetAddress16bpp(uls >> 2, ult >> 2)};
u8* address {g_pu8RamBase + ram_offset};
const RDP_Tile & rdp_tile {mTiles[tile_idx]};
u32 count {((lrs - uls)>>2) + 1};
#ifdef DAEDALUS_DEBUG_DISPLAYLIST
DAEDALUS_USE(count);
DAEDALUS_USE(lrt);
#endif
//Store address of PAL (assuming PAL is only stored in upper half of TMEM) //Corn
gTlutLoadAddresses[ (rdp_tile.tmem>>2) & 0x3F ] = (u32*)address;
#ifdef DAEDALUS_DEBUG_DISPLAYLIST
DL_PF(" TLut Addr[0x%08x] TMEM[0x%03x] Tile[%d] Count[%d] Format[%s] (%d,%d)->(%d,%d)",
address, rdp_tile.tmem, tile_idx, count, kTLUTTypeName[gRDPOtherMode.text_tlut], uls >> 2, ult >> 2, lrs >> 2, lrt >> 2);
#endif
#ifdef DAEDALUS_ACCURATE_TMEM
DAEDALUS_DL_ASSERT( (rdp_tile.tmem + count) <= (MAX_TMEM_ADDRESS/8), "LoadTlut address is invalid" );
u16* dst {(u16*)(((u64*)gTMEM) + rdp_tile.tmem)};
u16* src {(u16*)(address)};
CopyLine16(dst, src, count);
#endif
}
// Limit the tile's width/height to the number of bits specified by mask_s/t.
// See the detailed noted in BaseRenderer::UpdateTileSnapshots for issues relating to this.
static inline u16 GetTextureDimension( u16 tile_dimension, u8 mask, bool clamp )
{
if (mask)
{
u16 mask_dimension {(u16)(1 << mask)};
// If clamp is enabled, the maximum addressable texel is the
// smaller of the mask dimension and the tile dimension.
if (clamp)
{
return Min< u16 >( mask_dimension, tile_dimension );
}
return mask_dimension;
}
return tile_dimension;
}
const TextureInfo & CRDPStateManager::GetUpdatedTextureDescriptor( u32 idx )
{
DAEDALUS_ASSERT( idx < ARRAYSIZE( mTileTextureInfoValid ), "Invalid index %d", idx );
if( !mTileTextureInfoValid[ idx ] )
{
TextureInfo & ti {mTileTextureInfo[ idx ]};
const RDP_Tile & rdp_tile {mTiles[ idx ]};
const RDP_TileSize & rdp_tilesize {mTileSizes[ idx ]};
const u32 tmem_lookup {(u32)(rdp_tile.tmem >> 4)};
const TimgLoadDetails & info {mTmemLoadInfo[ tmem_lookup ]};
u32 address {info.Address};
u32 pitch {info.Pitch};
bool swapped {info.Swapped};
//Check if tmem_lookup has a valid entry, if not we assume load was done on TMEM[0] and we add the offset //Corn
//Games that uses this is Fzero/Space station Silicon Valley/Animal crossing.
if( EntryIsValid( tmem_lookup ) == 0 )
{
const TimgLoadDetails & info_base {mTmemLoadInfo[ 0 ]};
//Calculate offset in bytes and add to base address
address = info_base.Address + (rdp_tile.tmem << 3);
pitch = info_base.Pitch;
swapped = info_base.Swapped;
}
// If it was a Block Load - the pitch is determined by the tile size.
// Else if it was a Tile Load - the pitch is set when the tile is loaded.
if ( pitch == u32(~0) )
{
if( rdp_tile.size == G_IM_SIZ_32b ) pitch = rdp_tile.line << 4;
else pitch = rdp_tile.line << 3;
}
// Limit the tile's width/height to the number of bits specified by mask_s/t.
// See the detailed notes in BaseRenderer::UpdateTileSnapshots for issues relating to this.
//
u16 tile_width {GetTextureDimension( rdp_tilesize.GetWidth(), rdp_tile.mask_s, rdp_tile.clamp_s )};
u16 tile_height {GetTextureDimension( rdp_tilesize.GetHeight(), rdp_tile.mask_t, rdp_tile.clamp_t )};
#ifdef DAEDALUS_ACCURATE_TMEM
ti.SetTlutAddress( TLUT_BASE );
#else
//
//If indexed TMEM PAL address is NULL then assume that the base address is stored in
//TMEM address 0x100 (gTlutLoadAddresses[ 0 ]) and calculate offset from there with TLutIndex(palette index)
//This trick saves us from the need to copy the real palette to TMEM and we just pass the pointer //Corn
//
u32 tlut {TLUT_BASE};
if(rdp_tile.size == G_IM_SIZ_4b)
{
u32 tlut_idx0 {(u32)(g_ROM.TLUT_HACK << 1)};
u32 tlut_idx1 {(u32)gTlutLoadAddresses[ rdp_tile.palette << tlut_idx0 ]};
//If pointer == NULL(=invalid entry) add offset to base address (TMEM[0] + offset)
if(tlut_idx1 == 0)
{
tlut += (rdp_tile.palette << (5 + tlut_idx0) );
}
else
{
tlut = tlut_idx1;
}
}
ti.SetTlutAddress( tlut );
#endif
#ifdef DAEDALUS_ACCURATE_TMEM
ti.SetLine( rdp_tile.line );
// NB: ACCURATE_TMEM doesn't care about pitch - it's already been loaded into tmem.
// We only care about line.
#endif
ti.SetTmemAddress( rdp_tile.tmem );
ti.SetLoadAddress( address );
ti.SetPalette( rdp_tile.palette );
ti.SetFormat( rdp_tile.format );
ti.SetSize( rdp_tile.size );
ti.SetWidth( tile_width );
ti.SetHeight( tile_height );
ti.SetPitch( pitch );
ti.SetTLutFormat( (ETLutFmt)gRDPOtherMode.text_tlut );
ti.SetSwapped( swapped );
ti.SetEmulateMirrorS( EmulateMirror && rdp_tile.mirror_s );
ti.SetEmulateMirrorT( EmulateMirror && rdp_tile.mirror_t );
// Hack - Extreme-G specifies RGBA/8 textures, but they're really CI8
if( ti.GetFormat() == G_IM_FMT_RGBA && ti.GetSize() <= G_IM_SIZ_8b ) ti.SetFormat( G_IM_FMT_CI );
// Force RGBA
if( ti.GetFormat() == G_IM_FMT_CI && ti.GetTLutFormat() == kTT_NONE ) ti.SetTLutFormat( kTT_RGBA16 );
mTileTextureInfoValid[ idx ] = true;
}
return mTileTextureInfo[ idx ];
}