Improve PI DMA implementation.

This is now basically perfect compared to real hardware. Verified
used the extensive testsuite here: https://github.com/rasky/n64_pi_dma_test

The only missing part is timing and making the transfer happen in
background, at least block by block.
This commit is contained in:
Giovanni Bajo 2021-06-20 13:36:56 +02:00 committed by Simon Eriksson
parent a56fa4ba41
commit 8367698e20

View file

@ -34,14 +34,10 @@ void pi_cycle_(struct pi_controller *pi) {
// DMA engine is finishing up with one entry.
if (pi->bytes_to_copy > 0) {
uint32_t bytes = pi->bytes_to_copy;
// XXX: Defer actual movement of bytes until... now.
// This is a giant hack; bytes should be DMA'd slowly.
pi->is_dma_read ? pi_dma_read(pi) : pi_dma_write(pi);
pi->regs[PI_DRAM_ADDR_REG] = (pi->regs[PI_DRAM_ADDR_REG] + bytes + 7) & ~7;
pi->regs[PI_CART_ADDR_REG] = (pi->regs[PI_CART_ADDR_REG] + bytes + 1) & ~1;
pi->regs[PI_STATUS_REG] &= ~PI_STATUS_DMA_BUSY;
pi->regs[PI_STATUS_REG] |= PI_STATUS_INTERRUPT;
@ -79,17 +75,29 @@ static int pi_dma_read(struct pi_controller *pi) {
else if ((source & 0x05000000) == 0x05000000)
dd_dma_read(pi->bus->dd, source, dest, length);
// FIXME: verify these
pi->regs[PI_RD_LEN_REG] = 0x7F;
pi->regs[PI_DRAM_ADDR_REG] = (pi->regs[PI_DRAM_ADDR_REG] + length + 7) & ~7;
pi->regs[PI_CART_ADDR_REG] = (pi->regs[PI_CART_ADDR_REG] + length + 1) & ~1;
return 0;
}
static void pi_rom_fetch(struct pi_controller *pi, uint32_t source, int32_t length, uint8_t *dest) {
int l = length;
if (source + length > pi->rom_size)
l = pi->rom_size - source;
memcpy(dest, pi->rom + source, l);
// FIXME: verify this on real hardware
memset(dest+l, 0xFF, length - l);
}
// Copies data from the the PI into RDRAM.
static int pi_dma_write(struct pi_controller *pi) {
uint32_t dest = pi->regs[PI_DRAM_ADDR_REG] & 0x7FFFFE;
uint32_t source = pi->regs[PI_CART_ADDR_REG] & 0xFFFFFFE;
uint32_t length = (pi->regs[PI_WR_LEN_REG] & 0xFFFFFF) + 1;
if (dest & 0x7)
length -= dest & 0x7;
int32_t length = (pi->regs[PI_WR_LEN_REG] & 0xFFFFFF) + 1;
if (pi->bus->dd->ipl_rom && (source & 0x06000000) == 0x06000000) {
source &= 0x003FFFFF;
@ -128,23 +136,70 @@ static int pi_dma_write(struct pi_controller *pi) {
}
else if (pi->rom) {
if (source + length > pi->rom_size) {
unsigned i;
// PI_WR_LEN_REG has a weird behavior when read back. It almost always
// reads as 0x7F, with the only exception of very short transfers (<= 8
// bytes) where the actual value is affected by the DRAM alignment. This
// is just for full accuracy, nobody is probably relying on this value.
pi->regs[PI_WR_LEN_REG] = 0x7F;
if (length <= 8)
pi->regs[PI_WR_LEN_REG] -= pi->regs[PI_DRAM_ADDR_REG] & 7;
// TODO: Check for correctness against hardware.
// Is this the right address to use for open bus?
for (i = (pi->regs[PI_CART_ADDR_REG] + pi->rom_size + 3) & ~0x3;
i < pi->regs[PI_CART_ADDR_REG] + length; i += 4) {
uint32_t word = (i >> 16) | (i & 0xFFFF0000);
memcpy(pi->bus->ri->ram + dest, &word, sizeof(word));
// PI DMA has an internal cache of 128 bytes ("a block"). Data is fetched
// from ROM and then copied to RDRAM. The first block is handled "specially":
// if the RDRAM address is not a multiple of 8, the block is shorter so
// that the RDRAM address becomes a multiple of 8 afterwards, and a faster
// code-path is triggered (possibly, 64-bit transfers to RDRAM).
// This is visible because this feature is actually broken: there are two
// bugs lingering, so that in the end Nintendo documented that only
// 8-bytes aligned transfers were possible.
uint8_t mem[128];
bool first_block = true;
while (length > 0) {
uint32_t dest = pi->regs[PI_DRAM_ADDR_REG] & 0x7FFFFE;
int32_t misalign = dest & 0x7;
int32_t cur_len = length;
int32_t block_len = 128 - misalign;
if (cur_len > block_len)
cur_len = block_len;
// Decrease length (for next block). After first block, odd sizes
// are round up.
length -= cur_len;
if (length & 1) length += 1;
// Fetch block from ROM. ROM is always fetched as 16-bit words,
// so round up the actual transfer.
uint32_t source = pi->regs[PI_CART_ADDR_REG] & 0xFFFFFFE;
int32_t rom_fetch_len = (cur_len + 1) & ~1;
pi_rom_fetch(pi, source, rom_fetch_len, mem);
pi->regs[PI_CART_ADDR_REG] += rom_fetch_len;
// Writeback to RDRAM. Here come the lions.
if (first_block) {
// HARDWARE BUG #1: in the first block, there's an off-by-one, so the
// length is actually rounded up to even size just for the last byte.
// Notice that ROM transfers are rounded up anyway, so this additional
// byte was already fetched from ROM.
if (cur_len == block_len-1)
cur_len++;
// HARDWARE BUG #2: the length of data written back is decreased by the
// RDRAM misalignment. This is wrong because cur_len was already
// clamped to the block length, so this actually ends up leaving a
// hole in RDRAM of non-transferred data at the end of the first block.
cur_len -= misalign;
if (cur_len < 0)
cur_len = 0;
}
length = pi->rom_size - source;
}
memcpy(pi->bus->ri->ram+dest, mem, cur_len);
pi->regs[PI_DRAM_ADDR_REG] += cur_len;
pi->regs[PI_DRAM_ADDR_REG] = (pi->regs[PI_DRAM_ADDR_REG] + 7) & ~7;
// TODO: Very hacky.
if (source < pi->rom_size)
memcpy(pi->bus->ri->ram + dest, pi->rom + source, length);
first_block = false;
}
}
return 0;
@ -162,6 +217,9 @@ int pi_init(struct pi_controller *pi, struct bus_controller *bus,
pi->flashram.data = flashram->ptr;
pi->is_viewer = is_viewer;
pi->regs[PI_RD_LEN_REG] = 0x7F;
pi->regs[PI_WR_LEN_REG] = 0x7F;
pi->bytes_to_copy = 0;
return 0;
}
@ -194,15 +252,6 @@ int read_pi_regs(void *opaque, uint32_t address, uint32_t *word) {
*word = pi->regs[reg];
if (reg == PI_WR_LEN_REG || reg == PI_RD_LEN_REG)
*word = 0x7F;
else if (reg == PI_CART_ADDR_REG)
*word &= 0xFFFFFFFE;
else if (reg == PI_DRAM_ADDR_REG)
*word &= 0xFFFFFE;
debug_mmio_read(pi, pi_register_mnemonics[reg], *word);
return 0;
}
@ -244,8 +293,13 @@ int write_pi_regs(void *opaque, uint32_t address, uint32_t word, uint32_t dqm) {
pi->regs[reg] &= ~dqm;
pi->regs[reg] |= word;
if (reg == PI_CART_ADDR_REG)
if (reg == PI_DRAM_ADDR_REG) {
pi->regs[reg] &= 0x00FFFFFE;
} else if (reg == PI_CART_ADDR_REG) {
pi->regs[reg] &= 0xFFFFFFFE;
dd_pi_write(pi->bus->dd, word);
}
else if (reg == PI_WR_LEN_REG) {
if (pi->regs[PI_DRAM_ADDR_REG] == 0xFFFFFFFF) {