mirror of
https://github.com/kmc-jp/n64-emu.git
synced 2025-04-02 10:21:43 -04:00
577 lines
No EOL
19 KiB
Text
577 lines
No EOL
19 KiB
Text
#version 450
|
|
/* Copyright (c) 2020 Themaister
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
#include "debug.h"
|
|
#include "small_types.h"
|
|
layout(local_size_x_id = 0) in;
|
|
|
|
layout(set = 0, binding = 0, std430) readonly buffer VRAM8Buffer
|
|
{
|
|
mem_u8 data[];
|
|
} vram8;
|
|
|
|
layout(set = 0, binding = 0, std430) readonly buffer VRAM16Buffer
|
|
{
|
|
mem_u16 data[];
|
|
} vram16;
|
|
|
|
layout(set = 0, binding = 0, std430) readonly buffer VRAM32Buffer
|
|
{
|
|
uint data[];
|
|
} vram32;
|
|
|
|
layout(set = 0, binding = 1, std430) buffer TMEM16Buffer
|
|
{
|
|
mem_u16 data[2048];
|
|
} tmem16;
|
|
|
|
struct TileInstance
|
|
{
|
|
mem_u16 data[2048];
|
|
};
|
|
|
|
layout(set = 0, binding = 2, std430) writeonly buffer TMEMInstances
|
|
{
|
|
TileInstance instances[];
|
|
} tile_instances;
|
|
|
|
layout(push_constant, std430) uniform Registers
|
|
{
|
|
int num_uploads;
|
|
} registers;
|
|
|
|
const int TEXTURE_FMT_RGBA = 0;
|
|
const int TEXTURE_FMT_YUV = 1;
|
|
const int TEXTURE_FMT_CI = 2;
|
|
const int TEXTURE_FMT_IA = 3;
|
|
const int TEXTURE_FMT_I = 4;
|
|
|
|
const int UPLOAD_MODE_TILE = 0;
|
|
const int UPLOAD_MODE_TLUT = 1;
|
|
const int UPLOAD_MODE_BLOCK = 2;
|
|
|
|
struct UploadInfo
|
|
{
|
|
int width, height;
|
|
float min_t_mod, max_t_mod;
|
|
|
|
int vram_addr;
|
|
int vram_width;
|
|
int vram_size;
|
|
int vram_effective_width;
|
|
|
|
int tmem_offset;
|
|
int tmem_stride_words;
|
|
int tmem_size;
|
|
int tmem_fmt;
|
|
|
|
int mode;
|
|
float inv_tmem_stride_words;
|
|
int dxt;
|
|
int padding;
|
|
};
|
|
|
|
layout(set = 1, binding = 0, std140) uniform UploadInfos
|
|
{
|
|
UploadInfo upload_info[256];
|
|
};
|
|
|
|
bool tmem_dirty;
|
|
uint current_tmem_value;
|
|
|
|
int compute_upload_t(int offset, float inv_stride)
|
|
{
|
|
// This is still exact for all relevant inputs, and much faster than integer divide.
|
|
return int((float(offset) + 0.5) * inv_stride);
|
|
}
|
|
|
|
// In 32bpp upload mode we read 64 bits and split the result over the lower and upper TMEM.
|
|
void update_tmem_32(UploadInfo info, int tmem16_index, bool upper_tmem, bool yuv)
|
|
{
|
|
int tmem16_offset = (info.tmem_offset & 0x7ff) >> 1;
|
|
int tmem16_stride = info.tmem_stride_words;
|
|
|
|
int pixel_offset = (tmem16_index - tmem16_offset) & 0x3ff;
|
|
int upload_x, upload_y;
|
|
int upload_x_xor = 0;
|
|
|
|
if (info.mode == UPLOAD_MODE_BLOCK)
|
|
{
|
|
int word_offset = pixel_offset >> 1;
|
|
|
|
if (info.tmem_stride_words == 0)
|
|
{
|
|
// Trivial case, we can just compute T factor directly and set upload_x_xor.
|
|
// Other than that, it works like a simple 1D upload.
|
|
|
|
// However, if DxT is weird, we might end up in a situation where this word is written multiple times,
|
|
// or zero times.
|
|
|
|
int iteration_candidate_first = word_offset & ~1;
|
|
int iteration_candidate_second = iteration_candidate_first + 1;
|
|
int first_t = (iteration_candidate_first * info.dxt) >> 16;
|
|
int second_t = (iteration_candidate_second * info.dxt) >> 16;
|
|
if (first_t != second_t)
|
|
{
|
|
int iteration_candidate_first_write_index = iteration_candidate_first ^ (first_t & 1);
|
|
int iteration_candidate_second_write_index = iteration_candidate_second ^ (second_t & 1);
|
|
if (iteration_candidate_second_write_index == word_offset)
|
|
upload_x_xor = (second_t & 1) << 1;
|
|
else if (iteration_candidate_first_write_index == word_offset)
|
|
upload_x_xor = (first_t & 1) << 1;
|
|
else
|
|
return;
|
|
}
|
|
else
|
|
upload_x_xor ^= (first_t & 1) << 1;
|
|
}
|
|
else
|
|
{
|
|
// Welp ... This is pure insanity, but if we want to be completely correct ...
|
|
int min_t = compute_upload_t(word_offset & ~1, info.min_t_mod);
|
|
int max_t = compute_upload_t(word_offset | 1, info.max_t_mod);
|
|
|
|
// If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
|
|
// Y - t_max * stride <= X <= Y - t_min * stride.
|
|
int max_word_candidate = (word_offset | 1) - tmem16_stride * min_t;
|
|
int min_word_candidate = (word_offset & ~1) - tmem16_stride * max_t;
|
|
|
|
// If we have constraints for X, we constraint T further.
|
|
min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
|
|
max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
|
|
|
|
bool found_candidate = false;
|
|
for (int t = max_t; t >= min_t; t--)
|
|
{
|
|
// Check to see if t is a solution to the equation.
|
|
// Potentially two targets could write here.
|
|
int candidate_solution_first = (word_offset & ~1) - tmem16_stride * t;
|
|
int candidate_solution_second = (word_offset | 1) - tmem16_stride * t;
|
|
|
|
int candidate_t_first = (candidate_solution_first * info.dxt) >> 16;
|
|
int candidate_t_second = (candidate_solution_second * info.dxt) >> 16;
|
|
|
|
if (((candidate_solution_second + candidate_t_second * tmem16_stride) ^ (candidate_t_second & 1)) == word_offset)
|
|
{
|
|
found_candidate = true;
|
|
pixel_offset = (candidate_solution_second << 1) + (pixel_offset & 1);
|
|
break;
|
|
}
|
|
else if (((candidate_solution_first + candidate_t_first * tmem16_stride) ^ (candidate_t_first & 1)) == word_offset)
|
|
{
|
|
found_candidate = true;
|
|
pixel_offset = (candidate_solution_first << 1) + (pixel_offset & 1);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// We strided over this 64bpp word.
|
|
if (!found_candidate)
|
|
return;
|
|
}
|
|
|
|
upload_x = pixel_offset;
|
|
upload_y = 0;
|
|
}
|
|
else if (tmem16_stride == 0)
|
|
{
|
|
// For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
|
|
// is what happened in Y == height - 1.
|
|
upload_x = pixel_offset;
|
|
upload_y = info.height - 1;
|
|
}
|
|
else
|
|
{
|
|
upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
|
|
upload_x = pixel_offset - upload_y * tmem16_stride;
|
|
|
|
// If stride is smaller than width, we'll need to unroll the last line.
|
|
if (upload_y >= info.height)
|
|
{
|
|
upload_x += tmem16_stride * (upload_y - info.height + 1);
|
|
upload_y = info.height - 1;
|
|
}
|
|
}
|
|
|
|
int last_line_upload_x = upload_x ^ ((upload_y & 1) << 1);
|
|
if (last_line_upload_x >= info.width && upload_y > 0)
|
|
{
|
|
// If the last line won't trigger a write, the previous line probably did.
|
|
upload_y--;
|
|
upload_x += tmem16_stride;
|
|
}
|
|
|
|
int iteration_offset;
|
|
|
|
upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
|
|
|
|
if (info.vram_size == 3 || yuv)
|
|
{
|
|
iteration_offset = 4 * (upload_x & ~1);
|
|
}
|
|
else if (info.vram_size == 2)
|
|
{
|
|
// In 16bpp VRAM mode, we are supposed to step 4 pixels at a time (8 bytes), which will form 2 complete pixels.
|
|
// However, in 32bpp tile mode we're not shifting the X value appropriately.
|
|
// So, we're writing texels [0, 1, ..., 4, 5, ...], etc.
|
|
if ((upload_x & 2) != 0)
|
|
{
|
|
// We're not writing in this line, but the previous line might have!
|
|
// Interleaving patterns will form ...
|
|
if (upload_y > 0)
|
|
{
|
|
upload_y--;
|
|
upload_x += tmem16_stride;
|
|
upload_x ^= 2;
|
|
}
|
|
else
|
|
{
|
|
// These 2 words will never be written to.
|
|
return;
|
|
}
|
|
}
|
|
iteration_offset = 2 * (upload_x & ~1);
|
|
}
|
|
else if (info.vram_size == 1)
|
|
{
|
|
// 4 potential mirrors.
|
|
for (int i = 0; i < 4 && upload_y > 0 && (upload_x & 6) != 0; i++)
|
|
{
|
|
upload_y--;
|
|
upload_x += tmem16_stride;
|
|
upload_x ^= 2;
|
|
}
|
|
|
|
if ((upload_x & 6) != 0)
|
|
{
|
|
// These 6 words will never be written to.
|
|
return;
|
|
}
|
|
|
|
iteration_offset = upload_x & ~1;
|
|
}
|
|
|
|
if (upload_x >= info.width)
|
|
return;
|
|
|
|
int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
|
|
|
|
// The loading pipeline reads 64 bits per iteration.
|
|
int rdram_addr = line_rdram_addr + iteration_offset + 4 * (upload_x & 1);
|
|
|
|
uint word;
|
|
if ((rdram_addr & 3) == 0)
|
|
{
|
|
word = uint(vram32.data[rdram_addr >> 2]);
|
|
}
|
|
else
|
|
{
|
|
word = (uint(vram8.data[rdram_addr ^ 3]) << 24) |
|
|
(uint(vram8.data[(rdram_addr + 1) ^ 3]) << 16) |
|
|
(uint(vram8.data[(rdram_addr + 2) ^ 3]) << 8) |
|
|
uint(vram8.data[(rdram_addr + 3) ^ 3]);
|
|
}
|
|
|
|
if (yuv)
|
|
{
|
|
// Lower TMEM receives interleaved UV samples, while upper receives Y.
|
|
if (upper_tmem)
|
|
{
|
|
uint y0 = (word >> 16u) & 0xffu;
|
|
uint y1 = (word >> 0u) & 0xffu;
|
|
word = (y0 << 8u) | y1;
|
|
}
|
|
else
|
|
{
|
|
uint u = (word >> 24u) & 0xffu;
|
|
uint v = (word >> 8u) & 0xffu;
|
|
word = (u << 8u) | v;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
word >>= 16u - 16u * uint(upper_tmem);
|
|
word &= 0xffffu;
|
|
}
|
|
current_tmem_value = word;
|
|
tmem_dirty = true;
|
|
}
|
|
|
|
void update_tmem_16(UploadInfo info, int tmem16_index)
|
|
{
|
|
int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
|
|
int tmem16_stride = info.tmem_stride_words;
|
|
|
|
int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
|
|
int upload_x, upload_y;
|
|
int upload_x_xor = 0;
|
|
|
|
if (info.mode == UPLOAD_MODE_BLOCK)
|
|
{
|
|
int word_offset = pixel_offset >> 2;
|
|
|
|
if (info.tmem_stride_words == 0)
|
|
{
|
|
// Trivial case, we can just compute T factor directly and set upload_x_xor.
|
|
// Other than that, it works like a simple 1D upload.
|
|
upload_x_xor = (((word_offset * info.dxt) >> 16) & 1) << 1;
|
|
}
|
|
else
|
|
{
|
|
// Welp ... This is pure insanity, but if we want to be completely correct ...
|
|
int min_t = compute_upload_t(word_offset, info.min_t_mod);
|
|
int max_t = compute_upload_t(word_offset, info.max_t_mod);
|
|
|
|
// If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
|
|
// Y - t_max * stride <= X <= Y - t_min * stride.
|
|
int max_word_candidate = word_offset - tmem16_stride * min_t;
|
|
int min_word_candidate = word_offset - tmem16_stride * max_t;
|
|
|
|
// If we have constraints for X, we constraint T further.
|
|
min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
|
|
max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
|
|
|
|
bool found_candidate = false;
|
|
for (int t = max_t; t >= min_t; t--)
|
|
{
|
|
// Check to see if t is a solution to the equation.
|
|
int candidate_solution = word_offset - tmem16_stride * t;
|
|
int computed_t = (candidate_solution * info.dxt) >> 16;
|
|
if (candidate_solution + computed_t * tmem16_stride == word_offset)
|
|
{
|
|
found_candidate = true;
|
|
upload_x_xor = (computed_t & 1) << 1;
|
|
pixel_offset = (candidate_solution << 2) + (pixel_offset & 3);
|
|
}
|
|
}
|
|
|
|
// We strided over this 64bpp word.
|
|
if (!found_candidate)
|
|
return;
|
|
}
|
|
|
|
upload_x = pixel_offset;
|
|
upload_y = 0;
|
|
}
|
|
else if (tmem16_stride == 0)
|
|
{
|
|
// For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
|
|
// is what happened in Y == height - 1.
|
|
upload_x = pixel_offset;
|
|
upload_y = info.height - 1;
|
|
}
|
|
else
|
|
{
|
|
upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
|
|
upload_x = pixel_offset - upload_y * tmem16_stride;
|
|
|
|
// If stride is smaller than width, we'll need to unroll the last line.
|
|
if (upload_y >= info.height)
|
|
{
|
|
upload_x += tmem16_stride * (upload_y - info.height + 1);
|
|
upload_y = info.height - 1;
|
|
}
|
|
}
|
|
|
|
// This is pure bullshit magic which arises as an edge case when
|
|
// tile pixel size does not match texture image size.
|
|
// Should not happen in normal applications.
|
|
// This is basically doing scatter-as-gather, so we need to figure out
|
|
// if there is no write to our texel after all (striding), or if there are multiple writes
|
|
// to our texel, in which case we need to figure out the last writer.
|
|
// This code is black magic, and it's made with blood, sweat and tears from testing with lots of trial and error.
|
|
int iteration_offset;
|
|
if (info.tmem_size != info.vram_size)
|
|
{
|
|
if (info.vram_size - info.tmem_size == 1)
|
|
{
|
|
// If TMEM is N bpp but VRAM is 2N bpp, we will get mirrored writes here.
|
|
// Select which half of the 2N bpp load we observe in TMEM.
|
|
iteration_offset = (upload_x & ~3) * 4;
|
|
if ((upload_x & ~3) + 2 < (info.vram_effective_width >> (3 - info.vram_size)))
|
|
iteration_offset += 8;
|
|
}
|
|
else if (info.tmem_size == 2 && info.vram_size == 1)
|
|
{
|
|
// In 8bpp VRAM mode, we are supposed to step 8 pixels at a time (8 bytes), which will form 4 complete pixels.
|
|
// However, in 16bpp tile mode we're not shifting the X value appropriately.
|
|
// So, we're writing texels [0, 1, 2, 3, ..., 8, 9, 10, 11], etc.
|
|
if ((upload_x & 4) != 0)
|
|
{
|
|
// We're not writing in this line, but the previous line might have!
|
|
// Interleaving patterns will form ...
|
|
if ((tmem16_stride & 4) != 0 && upload_y > 0)
|
|
{
|
|
upload_y--;
|
|
upload_x += tmem16_stride;
|
|
}
|
|
else
|
|
{
|
|
// These 4 words will never be written to.
|
|
return;
|
|
}
|
|
}
|
|
iteration_offset = upload_x & ~3;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Normal case TMEM size aligns with VRAM size.
|
|
iteration_offset = (upload_x & ~3) * 2;
|
|
}
|
|
|
|
if (upload_x >= info.width)
|
|
return;
|
|
|
|
int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
|
|
upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
|
|
|
|
// The loading pipeline reads 64 bits per iteration.
|
|
int rdram_addr = line_rdram_addr + iteration_offset + 2 * (upload_x & 3);
|
|
|
|
uint word;
|
|
if ((rdram_addr & 1) == 0)
|
|
word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
|
|
else
|
|
word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
|
|
|
|
current_tmem_value = word;
|
|
tmem_dirty = true;
|
|
}
|
|
|
|
void update_tmem_lut(UploadInfo info, int tmem16_index)
|
|
{
|
|
int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
|
|
int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
|
|
int pixel_offset_splat;
|
|
|
|
if (info.vram_size - info.tmem_size == 2)
|
|
{
|
|
pixel_offset_splat = pixel_offset >> 2;
|
|
pixel_offset_splat <<= info.vram_size - 2;
|
|
if (pixel_offset_splat >= info.vram_effective_width)
|
|
return;
|
|
}
|
|
else if (info.vram_size - info.tmem_size == 1)
|
|
{
|
|
if ((pixel_offset & 4) == 0)
|
|
{
|
|
int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
|
|
pixel_offset_splat = (pixel_offset & ~7) >> shamt;
|
|
if (pixel_offset_splat >= info.vram_effective_width)
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
return;
|
|
}
|
|
}
|
|
else if (info.vram_size == info.tmem_size)
|
|
{
|
|
if ((pixel_offset & 0xc) == 0)
|
|
{
|
|
int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
|
|
pixel_offset_splat = (pixel_offset & ~3) >> shamt;
|
|
if (pixel_offset_splat >= info.vram_effective_width)
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
return;
|
|
}
|
|
}
|
|
else if (info.vram_size - info.tmem_size == -1)
|
|
{
|
|
if ((pixel_offset & 0x1c) == 0)
|
|
{
|
|
int shamt = info.tmem_size;
|
|
pixel_offset_splat = (pixel_offset >> shamt) & ~7;
|
|
if (pixel_offset_splat >= info.vram_effective_width)
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
return;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// 4bpp tile, 32bpp VRAM. Mirrored writes.
|
|
int span_iteration = pixel_offset >> 2;
|
|
span_iteration = span_iteration * 2;
|
|
int span_pixel = span_iteration * 2;
|
|
if (span_pixel + 2 < info.vram_effective_width)
|
|
span_pixel += 2;
|
|
|
|
if (span_pixel >= info.vram_effective_width)
|
|
return;
|
|
|
|
pixel_offset_splat = span_pixel;
|
|
}
|
|
|
|
int rdram_addr = info.vram_addr + (pixel_offset_splat << (info.vram_size - 1));
|
|
|
|
// Odd behavior when we have unaligned TLUT uploads.
|
|
rdram_addr += 2 * (rdram_addr & 1) * (pixel_offset & 3);
|
|
|
|
uint word;
|
|
if ((rdram_addr & 1) == 0)
|
|
word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
|
|
else
|
|
word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
|
|
|
|
current_tmem_value = word;
|
|
tmem_dirty = true;
|
|
}
|
|
|
|
void main()
|
|
{
|
|
tmem_dirty = false;
|
|
current_tmem_value = uint(tmem16.data[gl_GlobalInvocationID.x]);
|
|
int tmem16_index = int(gl_GlobalInvocationID.x) ^ 1;
|
|
bool upper_tmem = tmem16_index >= 0x400;
|
|
|
|
tile_instances.instances[0].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
|
|
|
|
int num_uploads = registers.num_uploads;
|
|
for (int i = 0; i < num_uploads; i++)
|
|
{
|
|
UploadInfo info = upload_info[i];
|
|
if (info.mode == UPLOAD_MODE_TLUT)
|
|
{
|
|
update_tmem_lut(info, tmem16_index);
|
|
}
|
|
else
|
|
{
|
|
bool yuv = info.tmem_fmt == TEXTURE_FMT_YUV;
|
|
if (info.tmem_size == 3 || yuv)
|
|
update_tmem_32(info, tmem16_index & 0x3ff, upper_tmem, yuv);
|
|
else
|
|
update_tmem_16(info, tmem16_index);
|
|
}
|
|
|
|
tile_instances.instances[i + 1].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
|
|
}
|
|
|
|
if (tmem_dirty)
|
|
tmem16.data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
|
|
} |