n64-emu/third_party/parallel-rdp-standalone/parallel-rdp/shaders/tmem_update.comp
2023-08-06 14:03:29 +09:00

577 lines
No EOL
19 KiB
Text

#version 450
/* Copyright (c) 2020 Themaister
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "debug.h"
#include "small_types.h"
layout(local_size_x_id = 0) in;
layout(set = 0, binding = 0, std430) readonly buffer VRAM8Buffer
{
mem_u8 data[];
} vram8;
layout(set = 0, binding = 0, std430) readonly buffer VRAM16Buffer
{
mem_u16 data[];
} vram16;
layout(set = 0, binding = 0, std430) readonly buffer VRAM32Buffer
{
uint data[];
} vram32;
layout(set = 0, binding = 1, std430) buffer TMEM16Buffer
{
mem_u16 data[2048];
} tmem16;
struct TileInstance
{
mem_u16 data[2048];
};
layout(set = 0, binding = 2, std430) writeonly buffer TMEMInstances
{
TileInstance instances[];
} tile_instances;
layout(push_constant, std430) uniform Registers
{
int num_uploads;
} registers;
const int TEXTURE_FMT_RGBA = 0;
const int TEXTURE_FMT_YUV = 1;
const int TEXTURE_FMT_CI = 2;
const int TEXTURE_FMT_IA = 3;
const int TEXTURE_FMT_I = 4;
const int UPLOAD_MODE_TILE = 0;
const int UPLOAD_MODE_TLUT = 1;
const int UPLOAD_MODE_BLOCK = 2;
struct UploadInfo
{
int width, height;
float min_t_mod, max_t_mod;
int vram_addr;
int vram_width;
int vram_size;
int vram_effective_width;
int tmem_offset;
int tmem_stride_words;
int tmem_size;
int tmem_fmt;
int mode;
float inv_tmem_stride_words;
int dxt;
int padding;
};
layout(set = 1, binding = 0, std140) uniform UploadInfos
{
UploadInfo upload_info[256];
};
bool tmem_dirty;
uint current_tmem_value;
int compute_upload_t(int offset, float inv_stride)
{
// This is still exact for all relevant inputs, and much faster than integer divide.
return int((float(offset) + 0.5) * inv_stride);
}
// In 32bpp upload mode we read 64 bits and split the result over the lower and upper TMEM.
void update_tmem_32(UploadInfo info, int tmem16_index, bool upper_tmem, bool yuv)
{
int tmem16_offset = (info.tmem_offset & 0x7ff) >> 1;
int tmem16_stride = info.tmem_stride_words;
int pixel_offset = (tmem16_index - tmem16_offset) & 0x3ff;
int upload_x, upload_y;
int upload_x_xor = 0;
if (info.mode == UPLOAD_MODE_BLOCK)
{
int word_offset = pixel_offset >> 1;
if (info.tmem_stride_words == 0)
{
// Trivial case, we can just compute T factor directly and set upload_x_xor.
// Other than that, it works like a simple 1D upload.
// However, if DxT is weird, we might end up in a situation where this word is written multiple times,
// or zero times.
int iteration_candidate_first = word_offset & ~1;
int iteration_candidate_second = iteration_candidate_first + 1;
int first_t = (iteration_candidate_first * info.dxt) >> 16;
int second_t = (iteration_candidate_second * info.dxt) >> 16;
if (first_t != second_t)
{
int iteration_candidate_first_write_index = iteration_candidate_first ^ (first_t & 1);
int iteration_candidate_second_write_index = iteration_candidate_second ^ (second_t & 1);
if (iteration_candidate_second_write_index == word_offset)
upload_x_xor = (second_t & 1) << 1;
else if (iteration_candidate_first_write_index == word_offset)
upload_x_xor = (first_t & 1) << 1;
else
return;
}
else
upload_x_xor ^= (first_t & 1) << 1;
}
else
{
// Welp ... This is pure insanity, but if we want to be completely correct ...
int min_t = compute_upload_t(word_offset & ~1, info.min_t_mod);
int max_t = compute_upload_t(word_offset | 1, info.max_t_mod);
// If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
// Y - t_max * stride <= X <= Y - t_min * stride.
int max_word_candidate = (word_offset | 1) - tmem16_stride * min_t;
int min_word_candidate = (word_offset & ~1) - tmem16_stride * max_t;
// If we have constraints for X, we constraint T further.
min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
bool found_candidate = false;
for (int t = max_t; t >= min_t; t--)
{
// Check to see if t is a solution to the equation.
// Potentially two targets could write here.
int candidate_solution_first = (word_offset & ~1) - tmem16_stride * t;
int candidate_solution_second = (word_offset | 1) - tmem16_stride * t;
int candidate_t_first = (candidate_solution_first * info.dxt) >> 16;
int candidate_t_second = (candidate_solution_second * info.dxt) >> 16;
if (((candidate_solution_second + candidate_t_second * tmem16_stride) ^ (candidate_t_second & 1)) == word_offset)
{
found_candidate = true;
pixel_offset = (candidate_solution_second << 1) + (pixel_offset & 1);
break;
}
else if (((candidate_solution_first + candidate_t_first * tmem16_stride) ^ (candidate_t_first & 1)) == word_offset)
{
found_candidate = true;
pixel_offset = (candidate_solution_first << 1) + (pixel_offset & 1);
break;
}
}
// We strided over this 64bpp word.
if (!found_candidate)
return;
}
upload_x = pixel_offset;
upload_y = 0;
}
else if (tmem16_stride == 0)
{
// For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
// is what happened in Y == height - 1.
upload_x = pixel_offset;
upload_y = info.height - 1;
}
else
{
upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
upload_x = pixel_offset - upload_y * tmem16_stride;
// If stride is smaller than width, we'll need to unroll the last line.
if (upload_y >= info.height)
{
upload_x += tmem16_stride * (upload_y - info.height + 1);
upload_y = info.height - 1;
}
}
int last_line_upload_x = upload_x ^ ((upload_y & 1) << 1);
if (last_line_upload_x >= info.width && upload_y > 0)
{
// If the last line won't trigger a write, the previous line probably did.
upload_y--;
upload_x += tmem16_stride;
}
int iteration_offset;
upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
if (info.vram_size == 3 || yuv)
{
iteration_offset = 4 * (upload_x & ~1);
}
else if (info.vram_size == 2)
{
// In 16bpp VRAM mode, we are supposed to step 4 pixels at a time (8 bytes), which will form 2 complete pixels.
// However, in 32bpp tile mode we're not shifting the X value appropriately.
// So, we're writing texels [0, 1, ..., 4, 5, ...], etc.
if ((upload_x & 2) != 0)
{
// We're not writing in this line, but the previous line might have!
// Interleaving patterns will form ...
if (upload_y > 0)
{
upload_y--;
upload_x += tmem16_stride;
upload_x ^= 2;
}
else
{
// These 2 words will never be written to.
return;
}
}
iteration_offset = 2 * (upload_x & ~1);
}
else if (info.vram_size == 1)
{
// 4 potential mirrors.
for (int i = 0; i < 4 && upload_y > 0 && (upload_x & 6) != 0; i++)
{
upload_y--;
upload_x += tmem16_stride;
upload_x ^= 2;
}
if ((upload_x & 6) != 0)
{
// These 6 words will never be written to.
return;
}
iteration_offset = upload_x & ~1;
}
if (upload_x >= info.width)
return;
int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
// The loading pipeline reads 64 bits per iteration.
int rdram_addr = line_rdram_addr + iteration_offset + 4 * (upload_x & 1);
uint word;
if ((rdram_addr & 3) == 0)
{
word = uint(vram32.data[rdram_addr >> 2]);
}
else
{
word = (uint(vram8.data[rdram_addr ^ 3]) << 24) |
(uint(vram8.data[(rdram_addr + 1) ^ 3]) << 16) |
(uint(vram8.data[(rdram_addr + 2) ^ 3]) << 8) |
uint(vram8.data[(rdram_addr + 3) ^ 3]);
}
if (yuv)
{
// Lower TMEM receives interleaved UV samples, while upper receives Y.
if (upper_tmem)
{
uint y0 = (word >> 16u) & 0xffu;
uint y1 = (word >> 0u) & 0xffu;
word = (y0 << 8u) | y1;
}
else
{
uint u = (word >> 24u) & 0xffu;
uint v = (word >> 8u) & 0xffu;
word = (u << 8u) | v;
}
}
else
{
word >>= 16u - 16u * uint(upper_tmem);
word &= 0xffffu;
}
current_tmem_value = word;
tmem_dirty = true;
}
void update_tmem_16(UploadInfo info, int tmem16_index)
{
int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
int tmem16_stride = info.tmem_stride_words;
int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
int upload_x, upload_y;
int upload_x_xor = 0;
if (info.mode == UPLOAD_MODE_BLOCK)
{
int word_offset = pixel_offset >> 2;
if (info.tmem_stride_words == 0)
{
// Trivial case, we can just compute T factor directly and set upload_x_xor.
// Other than that, it works like a simple 1D upload.
upload_x_xor = (((word_offset * info.dxt) >> 16) & 1) << 1;
}
else
{
// Welp ... This is pure insanity, but if we want to be completely correct ...
int min_t = compute_upload_t(word_offset, info.min_t_mod);
int max_t = compute_upload_t(word_offset, info.max_t_mod);
// If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
// Y - t_max * stride <= X <= Y - t_min * stride.
int max_word_candidate = word_offset - tmem16_stride * min_t;
int min_word_candidate = word_offset - tmem16_stride * max_t;
// If we have constraints for X, we constraint T further.
min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);
bool found_candidate = false;
for (int t = max_t; t >= min_t; t--)
{
// Check to see if t is a solution to the equation.
int candidate_solution = word_offset - tmem16_stride * t;
int computed_t = (candidate_solution * info.dxt) >> 16;
if (candidate_solution + computed_t * tmem16_stride == word_offset)
{
found_candidate = true;
upload_x_xor = (computed_t & 1) << 1;
pixel_offset = (candidate_solution << 2) + (pixel_offset & 3);
}
}
// We strided over this 64bpp word.
if (!found_candidate)
return;
}
upload_x = pixel_offset;
upload_y = 0;
}
else if (tmem16_stride == 0)
{
// For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
// is what happened in Y == height - 1.
upload_x = pixel_offset;
upload_y = info.height - 1;
}
else
{
upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
upload_x = pixel_offset - upload_y * tmem16_stride;
// If stride is smaller than width, we'll need to unroll the last line.
if (upload_y >= info.height)
{
upload_x += tmem16_stride * (upload_y - info.height + 1);
upload_y = info.height - 1;
}
}
// This is pure bullshit magic which arises as an edge case when
// tile pixel size does not match texture image size.
// Should not happen in normal applications.
// This is basically doing scatter-as-gather, so we need to figure out
// if there is no write to our texel after all (striding), or if there are multiple writes
// to our texel, in which case we need to figure out the last writer.
// This code is black magic, and it's made with blood, sweat and tears from testing with lots of trial and error.
int iteration_offset;
if (info.tmem_size != info.vram_size)
{
if (info.vram_size - info.tmem_size == 1)
{
// If TMEM is N bpp but VRAM is 2N bpp, we will get mirrored writes here.
// Select which half of the 2N bpp load we observe in TMEM.
iteration_offset = (upload_x & ~3) * 4;
if ((upload_x & ~3) + 2 < (info.vram_effective_width >> (3 - info.vram_size)))
iteration_offset += 8;
}
else if (info.tmem_size == 2 && info.vram_size == 1)
{
// In 8bpp VRAM mode, we are supposed to step 8 pixels at a time (8 bytes), which will form 4 complete pixels.
// However, in 16bpp tile mode we're not shifting the X value appropriately.
// So, we're writing texels [0, 1, 2, 3, ..., 8, 9, 10, 11], etc.
if ((upload_x & 4) != 0)
{
// We're not writing in this line, but the previous line might have!
// Interleaving patterns will form ...
if ((tmem16_stride & 4) != 0 && upload_y > 0)
{
upload_y--;
upload_x += tmem16_stride;
}
else
{
// These 4 words will never be written to.
return;
}
}
iteration_offset = upload_x & ~3;
}
}
else
{
// Normal case TMEM size aligns with VRAM size.
iteration_offset = (upload_x & ~3) * 2;
}
if (upload_x >= info.width)
return;
int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;
// The loading pipeline reads 64 bits per iteration.
int rdram_addr = line_rdram_addr + iteration_offset + 2 * (upload_x & 3);
uint word;
if ((rdram_addr & 1) == 0)
word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
else
word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
current_tmem_value = word;
tmem_dirty = true;
}
void update_tmem_lut(UploadInfo info, int tmem16_index)
{
int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
int pixel_offset_splat;
if (info.vram_size - info.tmem_size == 2)
{
pixel_offset_splat = pixel_offset >> 2;
pixel_offset_splat <<= info.vram_size - 2;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else if (info.vram_size - info.tmem_size == 1)
{
if ((pixel_offset & 4) == 0)
{
int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
pixel_offset_splat = (pixel_offset & ~7) >> shamt;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else
{
return;
}
}
else if (info.vram_size == info.tmem_size)
{
if ((pixel_offset & 0xc) == 0)
{
int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
pixel_offset_splat = (pixel_offset & ~3) >> shamt;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else
{
return;
}
}
else if (info.vram_size - info.tmem_size == -1)
{
if ((pixel_offset & 0x1c) == 0)
{
int shamt = info.tmem_size;
pixel_offset_splat = (pixel_offset >> shamt) & ~7;
if (pixel_offset_splat >= info.vram_effective_width)
return;
}
else
{
return;
}
}
else
{
// 4bpp tile, 32bpp VRAM. Mirrored writes.
int span_iteration = pixel_offset >> 2;
span_iteration = span_iteration * 2;
int span_pixel = span_iteration * 2;
if (span_pixel + 2 < info.vram_effective_width)
span_pixel += 2;
if (span_pixel >= info.vram_effective_width)
return;
pixel_offset_splat = span_pixel;
}
int rdram_addr = info.vram_addr + (pixel_offset_splat << (info.vram_size - 1));
// Odd behavior when we have unaligned TLUT uploads.
rdram_addr += 2 * (rdram_addr & 1) * (pixel_offset & 3);
uint word;
if ((rdram_addr & 1) == 0)
word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
else
word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);
current_tmem_value = word;
tmem_dirty = true;
}
void main()
{
tmem_dirty = false;
current_tmem_value = uint(tmem16.data[gl_GlobalInvocationID.x]);
int tmem16_index = int(gl_GlobalInvocationID.x) ^ 1;
bool upper_tmem = tmem16_index >= 0x400;
tile_instances.instances[0].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
int num_uploads = registers.num_uploads;
for (int i = 0; i < num_uploads; i++)
{
UploadInfo info = upload_info[i];
if (info.mode == UPLOAD_MODE_TLUT)
{
update_tmem_lut(info, tmem16_index);
}
else
{
bool yuv = info.tmem_fmt == TEXTURE_FMT_YUV;
if (info.tmem_size == 3 || yuv)
update_tmem_32(info, tmem16_index & 0x3ff, upper_tmem, yuv);
else
update_tmem_16(info, tmem16_index);
}
tile_instances.instances[i + 1].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
}
if (tmem_dirty)
tmem16.data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
}