daedalus/Source/Core/JpegTask.cpp
2022-03-22 18:06:17 +11:00

620 lines
18 KiB
C++

/**
* Mupen64 hle rsp - jpeg.c
* Copyright (C) 2012 Bobby Smiles *
* Copyright (C) 2009 Richard Goedeken *
* Copyright (C) 2002 Hacktarux
*
* Mupen64 homepage: http://mupen64.emulation64.com
* email address: hacktarux@yahoo.fr
*
* If you want to contribute to the project please contact
* me first (maybe someone is already making what you are
* planning to do).
*
*
* This program is free software; you can redistribute it and/
* or modify it under the terms of the GNU General Public Li-
* cence as published by the Free Software Foundation; either
* version 2 of the Licence, or any later version.
*
* This program is distributed in the hope that it will be use-
* ful, but WITHOUT ANY WARRANTY; without even the implied war-
* ranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public
* Licence along with this program; if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
**/
#include "BuildOptions.h"
#include "Base/Types.h"
#include <stdlib.h>
#include <string.h>
#include "Core/Memory.h"
#include "Core/RDRam.h"
#include "Debug/DBGConsole.h"
#include "Ultra/ultra_sptask.h"
#define SUBBLOCK_SIZE 64
using tile_line_emitter_t = void (*)(const s16 *y, const s16 *u, u32 address);
/* pixel conversion & foratting */
static u32 GetUYVY(s16 y1, s16 y2, s16 u, s16 v);
static u16 GetRGBA(s16 y, s16 u, s16 v);
/* tile line emitters */
static void EmitYUVTileLine(const s16 *y, const s16 *u, u32 address);
//static void EmitYUVTileLine_SwapY1Y2(const s16 *y, const s16 *u, u32 address);
static void EmitRGBATileLine(const s16 *y, const s16 *u, u32 address);
/* macroblocks operations */
static void DecodeMacroblockOB(s16 *macroblock, s32 *y_dc, s32 *u_dc, s32 *v_dc, const s16 *qtable);
static void DecodeMacroblockPS(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE]);
static void DecodeMacroblockPS0(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE]);
static void EmitTilesMode0(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address);
static void EmitTilesMode2(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address);
/* subblocks operations */
static void TransposeSubBlock(s16 *dst, const s16 *src);
static void ZigZagSubBlock(s16 *dst, const s16 *src);
static void ReorderSubBlock(s16 *dst, const s16 *src, const u32 *table);
static void MultSubBlocks(s16 *dst, const s16 *src1, const s16 *src2, u32 shift);
static void ScaleSubBlock(s16 *dst, const s16 *src, s16 scale);
static void RShiftSubBlock(s16 *dst, const s16 *src, u32 shift);
static void InverseDCT1D(const float * const x, float *dst, u32 stride);
static void InverseDCTSubBlock(s16 *dst, const s16 *src);
static void RescaleYSubBlock(s16 *dst, const s16 *src);
static void RescaleUVSubBlock(s16 *dst, const s16 *src);
/* transposed dequantization table */
const s16 DEFAULT_QTABLE[SUBBLOCK_SIZE] =
{
16, 12, 14, 14, 18, 24, 49, 72,
11, 12, 13, 17, 22, 35, 64, 92,
10, 14, 16, 22, 37, 55, 78, 95,
16, 19, 24, 29, 56, 64, 87, 98,
24, 26, 40, 51, 68, 81, 103, 112,
40, 58, 57, 87, 109, 104, 121, 100,
51, 60, 69, 80, 103, 113, 120, 103,
61, 55, 56, 62, 77, 92, 101, 99
};
/* zig-zag indices */
const u32 ZIGZAG_TABLE[SUBBLOCK_SIZE] =
{
0, 1, 5, 6, 14, 15, 27, 28,
2, 4, 7, 13, 16, 26, 29, 42,
3, 8, 12, 17, 25, 30, 41, 43,
9, 11, 18, 24, 31, 40, 44, 53,
10, 19, 23, 32, 39, 45, 52, 54,
20, 22, 33, 38, 46, 51, 55, 60,
21, 34, 37, 47, 50, 56, 59, 61,
35, 36, 48, 49, 57, 58, 62, 63
};
/* transposition indices */
const u32 TRANSPOSE_TABLE[SUBBLOCK_SIZE] =
{
0, 8, 16, 24, 32, 40, 48, 56,
1, 9, 17, 25, 33, 41, 49, 57,
2, 10, 18, 26, 34, 42, 50, 58,
3, 11, 19, 27, 35, 43, 51, 59,
4, 12, 20, 28, 36, 44, 52, 60,
5, 13, 21, 29, 37, 45, 53, 61,
6, 14, 22, 30, 38, 46, 54, 62,
7, 15, 23, 31, 39, 47, 55, 63
};
/***************************************************************************
* JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
**************************************************************************/
void jpeg_decode_PS0(OSTask *task)
{
s16 qtables[3][SUBBLOCK_SIZE];
if (task->t.flags & 0x1)
{
DBGConsole_Msg(0, "jpeg_decode_PS: task yielding not implemented");
return;
}
u32 address = rdram_read_u32((u32)task->t.data_ptr);
const u32 macroblock_count = rdram_read_u32((u32)task->t.data_ptr + 4);
const u32 mode = rdram_read_u32((u32)task->t.data_ptr + 8);
const u32 qtableY_ptr = rdram_read_u32((u32)task->t.data_ptr + 12);
const u32 qtableU_ptr = rdram_read_u32((u32)task->t.data_ptr + 16);
const u32 qtableV_ptr = rdram_read_u32((u32)task->t.data_ptr + 20);
if (mode != 0 && mode != 2)
{
DBGConsole_Msg(0, "jpeg_decode_PS: invalid mode %d", mode);
return;
}
rdram_read_many_u16((u16*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
rdram_read_many_u16((u16*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
rdram_read_many_u16((u16*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
void (*EmitTilesMode)(const tile_line_emitter_t, const s16 *, u32);
if (mode == 0) EmitTilesMode = EmitTilesMode0;
else EmitTilesMode = EmitTilesMode2;
const u32 subblock_count = mode + 4;
const u32 macroblock_size = subblock_count * SUBBLOCK_SIZE;
/* macroblock contains at most 6 subblocks */
s16 macroblock[6 * SUBBLOCK_SIZE];
for (u32 mb = 0; mb < macroblock_count; ++mb)
{
rdram_read_many_u16((u16*)macroblock, address, macroblock_size);
DecodeMacroblockPS0(macroblock, subblock_count, (const s16 (*)[SUBBLOCK_SIZE])qtables);
EmitTilesMode(EmitYUVTileLine, macroblock, address);
address += 2 * macroblock_size;
}
}
/***************************************************************************
* JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
* Pokemon Stadium 2.
**************************************************************************/
void jpeg_decode_PS(OSTask *task)
{
s16 qtables[3][SUBBLOCK_SIZE];
if (task->t.flags & 0x1)
{
DBGConsole_Msg(0, "jpeg_decode_PS: task yielding not implemented");
return;
}
u32 address = rdram_read_u32((u32)task->t.data_ptr);
const u32 macroblock_count = rdram_read_u32((u32)task->t.data_ptr + 4);
const u32 mode = rdram_read_u32((u32)task->t.data_ptr + 8);
const u32 qtableY_ptr = rdram_read_u32((u32)task->t.data_ptr + 12);
const u32 qtableU_ptr = rdram_read_u32((u32)task->t.data_ptr + 16);
const u32 qtableV_ptr = rdram_read_u32((u32)task->t.data_ptr + 20);
if (mode != 0 && mode != 2)
{
DBGConsole_Msg(0, "jpeg_decode_PS: invalid mode %d", mode);
return;
}
rdram_read_many_u16((u16*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
rdram_read_many_u16((u16*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
rdram_read_many_u16((u16*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);
void (*EmitTilesMode)(const tile_line_emitter_t, const s16 *, u32);
if (mode == 0) EmitTilesMode = EmitTilesMode0;
else EmitTilesMode = EmitTilesMode2;
const u32 subblock_count = mode + 4;
const u32 macroblock_size = subblock_count * SUBBLOCK_SIZE;
/* macroblock contains at most 6 subblocks */
s16 macroblock[6 * SUBBLOCK_SIZE];
for (u32 mb = 0; mb < macroblock_count; ++mb)
{
rdram_read_many_u16((u16*)macroblock, address, macroblock_size);
DecodeMacroblockPS(macroblock, subblock_count, (const s16 (*)[SUBBLOCK_SIZE])qtables);
EmitTilesMode(EmitRGBATileLine, macroblock, address);
address += 2 * macroblock_size;
}
}
/***************************************************************************
* JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
**************************************************************************/
void jpeg_decode_OB(OSTask *task)
{
s16 qtable[SUBBLOCK_SIZE];
s32 y_dc = 0, u_dc = 0, v_dc = 0;
u32 address = (u32)task->t.data_ptr;
const u32 macroblock_count = task->t.data_size;
const int qscale = task->t.yield_data_size;
if (qscale != 0)
{
if (qscale > 0)
{
ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
}
else
{
RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
}
}
for (u32 mb = 0; mb < macroblock_count; ++mb)
{
s16 macroblock[6 * SUBBLOCK_SIZE];
rdram_read_many_u16((u16*)macroblock, address, 6 * SUBBLOCK_SIZE);
DecodeMacroblockOB(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : nullptr);
EmitTilesMode2(EmitYUVTileLine, macroblock, address);
address += (2 * 6 * SUBBLOCK_SIZE);
}
}
static u32 GetUYVY(s16 y1, s16 y2, s16 u, s16 v)
{
return (u32)clamp_u8(u) << 24
| (u32)clamp_u8(y1) << 16
| (u32)clamp_u8(v) << 8
| (u32)clamp_u8(y2);
}
static u16 GetRGBA(s16 y, s16 u, s16 v)
{
const float fY = (float)y + 2048.0f;
const float fU = (float)u;
const float fV = (float)v;
const u16 r = clamp_RGBA_component((s16)(fY + 1.4025*fV));
const u16 g = clamp_RGBA_component((s16)(fY - 0.3443*fU - 0.7144*fV));
const u16 b = clamp_RGBA_component((s16)(fY + 1.7729*fU ));
return (r << 4) | (g >> 1) | (b >> 6) | 1;
}
static void EmitYUVTileLine(const s16 *y, const s16 *u, u32 address)
{
u32 uyvy[8];
const s16 *const v = u + SUBBLOCK_SIZE;
const s16 *const y2 = y + SUBBLOCK_SIZE;
uyvy[0] = GetUYVY(y[0], y[1], u[0], v[0]);
uyvy[1] = GetUYVY(y[2], y[3], u[1], v[1]);
uyvy[2] = GetUYVY(y[4], y[5], u[2], v[2]);
uyvy[3] = GetUYVY(y[6], y[7], u[3], v[3]);
uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);
rdram_write_many_u32(uyvy, address, 8);
}
/*
static void EmitYUVTileLine_SwapY1Y2(const s16 *y, const s16 *u, u32 address)
{
u32 uyvy[8];
const s16 * const v = u + SUBBLOCK_SIZE;
const s16 * const y2 = y + SUBBLOCK_SIZE;
uyvy[0] = GetUYVY(y[1], y[0], u[0], v[0]);
uyvy[1] = GetUYVY(y[3], y[2], u[1], v[1]);
uyvy[2] = GetUYVY(y[5], y[4], u[2], v[2]);
uyvy[3] = GetUYVY(y[7], y[6], u[3], v[3]);
uyvy[4] = GetUYVY(y2[1], y2[0], u[4], v[4]);
uyvy[5] = GetUYVY(y2[3], y2[2], u[5], v[5]);
uyvy[6] = GetUYVY(y2[5], y2[4], u[6], v[6]);
uyvy[7] = GetUYVY(y2[7], y2[6], u[7], v[7]);
rdram_write_many_u32(uyvy, address, 8);
}
*/
static void EmitRGBATileLine(const s16 *y, const s16 *u, u32 address)
{
u16 rgba[16];
const s16 * const v = u + SUBBLOCK_SIZE;
const s16 * const y2 = y + SUBBLOCK_SIZE;
rgba[0] = GetRGBA(y[0], u[0], v[0]);
rgba[1] = GetRGBA(y[1], u[0], v[0]);
rgba[2] = GetRGBA(y[2], u[1], v[1]);
rgba[3] = GetRGBA(y[3], u[1], v[1]);
rgba[4] = GetRGBA(y[4], u[2], v[2]);
rgba[5] = GetRGBA(y[5], u[2], v[2]);
rgba[6] = GetRGBA(y[6], u[3], v[3]);
rgba[7] = GetRGBA(y[7], u[3], v[3]);
rgba[8] = GetRGBA(y2[0], u[4], v[4]);
rgba[9] = GetRGBA(y2[1], u[4], v[4]);
rgba[10] = GetRGBA(y2[2], u[5], v[5]);
rgba[11] = GetRGBA(y2[3], u[5], v[5]);
rgba[12] = GetRGBA(y2[4], u[6], v[6]);
rgba[13] = GetRGBA(y2[5], u[6], v[6]);
rgba[14] = GetRGBA(y2[6], u[7], v[7]);
rgba[15] = GetRGBA(y2[7], u[7], v[7]);
rdram_write_many_u16(rgba, address, 16);
}
static void EmitTilesMode0(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address)
{
u32 y_offset = 0;
u32 u_offset = 2 * SUBBLOCK_SIZE;
for (u32 i = 0; i < 8; ++i)
{
emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
y_offset += 8;
u_offset += 8;
address += 32;
}
}
static void EmitTilesMode2(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address)
{
u32 y_offset = 0;
u32 u_offset = 4 * SUBBLOCK_SIZE;
for (u32 i = 0; i < 8; ++i)
{
emit_line(&macroblock[y_offset], &macroblock[u_offset], address);
emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);
y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
u_offset += 8;
address += 64;
}
}
static void DecodeMacroblockOB(s16 *macroblock, s32 *y_dc, s32 *u_dc, s32 *v_dc, const s16 *qtable)
{
for (int sb = 0; sb < 6; ++sb)
{
s16 tmp_sb[SUBBLOCK_SIZE];
/* update DC */
s32 dc = (s32)macroblock[0];
switch(sb) {
case 0:
case 1:
case 2:
case 3:
{
y_dc[0] += dc;
macroblock[0] = y_dc[0] & 0xffff;
break;
}
case 4:
{
u_dc[0] += dc;
macroblock[0] = u_dc[0] & 0xffff;
break;
}
case 5:
{
v_dc[0] += dc;
macroblock[0] = v_dc[0] & 0xffff;
break;
}
}
ZigZagSubBlock(tmp_sb, macroblock);
if (qtable != nullptr)
MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
TransposeSubBlock(macroblock, tmp_sb);
InverseDCTSubBlock(macroblock, macroblock);
macroblock += SUBBLOCK_SIZE;
}
}
static void DecodeMacroblockPS(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE])
{
u32 q = 0;
for (u32 sb = 0; sb < subblock_count; ++sb)
{
s16 tmp_sb[SUBBLOCK_SIZE];
const int isChromaSubBlock = (subblock_count - sb <= 2);
if (isChromaSubBlock)
++q;
MultSubBlocks(macroblock, macroblock, qtables[q], 4);
ZigZagSubBlock(tmp_sb, macroblock);
InverseDCTSubBlock(macroblock, tmp_sb);
macroblock += SUBBLOCK_SIZE;
}
}
static void DecodeMacroblockPS0(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE])
{
u32 sb;
u32 q = 0;
for (sb = 0; sb < subblock_count; ++sb)
{
s16 tmp_sb[SUBBLOCK_SIZE];
const int isChromaSubBlock = (subblock_count - sb <= 2);
if (isChromaSubBlock) { ++q; }
MultSubBlocks(macroblock, macroblock, qtables[q], 4);
ZigZagSubBlock(tmp_sb, macroblock);
InverseDCTSubBlock(macroblock, tmp_sb);
if (isChromaSubBlock)
{
RescaleUVSubBlock(macroblock, macroblock);
}
else
{
RescaleYSubBlock(macroblock, macroblock);
}
macroblock += SUBBLOCK_SIZE;
}
}
static void TransposeSubBlock(s16 *dst, const s16 *src)
{
ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
}
static void ZigZagSubBlock(s16 *dst, const s16 *src)
{
ReorderSubBlock(dst, src, ZIGZAG_TABLE);
}
static void ReorderSubBlock(s16 *dst, const s16 *src, const u32 *table)
{
/* source and destination sublocks cannot overlap */
//assert(abs(dst - src) > SUBBLOCK_SIZE);
for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
{
dst[i] = src[table[i]];
}
}
static void MultSubBlocks(s16 *dst, const s16 *src1, const s16 *src2, u32 shift)
{
for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
{
s32 v = src1[i] * src2[i];
dst[i] = clamp_s16(v) << shift;
}
}
static void ScaleSubBlock(s16 *dst, const s16 *src, s16 scale)
{
for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
{
s32 v = src[i] * scale;
dst[i] = clamp_s16(v);
}
}
static void RShiftSubBlock(s16 *dst, const s16 *src, u32 shift)
{
for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
{
dst[i] = src[i] >> shift;
}
}
/***************************************************************************
* Fast 2D IDCT using separable formulation and normalization
* Computations use single precision floats
* Implementation based on Wikipedia :
* http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
**************************************************************************/
/* Normalized such as C4 = 1 */
#define C3 1.175875602f
#define C6 0.541196100f
#define K1 0.765366865f // C2-C6
#define K2 -1.847759065f // -C2-C6
#define K3 -0.390180644f // C5-C3
#define K4 -1.961570561f // -C5-C3
#define K5 1.501321110f // C1+C3-C5-C7
#define K6 2.053119869f // C1+C3-C5+C7
#define K7 3.072711027f // C1+C3+C5-C7
#define K8 0.298631336f // -C1+C3+C5-C7
#define K9 -0.899976223f // C7-C3
#define K10 -2.562915448f // -C1-C3
static void InverseDCT1D(const float * const x, float *dst, u32 stride)
{
float e[4];
float f[4];
float x26, x1357, x15, x37, x17, x35;
x15 = K3 * (x[1] + x[5]);
x37 = K4 * (x[3] + x[7]);
x17 = K9 * (x[1] + x[7]);
x35 = K10 * (x[3] + x[5]);
x1357 = C3 * (x[1] + x[3] + x[5] + x[7]);
x26 = C6 * (x[2] + x[6]);
f[0] = x[0] + x[4];
f[1] = x[0] - x[4];
f[2] = x26 + K1*x[2];
f[3] = x26 + K2*x[6];
e[0] = x1357 + x15 + K5*x[1] + x17;
e[1] = x1357 + x37 + K7*x[3] + x35;
e[2] = x1357 + x15 + K6*x[5] + x35;
e[3] = x1357 + x37 + K8*x[7] + x17;
*dst = f[0] + f[2] + e[0]; dst += stride;
*dst = f[1] + f[3] + e[1]; dst += stride;
*dst = f[1] - f[3] + e[2]; dst += stride;
*dst = f[0] - f[2] + e[3]; dst += stride;
*dst = f[0] - f[2] - e[3]; dst += stride;
*dst = f[1] - f[3] - e[2]; dst += stride;
*dst = f[1] + f[3] - e[1]; dst += stride;
*dst = f[0] + f[2] - e[0]; dst += stride;
}
#undef C3
#undef C6
#undef K1
#undef K2
#undef K3
#undef K4
#undef K5
#undef K6
#undef K7
#undef K8
#undef K9
#undef K10
static void InverseDCTSubBlock(s16 *dst, const s16 *src)
{
float x[8];
float block[SUBBLOCK_SIZE];
/* idct 1d on rows (+transposition) */
for (u32 i = 0; i < 8; ++i)
{
for (u32 j = 0; j < 8; ++j)
{
x[j] = (float)src[i*8+j];
}
InverseDCT1D(x, &block[i], 8);
}
/* idct 1d on columns (thanks to previous transposition) */
for (u32 i = 0; i < 8; ++i)
{
InverseDCT1D(&block[i*8], x, 1);
/* C4 = 1 normalization implies a division by 8 */
for (u32 j = 0; j < 8; ++j)
{
dst[i+j*8] = (s16)x[j] >> 3;
}
}
}
static void RescaleYSubBlock(s16 *dst, const s16 *src)
{
for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
{
dst[i] = (((u32)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
}
}
static void RescaleUVSubBlock(s16 *dst, const s16 *src)
{
for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
{
dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
}
}