/**
* Mupen64 hle rsp - jpeg.c
* Copyright (C) 2012 Bobby Smiles                                       *
* Copyright (C) 2009 Richard Goedeken                                   *
* Copyright (C) 2002 Hacktarux
*
* Mupen64 homepage: http://mupen64.emulation64.com
* email address: hacktarux@yahoo.fr
*
* If you want to contribute to the project please contact
* me first (maybe someone is already making what you are
* planning to do).
*
*
* This program is free software; you can redistribute it and/
* or modify it under the terms of the GNU General Public Li-
* cence as published by the Free Software Foundation; either
* version 2 of the Licence, or any later version.
*
* This program is distributed in the hope that it will be use-
* ful, but WITHOUT ANY WARRANTY; without even the implied war-
* ranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public Licence for more details.
*
* You should have received a copy of the GNU General Public
* Licence along with this program; if not, write to the Free
* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
**/

#include "BuildOptions.h"
#include "Base/Types.h"

#include <stdlib.h>
#include <string.h>

#include "Core/Memory.h"
#include "Core/RDRam.h"
#include "Debug/DBGConsole.h"
#include "Ultra/ultra_sptask.h"

#define SUBBLOCK_SIZE 64
using tile_line_emitter_t = void (*)(const s16 *y, const s16 *u, u32 address);

/* pixel conversion & foratting */
static u32 GetUYVY(s16 y1, s16 y2, s16 u, s16 v);
static u16 GetRGBA(s16 y, s16 u, s16 v);

/* tile line emitters */
static void EmitYUVTileLine(const s16 *y, const s16 *u, u32 address);
//static void EmitYUVTileLine_SwapY1Y2(const s16 *y, const s16 *u, u32 address);
static void EmitRGBATileLine(const s16 *y, const s16 *u, u32 address);

/* macroblocks operations */
static void DecodeMacroblockOB(s16 *macroblock, s32 *y_dc, s32 *u_dc, s32 *v_dc, const s16 *qtable);
static void DecodeMacroblockPS(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE]);
static void DecodeMacroblockPS0(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE]);
static void EmitTilesMode0(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address);
static void EmitTilesMode2(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address);

/* subblocks operations */
static void TransposeSubBlock(s16 *dst, const s16 *src);
static void ZigZagSubBlock(s16 *dst, const s16 *src);
static void ReorderSubBlock(s16 *dst, const s16 *src, const u32 *table);
static void MultSubBlocks(s16 *dst, const s16 *src1, const s16 *src2, u32 shift);
static void ScaleSubBlock(s16 *dst, const s16 *src, s16 scale);
static void RShiftSubBlock(s16 *dst, const s16 *src, u32 shift);
static void InverseDCT1D(const float * const x, float *dst, u32 stride);
static void InverseDCTSubBlock(s16 *dst, const s16 *src);
static void RescaleYSubBlock(s16 *dst, const s16 *src);
static void RescaleUVSubBlock(s16 *dst, const s16 *src);

/* transposed dequantization table */
const s16 DEFAULT_QTABLE[SUBBLOCK_SIZE] =
{
    16, 12, 14, 14,  18,  24,  49,  72,
    11, 12, 13, 17,  22,  35,  64,  92,
    10, 14, 16, 22,  37,  55,  78,  95,
    16, 19, 24, 29,  56,  64,  87,  98,
    24, 26, 40, 51,  68,  81, 103, 112,
    40, 58, 57, 87, 109, 104, 121, 100,
    51, 60, 69, 80, 103, 113, 120, 103,
    61, 55, 56, 62,  77,  92, 101,  99
};

/* zig-zag indices */
const u32 ZIGZAG_TABLE[SUBBLOCK_SIZE] =
{
     0,  1,  5,  6, 14, 15, 27, 28,
     2,  4,  7, 13, 16, 26, 29, 42,
     3,  8, 12, 17, 25, 30, 41, 43,
     9, 11, 18, 24, 31, 40, 44, 53,
    10, 19, 23, 32, 39, 45, 52, 54,
    20, 22, 33, 38, 46, 51, 55, 60,
    21, 34, 37, 47, 50, 56, 59, 61,
    35, 36, 48, 49, 57, 58, 62, 63
};

/* transposition indices */
const u32 TRANSPOSE_TABLE[SUBBLOCK_SIZE] =
{
    0,  8, 16, 24, 32, 40, 48, 56,
    1,  9, 17, 25, 33, 41, 49, 57,
    2, 10, 18, 26, 34, 42, 50, 58,
    3, 11, 19, 27, 35, 43, 51, 59,
    4, 12, 20, 28, 36, 44, 52, 60,
    5, 13, 21, 29, 37, 45, 53, 61,
    6, 14, 22, 30, 38, 46, 54, 62,
    7, 15, 23, 31, 39, 47, 55, 63
};

/***************************************************************************
 * JPEG decoding ucode found in Japanese exclusive version of Pokemon Stadium.
 **************************************************************************/
void jpeg_decode_PS0(OSTask *task)
{
    s16 qtables[3][SUBBLOCK_SIZE];

    if (task->t.flags & 0x1)
    {
        DBGConsole_Msg(0, "jpeg_decode_PS: task yielding not implemented");
        return;
    }

    u32       address          = rdram_read_u32((u32)task->t.data_ptr);
    const u32 macroblock_count = rdram_read_u32((u32)task->t.data_ptr + 4);
    const u32 mode             = rdram_read_u32((u32)task->t.data_ptr + 8);
    const u32 qtableY_ptr      = rdram_read_u32((u32)task->t.data_ptr + 12);
    const u32 qtableU_ptr      = rdram_read_u32((u32)task->t.data_ptr + 16);
    const u32 qtableV_ptr      = rdram_read_u32((u32)task->t.data_ptr + 20);

    if (mode != 0 && mode != 2)
    {
        DBGConsole_Msg(0, "jpeg_decode_PS: invalid mode %d", mode);
        return;
    }

    rdram_read_many_u16((u16*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
    rdram_read_many_u16((u16*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
    rdram_read_many_u16((u16*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);

	void (*EmitTilesMode)(const tile_line_emitter_t, const s16 *, u32);

	if (mode == 0) EmitTilesMode =  EmitTilesMode0;
	else EmitTilesMode =  EmitTilesMode2;

	const u32 subblock_count = mode + 4;
	const u32 macroblock_size = subblock_count * SUBBLOCK_SIZE;

	/* macroblock contains at most 6 subblocks */
   s16 macroblock[6 * SUBBLOCK_SIZE];

    for (u32 mb = 0; mb < macroblock_count; ++mb)
    {
        rdram_read_many_u16((u16*)macroblock, address, macroblock_size);
        DecodeMacroblockPS0(macroblock, subblock_count, (const s16 (*)[SUBBLOCK_SIZE])qtables);
		EmitTilesMode(EmitYUVTileLine, macroblock, address);

        address += 2 * macroblock_size;
    }
}


/***************************************************************************
 * JPEG decoding ucode found in Ocarina of Time, Pokemon Stadium 1 and
 * Pokemon Stadium 2.
 **************************************************************************/
void jpeg_decode_PS(OSTask *task)
{
    s16 qtables[3][SUBBLOCK_SIZE];

    if (task->t.flags & 0x1)
    {
        DBGConsole_Msg(0, "jpeg_decode_PS: task yielding not implemented");
        return;
    }

    u32       address          = rdram_read_u32((u32)task->t.data_ptr);
    const u32 macroblock_count = rdram_read_u32((u32)task->t.data_ptr + 4);
    const u32 mode             = rdram_read_u32((u32)task->t.data_ptr + 8);
    const u32 qtableY_ptr      = rdram_read_u32((u32)task->t.data_ptr + 12);
    const u32 qtableU_ptr      = rdram_read_u32((u32)task->t.data_ptr + 16);
    const u32 qtableV_ptr      = rdram_read_u32((u32)task->t.data_ptr + 20);

    if (mode != 0 && mode != 2)
    {
        DBGConsole_Msg(0, "jpeg_decode_PS: invalid mode %d", mode);
        return;
    }

    rdram_read_many_u16((u16*)qtables[0], qtableY_ptr, SUBBLOCK_SIZE);
    rdram_read_many_u16((u16*)qtables[1], qtableU_ptr, SUBBLOCK_SIZE);
    rdram_read_many_u16((u16*)qtables[2], qtableV_ptr, SUBBLOCK_SIZE);

	void (*EmitTilesMode)(const tile_line_emitter_t, const s16 *, u32);

	if (mode == 0) EmitTilesMode =  EmitTilesMode0;
	else EmitTilesMode =  EmitTilesMode2;

	const u32 subblock_count = mode + 4;
	const u32 macroblock_size = subblock_count * SUBBLOCK_SIZE;

	/* macroblock contains at most 6 subblocks */
   s16 macroblock[6 * SUBBLOCK_SIZE];

    for (u32 mb = 0; mb < macroblock_count; ++mb)
    {
        rdram_read_many_u16((u16*)macroblock, address, macroblock_size);
        DecodeMacroblockPS(macroblock, subblock_count, (const s16 (*)[SUBBLOCK_SIZE])qtables);
		EmitTilesMode(EmitRGBATileLine, macroblock, address);

        address += 2 * macroblock_size;
    }
}

/***************************************************************************
 * JPEG decoding ucode found in Ogre Battle and Bottom of the 9th.
 **************************************************************************/
void jpeg_decode_OB(OSTask *task)
{
    s16 qtable[SUBBLOCK_SIZE];

    s32 y_dc = 0, u_dc = 0, v_dc = 0;

	u32  address = (u32)task->t.data_ptr;
	const u32 macroblock_count = task->t.data_size;
	const int qscale = task->t.yield_data_size;

    if (qscale != 0)
    {
        if (qscale > 0)
        {
            ScaleSubBlock(qtable, DEFAULT_QTABLE, qscale);
        }
        else
        {
            RShiftSubBlock(qtable, DEFAULT_QTABLE, -qscale);
        }
    }

    for (u32 mb = 0; mb < macroblock_count; ++mb)
    {
        s16 macroblock[6 * SUBBLOCK_SIZE];
        rdram_read_many_u16((u16*)macroblock, address, 6 * SUBBLOCK_SIZE);
        DecodeMacroblockOB(macroblock, &y_dc, &u_dc, &v_dc, (qscale != 0) ? qtable : nullptr);
        EmitTilesMode2(EmitYUVTileLine, macroblock, address);

        address += (2 * 6 * SUBBLOCK_SIZE);
    }
}

static u32 GetUYVY(s16 y1, s16 y2, s16 u, s16 v)
{
    return (u32)clamp_u8(u)  << 24
        |  (u32)clamp_u8(y1) << 16
        |  (u32)clamp_u8(v)  << 8
        |  (u32)clamp_u8(y2);
}

static u16 GetRGBA(s16 y, s16 u, s16 v)
{
    const float fY = (float)y + 2048.0f;
    const float fU = (float)u;
    const float fV = (float)v;

    const u16 r = clamp_RGBA_component((s16)(fY             + 1.4025*fV));
    const u16 g = clamp_RGBA_component((s16)(fY - 0.3443*fU - 0.7144*fV));
    const u16 b = clamp_RGBA_component((s16)(fY + 1.7729*fU            ));

    return (r << 4) | (g >> 1) | (b >> 6) | 1;
}

static void EmitYUVTileLine(const s16 *y, const s16 *u, u32 address)
{
    u32 uyvy[8];

    const s16 *const v  = u + SUBBLOCK_SIZE;
    const s16 *const y2 = y + SUBBLOCK_SIZE;

    uyvy[0] = GetUYVY(y[0],  y[1],  u[0], v[0]);
    uyvy[1] = GetUYVY(y[2],  y[3],  u[1], v[1]);
    uyvy[2] = GetUYVY(y[4],  y[5],  u[2], v[2]);
    uyvy[3] = GetUYVY(y[6],  y[7],  u[3], v[3]);
    uyvy[4] = GetUYVY(y2[0], y2[1], u[4], v[4]);
    uyvy[5] = GetUYVY(y2[2], y2[3], u[5], v[5]);
    uyvy[6] = GetUYVY(y2[4], y2[5], u[6], v[6]);
    uyvy[7] = GetUYVY(y2[6], y2[7], u[7], v[7]);

    rdram_write_many_u32(uyvy, address, 8);
}
/*
static void EmitYUVTileLine_SwapY1Y2(const s16 *y, const s16 *u, u32 address)
{
    u32 uyvy[8];

    const s16 * const v  = u + SUBBLOCK_SIZE;
    const s16 * const y2 = y + SUBBLOCK_SIZE;

    uyvy[0] = GetUYVY(y[1],  y[0],  u[0], v[0]);
    uyvy[1] = GetUYVY(y[3],  y[2],  u[1], v[1]);
    uyvy[2] = GetUYVY(y[5],  y[4],  u[2], v[2]);
    uyvy[3] = GetUYVY(y[7],  y[6],  u[3], v[3]);
    uyvy[4] = GetUYVY(y2[1], y2[0], u[4], v[4]);
    uyvy[5] = GetUYVY(y2[3], y2[2], u[5], v[5]);
    uyvy[6] = GetUYVY(y2[5], y2[4], u[6], v[6]);
    uyvy[7] = GetUYVY(y2[7], y2[6], u[7], v[7]);

    rdram_write_many_u32(uyvy, address, 8);
}
*/
static void EmitRGBATileLine(const s16 *y, const s16 *u, u32 address)
{
    u16 rgba[16];

    const s16 * const v  = u + SUBBLOCK_SIZE;
    const s16 * const y2 = y + SUBBLOCK_SIZE;

    rgba[0]  = GetRGBA(y[0],  u[0], v[0]);
    rgba[1]  = GetRGBA(y[1],  u[0], v[0]);
    rgba[2]  = GetRGBA(y[2],  u[1], v[1]);
    rgba[3]  = GetRGBA(y[3],  u[1], v[1]);
    rgba[4]  = GetRGBA(y[4],  u[2], v[2]);
    rgba[5]  = GetRGBA(y[5],  u[2], v[2]);
    rgba[6]  = GetRGBA(y[6],  u[3], v[3]);
    rgba[7]  = GetRGBA(y[7],  u[3], v[3]);
    rgba[8]  = GetRGBA(y2[0], u[4], v[4]);
    rgba[9]  = GetRGBA(y2[1], u[4], v[4]);
    rgba[10] = GetRGBA(y2[2], u[5], v[5]);
    rgba[11] = GetRGBA(y2[3], u[5], v[5]);
    rgba[12] = GetRGBA(y2[4], u[6], v[6]);
    rgba[13] = GetRGBA(y2[5], u[6], v[6]);
    rgba[14] = GetRGBA(y2[6], u[7], v[7]);
    rgba[15] = GetRGBA(y2[7], u[7], v[7]);

    rdram_write_many_u16(rgba, address, 16);
}

static void EmitTilesMode0(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address)
{
    u32 y_offset = 0;
    u32 u_offset = 2 * SUBBLOCK_SIZE;

    for (u32 i = 0; i < 8; ++i)
    {
        emit_line(&macroblock[y_offset], &macroblock[u_offset], address);

        y_offset += 8;
        u_offset += 8;
        address += 32;
    }
}

static void EmitTilesMode2(const tile_line_emitter_t emit_line, const s16 *macroblock, u32 address)
{
    u32 y_offset = 0;
    u32 u_offset = 4 * SUBBLOCK_SIZE;

    for (u32 i = 0; i < 8; ++i)
    {
        emit_line(&macroblock[y_offset],     &macroblock[u_offset], address);
        emit_line(&macroblock[y_offset + 8], &macroblock[u_offset], address + 32);

        y_offset += (i == 3) ? SUBBLOCK_SIZE + 16 : 16;
        u_offset += 8;
        address += 64;
    }
}

static void DecodeMacroblockOB(s16 *macroblock, s32 *y_dc, s32 *u_dc, s32 *v_dc, const s16 *qtable)
{

	for (int sb = 0; sb < 6; ++sb)
	{
		s16 tmp_sb[SUBBLOCK_SIZE];

		/* update DC */
		s32 dc = (s32)macroblock[0];
		switch(sb) {
		case 0:
		case 1:
		case 2:
		case 3:
			{
				y_dc[0] += dc;
				macroblock[0] = y_dc[0] & 0xffff;
				break;
			}
		case 4:
			{
				u_dc[0] += dc;
				macroblock[0] = u_dc[0] & 0xffff;
				break;
			}
		case 5:
			{
				v_dc[0] += dc;
				macroblock[0] = v_dc[0] & 0xffff;
				break;
			}
		}

		ZigZagSubBlock(tmp_sb, macroblock);
		if (qtable != nullptr)
			MultSubBlocks(tmp_sb, tmp_sb, qtable, 0);
		TransposeSubBlock(macroblock, tmp_sb);
		InverseDCTSubBlock(macroblock, macroblock);

		macroblock += SUBBLOCK_SIZE;
	}
}

static void DecodeMacroblockPS(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE])
{
    u32 q = 0;

    for (u32 sb = 0; sb < subblock_count; ++sb)
    {
        s16 tmp_sb[SUBBLOCK_SIZE];
        const int isChromaSubBlock = (subblock_count - sb <= 2);

        if (isChromaSubBlock)
			++q;

        MultSubBlocks(macroblock, macroblock, qtables[q], 4);
        ZigZagSubBlock(tmp_sb, macroblock);
        InverseDCTSubBlock(macroblock, tmp_sb);

        macroblock += SUBBLOCK_SIZE;
    }

}

static void DecodeMacroblockPS0(s16 *macroblock, u32 subblock_count, const s16 qtables[3][SUBBLOCK_SIZE])
{
    u32 sb;
    u32 q = 0;

    for (sb = 0; sb < subblock_count; ++sb)
    {
        s16 tmp_sb[SUBBLOCK_SIZE];
        const int isChromaSubBlock = (subblock_count - sb <= 2);

        if (isChromaSubBlock) { ++q; }

        MultSubBlocks(macroblock, macroblock, qtables[q], 4);
        ZigZagSubBlock(tmp_sb, macroblock);
        InverseDCTSubBlock(macroblock, tmp_sb);

        if (isChromaSubBlock)
        {
            RescaleUVSubBlock(macroblock, macroblock);
        }
        else
        {
            RescaleYSubBlock(macroblock, macroblock);
        }

        macroblock += SUBBLOCK_SIZE;
    }
}

static void TransposeSubBlock(s16 *dst, const s16 *src)
{
    ReorderSubBlock(dst, src, TRANSPOSE_TABLE);
}

static void ZigZagSubBlock(s16 *dst, const s16 *src)
{
    ReorderSubBlock(dst, src, ZIGZAG_TABLE);
}

static void ReorderSubBlock(s16 *dst, const s16 *src, const u32 *table)
{
    /* source and destination sublocks cannot overlap */
    //assert(abs(dst - src) > SUBBLOCK_SIZE);

    for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
    {
        dst[i] = src[table[i]];
    }
}

static void MultSubBlocks(s16 *dst, const s16 *src1, const s16 *src2, u32 shift)
{

    for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
    {
        s32 v = src1[i] * src2[i];
        dst[i] = clamp_s16(v) << shift;
    }
}

static void ScaleSubBlock(s16 *dst, const s16 *src, s16 scale)
{
    for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
    {
        s32 v = src[i] * scale;
        dst[i] = clamp_s16(v);
    }
}

static void RShiftSubBlock(s16 *dst, const s16 *src, u32 shift)
{

    for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
    {
        dst[i] = src[i] >> shift;
    }
}

/***************************************************************************
 * Fast 2D IDCT using separable formulation and normalization
 * Computations use single precision floats
 * Implementation based on Wikipedia :
 * http://fr.wikipedia.org/wiki/Transform%C3%A9e_en_cosinus_discr%C3%A8te
 **************************************************************************/

/* Normalized such as C4 = 1 */
#define C3   1.175875602f
#define C6   0.541196100f
#define K1   0.765366865f   //  C2-C6
#define K2  -1.847759065f   // -C2-C6
#define K3  -0.390180644f   //  C5-C3
#define K4  -1.961570561f   // -C5-C3
#define K5   1.501321110f   //  C1+C3-C5-C7
#define K6   2.053119869f   //  C1+C3-C5+C7
#define K7   3.072711027f   //  C1+C3+C5-C7
#define K8   0.298631336f   // -C1+C3+C5-C7
#define K9  -0.899976223f   //  C7-C3
#define K10 -2.562915448f   // -C1-C3
static void InverseDCT1D(const float * const x, float *dst, u32 stride)
{
    float e[4];
    float f[4];
    float x26, x1357, x15, x37, x17, x35;

    x15   =  K3 * (x[1] + x[5]);
    x37   =  K4 * (x[3] + x[7]);
    x17   =  K9 * (x[1] + x[7]);
    x35   = K10 * (x[3] + x[5]);
    x1357 =  C3 * (x[1] + x[3] + x[5] + x[7]);
    x26   =  C6 * (x[2] + x[6]);

    f[0] = x[0] + x[4];
    f[1] = x[0] - x[4];
    f[2] = x26 + K1*x[2];
    f[3] = x26 + K2*x[6];

    e[0] = x1357 + x15 + K5*x[1] + x17;
    e[1] = x1357 + x37 + K7*x[3] + x35;
    e[2] = x1357 + x15 + K6*x[5] + x35;
    e[3] = x1357 + x37 + K8*x[7] + x17;

    *dst = f[0] + f[2] + e[0]; dst += stride;
    *dst = f[1] + f[3] + e[1]; dst += stride;
    *dst = f[1] - f[3] + e[2]; dst += stride;
    *dst = f[0] - f[2] + e[3]; dst += stride;
    *dst = f[0] - f[2] - e[3]; dst += stride;
    *dst = f[1] - f[3] - e[2]; dst += stride;
    *dst = f[1] + f[3] - e[1]; dst += stride;
    *dst = f[0] + f[2] - e[0]; dst += stride;
}
#undef C3
#undef C6
#undef K1
#undef K2
#undef K3
#undef K4
#undef K5
#undef K6
#undef K7
#undef K8
#undef K9
#undef K10

static void InverseDCTSubBlock(s16 *dst, const s16 *src)
{
    float x[8];
    float block[SUBBLOCK_SIZE];

    /* idct 1d on rows (+transposition) */
    for (u32 i = 0; i < 8; ++i)
    {
        for (u32 j = 0; j < 8; ++j)
        {
            x[j] = (float)src[i*8+j];
        }

        InverseDCT1D(x, &block[i], 8);
    }

    /* idct 1d on columns (thanks to previous transposition) */
    for (u32 i = 0; i < 8; ++i)
    {
        InverseDCT1D(&block[i*8], x, 1);

        /* C4 = 1 normalization implies a division by 8 */
        for (u32 j = 0; j < 8; ++j)
        {
            dst[i+j*8] = (s16)x[j] >> 3;
        }
    }
}

static void RescaleYSubBlock(s16 *dst, const s16 *src)
{
    for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
    {
        dst[i] = (((u32)(clamp_s12(src[i]) + 0x800) * 0xdb0) >> 16) + 0x10;
    }
}

static void RescaleUVSubBlock(s16 *dst, const s16 *src)
{
    for (u32 i = 0; i < SUBBLOCK_SIZE; ++i)
    {
        dst[i] = (((int)clamp_s12(src[i]) * 0xe00) >> 16) + 0x80;
    }
}