mirror of
https://github.com/PCSX2/pcsx2.git
synced 2025-04-02 10:52:54 -04:00
This pull request is for the pending reorganization of the folder structure on GSdx, making it better organized and easier to work with. Also remove unused GSTextureFX.cpp file.
2082 lines
37 KiB
C++
2082 lines
37 KiB
C++
/*
|
|
* Copyright (C) 2007-2009 Gabest
|
|
* http://www.gabest.org
|
|
*
|
|
* This Program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2, or (at your option)
|
|
* any later version.
|
|
*
|
|
* This Program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with GNU Make; see the file COPYING. If not, write to
|
|
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA.
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
*
|
|
*/
|
|
|
|
#include "stdafx.h"
|
|
#include "GSDrawScanlineCodeGenerator.h"
|
|
#include "GSVertexSW.h"
|
|
|
|
#if _M_SSE < 0x501 && (defined(_M_AMD64) || defined(_WIN64))
|
|
|
|
// Ease the reading of the code
|
|
#define _m_local r12
|
|
#define _m_local__gd r13
|
|
#define _m_local__gd__vm a1
|
|
#define _m_local__gd__clut r11
|
|
#define _m_local__gd__tex a3
|
|
// More pretty name
|
|
#define _z xmm8
|
|
#define _f xmm9
|
|
#define _s xmm10
|
|
#define _t xmm11
|
|
#define _q xmm12
|
|
#define _f_rb xmm13
|
|
#define _f_ga xmm14
|
|
#define _test xmm15
|
|
// Extra bonus
|
|
#define _rb xmm2
|
|
#define _ga xmm3
|
|
#define _fm xmm4
|
|
#define _zm xmm5
|
|
#define _fd xmm6
|
|
|
|
#define _rip_local(field) (m_rip ? ptr[rip + &m_local.field] : ptr[_m_local + offsetof(GSScanlineLocalData, field)])
|
|
#define _rip_global(field) (m_rip ? ptr[rip + &m_local.gd->field] : ptr[_m_local__gd + offsetof(GSScanlineGlobalData, field)])
|
|
|
|
#ifdef _WIN64
|
|
#else
|
|
static const int _rz_rbx = -8 * 1;
|
|
static const int _rz_r12 = -8 * 2;
|
|
static const int _rz_r13 = -8 * 3;
|
|
static const int _rz_r14 = -8 * 4;
|
|
static const int _rz_r15 = -8 * 5;
|
|
static const int _rz_top = -8 * 6;
|
|
static const int _rz_zs = -8 * 8;
|
|
static const int _rz_zd = -8 * 10;
|
|
static const int _rz_cov = -8 * 12;
|
|
#endif
|
|
|
|
void GSDrawScanlineCodeGenerator::Generate_AVX()
|
|
{
|
|
bool need_tex = m_sel.fb && m_sel.tfx != TFX_NONE;
|
|
bool need_clut = need_tex && m_sel.tlu;
|
|
m_rip = (size_t)getCurr() < 0x80000000;
|
|
m_rip &= (size_t)&m_local < 0x80000000;
|
|
m_rip &= (size_t)&m_local.gd < 0x80000000;
|
|
|
|
#ifdef _WIN64
|
|
push(rbx);
|
|
push(rsi);
|
|
push(rdi);
|
|
push(rbp);
|
|
push(r12);
|
|
push(r13);
|
|
|
|
sub(rsp, 8 + 10 * 16);
|
|
|
|
for(int i = 6; i < 16; i++)
|
|
{
|
|
vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i));
|
|
}
|
|
#else
|
|
// No reservation on the stack as a red zone is available
|
|
push(rbp);
|
|
mov(ptr[rsp + _rz_rbx], rbx);
|
|
if (!m_rip)
|
|
{
|
|
mov(ptr[rsp + _rz_r12], r12);
|
|
mov(ptr[rsp + _rz_r13], r13);
|
|
}
|
|
#endif
|
|
|
|
mov(r10, (size_t)g_const->m_test_128b[0]);
|
|
if (!m_rip)
|
|
{
|
|
mov(_m_local, (size_t)&m_local);
|
|
mov(_m_local__gd, _rip_local(gd));
|
|
}
|
|
|
|
if(need_clut)
|
|
mov(_m_local__gd__clut, _rip_global(clut));
|
|
|
|
Init_AVX();
|
|
|
|
// a0 = steps
|
|
// t1 = fza_base
|
|
// t0 = fza_offset
|
|
// r10 = &m_test[0]
|
|
// _m_local = &m_local
|
|
// _m_local__gd = m_local->gd
|
|
// _m_local__gd__vm = m_local->gd.vm
|
|
// xmm7 = vf (sprite && ltf)
|
|
// xmm8 = z
|
|
// xmm9 = f
|
|
// xmm10 = s
|
|
// xmm11 = t
|
|
// xmm12 = q
|
|
// xmm13 = rb
|
|
// xmm14 = ga
|
|
// xmm15 = test
|
|
|
|
if(!m_sel.edge)
|
|
{
|
|
align(16);
|
|
}
|
|
|
|
L("loop");
|
|
|
|
TestZ_AVX(xmm5, xmm6);
|
|
|
|
// ebp = za
|
|
|
|
// FIXME not yet done
|
|
if(m_sel.mmin && 0)
|
|
{
|
|
SampleTextureLOD_AVX();
|
|
}
|
|
else
|
|
{
|
|
SampleTexture_AVX();
|
|
}
|
|
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
|
|
AlphaTFX_AVX();
|
|
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
|
|
ReadMask_AVX();
|
|
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm5 = zm
|
|
|
|
TestAlpha_AVX();
|
|
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm5 = zm
|
|
|
|
ColorTFX_AVX();
|
|
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm5 = zm
|
|
|
|
Fog_AVX();
|
|
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm5 = zm
|
|
|
|
ReadFrame_AVX();
|
|
|
|
// ebx = fa
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm5 = zm
|
|
// xmm6 = fd
|
|
|
|
TestDestAlpha_AVX();
|
|
|
|
// ebx = fa
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm5 = zm
|
|
// xmm6 = fd
|
|
|
|
WriteMask_AVX();
|
|
|
|
// ebx = fa
|
|
// edx = fzm
|
|
// ebp = za
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm5 = zm
|
|
// xmm6 = fd
|
|
|
|
WriteZBuf_AVX();
|
|
|
|
// ebx = fa
|
|
// edx = fzm
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm6 = fd
|
|
|
|
AlphaBlend_AVX();
|
|
|
|
// ebx = fa
|
|
// edx = fzm
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm4 = fm
|
|
// xmm6 = fd
|
|
|
|
WriteFrame_AVX();
|
|
|
|
L("step");
|
|
|
|
// if(steps <= 0) break;
|
|
|
|
if(!m_sel.edge)
|
|
{
|
|
test(a0, a0);
|
|
|
|
jle("exit", T_NEAR);
|
|
|
|
Step_AVX();
|
|
|
|
jmp("loop", T_NEAR);
|
|
}
|
|
|
|
L("exit");
|
|
|
|
#ifdef _WIN64
|
|
for(int i = 6; i < 16; i++)
|
|
{
|
|
vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]);
|
|
}
|
|
|
|
add(rsp, 8 + 10 * 16);
|
|
|
|
pop(r13);
|
|
pop(r12);
|
|
pop(rbp);
|
|
pop(rdi);
|
|
pop(rsi);
|
|
pop(rbx);
|
|
#else
|
|
mov(rbx, ptr[rsp + _rz_rbx]);
|
|
if (!m_rip)
|
|
{
|
|
mov(r12, ptr[rsp + _rz_r12]);
|
|
mov(r13, ptr[rsp + _rz_r13]);
|
|
}
|
|
pop(rbp);
|
|
#endif
|
|
|
|
ret();
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::Init_AVX()
|
|
{
|
|
if(!m_sel.notest)
|
|
{
|
|
// int skip = left & 3;
|
|
|
|
mov(ebx, a1.cvt32());
|
|
and(a1.cvt32(), 3);
|
|
|
|
// left -= skip;
|
|
|
|
sub(ebx, a1.cvt32());
|
|
|
|
// int steps = pixels + skip - 4;
|
|
|
|
lea(a0, ptr[a0 + a1 - 4]);
|
|
|
|
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
|
|
|
|
shl(a1.cvt32(), 4); // * sizeof(m_test[0])
|
|
|
|
vmovdqa(_test, ptr[a1 + r10]);
|
|
|
|
mov(rax, a0);
|
|
sar(rax, 63); // GH: 63 to extract the sign of the register
|
|
and(rax, a0);
|
|
shl(rax, 4); // * sizeof(m_test[0])
|
|
|
|
vpor(_test, ptr[rax + r10 + 7 * 16]);
|
|
}
|
|
else
|
|
{
|
|
mov(ebx, a1.cvt32()); // left
|
|
xor(a1.cvt32(), a1.cvt32()); // skip
|
|
lea(a0, ptr[a0 - 4]); // steps
|
|
}
|
|
|
|
// a0 = steps
|
|
// a1 = skip
|
|
// rbx = left
|
|
|
|
|
|
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
|
|
|
|
mov(rax, _rip_global(fzbr));
|
|
lea(t1, ptr[rax + a2 * 8]);
|
|
|
|
// GSVector2i* fza_offset = &m_local.gd->fzbc[left >> 2];
|
|
|
|
mov(rax, _rip_global(fzbc));
|
|
lea(t0, ptr[rax + rbx * 2]);
|
|
|
|
if(m_sel.prim != GS_SPRITE_CLASS && (m_sel.fwrite && m_sel.fge || m_sel.zb) || m_sel.fb && (m_sel.edge || m_sel.tfx != TFX_NONE || m_sel.iip))
|
|
{
|
|
// a1 = &m_local.d[skip] // note a1 was (skip << 4)
|
|
|
|
// FIXME
|
|
//lea(a1, ptr[a1 * 8 + _m_local + offsetof(GSScanlineLocalData, d)]);
|
|
lea(rax, _rip_local(d));
|
|
lea(a1, ptr[rax + a1 * 8]);
|
|
}
|
|
|
|
if(m_sel.prim != GS_SPRITE_CLASS)
|
|
{
|
|
if(m_sel.fwrite && m_sel.fge || m_sel.zb)
|
|
{
|
|
vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, p)]); // v.p
|
|
|
|
if(m_sel.fwrite && m_sel.fge)
|
|
{
|
|
// f = GSVector4i(vp).zzzzh().zzzz().add16(m_local.d[skip].f);
|
|
|
|
vcvttps2dq(_f, xmm0);
|
|
vpshufhw(_f, _f, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vpshufd(_f, _f, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vpaddw(_f, ptr[a1 + 16 * 6]);
|
|
}
|
|
|
|
if(m_sel.zb)
|
|
{
|
|
// z = vp.zzzz() + m_local.d[skip].z;
|
|
|
|
vshufps(_z, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vaddps(_z, ptr[a1]);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(m_sel.ztest)
|
|
{
|
|
vmovdqa(_z, _rip_local(p.z));
|
|
}
|
|
|
|
if(m_sel.fwrite && m_sel.fge)
|
|
vmovdqa(_f, _rip_local(p.f));
|
|
}
|
|
|
|
if(m_sel.fb)
|
|
{
|
|
if(m_sel.edge || m_sel.tfx != TFX_NONE)
|
|
{
|
|
vmovaps(xmm0, ptr[a3 + offsetof(GSVertexSW, t)]); // v.t
|
|
}
|
|
|
|
if(m_sel.edge)
|
|
{
|
|
// m_local.temp.cov = GSVector4i::cast(v.t).zzzzh().wwww().srl16(9);
|
|
|
|
vpshufhw(xmm1, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
vpshufd(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
|
|
vpsrlw(xmm1, 9);
|
|
|
|
#ifdef _WIN64
|
|
vmovdqa(_rip_local(temp.cov), xmm1);
|
|
#else
|
|
vmovdqa(ptr[rsp + _rz_cov], xmm1);
|
|
#endif
|
|
}
|
|
|
|
if(m_sel.tfx != TFX_NONE)
|
|
{
|
|
// a1 = &m_local.d[skip]
|
|
|
|
if(m_sel.fst)
|
|
{
|
|
// GSVector4i vti(vt);
|
|
|
|
vcvttps2dq(xmm0, xmm0);
|
|
|
|
// s = vti.xxxx() + m_local.d[skip].s;
|
|
// t = vti.yyyy(); if(!sprite) t += m_local.d[skip].t;
|
|
|
|
vpshufd(_s, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vpshufd(_t, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
|
|
vpaddd(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]);
|
|
|
|
if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
|
|
{
|
|
vpaddd(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]);
|
|
}
|
|
else if(m_sel.ltf)
|
|
{
|
|
vpshuflw(xmm7, _t, _MM_SHUFFLE(2, 2, 0, 0));
|
|
vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0));
|
|
vpsrlw(xmm7, 12);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// s = vt.xxxx() + m_local.d[skip].s;
|
|
// t = vt.yyyy() + m_local.d[skip].t;
|
|
// q = vt.zzzz() + m_local.d[skip].q;
|
|
|
|
vshufps(_s, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vshufps(_t, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vshufps(_q, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
|
|
vaddps(_s, ptr[a1 + offsetof(GSScanlineLocalData::skip, s)]);
|
|
vaddps(_t, ptr[a1 + offsetof(GSScanlineLocalData::skip, t)]);
|
|
vaddps(_q, ptr[a1 + offsetof(GSScanlineLocalData::skip, q)]);
|
|
}
|
|
}
|
|
|
|
if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
|
|
{
|
|
if(m_sel.iip)
|
|
{
|
|
// GSVector4i vc = GSVector4i(v.c);
|
|
|
|
vcvttps2dq(xmm0, ptr[a3 + offsetof(GSVertexSW, c)]); // v.c
|
|
|
|
// vc = vc.upl16(vc.zwxy());
|
|
|
|
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 0, 3, 2));
|
|
vpunpcklwd(xmm0, xmm1);
|
|
|
|
// rb = vc.xxxx().add16(m_local.d[skip].rb);
|
|
// ga = vc.zzzz().add16(m_local.d[skip].ga);
|
|
|
|
vpshufd(_f_rb, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vpshufd(_f_ga, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
|
|
vpaddw(_f_rb, ptr[a1 + offsetof(GSScanlineLocalData::skip, rb)]);
|
|
vpaddw(_f_ga, ptr[a1 + offsetof(GSScanlineLocalData::skip, ga)]);
|
|
}
|
|
else
|
|
{
|
|
vmovdqa(_f_rb, _rip_local(c.rb));
|
|
vmovdqa(_f_ga, _rip_local(c.ga));
|
|
}
|
|
|
|
vmovdqa(_rb, _f_rb);
|
|
vmovdqa(_ga, _f_ga);
|
|
}
|
|
}
|
|
|
|
if(m_sel.fwrite && m_sel.fpsm == 2 && m_sel.dthe)
|
|
{
|
|
// On linux, a2 is edx which will be used for fzm
|
|
// In all case, it will require a mov in dthe code, so let's keep the value on the stack
|
|
#ifdef _WIN64
|
|
ASSERT(0);
|
|
#else
|
|
mov(ptr[rsp + _rz_top], a2);
|
|
#endif
|
|
}
|
|
|
|
mov(_m_local__gd__vm, _rip_global(vm));
|
|
if(m_sel.fb && m_sel.tfx != TFX_NONE)
|
|
mov(_m_local__gd__tex, _rip_global(tex));
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::Step_AVX()
|
|
{
|
|
// steps -= 4;
|
|
|
|
sub(a0, 4);
|
|
|
|
// fza_offset++;
|
|
|
|
add(t0, 8);
|
|
|
|
if(m_sel.prim != GS_SPRITE_CLASS)
|
|
{
|
|
// z += m_local.d4.z;
|
|
|
|
if(m_sel.zb)
|
|
{
|
|
vaddps(_z, _rip_local(d4.z));
|
|
}
|
|
|
|
// f = f.add16(m_local.d4.f);
|
|
|
|
if(m_sel.fwrite && m_sel.fge)
|
|
{
|
|
vpaddw(_f, _rip_local(d4.f));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(m_sel.ztest)
|
|
{
|
|
}
|
|
}
|
|
|
|
if(m_sel.fb)
|
|
{
|
|
if(m_sel.tfx != TFX_NONE)
|
|
{
|
|
if(m_sel.fst)
|
|
{
|
|
// GSVector4i st = m_local.d4.st;
|
|
|
|
// si += st.xxxx();
|
|
// if(!sprite) ti += st.yyyy();
|
|
|
|
vmovdqa(xmm0, _rip_local(d4.stq));
|
|
|
|
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vpaddd(_s, xmm1);
|
|
|
|
if(m_sel.prim != GS_SPRITE_CLASS || m_sel.mmin)
|
|
{
|
|
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vpaddd(_t, xmm1);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// GSVector4 stq = m_local.d4.stq;
|
|
|
|
// s += stq.xxxx();
|
|
// t += stq.yyyy();
|
|
// q += stq.zzzz();
|
|
|
|
vmovaps(xmm0, _rip_local(d4.stq));
|
|
|
|
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
|
|
|
|
vaddps(_s, xmm1);
|
|
vaddps(_t, xmm2);
|
|
vaddps(_q, xmm3);
|
|
}
|
|
}
|
|
|
|
if(!(m_sel.tfx == TFX_DECAL && m_sel.tcc))
|
|
{
|
|
if(m_sel.iip)
|
|
{
|
|
// GSVector4i c = m_local.d4.c;
|
|
|
|
// rb = rb.add16(c.xxxx());
|
|
// ga = ga.add16(c.yyyy());
|
|
|
|
vmovdqa(xmm0, _rip_local(d4.c));
|
|
|
|
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
vpshufd(xmm2, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
|
|
|
|
vpaddw(_f_rb, xmm1);
|
|
vpaddw(_f_ga, xmm2);
|
|
|
|
// FIXME: color may underflow and roll over at the end of the line, if decreasing
|
|
|
|
vpxor(xmm0, xmm0);
|
|
vpmaxsw(_f_rb, xmm0);
|
|
vpmaxsw(_f_ga, xmm0);
|
|
}
|
|
else
|
|
{
|
|
if(m_sel.tfx == TFX_NONE)
|
|
{
|
|
}
|
|
}
|
|
|
|
vmovdqa(_rb, _f_rb);
|
|
vmovdqa(_ga, _f_ga);
|
|
}
|
|
}
|
|
|
|
if(!m_sel.notest)
|
|
{
|
|
// test = m_test[7 + (steps & (steps >> 31))];
|
|
|
|
mov(rax, a0);
|
|
sar(rax, 63); // GH: 63 to extract the sign of the register
|
|
and(rax, a0);
|
|
shl(rax, 4);
|
|
|
|
vmovdqa(_test, ptr[rax + r10 + 7 * 16]);
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::TestZ_AVX(const Xmm& temp1, const Xmm& temp2)
|
|
{
|
|
if(!m_sel.zb)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// int za = fza_base.y + fza_offset->y;
|
|
|
|
mov(ebp, dword[t1 + 4]);
|
|
add(ebp, dword[t0 + 4]);
|
|
and(ebp, HALF_VM_SIZE - 1);
|
|
|
|
// GSVector4i zs = zi;
|
|
|
|
if(m_sel.prim != GS_SPRITE_CLASS)
|
|
{
|
|
if(m_sel.zoverflow)
|
|
{
|
|
// zs = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
|
|
|
|
mov(rax, (size_t)&GSVector4::m_half);
|
|
|
|
vbroadcastss(xmm0, ptr[rax]);
|
|
vmulps(xmm0, _z);
|
|
vcvttps2dq(xmm0, xmm0);
|
|
vpslld(xmm0, 1);
|
|
|
|
vcvttps2dq(xmm1, _z);
|
|
vpcmpeqd(xmm2, xmm2);
|
|
vpsrld(xmm2, 31);
|
|
vpand(xmm1, xmm2);
|
|
|
|
vpor(xmm0, xmm1);
|
|
}
|
|
else
|
|
{
|
|
// zs = GSVector4i(z);
|
|
|
|
vcvttps2dq(xmm0, _z);
|
|
}
|
|
|
|
if(m_sel.zwrite)
|
|
{
|
|
#ifdef _WIN64
|
|
vmovdqa(_rip_local(temp.zs), xmm0);
|
|
#else
|
|
vmovdqa(ptr[rsp + _rz_zs], xmm0);
|
|
#endif
|
|
}
|
|
}
|
|
else
|
|
{
|
|
movdqa(xmm0, _z);
|
|
}
|
|
|
|
if(m_sel.ztest)
|
|
{
|
|
ReadPixel_AVX(xmm1, rbp);
|
|
|
|
if(m_sel.zwrite && m_sel.zpsm < 2)
|
|
{
|
|
#ifdef _WIN64
|
|
vmovdqa(_rip_local(temp.zd), xmm1);
|
|
#else
|
|
vmovdqa(ptr[rsp + _rz_zd], xmm1);
|
|
#endif
|
|
}
|
|
|
|
// zd &= 0xffffffff >> m_sel.zpsm * 8;
|
|
|
|
if(m_sel.zpsm)
|
|
{
|
|
vpslld(xmm1, static_cast<uint8>(m_sel.zpsm * 8));
|
|
vpsrld(xmm1, static_cast<uint8>(m_sel.zpsm * 8));
|
|
}
|
|
|
|
if(m_sel.zoverflow || m_sel.zpsm == 0)
|
|
{
|
|
// GSVector4i o = GSVector4i::x80000000();
|
|
|
|
vpcmpeqd(xmm2, xmm2);
|
|
vpslld(xmm2, 31);
|
|
|
|
// GSVector4i zso = zs - o;
|
|
// GSVector4i zdo = zd - o;
|
|
|
|
vpsubd(xmm0, xmm2);
|
|
vpsubd(xmm1, xmm2);
|
|
}
|
|
|
|
switch(m_sel.ztst)
|
|
{
|
|
case ZTST_GEQUAL:
|
|
// test |= zso < zdo; // ~(zso >= zdo)
|
|
vpcmpgtd(xmm1, xmm0);
|
|
vpor(_test, xmm1);
|
|
break;
|
|
|
|
case ZTST_GREATER: // TODO: tidus hair and chocobo wings only appear fully when this is tested as ZTST_GEQUAL
|
|
// test |= zso <= zdo; // ~(zso > zdo)
|
|
vpcmpgtd(xmm0, xmm1);
|
|
vpcmpeqd(xmm2, xmm2);
|
|
vpxor(xmm0, xmm2);
|
|
vpor(_test, xmm0);
|
|
break;
|
|
}
|
|
|
|
alltrue(_test);
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::SampleTexture_AVX()
|
|
{
|
|
if(!m_sel.fb || m_sel.tfx == TFX_NONE)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(!m_sel.fst)
|
|
{
|
|
vrcpps(xmm0, _q);
|
|
|
|
vmulps(xmm4, _s, xmm0);
|
|
vmulps(xmm5, _t, xmm0);
|
|
|
|
vcvttps2dq(xmm4, xmm4);
|
|
vcvttps2dq(xmm5, xmm5);
|
|
|
|
if(m_sel.ltf)
|
|
{
|
|
// u -= 0x8000;
|
|
// v -= 0x8000;
|
|
|
|
mov(eax, 0x8000);
|
|
vmovd(xmm0, eax);
|
|
vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vpsubd(xmm4, xmm0);
|
|
vpsubd(xmm5, xmm0);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
vmovdqa(xmm4, _s);
|
|
vmovdqa(xmm5, _t);
|
|
}
|
|
|
|
if(m_sel.ltf)
|
|
{
|
|
// GSVector4i uf = u.xxzzlh().srl16(12);
|
|
|
|
vpshuflw(xmm6, xmm4, _MM_SHUFFLE(2, 2, 0, 0));
|
|
vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
|
|
vpsrlw(xmm6, 12);
|
|
|
|
if(m_sel.prim != GS_SPRITE_CLASS)
|
|
{
|
|
// GSVector4i vf = v.xxzzlh().srl16(12);
|
|
|
|
vpshuflw(xmm7, xmm5, _MM_SHUFFLE(2, 2, 0, 0));
|
|
vpshufhw(xmm7, xmm7, _MM_SHUFFLE(2, 2, 0, 0));
|
|
vpsrlw(xmm7, 12);
|
|
}
|
|
}
|
|
|
|
// GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
|
|
|
|
vpsrad(xmm4, 16);
|
|
vpsrad(xmm5, 16);
|
|
vpackssdw(xmm4, xmm5);
|
|
|
|
if(m_sel.ltf)
|
|
{
|
|
// GSVector4i uv1 = uv0.add16(GSVector4i::x0001());
|
|
|
|
vpcmpeqd(xmm0, xmm0);
|
|
vpsrlw(xmm0, 15);
|
|
vpaddw(xmm5, xmm4, xmm0);
|
|
|
|
// uv0 = Wrap(uv0);
|
|
// uv1 = Wrap(uv1);
|
|
|
|
Wrap_AVX(xmm4, xmm5);
|
|
}
|
|
else
|
|
{
|
|
// uv0 = Wrap(uv0);
|
|
|
|
Wrap_AVX(xmm4);
|
|
}
|
|
|
|
// xmm4 = uv0
|
|
// xmm5 = uv1 (ltf)
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
// GSVector4i x0 = uv0.upl16();
|
|
// GSVector4i y0 = uv0.uph16() << tw;
|
|
|
|
vpxor(xmm0, xmm0);
|
|
|
|
vpunpcklwd(xmm2, xmm4, xmm0);
|
|
vpunpckhwd(xmm3, xmm4, xmm0);
|
|
vpslld(xmm3, static_cast<uint8>(m_sel.tw + 3));
|
|
|
|
// xmm0 = 0
|
|
// xmm2 = x0
|
|
// xmm3 = y0
|
|
// xmm5 = uv1 (ltf)
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
if(m_sel.ltf)
|
|
{
|
|
// GSVector4i x1 = uv1.upl16();
|
|
// GSVector4i y1 = uv1.uph16() << tw;
|
|
|
|
vpunpcklwd(xmm4, xmm5, xmm0);
|
|
vpunpckhwd(xmm5, xmm5, xmm0);
|
|
vpslld(xmm5, static_cast<uint8>(m_sel.tw + 3));
|
|
|
|
// xmm2 = x0
|
|
// xmm3 = y0
|
|
// xmm4 = x1
|
|
// xmm5 = y1
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
// GSVector4i addr00 = y0 + x0;
|
|
// GSVector4i addr01 = y0 + x1;
|
|
// GSVector4i addr10 = y1 + x0;
|
|
// GSVector4i addr11 = y1 + x1;
|
|
|
|
vpaddd(xmm0, xmm3, xmm2);
|
|
vpaddd(xmm1, xmm3, xmm4);
|
|
vpaddd(xmm2, xmm5, xmm2);
|
|
vpaddd(xmm3, xmm5, xmm4);
|
|
|
|
// xmm0 = addr00
|
|
// xmm1 = addr01
|
|
// xmm2 = addr10
|
|
// xmm3 = addr11
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
|
// c01 = addr01.gather32_32((const uint32/uint8*)tex[, clut]);
|
|
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
|
|
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
|
|
|
|
ReadTexel_AVX(4, 0);
|
|
|
|
// xmm0 = c10
|
|
// xmm1 = c11
|
|
// xmm4 = c00
|
|
// xmm5 = c01
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
// GSVector4i rb00 = c00 & mask;
|
|
// GSVector4i ga00 = (c00 >> 8) & mask;
|
|
|
|
split16_2x8(xmm2, xmm3, xmm4);
|
|
|
|
// GSVector4i rb01 = c01 & mask;
|
|
// GSVector4i ga01 = (c01 >> 8) & mask;
|
|
|
|
split16_2x8(xmm4, xmm5, xmm5);
|
|
|
|
// xmm0 = c10
|
|
// xmm1 = c11
|
|
// xmm2 = rb00
|
|
// xmm3 = ga00
|
|
// xmm4 = rb01
|
|
// xmm5 = ga01
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
// rb00 = rb00.lerp16_4(rb01, uf);
|
|
// ga00 = ga00.lerp16_4(ga01, uf);
|
|
|
|
lerp16_4(xmm4, xmm2, xmm6);
|
|
lerp16_4(xmm5, xmm3, xmm6);
|
|
|
|
// xmm0 = c10
|
|
// xmm1 = c11
|
|
// xmm4 = rb00
|
|
// xmm5 = ga00
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
// GSVector4i rb10 = c10 & mask;
|
|
// GSVector4i ga10 = (c10 >> 8) & mask;
|
|
|
|
split16_2x8(xmm2, xmm3, xmm0);
|
|
|
|
// GSVector4i rb11 = c11 & mask;
|
|
// GSVector4i ga11 = (c11 >> 8) & mask;
|
|
|
|
split16_2x8(xmm0, xmm1, xmm1);
|
|
|
|
// xmm0 = rb11
|
|
// xmm1 = ga11
|
|
// xmm2 = rb10
|
|
// xmm3 = ga10
|
|
// xmm4 = rb00
|
|
// xmm5 = ga00
|
|
// xmm6 = uf
|
|
// xmm7 = vf
|
|
|
|
// rb10 = rb10.lerp16_4(rb11, uf);
|
|
// ga10 = ga10.lerp16_4(ga11, uf);
|
|
|
|
lerp16_4(xmm0, xmm2, xmm6);
|
|
lerp16_4(xmm1, xmm3, xmm6);
|
|
|
|
// xmm0 = rb10
|
|
// xmm1 = ga10
|
|
// xmm4 = rb00
|
|
// xmm5 = ga00
|
|
// xmm7 = vf
|
|
|
|
// rb00 = rb00.lerp16_4(rb10, vf);
|
|
// ga00 = ga00.lerp16_4(ga10, vf);
|
|
|
|
lerp16_4(xmm0, xmm4, xmm7);
|
|
lerp16_4(xmm1, xmm5, xmm7);
|
|
|
|
// FIXME not ideal (but allow different source in ReadTexel and less register dependency)
|
|
vmovdqa(xmm2, xmm0);
|
|
vmovdqa(xmm3, xmm1);
|
|
}
|
|
else
|
|
{
|
|
// GSVector4i addr00 = y0 + x0;
|
|
|
|
vpaddd(xmm0, xmm3, xmm2);
|
|
|
|
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
|
|
|
|
ReadTexel_AVX(1, 0);
|
|
|
|
// GSVector4i mask = GSVector4i::x00ff();
|
|
|
|
// c[0] = c00 & mask;
|
|
// c[1] = (c00 >> 8) & mask;
|
|
|
|
split16_2x8(_rb, _ga, xmm4);
|
|
}
|
|
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv)
|
|
{
|
|
// xmm0, xmm1, xmm2, xmm3 = free
|
|
|
|
int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
|
|
int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
|
|
|
|
int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
|
|
|
|
if(wms_clamp == wmt_clamp)
|
|
{
|
|
if(wms_clamp)
|
|
{
|
|
if(region)
|
|
{
|
|
vpmaxsw(uv, _rip_global(t.min));
|
|
}
|
|
else
|
|
{
|
|
vpxor(xmm0, xmm0);
|
|
vpmaxsw(uv, xmm0);
|
|
}
|
|
|
|
vpminsw(uv, _rip_global(t.max));
|
|
}
|
|
else
|
|
{
|
|
vpand(uv, _rip_global(t.min));
|
|
|
|
if(region)
|
|
{
|
|
vpor(uv, _rip_global(t.max));
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
vmovdqa(xmm2, _rip_global(t.min));
|
|
vmovdqa(xmm3, _rip_global(t.max));
|
|
vmovdqa(xmm0, _rip_global(t.mask));
|
|
|
|
// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
|
|
|
|
vpand(xmm1, uv, xmm2);
|
|
|
|
if(region)
|
|
{
|
|
vpor(xmm1, xmm3);
|
|
}
|
|
|
|
// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
|
|
|
|
vpmaxsw(uv, xmm2);
|
|
vpminsw(uv, xmm3);
|
|
|
|
// clamp.blend8(repeat, m_local.gd->t.mask);
|
|
|
|
vpblendvb(uv, xmm1, xmm0);
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::Wrap_AVX(const Xmm& uv0, const Xmm& uv1)
|
|
{
|
|
// xmm0, xmm1, xmm2, xmm3 = free
|
|
|
|
int wms_clamp = ((m_sel.wms + 1) >> 1) & 1;
|
|
int wmt_clamp = ((m_sel.wmt + 1) >> 1) & 1;
|
|
|
|
int region = ((m_sel.wms | m_sel.wmt) >> 1) & 1;
|
|
|
|
if(wms_clamp == wmt_clamp)
|
|
{
|
|
if(wms_clamp)
|
|
{
|
|
if(region)
|
|
{
|
|
vmovdqa(xmm0, _rip_global(t.min));
|
|
vpmaxsw(uv0, xmm0);
|
|
vpmaxsw(uv1, xmm0);
|
|
}
|
|
else
|
|
{
|
|
vpxor(xmm0, xmm0);
|
|
vpmaxsw(uv0, xmm0);
|
|
vpmaxsw(uv1, xmm0);
|
|
}
|
|
|
|
vmovdqa(xmm0, _rip_global(t.max));
|
|
vpminsw(uv0, xmm0);
|
|
vpminsw(uv1, xmm0);
|
|
}
|
|
else
|
|
{
|
|
vmovdqa(xmm0, _rip_global(t.min));
|
|
vpand(uv0, xmm0);
|
|
vpand(uv1, xmm0);
|
|
|
|
if(region)
|
|
{
|
|
vmovdqa(xmm0, _rip_global(t.max));
|
|
vpor(uv0, xmm0);
|
|
vpor(uv1, xmm0);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
vmovdqa(xmm2, _rip_global(t.min));
|
|
vmovdqa(xmm3, _rip_global(t.max));
|
|
vmovdqa(xmm0, _rip_global(t.mask));
|
|
|
|
// uv0
|
|
|
|
// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
|
|
|
|
vpand(xmm1, uv0, xmm2);
|
|
|
|
if(region)
|
|
{
|
|
vpor(xmm1, xmm3);
|
|
}
|
|
|
|
// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
|
|
|
|
vpmaxsw(uv0, xmm2);
|
|
vpminsw(uv0, xmm3);
|
|
|
|
// clamp.blend8(repeat, m_local.gd->t.mask);
|
|
|
|
vpblendvb(uv0, xmm1, xmm0);
|
|
|
|
// uv1
|
|
|
|
// GSVector4i repeat = (t & m_local.gd->t.min) | m_local.gd->t.max;
|
|
|
|
vpand(xmm1, uv1, xmm2);
|
|
|
|
if(region)
|
|
{
|
|
vpor(xmm1, xmm3);
|
|
}
|
|
|
|
// GSVector4i clamp = t.sat_i16(m_local.gd->t.min, m_local.gd->t.max);
|
|
|
|
vpmaxsw(uv1, xmm2);
|
|
vpminsw(uv1, xmm3);
|
|
|
|
// clamp.blend8(repeat, m_local.gd->t.mask);
|
|
|
|
vpblendvb(uv1, xmm1, xmm0);
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
|
|
{
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv)
|
|
{
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::WrapLOD_AVX(const Xmm& uv0, const Xmm& uv1)
|
|
{
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::AlphaTFX_AVX()
|
|
{
|
|
if(!m_sel.fb)
|
|
{
|
|
return;
|
|
}
|
|
|
|
switch(m_sel.tfx)
|
|
{
|
|
case TFX_MODULATE:
|
|
|
|
// gat = gat.modulate16<1>(ga).clamp8();
|
|
|
|
modulate16(_ga, _f_ga, 1);
|
|
|
|
clamp16(_ga, xmm0);
|
|
|
|
// if(!tcc) gat = gat.mix16(ga.srl16(7));
|
|
|
|
if(!m_sel.tcc)
|
|
{
|
|
vpsrlw(xmm1, _f_ga, 7);
|
|
|
|
mix16(_ga, xmm1, xmm0);
|
|
}
|
|
|
|
break;
|
|
|
|
case TFX_DECAL:
|
|
|
|
// if(!tcc) gat = gat.mix16(ga.srl16(7));
|
|
|
|
if(!m_sel.tcc)
|
|
{
|
|
vpsrlw(xmm1, _f_ga, 7);
|
|
|
|
mix16(_ga, xmm1, xmm0);
|
|
}
|
|
|
|
break;
|
|
|
|
case TFX_HIGHLIGHT:
|
|
|
|
// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
|
|
|
|
vpsrlw(xmm1, _f_ga, 7);
|
|
|
|
if(m_sel.tcc)
|
|
{
|
|
vpaddusb(xmm1, _ga);
|
|
}
|
|
|
|
mix16(_ga, xmm1, xmm0);
|
|
|
|
break;
|
|
|
|
case TFX_HIGHLIGHT2:
|
|
|
|
// if(!tcc) gat = gat.mix16(ga.srl16(7));
|
|
|
|
if(!m_sel.tcc)
|
|
{
|
|
vpsrlw(xmm1, _f_ga, 7);
|
|
|
|
mix16(_ga, xmm1, xmm0);
|
|
}
|
|
|
|
break;
|
|
|
|
case TFX_NONE:
|
|
|
|
// gat = iip ? ga.srl16(7) : ga;
|
|
|
|
if(m_sel.iip)
|
|
{
|
|
vpsrlw(_ga, _f_ga, 7);
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
if(m_sel.aa1)
|
|
{
|
|
// gs_user figure 3-2: anti-aliasing after tfx, before tests, modifies alpha
|
|
|
|
// FIXME: bios config screen cubes
|
|
|
|
if(!m_sel.abe)
|
|
{
|
|
// a = cov
|
|
|
|
if(m_sel.edge)
|
|
{
|
|
#ifdef _WIN64
|
|
vmovdqa(xmm0, _rip_local(temp.cov));
|
|
#else
|
|
vmovdqa(xmm0, ptr[rsp + _rz_cov]);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
vpcmpeqd(xmm0, xmm0);
|
|
vpsllw(xmm0, 15);
|
|
vpsrlw(xmm0, 8);
|
|
}
|
|
|
|
mix16(_ga, xmm0, xmm1);
|
|
}
|
|
else
|
|
{
|
|
// a = a == 0x80 ? cov : a
|
|
|
|
vpcmpeqd(xmm0, xmm0);
|
|
vpsllw(xmm0, 15);
|
|
vpsrlw(xmm0, 8);
|
|
|
|
if(m_sel.edge)
|
|
{
|
|
#ifdef _WIN64
|
|
vmovdqa(xmm1, _rip_local(temp.cov));
|
|
#else
|
|
vmovdqa(xmm1, ptr[rsp + _rz_cov]);
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
vmovdqa(xmm1, xmm0);
|
|
}
|
|
|
|
vpcmpeqw(xmm0, _ga);
|
|
vpsrld(xmm0, 16);
|
|
vpslld(xmm0, 16);
|
|
|
|
vpblendvb(_ga, xmm1, xmm0);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::ReadMask_AVX()
|
|
{
|
|
if(m_sel.fwrite)
|
|
{
|
|
vmovdqa(_fm, _rip_global(fm));
|
|
}
|
|
|
|
if(m_sel.zwrite)
|
|
{
|
|
vmovdqa(_zm, _rip_global(zm));
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::TestAlpha_AVX()
|
|
{
|
|
switch(m_sel.atst)
|
|
{
|
|
case ATST_NEVER:
|
|
// t = GSVector4i::xffffffff();
|
|
vpcmpeqd(xmm1, xmm1);
|
|
break;
|
|
|
|
case ATST_ALWAYS:
|
|
return;
|
|
|
|
case ATST_LESS:
|
|
case ATST_LEQUAL:
|
|
// t = (ga >> 16) > m_local.gd->aref;
|
|
vpsrld(xmm1, _ga, 16);
|
|
vpcmpgtd(xmm1, _rip_global(aref));
|
|
break;
|
|
|
|
case ATST_EQUAL:
|
|
// t = (ga >> 16) != m_local.gd->aref;
|
|
vpsrld(xmm1, _ga, 16);
|
|
vpcmpeqd(xmm1, _rip_global(aref));
|
|
vpcmpeqd(xmm0, xmm0);
|
|
vpxor(xmm1, xmm0);
|
|
break;
|
|
|
|
case ATST_GEQUAL:
|
|
case ATST_GREATER:
|
|
// t = (ga >> 16) < m_local.gd->aref;
|
|
vpsrld(xmm0, _ga, 16);
|
|
vmovdqa(xmm1, _rip_global(aref));
|
|
vpcmpgtd(xmm1, xmm0);
|
|
break;
|
|
|
|
case ATST_NOTEQUAL:
|
|
// t = (ga >> 16) == m_local.gd->aref;
|
|
vpsrld(xmm1, _ga, 16);
|
|
vpcmpeqd(xmm1, _rip_global(aref));
|
|
break;
|
|
}
|
|
|
|
switch(m_sel.afail)
|
|
{
|
|
case AFAIL_KEEP:
|
|
// test |= t;
|
|
vpor(_test, xmm1);
|
|
alltrue(_test);
|
|
break;
|
|
|
|
case AFAIL_FB_ONLY:
|
|
// zm |= t;
|
|
vpor(_zm, xmm1);
|
|
break;
|
|
|
|
case AFAIL_ZB_ONLY:
|
|
// fm |= t;
|
|
vpor(_fm, xmm1);
|
|
break;
|
|
|
|
case AFAIL_RGB_ONLY:
|
|
// zm |= t;
|
|
vpor(_zm, xmm1);
|
|
// fm |= t & GSVector4i::xff000000();
|
|
vpsrld(xmm1, 24);
|
|
vpslld(xmm1, 24);
|
|
vpor(_fm, xmm1);
|
|
break;
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::ColorTFX_AVX()
|
|
{
|
|
if(!m_sel.fwrite)
|
|
{
|
|
return;
|
|
}
|
|
|
|
switch(m_sel.tfx)
|
|
{
|
|
case TFX_MODULATE:
|
|
|
|
// rbt = rbt.modulate16<1>(rb).clamp8();
|
|
|
|
modulate16(_rb, _f_rb, 1);
|
|
|
|
clamp16(_rb, xmm0);
|
|
|
|
break;
|
|
|
|
case TFX_DECAL:
|
|
|
|
break;
|
|
|
|
case TFX_HIGHLIGHT:
|
|
case TFX_HIGHLIGHT2:
|
|
|
|
// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
|
|
|
|
vmovdqa(xmm1, _ga);
|
|
|
|
modulate16(_ga, _f_ga, 1);
|
|
|
|
vpshuflw(xmm6, _f_ga, _MM_SHUFFLE(3, 3, 1, 1));
|
|
vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1));
|
|
vpsrlw(xmm6, 7);
|
|
|
|
vpaddw(_ga, xmm6);
|
|
|
|
clamp16(_ga, xmm0);
|
|
|
|
mix16(_ga, xmm1, xmm0);
|
|
|
|
// rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
|
|
|
|
modulate16(_rb, _f_rb, 1);
|
|
|
|
vpaddw(_rb, xmm6);
|
|
|
|
clamp16(_rb, xmm0);
|
|
|
|
break;
|
|
|
|
case TFX_NONE:
|
|
|
|
// rbt = iip ? rb.srl16(7) : rb;
|
|
|
|
if(m_sel.iip)
|
|
{
|
|
vpsrlw(_rb, _f_rb, 7);
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::Fog_AVX()
|
|
{
|
|
if(!m_sel.fwrite || !m_sel.fge)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// rb = m_local.gd->frb.lerp16<0>(rb, f);
|
|
// ga = m_local.gd->fga.lerp16<0>(ga, f).mix16(ga);
|
|
|
|
vmovdqa(xmm6, _ga);
|
|
|
|
vmovdqa(xmm0, _rip_global(frb));
|
|
vmovdqa(xmm1, _rip_global(fga));
|
|
|
|
lerp16(_rb, xmm0, _f, 0);
|
|
lerp16(_ga, xmm1, _f, 0);
|
|
|
|
mix16(_ga, xmm6, _f);
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::ReadFrame_AVX()
|
|
{
|
|
if(!m_sel.fb)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// int fa = fza_base.x + fza_offset->x;
|
|
|
|
mov(ebx, dword[t1]);
|
|
add(ebx, dword[t0]);
|
|
and(ebx, HALF_VM_SIZE - 1);
|
|
|
|
if(!m_sel.rfb)
|
|
{
|
|
return;
|
|
}
|
|
|
|
ReadPixel_AVX(_fd, rbx);
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::TestDestAlpha_AVX()
|
|
{
|
|
if(!m_sel.date || m_sel.fpsm != 0 && m_sel.fpsm != 2)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// test |= ((fd [<< 16]) ^ m_local.gd->datm).sra32(31);
|
|
|
|
if(m_sel.datm)
|
|
{
|
|
if(m_sel.fpsm == 2)
|
|
{
|
|
vpxor(xmm0, xmm0);
|
|
//vpsrld(xmm1, _fd, 15);
|
|
vpslld(xmm1, _fd, 16);
|
|
vpsrad(xmm1, 31);
|
|
vpcmpeqd(xmm1, xmm0);
|
|
}
|
|
else
|
|
{
|
|
vpcmpeqd(xmm0, xmm0);
|
|
vpxor(xmm1, _fd, xmm0);
|
|
vpsrad(xmm1, 31);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(m_sel.fpsm == 2)
|
|
{
|
|
vpslld(xmm1, _fd, 16);
|
|
vpsrad(xmm1, 31);
|
|
}
|
|
else
|
|
{
|
|
vpsrad(xmm1, _fd, 31);
|
|
}
|
|
}
|
|
|
|
vpor(_test, xmm1);
|
|
|
|
alltrue(_test);
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::WriteMask_AVX()
|
|
{
|
|
if(m_sel.notest)
|
|
{
|
|
return;
|
|
}
|
|
|
|
// fm |= test;
|
|
// zm |= test;
|
|
|
|
if(m_sel.fwrite)
|
|
{
|
|
vpor(_fm, _test);
|
|
}
|
|
|
|
if(m_sel.zwrite)
|
|
{
|
|
vpor(_zm, _test);
|
|
}
|
|
|
|
// int fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
|
|
|
|
vpcmpeqd(xmm1, xmm1);
|
|
|
|
if(m_sel.fwrite && m_sel.zwrite)
|
|
{
|
|
vpcmpeqd(xmm0, xmm1, _zm);
|
|
vpcmpeqd(xmm1, _fm);
|
|
vpackssdw(xmm1, xmm0);
|
|
}
|
|
else if(m_sel.fwrite)
|
|
{
|
|
vpcmpeqd(xmm1, _fm);
|
|
vpackssdw(xmm1, xmm1);
|
|
}
|
|
else if(m_sel.zwrite)
|
|
{
|
|
vpcmpeqd(xmm1, _zm);
|
|
vpackssdw(xmm1, xmm1);
|
|
}
|
|
|
|
vpmovmskb(edx, xmm1);
|
|
|
|
not(edx);
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::WriteZBuf_AVX()
|
|
{
|
|
if(!m_sel.zwrite)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (m_sel.prim != GS_SPRITE_CLASS)
|
|
#ifdef _WIN64
|
|
vmovdqa(xmm1, _rip_local(temp.zs));
|
|
#else
|
|
vmovdqa(xmm1, ptr[rsp + _rz_zs]);
|
|
#endif
|
|
else
|
|
vmovdqa(xmm1, _rip_local(p.z));
|
|
|
|
if(m_sel.ztest && m_sel.zpsm < 2)
|
|
{
|
|
// zs = zs.blend8(zd, zm);
|
|
|
|
#ifdef _WIN64
|
|
vpblendvb(xmm1, _rip_local(temp.zd), _zm);
|
|
#else
|
|
vpblendvb(xmm1, ptr[rsp + _rz_zd], _zm);
|
|
#endif
|
|
}
|
|
|
|
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
|
|
|
|
WritePixel_AVX(xmm1, rbp, dh, fast, m_sel.zpsm, 1);
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::AlphaBlend_AVX()
|
|
{
|
|
if(!m_sel.fwrite)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(m_sel.abe == 0 && m_sel.aa1 == 0)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const Xmm& _dst_rb = xmm0;
|
|
const Xmm& _dst_ga = xmm1;
|
|
|
|
if((m_sel.aba != m_sel.abb) && (m_sel.aba == 1 || m_sel.abb == 1 || m_sel.abc == 1) || m_sel.abd == 1)
|
|
{
|
|
switch(m_sel.fpsm)
|
|
{
|
|
case 0:
|
|
case 1:
|
|
|
|
// c[2] = fd & mask;
|
|
// c[3] = (fd >> 8) & mask;
|
|
|
|
split16_2x8(_dst_rb, _dst_ga, _fd);
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
// c[2] = ((fd & 0x7c00) << 9) | ((fd & 0x001f) << 3);
|
|
// c[3] = ((fd & 0x8000) << 8) | ((fd & 0x03e0) >> 2);
|
|
|
|
vpcmpeqd(xmm15, xmm15);
|
|
|
|
vpsrld(xmm15, 27); // 0x0000001f
|
|
vpand(_dst_rb, _fd, xmm15);
|
|
vpslld(_dst_rb, 3);
|
|
|
|
vpslld(xmm15, 10); // 0x00007c00
|
|
vpand(xmm5, _fd, xmm15);
|
|
vpslld(xmm5, 9);
|
|
|
|
vpor(_dst_rb, xmm5);
|
|
|
|
vpsrld(xmm15, 5); // 0x000003e0
|
|
vpand(_dst_ga, _fd, xmm15);
|
|
vpsrld(_dst_ga, 2);
|
|
|
|
vpsllw(xmm15, 10); // 0x00008000
|
|
vpand(xmm5, _fd, xmm15);
|
|
vpslld(xmm5, 8);
|
|
|
|
vpor(_dst_ga, xmm5);
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
// xmm2, xmm3 = src rb, ga
|
|
// xmm0, xmm1 = dst rb, ga
|
|
// xmm5, xmm15 = free
|
|
|
|
if(m_sel.pabe || (m_sel.aba != m_sel.abb) && (m_sel.abb == 0 || m_sel.abd == 0))
|
|
{
|
|
vmovdqa(xmm5, _rb);
|
|
}
|
|
|
|
if(m_sel.aba != m_sel.abb)
|
|
{
|
|
// rb = c[aba * 2 + 0];
|
|
|
|
switch(m_sel.aba)
|
|
{
|
|
case 0: break;
|
|
case 1: vmovdqa(_rb, _dst_rb); break;
|
|
case 2: vpxor(_rb, _rb); break;
|
|
}
|
|
|
|
// rb = rb.sub16(c[abb * 2 + 0]);
|
|
|
|
switch(m_sel.abb)
|
|
{
|
|
case 0: vpsubw(_rb, xmm5); break;
|
|
case 1: vpsubw(_rb, _dst_rb); break;
|
|
case 2: break;
|
|
}
|
|
|
|
if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
|
|
{
|
|
// GSVector4i a = abc < 2 ? c[abc * 2 + 1].yywwlh().sll16(7) : m_local.gd->afix;
|
|
|
|
switch(m_sel.abc)
|
|
{
|
|
case 0:
|
|
case 1:
|
|
vpshuflw(xmm15, m_sel.abc ? _dst_ga : _ga, _MM_SHUFFLE(3, 3, 1, 1));
|
|
vpshufhw(xmm15, xmm15, _MM_SHUFFLE(3, 3, 1, 1));
|
|
vpsllw(xmm15, 7);
|
|
break;
|
|
case 2:
|
|
vmovdqa(xmm15, _rip_global(afix));
|
|
break;
|
|
}
|
|
|
|
// rb = rb.modulate16<1>(a);
|
|
|
|
modulate16(_rb, xmm15, 1);
|
|
}
|
|
|
|
// rb = rb.add16(c[abd * 2 + 0]);
|
|
|
|
switch(m_sel.abd)
|
|
{
|
|
case 0: vpaddw(_rb, xmm5); break;
|
|
case 1: vpaddw(_rb, _dst_rb); break;
|
|
case 2: break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// rb = c[abd * 2 + 0];
|
|
|
|
switch(m_sel.abd)
|
|
{
|
|
case 0: break;
|
|
case 1: vmovdqa(_rb, _dst_rb); break;
|
|
case 2: vpxor(_rb, _rb); break;
|
|
}
|
|
}
|
|
|
|
if(m_sel.pabe)
|
|
{
|
|
// mask = (c[1] << 8).sra32(31);
|
|
|
|
vpslld(xmm0, _ga, 8);
|
|
vpsrad(xmm0, 31);
|
|
|
|
// rb = c[0].blend8(rb, mask);
|
|
|
|
vpblendvb(_rb, xmm5, _rb, xmm0);
|
|
}
|
|
|
|
// xmm0 = pabe mask
|
|
// xmm3 = src ga
|
|
// xmm1 = dst ga
|
|
// xmm2 = rb
|
|
// xmm15 = a
|
|
// xmm5 = free
|
|
|
|
vmovdqa(xmm5, _ga);
|
|
|
|
if(m_sel.aba != m_sel.abb)
|
|
{
|
|
// ga = c[aba * 2 + 1];
|
|
|
|
switch(m_sel.aba)
|
|
{
|
|
case 0: break;
|
|
case 1: vmovdqa(_ga, _dst_ga); break;
|
|
case 2: vpxor(_ga, _ga); break;
|
|
}
|
|
|
|
// ga = ga.sub16(c[abeb * 2 + 1]);
|
|
|
|
switch(m_sel.abb)
|
|
{
|
|
case 0: vpsubw(_ga, xmm5); break;
|
|
case 1: vpsubw(_ga, _dst_ga); break;
|
|
case 2: break;
|
|
}
|
|
|
|
if(!(m_sel.fpsm == 1 && m_sel.abc == 1))
|
|
{
|
|
// ga = ga.modulate16<1>(a);
|
|
|
|
modulate16(_ga, xmm15, 1);
|
|
}
|
|
|
|
// ga = ga.add16(c[abd * 2 + 1]);
|
|
|
|
switch(m_sel.abd)
|
|
{
|
|
case 0: vpaddw(_ga, xmm5); break;
|
|
case 1: vpaddw(_ga, _dst_ga); break;
|
|
case 2: break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// ga = c[abd * 2 + 1];
|
|
|
|
switch(m_sel.abd)
|
|
{
|
|
case 0: break;
|
|
case 1: vmovdqa(_ga, _dst_ga); break;
|
|
case 2: vpxor(_ga, _ga); break;
|
|
}
|
|
}
|
|
|
|
// xmm0 = pabe mask
|
|
// xmm5 = src ga
|
|
// xmm2 = rb
|
|
// xmm3 = ga
|
|
// xmm1, xmm15 = free
|
|
|
|
if(m_sel.pabe)
|
|
{
|
|
vpsrld(xmm0, 16); // zero out high words to select the source alpha in blend (so it also does mix16)
|
|
|
|
// ga = c[1].blend8(ga, mask).mix16(c[1]);
|
|
|
|
vpblendvb(_ga, xmm5, _ga, xmm0);
|
|
}
|
|
else
|
|
{
|
|
if(m_sel.fpsm != 1) // TODO: fm == 0xffxxxxxx
|
|
{
|
|
mix16(_ga, xmm5, xmm15);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::WriteFrame_AVX()
|
|
{
|
|
if(!m_sel.fwrite)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(m_sel.fpsm == 2 && m_sel.dthe)
|
|
{
|
|
// y = (top & 3) << 5
|
|
|
|
#ifdef _WIN64
|
|
ASSERT(0);
|
|
#else
|
|
mov(eax, ptr[rsp + _rz_top]);
|
|
#endif
|
|
and(eax, 3);
|
|
shl(eax, 5);
|
|
|
|
// rb = rb.add16(m_global.dimx[0 + y]);
|
|
// ga = ga.add16(m_global.dimx[1 + y]);
|
|
|
|
add(rax, _rip_global(dimx));
|
|
|
|
vpaddw(xmm2, ptr[rax + sizeof(GSVector4i) * 0]);
|
|
vpaddw(xmm3, ptr[rax + sizeof(GSVector4i) * 1]);
|
|
|
|
}
|
|
|
|
if(m_sel.colclamp == 0)
|
|
{
|
|
// c[0] &= 0x00ff00ff;
|
|
// c[1] &= 0x00ff00ff;
|
|
|
|
vpcmpeqd(xmm15, xmm15);
|
|
vpsrlw(xmm15, 8);
|
|
vpand(xmm2, xmm15);
|
|
vpand(xmm3, xmm15);
|
|
}
|
|
|
|
// GSVector4i fs = c[0].upl16(c[1]).pu16(c[0].uph16(c[1]));
|
|
|
|
vpunpckhwd(xmm15, xmm2, xmm3);
|
|
vpunpcklwd(xmm2, xmm3);
|
|
vpackuswb(xmm2, xmm15);
|
|
|
|
if(m_sel.fba && m_sel.fpsm != 1)
|
|
{
|
|
// fs |= 0x80000000;
|
|
|
|
vpcmpeqd(xmm15, xmm15);
|
|
vpslld(xmm15, 31);
|
|
vpor(xmm2, xmm15);
|
|
}
|
|
|
|
// xmm2 = fs
|
|
// xmm4 = fm
|
|
// xmm6 = fd
|
|
|
|
if(m_sel.fpsm == 2)
|
|
{
|
|
// GSVector4i rb = fs & 0x00f800f8;
|
|
// GSVector4i ga = fs & 0x8000f800;
|
|
|
|
mov(eax, 0x00f800f8);
|
|
vmovd(xmm0, eax);
|
|
vpshufd(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
mov(eax, 0x8000f800);
|
|
vmovd(xmm1, eax);
|
|
vpshufd(xmm1, xmm1, _MM_SHUFFLE(0, 0, 0, 0));
|
|
|
|
vpand(xmm0, xmm2);
|
|
vpand(xmm1, xmm2);
|
|
|
|
// fs = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3);
|
|
|
|
vpsrld(xmm2, xmm0, 9);
|
|
vpsrld(xmm0, 3);
|
|
vpsrld(xmm3, xmm1, 16);
|
|
vpsrld(xmm1, 6);
|
|
|
|
vpor(xmm0, xmm1);
|
|
vpor(xmm2, xmm3);
|
|
vpor(xmm2, xmm0);
|
|
}
|
|
|
|
if(m_sel.rfb)
|
|
{
|
|
// fs = fs.blend(fd, fm);
|
|
|
|
blend(xmm2, _fd, _fm); // TODO: could be skipped in certain cases, depending on fpsm and fm
|
|
}
|
|
|
|
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
|
|
|
|
WritePixel_AVX(xmm2, rbx, dl, fast, m_sel.fpsm, 0);
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::ReadPixel_AVX(const Xmm& dst, const Reg64& addr)
|
|
{
|
|
vmovq(dst, qword[_m_local__gd__vm + addr * 2]);
|
|
vmovhps(dst, qword[_m_local__gd__vm + addr * 2 + 8 * 2]);
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, const Reg8& mask, bool fast, int psm, int fz)
|
|
{
|
|
if(m_sel.notest)
|
|
{
|
|
if(fast)
|
|
{
|
|
vmovq(qword[_m_local__gd__vm + addr * 2], src);
|
|
vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src);
|
|
}
|
|
else
|
|
{
|
|
WritePixel_AVX(src, addr, 0, psm);
|
|
WritePixel_AVX(src, addr, 1, psm);
|
|
WritePixel_AVX(src, addr, 2, psm);
|
|
WritePixel_AVX(src, addr, 3, psm);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(fast)
|
|
{
|
|
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
|
|
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
|
|
|
|
test(mask, 0x0f);
|
|
je("@f");
|
|
vmovq(qword[_m_local__gd__vm + addr * 2], src);
|
|
L("@@");
|
|
|
|
test(mask, 0xf0);
|
|
je("@f");
|
|
vmovhps(qword[_m_local__gd__vm + addr * 2 + 8 * 2], src);
|
|
L("@@");
|
|
|
|
// vmaskmovps?
|
|
}
|
|
else
|
|
{
|
|
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
|
|
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
|
|
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
|
|
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
|
|
|
|
test(mask, 0x03);
|
|
je("@f");
|
|
WritePixel_AVX(src, addr, 0, psm);
|
|
L("@@");
|
|
|
|
test(mask, 0x0c);
|
|
je("@f");
|
|
WritePixel_AVX(src, addr, 1, psm);
|
|
L("@@");
|
|
|
|
test(mask, 0x30);
|
|
je("@f");
|
|
WritePixel_AVX(src, addr, 2, psm);
|
|
L("@@");
|
|
|
|
test(mask, 0xc0);
|
|
je("@f");
|
|
WritePixel_AVX(src, addr, 3, psm);
|
|
L("@@");
|
|
}
|
|
}
|
|
}
|
|
|
|
static const int s_offsets[4] = {0, 2, 8, 10};
|
|
|
|
void GSDrawScanlineCodeGenerator::WritePixel_AVX(const Xmm& src, const Reg64& addr, uint8 i, int psm)
|
|
{
|
|
Address dst = ptr[_m_local__gd__vm + addr * 2 + s_offsets[i] * 2];
|
|
|
|
switch(psm)
|
|
{
|
|
case 0:
|
|
if(i == 0) vmovd(dst, src);
|
|
else vpextrd(dst, src, i);
|
|
break;
|
|
case 1:
|
|
if(i == 0) vmovd(eax, src);
|
|
else vpextrd(eax, src, i);
|
|
xor(eax, dst);
|
|
and(eax, 0xffffff);
|
|
xor(dst, eax);
|
|
break;
|
|
case 2:
|
|
vpextrw(eax, src, i * 2);
|
|
mov(dst, ax);
|
|
break;
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
|
|
{
|
|
const int in[] = {0, 1, 2, 3};
|
|
const int out[] = {4, 5, 0, 1};
|
|
|
|
for(int i = 0; i < pixels; i++)
|
|
{
|
|
for(uint8 j = 0; j < 4; j++)
|
|
{
|
|
ReadTexel_AVX(Xmm(out[i]), Xmm(in[i]), j);
|
|
}
|
|
}
|
|
}
|
|
|
|
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(const Xmm& dst, const Xmm& addr, uint8 i)
|
|
{
|
|
const Address& src = m_sel.tlu ? ptr[_m_local__gd__clut + rax * 4] : ptr[_m_local__gd__tex + rax * 4];
|
|
|
|
// Extract address offset
|
|
if(i == 0) vmovd(eax, addr);
|
|
else vpextrd(eax, addr, i);
|
|
|
|
// If clut, load the value as a byte index
|
|
if(m_sel.tlu) movzx(eax, byte[_m_local__gd__tex + rax]);
|
|
|
|
if(i == 0) vmovd(dst, src);
|
|
else vpinsrd(dst, src, i);
|
|
}
|
|
|
|
// Gather example (AVX2). Not faster on Haswell but potentially better on recent CPU
|
|
// Worst case reduce Icache.
|
|
//
|
|
// Current limitation requires 1 extra free register for the mask.
|
|
// And palette need zero masking.
|
|
// It is not possible to use same source/destination so linear interpolation must be updated
|
|
#if 0
|
|
void GSDrawScanlineCodeGenerator::ReadTexel_AVX(int pixels, int mip_offset)
|
|
{
|
|
const int in[] = {0, 1, 2, 3};
|
|
const int out[] = {4, 5, 0, 1};
|
|
const int mask[] = {5, 0, 1, 2};
|
|
|
|
if (m_sel.tlu) {
|
|
for(int i = 0; i < pixels; i++) {
|
|
// FIXME can't use same dst and add register
|
|
Gather4Texel(Xmm(in[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i]));
|
|
// FIXME need a memory and could be faster
|
|
vpslld(Xmm(in[i]), 24);
|
|
vpsrld(Xmm(in[i]), 24);
|
|
Gather4Texel(Xmm(out[i]), _m_local__gd__clut, Xmm(in[i]), Xmm(mask[i]));
|
|
}
|
|
} else {
|
|
for(int i = 0; i < pixels; i++) {
|
|
Gather4Texel(Xmm(out[i]), _m_local__gd__tex, Xmm(in[i]), Xmm(mask[i]));
|
|
}
|
|
}
|
|
}
|
|
|
|
static void Gather4Texel(const Xmm& dst, const Reg64& base, const Xmm& addr, const Xmm& Mask)
|
|
{
|
|
//void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2)
|
|
vpcmpeqd(Mask, Mask);
|
|
vpgatherdd(dst, ptr[base + addr * 4], Mask);
|
|
}
|
|
|
|
#endif
|
|
|
|
#endif
|