pcsx2/pcsx2/x86/iFPUd.cpp
Gregory Hainaut fb1a19f157 x86emitter: clean unused legacy type
Note: no need to check -1 on register allocation failure (a exception will be fired)
2015-12-02 19:09:43 +01:00

1093 lines
31 KiB
C++

/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2010 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "Common.h"
#include "R5900OpcodeTables.h"
#include "x86emitter/x86emitter.h"
#include "iR5900.h"
#include "iFPU.h"
#ifndef DISABLE_SVU
#include "sVU_Micro.h"
#endif
/* This is a version of the FPU that emulates an exponent of 0xff and overflow/underflow flags */
/* Can be made faster by not converting stuff back and forth between instructions. */
//----------------------------------------------------------------
// FPU emulation status:
// ADD, SUB (incl. accumulation stage of MADD/MSUB) - no known problems.
// Mul (incl. multiplication stage of MADD/MSUB) - incorrect. PS2's result mantissa is sometimes
// smaller by 0x1 than IEEE's result (with round to zero).
// DIV, SQRT, RSQRT - incorrect. PS2's result varies between IEEE's result with round to zero
// and IEEE's result with round to +/-infinity.
// other stuff - no known problems.
//----------------------------------------------------------------
using namespace x86Emitter;
// Set overflow flag (define only if FPU_RESULT is 1)
#define FPU_FLAGS_OVERFLOW 1
// Set underflow flag (define only if FPU_RESULT is 1)
#define FPU_FLAGS_UNDERFLOW 1
// If 1, result is not clamped (Gives correct results as in PS2,
// but can cause problems due to insufficient clamping levels in the VUs)
#define FPU_RESULT 1
// Set I&D flags. also impacts other aspects of DIV/R/SQRT correctness
#define FPU_FLAGS_ID 1
// Add/Sub opcodes produce the same results as the ps2
#define FPU_CORRECT_ADD_SUB 1
#ifdef FPU_RECOMPILE
//------------------------------------------------------------------
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace COP1 {
u32 __fastcall FPU_MUL_HACK(u32 s, u32 t);
namespace DOUBLE {
//------------------------------------------------------------------
// Helper Macros
//------------------------------------------------------------------
#define _Ft_ _Rt_
#define _Fs_ _Rd_
#define _Fd_ _Sa_
// FCR31 Flags
#define FPUflagC 0X00800000
#define FPUflagI 0X00020000
#define FPUflagD 0X00010000
#define FPUflagO 0X00008000
#define FPUflagU 0X00004000
#define FPUflagSI 0X00000040
#define FPUflagSD 0X00000020
#define FPUflagSO 0X00000010
#define FPUflagSU 0X00000008
#define REC_FPUBRANCH(f) \
void f(); \
void rec##f() { \
iFlushCall(FLUSH_INTERPRETER); \
CALLFunc((uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \
branch = 2; \
}
#define REC_FPUFUNC(f) \
void f(); \
void rec##f() { \
iFlushCall(FLUSH_INTERPRETER); \
CALLFunc((uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// *FPU Opcodes!*
//------------------------------------------------------------------
//------------------------------------------------------------------
// PS2 -> DOUBLE
//------------------------------------------------------------------
#define SINGLE(sign, exp, mant) (((u32)(sign)<<31) | ((u32)(exp)<<23) | (u32)(mant))
#define DOUBLE(sign, exp, mant) (((sign ## ULL)<<63) | ((exp ## ULL)<<52) | (mant ## ULL))
struct FPUd_Globals
{
u32 neg[4], pos[4];
u32 pos_inf[4], neg_inf[4],
one_exp[4];
u64 dbl_one_exp[2];
u64 dbl_cvt_overflow, // needs special code if above or equal
dbl_ps2_overflow, // overflow & clamp if above or equal
dbl_underflow; // underflow if below
u64 padding;
u64 dbl_s_pos[2];
//u64 dlb_s_neg[2];
};
static const __aligned(32) FPUd_Globals s_const =
{
{ 0x80000000, 0xffffffff, 0xffffffff, 0xffffffff },
{ 0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff },
{SINGLE(0,0xff,0), 0, 0, 0},
{SINGLE(1,0xff,0), 0, 0, 0},
{SINGLE(0,1,0), 0, 0, 0},
{DOUBLE(0,1,0), 0},
DOUBLE(0,1151,0), // cvt_overflow
DOUBLE(0,1152,0), // ps2_overflow
DOUBLE(0,897,0), // underflow
0, // Padding!!
{0x7fffffffffffffffULL, 0},
//{0x8000000000000000ULL, 0},
};
// ToDouble : converts single-precision PS2 float to double-precision IEEE float
void ToDouble(int reg)
{
SSE_UCOMISS_M32_to_XMM(reg, (uptr)s_const.pos_inf); // Sets ZF if reg is equal or incomparable to pos_inf
u8 *to_complex = JE8(0); // Complex conversion if positive infinity or NaN
SSE_UCOMISS_M32_to_XMM(reg, (uptr)s_const.neg_inf);
u8 *to_complex2 = JE8(0); // Complex conversion if negative infinity
SSE2_CVTSS2SD_XMM_to_XMM(reg, reg); // Simply convert
u8 *end = JMP8(0);
x86SetJ8(to_complex);
x86SetJ8(to_complex2);
// Special conversion for when IEEE sees the value in reg as an INF/NaN
SSE2_PSUBD_M128_to_XMM(reg, (uptr)s_const.one_exp); // Lower exponent by one
SSE2_CVTSS2SD_XMM_to_XMM(reg, reg);
SSE2_PADDQ_M128_to_XMM(reg, (uptr)s_const.dbl_one_exp); // Raise exponent by one
x86SetJ8(end);
}
//------------------------------------------------------------------
// DOUBLE -> PS2
//------------------------------------------------------------------
/*
if FPU_RESULT is defined, results are more like the real PS2's FPU. But new issues may happen if
the VU isn't clamping all operands since games may transfer FPU results into the VU.
Ar tonelico 1 does this with the result from DIV/RSQRT (when a division by zero occurs)
otherwise, results are still usually better than iFPU.cpp.
*/
// ToPS2FPU_Full - converts double-precision IEEE float to single-precision PS2 float
// converts small normal numbers to PS2 equivalent
// converts large normal numbers to PS2 equivalent (which represent NaN/inf in IEEE)
// converts really large normal numbers to PS2 signed max
// converts really small normal numbers to zero (flush)
// doesn't handle inf/nan/denormal
void ToPS2FPU_Full(int reg, bool flags, int absreg, bool acc, bool addsub)
{
if (flags)
AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagO | FPUflagU));
if (flags && acc)
AND32ItoM((uptr)&fpuRegs.ACCflag, ~1);
SSE_MOVAPS_XMM_to_XMM(absreg, reg);
SSE2_ANDPD_M128_to_XMM(absreg, (uptr)&s_const.dbl_s_pos);
SSE2_UCOMISD_M64_to_XMM(absreg, (uptr)&s_const.dbl_cvt_overflow);
u8 *to_complex = JAE8(0);
SSE2_UCOMISD_M64_to_XMM(absreg, (uptr)&s_const.dbl_underflow);
u8 *to_underflow = JB8(0);
SSE2_CVTSD2SS_XMM_to_XMM(reg, reg); //simply convert
u8 *end = JMP8(0);
x86SetJ8(to_complex);
SSE2_UCOMISD_M64_to_XMM(absreg, (uptr)&s_const.dbl_ps2_overflow);
u8 *to_overflow = JAE8(0);
SSE2_PSUBQ_M128_to_XMM(reg, (uptr)&s_const.dbl_one_exp); //lower exponent
SSE2_CVTSD2SS_XMM_to_XMM(reg, reg); //convert
SSE2_PADDD_M128_to_XMM(reg, (uptr)s_const.one_exp); //raise exponent
u8 *end2 = JMP8(0);
x86SetJ8(to_overflow);
SSE2_CVTSD2SS_XMM_to_XMM(reg, reg);
SSE_ORPS_M128_to_XMM(reg, (uptr)&s_const.pos); //clamp
if (flags && FPU_FLAGS_OVERFLOW)
OR32ItoM((uptr)&fpuRegs.fprc[31], (FPUflagO | FPUflagSO));
if (flags && FPU_FLAGS_OVERFLOW && acc)
OR32ItoM((uptr)&fpuRegs.ACCflag, 1);
u8 *end3 = JMP8(0);
x86SetJ8(to_underflow);
u8 *end4;
if (flags && FPU_FLAGS_UNDERFLOW) //set underflow flags if not zero
{
SSE2_XORPD_XMM_to_XMM(absreg, absreg);
SSE2_UCOMISD_XMM_to_XMM(reg, absreg);
u8 *is_zero = JE8(0);
OR32ItoM((uptr)&fpuRegs.fprc[31], (FPUflagU | FPUflagSU));
if (addsub)
{
//On ADD/SUB, the PS2 simply leaves the mantissa bits as they are (after normalization)
//IEEE either clears them (FtZ) or returns the denormalized result.
//not thoroughly tested : other operations such as MUL and DIV seem to clear all mantissa bits?
SSE_MOVAPS_XMM_to_XMM(absreg, reg);
SSE2_PSLLQ_I8_to_XMM(reg, 12); //mantissa bits
SSE2_PSRLQ_I8_to_XMM(reg, 41);
SSE2_PSRLQ_I8_to_XMM(absreg, 63); //sign bit
SSE2_PSLLQ_I8_to_XMM(absreg, 31);
SSE2_POR_XMM_to_XMM(reg, absreg);
end4 = JMP8(0);
}
x86SetJ8(is_zero);
}
SSE2_CVTSD2SS_XMM_to_XMM(reg, reg);
SSE_ANDPS_M128_to_XMM(reg, (uptr)s_const.neg); //flush to zero
x86SetJ8(end);
x86SetJ8(end2);
x86SetJ8(end3);
if (flags && FPU_FLAGS_UNDERFLOW && addsub)
x86SetJ8(end4);
}
void ToPS2FPU(int reg, bool flags, int absreg, bool acc, bool addsub = false)
{
if (FPU_RESULT)
ToPS2FPU_Full(reg, flags, absreg, acc, addsub);
else
{
SSE2_CVTSD2SS_XMM_to_XMM(reg, reg); //clamp
SSE_MINSS_M32_to_XMM(reg, (uptr)&g_maxvals[0]);
SSE_MAXSS_M32_to_XMM(reg, (uptr)&g_minvals[0]);
}
}
//sets the maximum (positive or negative) value into regd.
void SetMaxValue(int regd)
{
if (FPU_RESULT)
SSE_ORPS_M128_to_XMM(regd, (uptr)&s_const.pos[0]); // set regd to maximum
else
{
SSE_ANDPS_M128_to_XMM(regd, (uptr)&s_const.neg[0]); // Get the sign bit
SSE_ORPS_M128_to_XMM(regd, (uptr)&g_maxvals[0]); // regd = +/- Maximum (CLAMP)!
}
}
#define GET_S(sreg) { \
if( info & PROCESS_EE_S ) SSE_MOVSS_XMM_to_XMM(sreg, EEREC_S); \
else SSE_MOVSS_M32_to_XMM(sreg, (uptr)&fpuRegs.fpr[_Fs_]); }
#define ALLOC_S(sreg) { sreg = _allocTempXMMreg(XMMT_FPS, -1); GET_S(sreg); }
#define GET_T(treg) { \
if( info & PROCESS_EE_T ) SSE_MOVSS_XMM_to_XMM(treg, EEREC_T); \
else SSE_MOVSS_M32_to_XMM(treg, (uptr)&fpuRegs.fpr[_Ft_]); }
#define ALLOC_T(treg) { treg = _allocTempXMMreg(XMMT_FPS, -1); GET_T(treg); }
#define GET_ACC(areg) { \
if( info & PROCESS_EE_ACC ) SSE_MOVSS_XMM_to_XMM(areg, EEREC_ACC); \
else SSE_MOVSS_M32_to_XMM(areg, (uptr)&fpuRegs.ACC); }
#define ALLOC_ACC(areg) { areg = _allocTempXMMreg(XMMT_FPS, -1); GET_ACC(areg); }
#define CLEAR_OU_FLAGS { AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagO | FPUflagU)); }
//------------------------------------------------------------------
// ABS XMM
//------------------------------------------------------------------
void recABS_S_xmm(int info)
{
GET_S(EEREC_D);
CLEAR_OU_FLAGS;
SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)s_const.pos);
}
FPURECOMPILE_CONSTCODE(ABS_S, XMMINFO_WRITED|XMMINFO_READS);
//------------------------------------------------------------------
//------------------------------------------------------------------
// FPU_ADD_SUB (Used to mimic PS2's FPU add/sub behavior)
//------------------------------------------------------------------
// Compliant IEEE FPU uses, in computations, uses additional "guard" bits to the right of the mantissa
// but EE-FPU doesn't. Substraction (and addition of positive and negative) may shift the mantissa left,
// causing those bits to appear in the result; this function masks out the bits of the mantissa that will
// get shifted right to the guard bits to ensure that the guard bits are empty.
// The difference of the exponents = the amount that the smaller operand will be shifted right by.
// Modification - the PS2 uses a single guard bit? (Coded by Nneeve)
//------------------------------------------------------------------
void FPU_ADD_SUB(int tempd, int tempt) //tempd and tempt are overwritten, they are floats
{
int tempecx = _allocX86reg(ECX, X86TYPE_TEMP, 0, 0); //receives regd
int temp2 = _allocX86reg(-1, X86TYPE_TEMP, 0, 0); //receives regt
int xmmtemp = _allocTempXMMreg(XMMT_FPS, -1); //temporary for anding with regd/regt
if (tempecx != ECX) { Console.Error("FPU: ADD/SUB Allocation Error!"); tempecx = ECX;}
if (temp2 == -1) { Console.Error("FPU: ADD/SUB Allocation Error!"); temp2 = EAX;}
SSE2_MOVD_XMM_to_R(tempecx, tempd);
SSE2_MOVD_XMM_to_R(temp2, tempt);
//mask the exponents
SHR32ItoR(tempecx, 23);
SHR32ItoR(temp2, 23);
AND32ItoR(tempecx, 0xff);
AND32ItoR(temp2, 0xff);
SUB32RtoR(tempecx, temp2); //tempecx = exponent difference
CMP32ItoR(tempecx, 25);
j8Ptr[0] = JGE8(0);
CMP32ItoR(tempecx, 0);
j8Ptr[1] = JG8(0);
j8Ptr[2] = JE8(0);
CMP32ItoR(tempecx, -25);
j8Ptr[3] = JLE8(0);
//diff = -24 .. -1 , expd < expt
NEG32R(tempecx);
DEC32R(tempecx);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2); //temp2 = 0xffffffff << tempecx
SSE2_MOVD_R_to_XMM(xmmtemp, temp2);
SSE_ANDPS_XMM_to_XMM(tempd, xmmtemp);
j8Ptr[4] = JMP8(0);
x86SetJ8(j8Ptr[0]);
//diff = 25 .. 255 , expt < expd
SSE_ANDPS_M128_to_XMM(tempt, (uptr)s_const.neg);
j8Ptr[5] = JMP8(0);
x86SetJ8(j8Ptr[1]);
//diff = 1 .. 24, expt < expd
DEC32R(tempecx);
MOV32ItoR(temp2, 0xffffffff);
SHL32CLtoR(temp2); //temp2 = 0xffffffff << tempecx
SSE2_MOVD_R_to_XMM(xmmtemp, temp2);
SSE_ANDPS_XMM_to_XMM(tempt, xmmtemp);
j8Ptr[6] = JMP8(0);
x86SetJ8(j8Ptr[3]);
//diff = -255 .. -25, expd < expt
SSE_ANDPS_M128_to_XMM(tempd, (uptr)s_const.neg);
j8Ptr[7] = JMP8(0);
x86SetJ8(j8Ptr[2]);
//diff == 0
x86SetJ8(j8Ptr[4]);
x86SetJ8(j8Ptr[5]);
x86SetJ8(j8Ptr[6]);
x86SetJ8(j8Ptr[7]);
_freeXMMreg(xmmtemp);
_freeX86reg(temp2);
_freeX86reg(tempecx);
}
void FPU_MUL(int info, int regd, int sreg, int treg, bool acc)
{
u8 *noHack;
u32 *endMul;
if (CHECK_FPUMULHACK)
{
SSE2_MOVD_XMM_to_R(ECX, sreg);
SSE2_MOVD_XMM_to_R(EDX, treg);
CALLFunc( (uptr)&FPU_MUL_HACK ); //returns the hacked result or 0
TEST32RtoR(EAX, EAX);
noHack = JZ8(0);
SSE2_MOVD_R_to_XMM(regd, EAX);
endMul = JMP32(0);
x86SetJ8(noHack);
}
ToDouble(sreg); ToDouble(treg);
SSE2_MULSD_XMM_to_XMM(sreg, treg);
ToPS2FPU(sreg, true, treg, acc);
SSE_MOVSS_XMM_to_XMM(regd, sreg);
if (CHECK_FPUMULHACK)
x86SetJ32(endMul);
}
//------------------------------------------------------------------
// CommutativeOp XMM (used for ADD and SUB opcodes. that's it.)
//------------------------------------------------------------------
static void (*recFPUOpXMM_to_XMM[] )(x86SSERegType, x86SSERegType) = {
SSE2_ADDSD_XMM_to_XMM, SSE2_SUBSD_XMM_to_XMM };
void recFPUOp(int info, int regd, int op, bool acc)
{
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
if (FPU_CORRECT_ADD_SUB)
FPU_ADD_SUB(sreg, treg);
ToDouble(sreg); ToDouble(treg);
recFPUOpXMM_to_XMM[op](sreg, treg);
ToPS2FPU(sreg, true, treg, acc, true);
SSE_MOVSS_XMM_to_XMM(regd, sreg);
_freeXMMreg(sreg); _freeXMMreg(treg);
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// ADD XMM
//------------------------------------------------------------------
void recADD_S_xmm(int info)
{
recFPUOp(info, EEREC_D, 0, false);
}
FPURECOMPILE_CONSTCODE(ADD_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT);
void recADDA_S_xmm(int info)
{
recFPUOp(info, EEREC_ACC, 0, true);
}
FPURECOMPILE_CONSTCODE(ADDA_S, XMMINFO_WRITEACC|XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
void recCMP(int info)
{
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
ToDouble(sreg); ToDouble(treg);
SSE2_UCOMISD_XMM_to_XMM(sreg, treg);
_freeXMMreg(sreg); _freeXMMreg(treg);
}
//------------------------------------------------------------------
// C.x.S XMM
//------------------------------------------------------------------
void recC_EQ_xmm(int info)
{
recCMP(info);
j8Ptr[0] = JZ8(0);
AND32ItoM( (uptr)&fpuRegs.fprc[31], ~FPUflagC );
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagC);
x86SetJ8(j8Ptr[1]);
}
FPURECOMPILE_CONSTCODE(C_EQ, XMMINFO_READS|XMMINFO_READT);
void recC_LE_xmm(int info )
{
recCMP(info);
j8Ptr[0] = JBE8(0);
AND32ItoM( (uptr)&fpuRegs.fprc[31], ~FPUflagC );
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagC);
x86SetJ8(j8Ptr[1]);
}
FPURECOMPILE_CONSTCODE(C_LE, XMMINFO_READS|XMMINFO_READT);
void recC_LT_xmm(int info)
{
recCMP(info);
j8Ptr[0] = JB8(0);
AND32ItoM( (uptr)&fpuRegs.fprc[31], ~FPUflagC );
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagC);
x86SetJ8(j8Ptr[1]);
}
FPURECOMPILE_CONSTCODE(C_LT, XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// CVT.x XMM
//------------------------------------------------------------------
void recCVT_S_xmm(int info)
{
if( !(info&PROCESS_EE_S) || (EEREC_D != EEREC_S && !(info&PROCESS_EE_MODEWRITES)) ) {
SSE_CVTSI2SS_M32_to_XMM(EEREC_D, (uptr)&fpuRegs.fpr[_Fs_]);
}
else {
SSE2_CVTDQ2PS_XMM_to_XMM(EEREC_D, EEREC_S);
}
}
FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED|XMMINFO_READS);
void recCVT_W() //called from iFPU.cpp's recCVT_W
{
int regs = _checkXMMreg(XMMTYPE_FPREG, _Fs_, MODE_READ);
if( regs >= 0 )
{
SSE_CVTTSS2SI_XMM_to_R32(EAX, regs);
SSE_MOVMSKPS_XMM_to_R32(EDX, regs); //extract the signs
AND32ItoR(EDX,1); //keep only LSB
}
else
{
SSE_CVTTSS2SI_M32_to_R32(EAX, (uptr)&fpuRegs.fpr[ _Fs_ ]);
MOV32MtoR(EDX, (uptr)&fpuRegs.fpr[ _Fs_ ]);
SHR32ItoR(EDX, 31); //mov sign to lsb
}
//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
_deleteFPtoXMMreg(_Fd_, 2);
ADD32ItoR(EDX, 0x7FFFFFFF); //0x7FFFFFFF if positive, 0x8000 0000 if negative
CMP32ItoR(EAX, 0x80000000); //If the result is indefinitive
CMOVE32RtoR(EAX, EDX); //Saturate it
//Write the result
MOV32RtoM((uptr)&fpuRegs.fpr[_Fd_], EAX);
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// DIV XMM
//------------------------------------------------------------------
void recDIVhelper1(int regd, int regt) // Sets flags
{
u8 *pjmp1, *pjmp2;
u32 *ajmp32, *bjmp32;
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
int tempReg = _allocX86reg(-1, X86TYPE_TEMP, 0, 0);
if (t1reg == -1) {Console.Error("FPU: DIV Allocation Error!"); return;}
if (tempReg == -1) {Console.Error("FPU: DIV Allocation Error!"); tempReg = EAX;}
AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagI|FPUflagD)); // Clear I and D flags
//--- Check for divide by zero ---
SSE_XORPS_XMM_to_XMM(t1reg, t1reg);
SSE_CMPEQSS_XMM_to_XMM(t1reg, regt);
SSE_MOVMSKPS_XMM_to_R32(tempReg, t1reg);
AND32ItoR(tempReg, 1); //Check sign (if regt == zero, sign will be set)
ajmp32 = JZ32(0); //Skip if not set
//--- Check for 0/0 ---
SSE_XORPS_XMM_to_XMM(t1reg, t1reg);
SSE_CMPEQSS_XMM_to_XMM(t1reg, regd);
SSE_MOVMSKPS_XMM_to_R32(tempReg, t1reg);
AND32ItoR(tempReg, 1); //Check sign (if regd == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagI|FPUflagSI); // Set I and SI flags ( 0/0 )
pjmp2 = JMP8(0);
x86SetJ8(pjmp1); //x/0 but not 0/0
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagD|FPUflagSD); // Set D and SD flags ( x/0 )
x86SetJ8(pjmp2);
//--- Make regd +/- Maximum ---
SSE_XORPS_XMM_to_XMM(regd, regt); // Make regd Positive or Negative
SetMaxValue(regd); //clamp to max
bjmp32 = JMP32(0);
x86SetJ32(ajmp32);
//--- Normal Divide ---
ToDouble(regd); ToDouble(regt);
SSE2_DIVSD_XMM_to_XMM(regd, regt);
ToPS2FPU(regd, false, regt, false);
x86SetJ32(bjmp32);
_freeXMMreg(t1reg);
_freeX86reg(tempReg);
}
void recDIVhelper2(int regd, int regt) // Doesn't sets flags
{
ToDouble(regd); ToDouble(regt);
SSE2_DIVSD_XMM_to_XMM(regd, regt);
ToPS2FPU(regd, false, regt, false);
}
static __aligned16 SSE_MXCSR roundmode_nearest, roundmode_neg;
void recDIV_S_xmm(int info)
{
bool roundmodeFlag = false;
//if (t0reg == -1) {Console.Error("FPU: DIV Allocation Error!");}
//Console.WriteLn("DIV");
if( CHECK_FPUNEGDIVHACK )
{
if (g_sseMXCSR.GetRoundMode() != SSEround_NegInf)
{
// Set roundmode to nearest since it isn't already
//Console.WriteLn("div to negative inf");
roundmode_neg = g_sseMXCSR;
roundmode_neg.SetRoundMode( SSEround_NegInf );
xLDMXCSR( roundmode_neg );
roundmodeFlag = true;
}
}
else
{
if (g_sseMXCSR.GetRoundMode() != SSEround_Nearest)
{
// Set roundmode to nearest since it isn't already
//Console.WriteLn("div to nearest");
roundmode_nearest = g_sseMXCSR;
roundmode_nearest.SetRoundMode( SSEround_Nearest );
xLDMXCSR( roundmode_nearest );
roundmodeFlag = true;
}
}
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
if (FPU_FLAGS_ID)
recDIVhelper1(sreg, treg);
else
recDIVhelper2(sreg, treg);
SSE_MOVSS_XMM_to_XMM(EEREC_D, sreg);
if (roundmodeFlag) xLDMXCSR (g_sseMXCSR);
_freeXMMreg(sreg); _freeXMMreg(treg);
}
FPURECOMPILE_CONSTCODE(DIV_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MADD/MSUB XMM
//------------------------------------------------------------------
// Unlike what the documentation implies, it seems that MADD/MSUB support all numbers just like other operations
// The complex overflow conditions the document describes apparently test whether the multiplication's result
// has overflowed and whether the last operation that used ACC as a destination has overflowed.
// For example, { adda.s -MAX, 0.0 ; madd.s fd, MAX, 1.0 } -> fd = 0
// while { adda.s -MAX, -MAX ; madd.s fd, MAX, 1.0 } -> fd = -MAX
// (where MAX is 0x7fffffff and -MAX is 0xffffffff)
void recMaddsub(int info, int regd, int op, bool acc)
{
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
FPU_MUL(info, sreg, sreg, treg, false);
GET_ACC(treg);
if (FPU_CORRECT_ADD_SUB)
FPU_ADD_SUB(treg, sreg); //might be problematic for something!!!!
// TEST FOR ACC/MUL OVERFLOWS, PROPOGATE THEM IF THEY OCCUR
TEST32ItoM((uptr)&fpuRegs.fprc[31], FPUflagO);
u8 *mulovf = JNZ8(0);
ToDouble(sreg); //else, convert
TEST32ItoM((uptr)&fpuRegs.ACCflag, 1);
u8 *accovf = JNZ8(0);
ToDouble(treg); //else, convert
u8 *operation = JMP8(0);
x86SetJ8(mulovf);
if (op == 1) //sub
SSE_XORPS_M128_to_XMM(sreg, (uptr)s_const.neg);
SSE_MOVAPS_XMM_to_XMM(treg, sreg); //fall through below
x86SetJ8(accovf);
SetMaxValue(treg); //just in case... I think it has to be a MaxValue already here
CLEAR_OU_FLAGS; //clear U flag
if (FPU_FLAGS_OVERFLOW)
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagO | FPUflagSO);
if (FPU_FLAGS_OVERFLOW && acc)
OR32ItoM((uptr)&fpuRegs.ACCflag, 1);
u32 *skipall = JMP32(0);
// PERFORM THE ACCUMULATION AND TEST RESULT. CONVERT TO SINGLE
x86SetJ8(operation);
if (op == 1)
SSE2_SUBSD_XMM_to_XMM(treg, sreg);
else
SSE2_ADDSD_XMM_to_XMM(treg, sreg);
ToPS2FPU(treg, true, sreg, acc, true);
x86SetJ32(skipall);
SSE_MOVSS_XMM_to_XMM(regd, treg);
_freeXMMreg(sreg); _freeXMMreg(treg);
}
void recMADD_S_xmm(int info)
{
recMaddsub(info, EEREC_D, 0, false);
}
FPURECOMPILE_CONSTCODE(MADD_S, XMMINFO_WRITED|XMMINFO_READACC|XMMINFO_READS|XMMINFO_READT);
void recMADDA_S_xmm(int info)
{
recMaddsub(info, EEREC_ACC, 0, true);
}
FPURECOMPILE_CONSTCODE(MADDA_S, XMMINFO_WRITEACC|XMMINFO_READACC|XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MAX / MIN XMM
//------------------------------------------------------------------
static const __aligned16 u32 minmax_mask[8] =
{
0xffffffff, 0x80000000, 0, 0,
0, 0x40000000, 0, 0
};
// FPU's MAX/MIN work with all numbers (including "denormals"). Check VU's logical min max for more info.
void recMINMAX(int info, bool ismin)
{
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
CLEAR_OU_FLAGS;
SSE2_PSHUFD_XMM_to_XMM(sreg, sreg, 0x00);
SSE2_PAND_M128_to_XMM(sreg, (uptr)minmax_mask);
SSE2_POR_M128_to_XMM(sreg, (uptr)&minmax_mask[4]);
SSE2_PSHUFD_XMM_to_XMM(treg, treg, 0x00);
SSE2_PAND_M128_to_XMM(treg, (uptr)minmax_mask);
SSE2_POR_M128_to_XMM(treg, (uptr)&minmax_mask[4]);
if (ismin)
SSE2_MINSD_XMM_to_XMM(sreg, treg);
else
SSE2_MAXSD_XMM_to_XMM(sreg, treg);
SSE_MOVSS_XMM_to_XMM(EEREC_D, sreg);
_freeXMMreg(sreg); _freeXMMreg(treg);
}
void recMAX_S_xmm(int info)
{
recMINMAX(info, false);
}
FPURECOMPILE_CONSTCODE(MAX_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT);
void recMIN_S_xmm(int info)
{
recMINMAX(info, true);
}
FPURECOMPILE_CONSTCODE(MIN_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MOV XMM
//------------------------------------------------------------------
void recMOV_S_xmm(int info)
{
GET_S(EEREC_D);
}
FPURECOMPILE_CONSTCODE(MOV_S, XMMINFO_WRITED|XMMINFO_READS);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MSUB XMM
//------------------------------------------------------------------
void recMSUB_S_xmm(int info)
{
recMaddsub(info, EEREC_D, 1, false);
}
FPURECOMPILE_CONSTCODE(MSUB_S, XMMINFO_WRITED|XMMINFO_READACC|XMMINFO_READS|XMMINFO_READT);
void recMSUBA_S_xmm(int info)
{
recMaddsub(info, EEREC_ACC, 1, true);
}
FPURECOMPILE_CONSTCODE(MSUBA_S, XMMINFO_WRITEACC|XMMINFO_READACC|XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MUL XMM
//------------------------------------------------------------------
void recMUL_S_xmm(int info)
{
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
FPU_MUL(info, EEREC_D, sreg, treg, false);
_freeXMMreg(sreg); _freeXMMreg(treg);
}
FPURECOMPILE_CONSTCODE(MUL_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT);
void recMULA_S_xmm(int info)
{
int sreg, treg;
ALLOC_S(sreg); ALLOC_T(treg);
FPU_MUL(info, EEREC_ACC, sreg, treg, true);
_freeXMMreg(sreg); _freeXMMreg(treg);
}
FPURECOMPILE_CONSTCODE(MULA_S, XMMINFO_WRITEACC|XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// NEG XMM
//------------------------------------------------------------------
void recNEG_S_xmm(int info) {
GET_S(EEREC_D);
CLEAR_OU_FLAGS;
SSE_XORPS_M128_to_XMM(EEREC_D, (uptr)&s_const.neg[0]);
}
FPURECOMPILE_CONSTCODE(NEG_S, XMMINFO_WRITED|XMMINFO_READS);
//------------------------------------------------------------------
//------------------------------------------------------------------
// SUB XMM
//------------------------------------------------------------------
void recSUB_S_xmm(int info)
{
recFPUOp(info, EEREC_D, 1, false);
}
FPURECOMPILE_CONSTCODE(SUB_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT);
void recSUBA_S_xmm(int info)
{
recFPUOp(info, EEREC_ACC, 1, true);
}
FPURECOMPILE_CONSTCODE(SUBA_S, XMMINFO_WRITEACC|XMMINFO_READS|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// SQRT XMM
//------------------------------------------------------------------
void recSQRT_S_xmm(int info)
{
u8 *pjmp;
int roundmodeFlag = 0;
int tempReg = _allocX86reg(-1, X86TYPE_TEMP, 0, 0);
if (tempReg == -1) {Console.Error("FPU: SQRT Allocation Error!"); tempReg = EAX;}
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
if (t1reg == -1) {Console.Error("FPU: SQRT Allocation Error!"); return;}
//Console.WriteLn("FPU: SQRT");
if (g_sseMXCSR.GetRoundMode() != SSEround_Nearest)
{
// Set roundmode to nearest if it isn't already
//Console.WriteLn("sqrt to nearest");
roundmode_nearest = g_sseMXCSR;
roundmode_nearest.SetRoundMode( SSEround_Nearest );
xLDMXCSR (roundmode_nearest);
roundmodeFlag = 1;
}
GET_T(EEREC_D);
if (FPU_FLAGS_ID) {
AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagI|FPUflagD)); // Clear I and D flags
//--- Check for negative SQRT --- (sqrt(-0) = 0, unlike what the docs say)
SSE_MOVMSKPS_XMM_to_R32(tempReg, EEREC_D);
AND32ItoR(tempReg, 1); //Check sign
pjmp = JZ8(0); //Skip if none are
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagI|FPUflagSI); // Set I and SI flags
SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&s_const.pos[0]); // Make EEREC_D Positive
x86SetJ8(pjmp);
}
else
{
SSE_ANDPS_M128_to_XMM(EEREC_D, (uptr)&s_const.pos[0]); // Make EEREC_D Positive
}
ToDouble(EEREC_D);
SSE2_SQRTSD_XMM_to_XMM(EEREC_D, EEREC_D);
ToPS2FPU(EEREC_D, false, t1reg, false);
if (roundmodeFlag == 1) {
xLDMXCSR (g_sseMXCSR);
}
_freeX86reg(tempReg);
_freeXMMreg(t1reg);
}
FPURECOMPILE_CONSTCODE(SQRT_S, XMMINFO_WRITED|XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// RSQRT XMM
//------------------------------------------------------------------
void recRSQRThelper1(int regd, int regt) // Preforms the RSQRT function when regd <- Fs and regt <- Ft (Sets correct flags)
{
u8 *pjmp1, *pjmp2;
u8 *qjmp1, *qjmp2;
u32 *pjmp32;
int t1reg = _allocTempXMMreg(XMMT_FPS, -1);
int tempReg = _allocX86reg(-1, X86TYPE_TEMP, 0, 0);
if (t1reg == -1) {Console.Error("FPU: RSQRT Allocation Error!"); return;}
if (tempReg == -1) {Console.Error("FPU: RSQRT Allocation Error!"); tempReg = EAX;}
AND32ItoM((uptr)&fpuRegs.fprc[31], ~(FPUflagI|FPUflagD)); // Clear I and D flags
//--- (first) Check for negative SQRT ---
SSE_MOVMSKPS_XMM_to_R32(tempReg, regt);
AND32ItoR(tempReg, 1); //Check sign
pjmp2 = JZ8(0); //Skip if not set
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagI|FPUflagSI); // Set I and SI flags
SSE_ANDPS_M128_to_XMM(regt, (uptr)&s_const.pos[0]); // Make regt Positive
x86SetJ8(pjmp2);
//--- Check for zero ---
SSE_XORPS_XMM_to_XMM(t1reg, t1reg);
SSE_CMPEQSS_XMM_to_XMM(t1reg, regt);
SSE_MOVMSKPS_XMM_to_R32(tempReg, t1reg);
AND32ItoR(tempReg, 1); //Check sign (if regt == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
//--- Check for 0/0 ---
SSE_XORPS_XMM_to_XMM(t1reg, t1reg);
SSE_CMPEQSS_XMM_to_XMM(t1reg, regd);
SSE_MOVMSKPS_XMM_to_R32(tempReg, t1reg);
AND32ItoR(tempReg, 1); //Check sign (if regd == zero, sign will be set)
qjmp1 = JZ8(0); //Skip if not set
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagI|FPUflagSI); // Set I and SI flags ( 0/0 )
qjmp2 = JMP8(0);
x86SetJ8(qjmp1); //x/0 but not 0/0
OR32ItoM((uptr)&fpuRegs.fprc[31], FPUflagD|FPUflagSD); // Set D and SD flags ( x/0 )
x86SetJ8(qjmp2);
SetMaxValue(regd); //clamp to max
pjmp32 = JMP32(0);
x86SetJ8(pjmp1);
ToDouble(regt); ToDouble(regd);
SSE2_SQRTSD_XMM_to_XMM(regt, regt);
SSE2_DIVSD_XMM_to_XMM(regd, regt);
ToPS2FPU(regd, false, regt, false);
x86SetJ32(pjmp32);
_freeXMMreg(t1reg);
_freeX86reg(tempReg);
}
void recRSQRThelper2(int regd, int regt) // Preforms the RSQRT function when regd <- Fs and regt <- Ft (Doesn't set flags)
{
SSE_ANDPS_M128_to_XMM(regt, (uptr)&s_const.pos[0]); // Make regt Positive
ToDouble(regt); ToDouble(regd);
SSE2_SQRTSD_XMM_to_XMM(regt, regt);
SSE2_DIVSD_XMM_to_XMM(regd, regt);
ToPS2FPU(regd, false, regt, false);
}
void recRSQRT_S_xmm(int info)
{
int sreg, treg;
// iFPU (regular FPU) doesn't touch roundmode for rSQRT.
// Should this do the same? or is changing the roundmode to nearest the better
// behavior for both recs? --air
bool roundmodeFlag = false;
if (g_sseMXCSR.GetRoundMode() != SSEround_Nearest)
{
// Set roundmode to nearest if it isn't already
//Console.WriteLn("sqrt to nearest");
roundmode_nearest = g_sseMXCSR;
roundmode_nearest.SetRoundMode( SSEround_Nearest );
xLDMXCSR (roundmode_nearest);
roundmodeFlag = true;
}
ALLOC_S(sreg); ALLOC_T(treg);
if (FPU_FLAGS_ID)
recRSQRThelper1(sreg, treg);
else
recRSQRThelper2(sreg, treg);
SSE_MOVSS_XMM_to_XMM(EEREC_D, sreg);
_freeXMMreg(treg); _freeXMMreg(sreg);
if (roundmodeFlag) xLDMXCSR (g_sseMXCSR);
}
FPURECOMPILE_CONSTCODE(RSQRT_S, XMMINFO_WRITED|XMMINFO_READS|XMMINFO_READT);
} } } } }
#endif