From 281ab5f9cb9553375d5a020b13d48e3dcf38aa32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 9 Oct 2014 20:01:47 +0200
Subject: [PATCH 1/3] Sync x64 emitter to Dolphin's.

---
 Common/x64Emitter.cpp | 296 +++++++++++++++++++++++++-----------------
 Common/x64Emitter.h   | 237 +++++++++++++++++----------------
 2 files changed, 301 insertions(+), 232 deletions(-)

diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp
index 4151ef1518..f454296470 100644
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@@ -23,6 +23,11 @@
 #include "MemoryUtil.h"
 #include "MsgHandler.h"
 
+#define PRIx64 "llx"
+
+// Minimize the diff against Dolphin
+#define DYNA_REC JIT
+
 namespace Gen
 {
 
@@ -32,7 +37,7 @@ struct NormalOpDef
 	u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, ext;
 };
 
-static const NormalOpDef nops[11] = 
+static const NormalOpDef nops[11] =
 {
 	{0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0}, //ADD
 	{0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 2}, //ADC
@@ -54,30 +59,30 @@ static const NormalOpDef nops[11] =
 
 enum NormalSSEOps
 {
-	sseCMP =         0xC2, 
-	sseADD =         0x58, //ADD
-	sseSUB =		 0x5C, //SUB
-	sseAND =		 0x54, //AND
-	sseANDN =		 0x55, //ANDN
-	sseOR  =         0x56, 
-	sseXOR  =        0x57,
-	sseMUL =		 0x59, //MUL,
-	sseDIV =		 0x5E, //DIV
-	sseMIN =		 0x5D, //MIN
-	sseMAX =		 0x5F, //MAX
-	sseCOMIS =		 0x2F, //COMIS
-	sseUCOMIS =		 0x2E, //UCOMIS
-	sseSQRT =		 0x51, //SQRT
-	sseRSQRT =		 0x52, //RSQRT (NO DOUBLE PRECISION!!!)
+	sseCMP         = 0xC2,
+	sseADD         = 0x58, //ADD
+	sseSUB         = 0x5C, //SUB
+	sseAND         = 0x54, //AND
+	sseANDN        = 0x55, //ANDN
+	sseOR          = 0x56,
+	sseXOR         = 0x57,
+	sseMUL         = 0x59, //MUL
+	sseDIV         = 0x5E, //DIV
+	sseMIN         = 0x5D, //MIN
+	sseMAX         = 0x5F, //MAX
+	sseCOMIS       = 0x2F, //COMIS
+	sseUCOMIS      = 0x2E, //UCOMIS
+	sseSQRT        = 0x51, //SQRT
+	sseRSQRT       = 0x52, //RSQRT (NO DOUBLE PRECISION!!!)
 	sseMOVAPfromRM = 0x28, //MOVAP from RM
-	sseMOVAPtoRM =	 0x29, //MOVAP to RM
-	sseMOVUPfromRM = 0x10, //MOVUP from RM
-	sseMOVUPtoRM =	 0x11, //MOVUP to RM
+	sseMOVAPtoRM   = 0x29, //MOVAP to RM
+	sseMOVUPfromRM = 0x10, //MOVUP from RM	
 	sseMOVDQfromRM = 0x6F,
 	sseMOVDQtoRM   = 0x7F,
-	sseMASKMOVDQU =  0xF7,
-	sseLDDQU      =  0xF0,
-	sseSHUF       =  0xC6,
+	sseMOVUPtoRM   = 0x11, //MOVUP to RM
+	sseMASKMOVDQU  = 0xF7,
+	sseLDDQU       = 0xF0,
+	sseSHUF        = 0xC6,
 	sseMOVNTDQ     = 0xE7,
 	sseMOVNTP      = 0x2B,
 };
@@ -128,9 +133,9 @@ const u8 *XEmitter::AlignCodePage()
 	return code;
 }
 
-void XEmitter::WriteModRM(int mod, int rm, int reg)
+void XEmitter::WriteModRM(int mod, int reg, int rm)
 {
-	Write8((u8)((mod << 6) | ((rm & 7) << 3) | (reg & 7)));
+	Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
 }
 
 void XEmitter::WriteSIB(int scale, int index, int base)
@@ -148,32 +153,66 @@ void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const
 	if (indexReg & 8)         op |= 2;
 	if (offsetOrBaseReg & 8)  op |= 1; //TODO investigate if this is dangerous
 	if (op != 0x40 ||
-	    (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
-	    (opBits == 8 && (customOp & 0x10c) == 4)) {
+		(bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+		(opBits == 8 && (customOp & 0x10c) == 4)) {
 		emit->Write8(op);
-		_dbg_assert_(JIT, (offsetOrBaseReg & 0x100) == 0 || bits != 8);
-		_dbg_assert_(JIT, (customOp & 0x100) == 0 || opBits != 8);
+		_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 0x100) == 0 || bits != 8);
+		_dbg_assert_(DYNA_REC, (customOp & 0x100) == 0 || opBits != 8);
 	} else {
-		_dbg_assert_(JIT, (offsetOrBaseReg & 0x10c) == 0 ||
-			     (offsetOrBaseReg & 0x10c) == 0x104 ||
-			     bits != 8);
-		_dbg_assert_(JIT, (customOp & 0x10c) == 0 ||
-			     (customOp & 0x10c) == 0x104 ||
-			     opBits != 8);
+		_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 0x10c) == 0 ||
+				(offsetOrBaseReg & 0x10c) == 0x104 ||
+				bits != 8);
+		_dbg_assert_(DYNA_REC, (customOp & 0x10c) == 0 ||
+				(customOp & 0x10c) == 0x104 ||
+				opBits != 8);
 	}
 
 #else
-	_dbg_assert_(JIT, opBits != 64);
-	_dbg_assert_(JIT, (customOp & 8) == 0 || customOp == -1);
-	_dbg_assert_(JIT, (indexReg & 8) == 0);
-	_dbg_assert_(JIT, (offsetOrBaseReg & 8) == 0);
-	_dbg_assert_(JIT, opBits != 8 || (customOp & 0x10c) != 4 || customOp == -1);
-	_dbg_assert_(JIT, scale == SCALE_ATREG || bits != 8 || (offsetOrBaseReg & 0x10c) != 4);
+	_dbg_assert_(DYNA_REC, opBits != 64);
+	_dbg_assert_(DYNA_REC, (customOp & 8) == 0 || customOp == -1);
+	_dbg_assert_(DYNA_REC, (indexReg & 8) == 0);
+	_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 8) == 0);
+	_dbg_assert_(DYNA_REC, opBits != 8 || (customOp & 0x10c) != 4 || customOp == -1);
+	_dbg_assert_(DYNA_REC, bits != 8 || (offsetOrBaseReg & 0x10c) != 4);
 #endif
 }
 
+void OpArg::WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, Gen::X64Reg regOp2) const
+{
+	int R = !(regOp1 & 8);
+	int X = !(indexReg & 8);
+	int B = !(offsetOrBaseReg & 8);
+
+	// not so sure about this one...
+	int W = 0;
+
+	// aka map_select in AMD manuals
+	// only support VEX opcode map 1 for now (analog to secondary opcode map)
+	int mmmmm = 1;
+
+	int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+	int L = size == 256;
+	int pp = (packed << 1) | (size == 64);
+
+	// do we need any VEX fields that only appear in the three-byte form?
+	if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+	{
+		u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 1) | pp;
+		emit->Write8(0xC5);
+		emit->Write8(RvvvvLpp);
+	}
+	else
+	{
+		u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+		u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 1) | pp;
+		emit->Write8(0xC4);
+		emit->Write8(RXBmmmmm);
+		emit->Write8(WvvvvLpp);
+	}
+}
+
 void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
-    bool warn_64bit_offset) const
+	bool warn_64bit_offset) const
 {
 	if (_operandReg == 0xff)
 		_operandReg = (X64Reg)this->operandReg;
@@ -191,10 +230,10 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 #ifdef _M_X64
 		u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
 		s64 distance = (s64)offset - (s64)ripAddr;
-		_assert_msg_(JIT, (distance < 0x80000000LL
+		_assert_msg_(DYNA_REC, (distance < 0x80000000LL
 					&& distance >=  -0x80000000LL) ||
 			     !warn_64bit_offset,
-			     "WriteRest: op out of range (0x%llx uses 0x%llx)",
+			     "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")",
 			     ripAddr, offset);
 		s32 offs = (s32)distance;
 		emit->Write32((u32)offs);
@@ -248,7 +287,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 				SIB = true;
 			}
 
-			if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4)) 
+			if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
 			{
 				SIB = true;
 				ireg = _offsetOrBaseReg;
@@ -273,7 +312,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	int oreg = _offsetOrBaseReg;
 	if (SIB)
 		oreg = 4;
-	
+
 	// TODO(ector): WTF is this if about? I don't remember writing it :-)
 	//if (RIP)
 	//    oreg = 5;
@@ -286,7 +325,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 		int ss;
 		switch (scale)
 		{
-		case SCALE_NONE: _offsetOrBaseReg = 4; ss = 0; break; //RSP 
+		case SCALE_NONE: _offsetOrBaseReg = 4; ss = 0; break; //RSP
 		case SCALE_1: ss = 0; break;
 		case SCALE_2: ss = 1; break;
 		case SCALE_4: ss = 2; break;
@@ -295,7 +334,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 		case SCALE_NOBASE_4: ss = 2; break;
 		case SCALE_NOBASE_8: ss = 3; break;
 		case SCALE_ATREG: ss = 0; break;
-		default: _assert_msg_(JIT, 0, "Invalid scale for SIB byte"); ss = 0; break;
+		default: _assert_msg_(DYNA_REC, 0, "Invalid scale for SIB byte"); ss = 0; break;
 		}
 		emit->Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7)));
 	}
@@ -317,7 +356,7 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 // B = base register# upper bit
 void XEmitter::Rex(int w, int r, int x, int b)
 {
-	w = w ? 1 : 0;		
+	w = w ? 1 : 0;
 	r = r ? 1 : 0;
 	x = x ? 1 : 0;
 	b = b ? 1 : 0;
@@ -332,7 +371,7 @@ void XEmitter::JMP(const u8 *addr, bool force5Bytes)
 	if (!force5Bytes)
 	{
 		s64 distance = (s64)(fn - ((u64)code + 2));
-		_assert_msg_(JIT, distance >= -0x80 && distance < 0x80,
+		_assert_msg_(DYNA_REC, distance >= -0x80 && distance < 0x80,
 			     "Jump target too far away, needs force5Bytes = true");
 		//8 bits will do
 		Write8(0xEB);
@@ -342,7 +381,7 @@ void XEmitter::JMP(const u8 *addr, bool force5Bytes)
 	{
 		s64 distance = (s64)(fn - ((u64)code + 5));
 
-		_assert_msg_(JIT, distance >= -0x80000000LL
+		_assert_msg_(DYNA_REC, distance >= -0x80000000LL
 			     && distance < 0x80000000LL,
 			     "Jump target too far away, needs indirect register");
 		Write8(0xE9);
@@ -353,7 +392,7 @@ void XEmitter::JMP(const u8 *addr, bool force5Bytes)
 void XEmitter::JMPptr(const OpArg &arg2)
 {
 	OpArg arg = arg2;
-	if (arg.IsImm()) _assert_msg_(JIT, 0, "JMPptr - Imm argument");
+	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "JMPptr - Imm argument");
 	arg.operandReg = 4;
 	arg.WriteRex(this, 0, 0);
 	Write8(0xFF);
@@ -370,7 +409,7 @@ void XEmitter::JMPself()
 
 void XEmitter::CALLptr(OpArg arg)
 {
-	if (arg.IsImm()) _assert_msg_(JIT, 0, "CALLptr - Imm argument");
+	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "CALLptr - Imm argument");
 	arg.operandReg = 2;
 	arg.WriteRex(this, 0, 0);
 	Write8(0xFF);
@@ -380,7 +419,7 @@ void XEmitter::CALLptr(OpArg arg)
 void XEmitter::CALL(const void *fnptr)
 {
 	u64 distance = u64(fnptr) - (u64(code) + 5);
-	_assert_msg_(JIT, distance < 0x0000000080000000ULL
+	_assert_msg_(DYNA_REC, distance < 0x0000000080000000ULL
 		     || distance >=  0xFFFFFFFF80000000ULL,
 		     "CALL out of range (%p calls %p)", code, fnptr);
 	Write8(0xE8);
@@ -432,7 +471,7 @@ void XEmitter::J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes)
 	if (!force5Bytes)
 	{
 		s64 distance = (s64)(fn - ((u64)code + 2));
-		_assert_msg_(JIT, distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
+		_assert_msg_(DYNA_REC, distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
 		//8 bits will do
 		Write8(0x70 + conditionCode);
 		Write8((u8)(s8)distance);
@@ -440,7 +479,7 @@ void XEmitter::J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes)
 	else
 	{
 		s64 distance = (s64)(fn - ((u64)code + 6));
-		_assert_msg_(JIT, distance >= -0x80000000LL
+		_assert_msg_(DYNA_REC, distance >= -0x80000000LL
 			     && distance < 0x80000000LL,
 			     "Jump target too far away, needs indirect register");
 		Write8(0x0F);
@@ -454,13 +493,13 @@ void XEmitter::SetJumpTarget(const FixupBranch &branch)
 	if (branch.type == 0)
 	{
 		s64 distance = (s64)(code - branch.ptr);
-		_assert_msg_(JIT, distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
+		_assert_msg_(DYNA_REC, distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
 		branch.ptr[-1] = (u8)(s8)distance;
 	}
 	else if (branch.type == 1)
 	{
 		s64 distance = (s64)(code - branch.ptr);
-		_assert_msg_(JIT, distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register");
+		_assert_msg_(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register");
 		((s32*)branch.ptr)[-1] = (s32)distance;
 	}
 }
@@ -491,9 +530,7 @@ void XEmitter::DEC(int bits, OpArg arg)
 
 //Single byte opcodes
 //There is no PUSHAD/POPAD in 64-bit mode.
-void XEmitter::INT3() {
-	Write8(0xCC);
-}
+void XEmitter::INT3() {Write8(0xCC);}
 void XEmitter::RET()  {Write8(0xC3);}
 void XEmitter::RET_FAST()  {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret
 
@@ -515,7 +552,7 @@ void XEmitter::NOP(int count)
 		}
 		break;
 	}
-} 
+}
 
 void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu
 void XEmitter::CLC()  {Write8(0xF8);} //clear carry
@@ -577,8 +614,8 @@ void XEmitter::CBW(int bits)
 void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);}
 void XEmitter::POP(X64Reg reg)  {WriteSimple1Byte(32, 0x58, reg);}
 
-void XEmitter::PUSH(int bits, const OpArg &reg) 
-{ 
+void XEmitter::PUSH(int bits, const OpArg &reg)
+{
 	if (reg.IsSimpleReg())
 		PUSH(reg.GetSimpleReg());
 	else if (reg.IsImm())
@@ -599,7 +636,7 @@ void XEmitter::PUSH(int bits, const OpArg &reg)
 			Write32((u32)reg.offset);
 			break;
 		default:
-			_assert_msg_(JIT, 0, "PUSH - Bad imm bits");
+			_assert_msg_(DYNA_REC, 0, "PUSH - Bad imm bits");
 			break;
 		}
 	}
@@ -614,7 +651,7 @@ void XEmitter::PUSH(int bits, const OpArg &reg)
 }
 
 void XEmitter::POP(int /*bits*/, const OpArg &reg)
-{ 
+{
 	if (reg.IsSimpleReg())
 		POP(reg.GetSimpleReg());
 	else
@@ -637,7 +674,7 @@ void XEmitter::BSWAP(int bits, X64Reg reg)
 	}
 	else
 	{
-		_assert_msg_(JIT, 0, "BSWAP - Wrong number of bits");
+		_assert_msg_(DYNA_REC, 0, "BSWAP - Wrong number of bits");
 	}
 }
 
@@ -651,7 +688,7 @@ void XEmitter::UD2()
 
 void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
 {
-	if (arg.IsImm()) _assert_msg_(JIT, 0, "PREFETCH - Imm argument");
+	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "PREFETCH - Imm argument");;
 	arg.operandReg = (u8)level;
 	arg.WriteRex(this, 0, 0);
 	Write8(0x0F);
@@ -661,7 +698,7 @@ void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
 
 void XEmitter::SETcc(CCFlags flag, OpArg dest)
 {
-	if (dest.IsImm()) _assert_msg_(JIT, 0, "SETcc - Imm argument");
+	if (dest.IsImm()) _assert_msg_(DYNA_REC, 0, "SETcc - Imm argument");
 	dest.operandReg = 0;
 	dest.WriteRex(this, 0, 0);
 	Write8(0x0F);
@@ -671,7 +708,7 @@ void XEmitter::SETcc(CCFlags flag, OpArg dest)
 
 void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
 {
-	if (src.IsImm()) _assert_msg_(JIT, 0, "CMOVcc - Imm argument");
+	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "CMOVcc - Imm argument");
 	src.operandReg = dest;
 	src.WriteRex(this, bits, bits);
 	Write8(0x0F);
@@ -681,7 +718,7 @@ void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
 
 void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
 {
-	if (src.IsImm()) _assert_msg_(JIT, 0, "WriteMulDivType - Imm argument");
+	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "WriteMulDivType - Imm argument");
 	src.operandReg = ext;
 	if (bits == 16) Write8(0x66);
 	src.WriteRex(this, bits, bits);
@@ -705,7 +742,7 @@ void XEmitter::NOT(int bits, OpArg src)  {WriteMulDivType(bits, src, 2);}
 
 void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2)
 {
-	if (src.IsImm()) _assert_msg_(JIT, 0, "WriteBitSearchType - Imm argument");
+	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "WriteBitSearchType - Imm argument");
 	src.operandReg = (u8)dest;
 	if (bits == 16) Write8(0x66);
 	src.WriteRex(this, bits, bits);
@@ -716,7 +753,7 @@ void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2)
 
 void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src)
 {
-	if (bits <= 16) _assert_msg_(JIT, 0, "MOVNTI - bits<=16");
+	if (bits <= 16) _assert_msg_(DYNA_REC, 0, "MOVNTI - bits<=16");
 	WriteBitSearchType(bits, src, dest, 0xC3);
 }
 
@@ -725,7 +762,7 @@ void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,de
 
 void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
 {
-	if (src.IsImm()) _assert_msg_(JIT, 0, "MOVSX - Imm argument");
+	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "MOVSX - Imm argument");
 	if (dbits == sbits) {
 		MOV(dbits, R(dest), src);
 		return;
@@ -756,7 +793,7 @@ void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
 
 void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
 {
-	if (src.IsImm()) _assert_msg_(JIT, 0, "MOVZX - Imm argument");
+	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "MOVZX - Imm argument");
 	if (dbits == sbits) {
 		MOV(dbits, R(dest), src);
 		return;
@@ -775,6 +812,10 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
 		Write8(0x0F);
 		Write8(0xB7);
 	}
+	else if (sbits == 32 && dbits == 64)
+	{
+		Write8(0x8B);
+	}
 	else
 	{
 		Crash();
@@ -785,7 +826,7 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
 
 void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
 {
-	if (src.IsImm()) _assert_msg_(JIT, 0, "LEA - Imm argument");
+	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "LEA - Imm argument");
 	src.operandReg = (u8)dest;
 	if (bits == 16) Write8(0x66); //TODO: performance warning
 	src.WriteRex(this, bits, bits);
@@ -799,11 +840,11 @@ void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext)
 	bool writeImm = false;
 	if (dest.IsImm())
 	{
-		_assert_msg_(JIT, 0, "WriteShift - can't shift imms");
+		_assert_msg_(DYNA_REC, 0, "WriteShift - can't shift imms");
 	}
 	if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
 	{
-		_assert_msg_(JIT, 0, "WriteShift - illegal argument"); 
+		_assert_msg_(DYNA_REC, 0, "WriteShift - illegal argument");
 	}
 	dest.operandReg = ext;
 	if (bits == 16) Write8(0x66);
@@ -846,11 +887,11 @@ void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext)
 {
 	if (dest.IsImm())
 	{
-		_assert_msg_(JIT, 0, "WriteBitTest - can't test imms");
+		_assert_msg_(DYNA_REC, 0, "WriteBitTest - can't test imms");
 	}
 	if ((index.IsImm() && index.GetImmBits() != 8))
 	{
-		_assert_msg_(JIT, 0, "WriteBitTest - illegal argument"); 
+		_assert_msg_(DYNA_REC, 0, "WriteBitTest - illegal argument");
 	}
 	if (bits == 16) Write8(0x66);
 	if (index.IsImm())
@@ -879,15 +920,15 @@ void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
 {
 	if (dest.IsImm())
 	{
-		_assert_msg_(JIT, 0, "SHRD - can't use imms as destination");
+		_assert_msg_(DYNA_REC, 0, "SHRD - can't use imms as destination");
 	}
 	if (!src.IsSimpleReg())
 	{
-		_assert_msg_(JIT, 0, "SHRD - must use simple register as source");
+		_assert_msg_(DYNA_REC, 0, "SHRD - must use simple register as source");
 	}
 	if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
 	{
-		_assert_msg_(JIT, 0, "SHRD - illegal shift"); 
+		_assert_msg_(DYNA_REC, 0, "SHRD - illegal shift");
 	}
 	if (bits == 16) Write8(0x66);
 	X64Reg operand = src.GetSimpleReg();
@@ -909,15 +950,15 @@ void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift)
 {
 	if (dest.IsImm())
 	{
-		_assert_msg_(JIT, 0, "SHLD - can't use imms as destination");
+		_assert_msg_(DYNA_REC, 0, "SHLD - can't use imms as destination");
 	}
 	if (!src.IsSimpleReg())
 	{
-		_assert_msg_(JIT, 0, "SHLD - must use simple register as source");
+		_assert_msg_(DYNA_REC, 0, "SHLD - must use simple register as source");
 	}
 	if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
 	{
-		_assert_msg_(JIT, 0, "SHLD - illegal shift"); 
+		_assert_msg_(DYNA_REC, 0, "SHLD - illegal shift");
 	}
 	if (bits == 16) Write8(0x66);
 	X64Reg operand = src.GetSimpleReg();
@@ -952,7 +993,7 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
 	X64Reg _operandReg = (X64Reg)this->operandReg;
 	if (IsImm())
 	{
-		_assert_msg_(JIT, 0, "WriteNormalOp - Imm argument, wrong order");
+		_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
 	}
 
 	if (bits == 16)
@@ -967,24 +1008,24 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
 
 		if (!toRM)
 		{
-			_assert_msg_(JIT, 0, "WriteNormalOp - Writing to Imm (!toRM)");
+			_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)");
 		}
 
-		if (operand.scale == SCALE_IMM8 && bits == 8) 
+		if (operand.scale == SCALE_IMM8 && bits == 8)
 		{
 			emit->Write8(nops[op].imm8);
 			immToWrite = 8;
 		}
 		else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
-                             (operand.scale == SCALE_IMM32 && bits == 32) || 
-                             (operand.scale == SCALE_IMM32 && bits == 64))
+				 (operand.scale == SCALE_IMM32 && bits == 32) ||
+				 (operand.scale == SCALE_IMM32 && bits == 64))
 		{
 			emit->Write8(nops[op].imm32);
 			immToWrite = bits == 16 ? 16 : 32;
 		}
 		else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
-                             (operand.scale == SCALE_IMM8 && bits == 32) ||
-                             (operand.scale == SCALE_IMM8 && bits == 64))
+				 (operand.scale == SCALE_IMM8 && bits == 32) ||
+				 (operand.scale == SCALE_IMM8 && bits == 64))
 		{
 			emit->Write8(nops[op].simm8);
 			immToWrite = 8;
@@ -997,11 +1038,11 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
 				emit->Write64((u64)operand.offset);
 				return;
 			}
-			_assert_msg_(JIT, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
+			_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
 		}
 		else
 		{
-			_assert_msg_(JIT, 0, "WriteNormalOp - Unhandled case");
+			_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
 		}
 		_operandReg = (X64Reg)nops[op].ext; //pass extension in REG of ModRM
 	}
@@ -1036,7 +1077,7 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
 		emit->Write32((u32)operand.offset);
 		break;
 	default:
-		_assert_msg_(JIT, 0, "WriteNormalOp - Unhandled case");
+		_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
 	}
 }
 
@@ -1045,7 +1086,7 @@ void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg
 	if (a1.IsImm())
 	{
 		//Booh! Can't write to an imm
-		_assert_msg_(JIT, 0, "WriteNormalOp - a1 cannot be imm");
+		_assert_msg_(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm");
 		return;
 	}
 	if (a2.IsImm())
@@ -1072,11 +1113,11 @@ void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(t
 void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmAND, a1, a2);}
 void XEmitter::OR  (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmOR , a1, a2);}
 void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXOR, a1, a2);}
-void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2) 
+void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2)
 {
 #ifdef _DEBUG
-	_assert_msg_(JIT, !a1.IsSimpleReg() || !a2.IsSimpleReg() || a1.GetSimpleReg() != a2.GetSimpleReg(), "Redundant MOV @ %p - bug in JIT?", 
-				 code); 
+	_assert_msg_(DYNA_REC, !a1.IsSimpleReg() || !a2.IsSimpleReg() || a1.GetSimpleReg() != a2.GetSimpleReg(), "Redundant MOV @ %p - bug in DYNA_REC?",
+				 code);
 #endif
 	WriteNormalOp(this, bits, nrmMOV, a1, a2);
 }
@@ -1087,16 +1128,16 @@ void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(t
 void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
 {
 	if (bits == 8) {
-		_assert_msg_(JIT, 0, "IMUL - illegal bit size!");
+		_assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!");
 		return;
 	}
 	if (a1.IsImm()) {
-		_assert_msg_(JIT, 0, "IMUL - second arg cannot be imm!");
+		_assert_msg_(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
 		return;
 	}
 	if (!a2.IsImm())
 	{
-		_assert_msg_(JIT, 0, "IMUL - third arg must be imm!");
+		_assert_msg_(DYNA_REC, 0, "IMUL - third arg must be imm!");
 		return;
 	}
 
@@ -1118,7 +1159,7 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
 				a1.WriteRest(this, 4, regOp);
 				Write32((u32)a2.offset);
 		} else {
-			_assert_msg_(JIT, 0, "IMUL - unhandled case!");
+			_assert_msg_(DYNA_REC, 0, "IMUL - unhandled case!");
 		}
 	}
 }
@@ -1126,7 +1167,7 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
 void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
 {
 	if (bits == 8) {
-		_assert_msg_(JIT, 0, "IMUL - illegal bit size!");
+		_assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!");
 		return;
 	}
 	if (a.IsImm())
@@ -1160,7 +1201,7 @@ void XEmitter::WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg a
 void XEmitter::WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
 {
 	if (size == 64 && packed)
-		Write8(0x66); //this time, override goes upwards
+	Write8(0x66); //this time, override goes upwards
 	if (!packed)
 		Write8(size == 64 ? 0xF2 : 0xF3);
 	arg.operandReg = regOp;
@@ -1171,6 +1212,18 @@ void XEmitter::WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg
 	arg.WriteRest(this, extrabytes);
 }
 
+void XEmitter::WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
+{
+	WriteAVXOp(size, sseOp, packed, regOp, X64Reg::INVALID_REG, arg, extrabytes);
+}
+
+void XEmitter::WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	arg.WriteVex(this, size, packed, regOp1, regOp2);
+	Write8(sseOp);
+	arg.WriteRest(this, extrabytes, regOp1);
+}
+
 void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);}
 void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);}
 
@@ -1218,8 +1271,8 @@ void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) {
 
 void XEmitter::WriteMXCSR(OpArg arg, int ext)
 {
-	if (arg.IsImm() || arg.IsSimpleReg()) 
-		_assert_msg_(JIT, 0, "MXCSR - invalid operand");
+	if (arg.IsImm() || arg.IsSimpleReg())
+		_assert_msg_(DYNA_REC, 0, "MXCSR - invalid operand");
 
 	arg.operandReg = ext;
 	arg.WriteRex(this, 0, 0);
@@ -1278,8 +1331,8 @@ void XEmitter::MAXPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMAX, true, re
 void XEmitter::SQRTPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseSQRT, true, regOp, arg);}
 void XEmitter::SQRTPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseSQRT, true, regOp, arg);}
 void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, true, regOp, arg);}
-void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);} 
-void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);} 
+void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
+void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
 
 void XEmitter::COMISS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseCOMIS, true, regOp, arg);} //weird that these should be packed
 void XEmitter::COMISD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseCOMIS, true, regOp, arg);} //ordered
@@ -1287,13 +1340,13 @@ void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseUCOMIS, true,
 void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseUCOMIS, true, regOp, arg);}
 
 void XEmitter::MOVAPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseMOVAPfromRM, true, regOp, arg);}
-void XEmitter::MOVAPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);}
-void XEmitter::MOVUPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);}
-void XEmitter::MOVUPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);}
-
 void XEmitter::MOVAPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseMOVAPfromRM, true, regOp, arg);}
+void XEmitter::MOVAPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);}
 void XEmitter::MOVAPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(64, sseMOVAPtoRM, true, regOp, arg);}
+
+void XEmitter::MOVUPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);}
 void XEmitter::MOVUPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseMOVUPfromRM, true, regOp, arg);}
+void XEmitter::MOVUPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);}
 void XEmitter::MOVUPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(64, sseMOVUPtoRM, true, regOp, arg);}
 
 void XEmitter::MOVDQA(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseMOVDQfromRM, true, regOp, arg);}
@@ -1311,7 +1364,7 @@ void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, reg
 
 void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, false, regOp, arg);}
 void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, false, regOp, arg);}
-void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xF2, false, regOp, arg);}
+void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2D, false, regOp, arg);}
 
 void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xE6, false, regOp, arg);}
 void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, true, regOp, arg);}
@@ -1339,7 +1392,7 @@ void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x15, true, dest
 void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x14, true, dest, arg);}
 void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x15, true, dest, arg);}
 
-void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg) 
+void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
 {
 	if (cpu_info.bSSE3)
 	{
@@ -1356,7 +1409,7 @@ void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
 
 //There are a few more left
 
-// Also some integer instrucitons are missing
+// Also some integer instructions are missing
 void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x6B, true, dest, arg);}
 void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x63, true, dest, arg);}
 //void PACKUSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);} // WRONG
@@ -1515,8 +1568,8 @@ void XEmitter::PCMPGTB(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x64, true, dest
 void XEmitter::PCMPGTW(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x65, true, dest, arg);}
 void XEmitter::PCMPGTD(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x66, true, dest, arg);}
 
-void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg)    { WriteSSEOp(64, 0xC5, true, dest, arg); Write8(subreg); }
-void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg)    { WriteSSEOp(64, 0xC4, true, dest, arg); Write8(subreg); }
+void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(64, 0xC5, true, dest, arg); Write8(subreg);}
+void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(64, 0xC4, true, dest, arg); Write8(subreg);}
 
 void XEmitter::PMADDWD(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0xF5, true, dest, arg); }
 void XEmitter::PSADBW(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xF6, true, dest, arg);}
@@ -1531,6 +1584,13 @@ void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xD7, true, d
 void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle)    {WriteSSEOp(64, 0x70, true, regOp, arg, 1); Write8(shuffle);}
 void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);}
 
+// VEX
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseADD, false, regOp1, regOp2, arg);}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseSUB, false, regOp1, regOp2, arg);}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
+
 // Prefixes
 
 void XEmitter::LOCK()  { Write8(0xF0); }
diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h
index cf739c7a30..2b163ff52a 100644
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@@ -22,10 +22,6 @@
 
 #include "Common.h"
 
-#if !defined(_M_IX86) && !defined(_M_X64)
-#error "Don't build this on arm."
-#endif
-
 namespace Gen
 {
 
@@ -33,7 +29,7 @@ enum X64Reg
 {
 	EAX = 0, EBX = 3, ECX = 1, EDX = 2,
 	ESI = 6, EDI = 7, EBP = 5, ESP = 4,
-	
+
 	RAX = 0, RBX = 3, RCX = 1, RDX = 2,
 	RSI = 6, RDI = 7, RBP = 5, RSP = 4,
 	R8  = 8, R9  = 9, R10 = 10,R11 = 11,
@@ -46,9 +42,12 @@ enum X64Reg
 	AX = 0, BX = 3, CX = 1, DX = 2,
 	SI = 6, DI = 7, BP = 5, SP = 4,
 
-	XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 
+	XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
 	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
 
+	YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+	YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15,
+
 	INVALID_REG = 0xFFFFFFFF
 };
 
@@ -59,7 +58,7 @@ enum CCFlags
 	CC_B   = 2, CC_C  = 2, CC_NAE = 2,
 	CC_NB  = 3, CC_NC = 3, CC_AE  = 3,
 	CC_Z   = 4, CC_E   = 4,
-	CC_NZ  = 5,	CC_NE  = 5, 
+	CC_NZ  = 5,	CC_NE  = 5,
 	CC_BE  = 6, CC_NA  = 6,
 	CC_NBE = 7, CC_A   = 7,
 	CC_S   = 8,
@@ -111,8 +110,7 @@ enum NormalOp {
 	nrmXCHG,
 };
 
-enum
-{
+enum {
 	CMP_EQ = 0,
 	CMP_LT = 1,
 	CMP_LE = 2,
@@ -125,6 +123,7 @@ enum
 
 class XEmitter;
 
+// RIP addressing does not benefit from micro op fusion on Core arch
 struct OpArg
 {
 	OpArg() {}  // dummy op arg, used for storage
@@ -134,10 +133,11 @@ struct OpArg
 		scale = (u8)_scale;
 		offsetOrBaseReg = (u16)rmReg;
 		indexReg = (u16)scaledReg;
-		//if scale == 0 never mind offseting
+		//if scale == 0 never mind offsetting
 		offset = _offset;
 	}
 	void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
+	void WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, X64Reg regOp2) const;
 	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF, bool warn_64bit_offset = true) const;
 	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
 	// This one is public - must be written to
@@ -148,6 +148,8 @@ struct OpArg
 	bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
 	bool IsSimpleReg() const {return scale == SCALE_NONE;}
 	bool IsSimpleReg(X64Reg reg) const {
+		if (!IsSimpleReg())
+			return false;
 		return GetSimpleReg() == reg;
 	}
 
@@ -186,16 +188,17 @@ struct OpArg
 	void IncreaseOffset(int sz) {
 		offset += sz;
 	}
+
 private:
 	u8 scale;
 	u16 offsetOrBaseReg;
 	u16 indexReg;
 };
 
-inline OpArg M(const void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);}
+inline OpArg M(void *ptr)	    {return OpArg((u64)ptr, (int)SCALE_RIP);}
 template <typename T>
 inline OpArg M(const T *ptr)    {return OpArg((u64)(const void *)ptr, (int)SCALE_RIP);}
-inline OpArg R(X64Reg value)    {return OpArg(0, SCALE_NONE, value);}
+inline OpArg R(X64Reg value)	{return OpArg(0, SCALE_NONE, value);}
 inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
 inline OpArg MDisp(X64Reg value, int offset) {
 	return OpArg((u32)offset, SCALE_ATREG, value);
@@ -224,11 +227,11 @@ inline OpArg SImmAuto(s32 imm) {
 }
 
 #ifdef _M_X64
-inline OpArg ImmPtr(const void *imm) {return Imm64((u64)imm);}
+inline OpArg ImmPtr(const void* imm) {return Imm64((u64)imm);}
 #else
-inline OpArg ImmPtr(const void *imm) {return Imm32((u32)imm);}
+inline OpArg ImmPtr(const void* imm) {return Imm32((u32)imm);}
 #endif
-inline u32 PtrOffset(const void *ptr, const void *base) {
+inline u32 PtrOffset(const void* ptr, const void* base) {
 #ifdef _M_X64
 	s64 distance = (s64)ptr-(s64)base;
 	if (distance >= 0x80000000LL ||
@@ -253,6 +256,18 @@ struct FixupBranch
 	int type; //0 = 8bit 1 = 32bit
 };
 
+enum SSECompare
+{
+	EQ = 0,
+	LT,
+	LE,
+	UNORD,
+	NEQ,
+	NLT,
+	NLE,
+	ORD,
+};
+
 typedef const u8* JumpTarget;
 
 class XEmitter
@@ -271,15 +286,12 @@ private:
 	void WriteMXCSR(OpArg arg, int ext);
 	void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
 	void WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
 	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
 
 protected:
-	inline void Write8(u8 value)   {
-		//if (value == 0xcc) {
-		//	value = 0xcc;   // set breakpoint here to find where mysterious 0xcc are written
-		//}
-		*code++ = value;
-	}
+	inline void Write8(u8 value)   {*code++ = value;}
 	inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
 	inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
 	inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
@@ -301,7 +313,7 @@ public:
 	u8 *GetWritableCodePtr();
 
 	// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
-	// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., 
+	// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
 	// INC and DEC are slow on Intel Core, but not on AMD. They create a
 	// false flag dependency because they only update a subset of the flags.
 	// XCHG is SLOW and should be avoided.
@@ -390,7 +402,7 @@ public:
 	void DIV(int bits, OpArg src);
 	void IDIV(int bits, OpArg src);
 
-	// Shift 
+	// Shift
 	void ROL(int bits, OpArg dest, OpArg shift);
 	void ROR(int bits, OpArg dest, OpArg shift);
 	void RCL(int bits, OpArg dest, OpArg shift);
@@ -445,7 +457,7 @@ public:
 
 	// Sign/zero extension
 	void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
-	void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); 
+	void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
 
 	// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
 	void STMXCSR(OpArg memloc);
@@ -459,25 +471,33 @@ public:
 	void FWAIT();
 
 	// SSE/SSE2: Floating point arithmetic
-	void ADDSS(X64Reg regOp, OpArg arg);  
-	void ADDSD(X64Reg regOp, OpArg arg);  
-	void SUBSS(X64Reg regOp, OpArg arg);  
-	void SUBSD(X64Reg regOp, OpArg arg);  
-	void MULSS(X64Reg regOp, OpArg arg);  
-	void MULSD(X64Reg regOp, OpArg arg);  
-	void DIVSS(X64Reg regOp, OpArg arg);  
-	void DIVSD(X64Reg regOp, OpArg arg);  
-	void MINSS(X64Reg regOp, OpArg arg);  
-	void MINSD(X64Reg regOp, OpArg arg);  
-	void MAXSS(X64Reg regOp, OpArg arg);  
-	void MAXSD(X64Reg regOp, OpArg arg);  
-	void SQRTSS(X64Reg regOp, OpArg arg); 
-	void SQRTSD(X64Reg regOp, OpArg arg); 
+	void ADDSS(X64Reg regOp, OpArg arg);
+	void ADDSD(X64Reg regOp, OpArg arg);
+	void SUBSS(X64Reg regOp, OpArg arg);
+	void SUBSD(X64Reg regOp, OpArg arg);
+	void MULSS(X64Reg regOp, OpArg arg);
+	void MULSD(X64Reg regOp, OpArg arg);
+	void DIVSS(X64Reg regOp, OpArg arg);
+	void DIVSD(X64Reg regOp, OpArg arg);
+	void MINSS(X64Reg regOp, OpArg arg);
+	void MINSD(X64Reg regOp, OpArg arg);
+	void MAXSS(X64Reg regOp, OpArg arg);
+	void MAXSD(X64Reg regOp, OpArg arg);
+	void SQRTSS(X64Reg regOp, OpArg arg);
+	void SQRTSD(X64Reg regOp, OpArg arg);
 	void RSQRTSS(X64Reg regOp, OpArg arg);
 
 	// SSE/SSE2: Floating point bitwise (yes)
-	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);  
-	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);  
+	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
+	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
+	void ANDSS(X64Reg regOp, OpArg arg);
+	void ANDSD(X64Reg regOp, OpArg arg);
+	void ANDNSS(X64Reg regOp, OpArg arg);
+	void ANDNSD(X64Reg regOp, OpArg arg);
+	void ORSS(X64Reg regOp, OpArg arg);
+	void ORSD(X64Reg regOp, OpArg arg);
+	void XORSS(X64Reg regOp, OpArg arg);
+	void XORSD(X64Reg regOp, OpArg arg);
 
 	inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); }
 	inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); }
@@ -487,24 +507,12 @@ public:
 	inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); }
 	inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); }
 
-
-	// I don't think these exist
-	/*
-	void ANDSD(X64Reg regOp, OpArg arg);  
-	void ANDNSS(X64Reg regOp, OpArg arg); 
-	void ANDNSD(X64Reg regOp, OpArg arg); 
-	void ORSS(X64Reg regOp, OpArg arg);   
-	void ORSD(X64Reg regOp, OpArg arg);   
-	void XORSS(X64Reg regOp, OpArg arg);   
-	void XORSD(X64Reg regOp, OpArg arg);   
-	*/
-
 	// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
-	void ADDPS(X64Reg regOp, OpArg arg); 
-	void ADDPD(X64Reg regOp, OpArg arg); 
-	void SUBPS(X64Reg regOp, OpArg arg); 
-	void SUBPD(X64Reg regOp, OpArg arg); 
-	void CMPPS(X64Reg regOp, OpArg arg, u8 compare);  
+	void ADDPS(X64Reg regOp, OpArg arg);
+	void ADDPD(X64Reg regOp, OpArg arg);
+	void SUBPS(X64Reg regOp, OpArg arg);
+	void SUBPD(X64Reg regOp, OpArg arg);
+	void CMPPS(X64Reg regOp, OpArg arg, u8 compare);
 	void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
 	void MULPS(X64Reg regOp, OpArg arg);
 	void MULPD(X64Reg regOp, OpArg arg);
@@ -519,8 +527,8 @@ public:
 	void RSQRTPS(X64Reg regOp, OpArg arg);
 
 	// SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
-	void ANDPS(X64Reg regOp, OpArg arg); 
-	void ANDPD(X64Reg regOp, OpArg arg); 
+	void ANDPS(X64Reg regOp, OpArg arg);
+	void ANDPD(X64Reg regOp, OpArg arg);
 	void ANDNPS(X64Reg regOp, OpArg arg);
 	void ANDNPD(X64Reg regOp, OpArg arg);
 	void ORPS(X64Reg regOp, OpArg arg);
@@ -529,9 +537,9 @@ public:
 	void XORPD(X64Reg regOp, OpArg arg);
 
 	// SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
-	void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);  
-	void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);  
-	
+	void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);
+	void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);
+
 	// SSE/SSE2: Useful alternative to shuffle in some cases.
 	void MOVDDUP(X64Reg regOp, OpArg arg);
 
@@ -549,18 +557,17 @@ public:
 	void UCOMISS(X64Reg regOp, OpArg arg);
 	void UCOMISD(X64Reg regOp, OpArg arg);
 
-	// SSE/SSE2: Moves. Use the right data type for your data to avoid slight penalties on some CPUs.
-
-	// Singles
+	// SSE/SSE2: Moves. Use the right data type for your data, in most cases.
 	void MOVAPS(X64Reg regOp, OpArg arg);
-	void MOVAPS(OpArg arg, X64Reg regOp);
-	void MOVUPS(X64Reg regOp, OpArg arg);
-	void MOVUPS(OpArg arg, X64Reg regOp);
-	// Doubles
 	void MOVAPD(X64Reg regOp, OpArg arg);
+	void MOVAPS(OpArg arg, X64Reg regOp);
 	void MOVAPD(OpArg arg, X64Reg regOp);
+
+	void MOVUPS(X64Reg regOp, OpArg arg);
 	void MOVUPD(X64Reg regOp, OpArg arg);
+	void MOVUPS(OpArg arg, X64Reg regOp);
 	void MOVUPD(OpArg arg, X64Reg regOp);
+
 	// Integers (NOTE: untested - I added these then it turned out I didn't have a use for them after all).
 	void MOVDQA(X64Reg regOp, OpArg arg);
 	void MOVDQA(OpArg arg, X64Reg regOp);
@@ -596,11 +603,11 @@ public:
 	void CVTDQ2PS(X64Reg regOp, OpArg arg);
 	void CVTPS2DQ(X64Reg regOp, OpArg arg);
 
+	void CVTTSS2SI(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
+	void CVTTPS2DQ(X64Reg regOp, OpArg arg);
 	void CVTSI2SS(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
 	void CVTSS2SI(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
-	void CVTTSS2SI(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
 	void CVTTSD2SI(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
-	void CVTTPS2DQ(X64Reg regOp, OpArg arg);
 	void CVTTPD2DQ(X64Reg xregdest, OpArg arg);
 
 	// SSE2: Packed integer instructions
@@ -621,57 +628,57 @@ public:
 	void PMOVZXWD(X64Reg dest, const OpArg &arg);
 
 	void PAND(X64Reg dest, OpArg arg);
-	void PANDN(X64Reg dest, OpArg arg);   
-	void PXOR(X64Reg dest, OpArg arg);    
-	void POR(X64Reg dest, OpArg arg);     
+	void PANDN(X64Reg dest, OpArg arg);
+	void PXOR(X64Reg dest, OpArg arg);
+	void POR(X64Reg dest, OpArg arg);
 
 	void PADDB(X64Reg dest, OpArg arg);
-	void PADDW(X64Reg dest, OpArg arg);   
-	void PADDD(X64Reg dest, OpArg arg);   
-	void PADDQ(X64Reg dest, OpArg arg);   
+	void PADDW(X64Reg dest, OpArg arg);
+	void PADDD(X64Reg dest, OpArg arg);
+	void PADDQ(X64Reg dest, OpArg arg);
 
-	void PADDSB(X64Reg dest, OpArg arg);  
-	void PADDSW(X64Reg dest, OpArg arg);  
-	void PADDUSB(X64Reg dest, OpArg arg); 
-	void PADDUSW(X64Reg dest, OpArg arg); 
+	void PADDSB(X64Reg dest, OpArg arg);
+	void PADDSW(X64Reg dest, OpArg arg);
+	void PADDUSB(X64Reg dest, OpArg arg);
+	void PADDUSW(X64Reg dest, OpArg arg);
 
-	void PSUBB(X64Reg dest, OpArg arg);   
-	void PSUBW(X64Reg dest, OpArg arg);   
-	void PSUBD(X64Reg dest, OpArg arg);   
-	void PSUBQ(X64Reg dest, OpArg arg);   
+	void PSUBB(X64Reg dest, OpArg arg);
+	void PSUBW(X64Reg dest, OpArg arg);
+	void PSUBD(X64Reg dest, OpArg arg);
+	void PSUBQ(X64Reg dest, OpArg arg);
 
-	void PSUBSB(X64Reg dest, OpArg arg);  
-	void PSUBSW(X64Reg dest, OpArg arg);  
-	void PSUBUSB(X64Reg dest, OpArg arg); 
-	void PSUBUSW(X64Reg dest, OpArg arg); 
+	void PSUBSB(X64Reg dest, OpArg arg);
+	void PSUBSW(X64Reg dest, OpArg arg);
+	void PSUBUSB(X64Reg dest, OpArg arg);
+	void PSUBUSW(X64Reg dest, OpArg arg);
 
-	void PAVGB(X64Reg dest, OpArg arg);   
-	void PAVGW(X64Reg dest, OpArg arg);   
+	void PAVGB(X64Reg dest, OpArg arg);
+	void PAVGW(X64Reg dest, OpArg arg);
 
-	void PCMPEQB(X64Reg dest, OpArg arg); 
-	void PCMPEQW(X64Reg dest, OpArg arg); 
-	void PCMPEQD(X64Reg dest, OpArg arg); 
+	void PCMPEQB(X64Reg dest, OpArg arg);
+	void PCMPEQW(X64Reg dest, OpArg arg);
+	void PCMPEQD(X64Reg dest, OpArg arg);
 
-	void PCMPGTB(X64Reg dest, OpArg arg); 
-	void PCMPGTW(X64Reg dest, OpArg arg); 
-	void PCMPGTD(X64Reg dest, OpArg arg); 
+	void PCMPGTB(X64Reg dest, OpArg arg);
+	void PCMPGTW(X64Reg dest, OpArg arg);
+	void PCMPGTD(X64Reg dest, OpArg arg);
 
 	void PEXTRW(X64Reg dest, OpArg arg, u8 subreg);
 	void PINSRW(X64Reg dest, OpArg arg, u8 subreg);
 
-	void PMADDWD(X64Reg dest, OpArg arg); 
-	void PSADBW(X64Reg dest, OpArg arg);  
+	void PMADDWD(X64Reg dest, OpArg arg);
+	void PSADBW(X64Reg dest, OpArg arg);
 
-	void PMAXSW(X64Reg dest, OpArg arg);  
-	void PMAXUB(X64Reg dest, OpArg arg);  
-	void PMINSW(X64Reg dest, OpArg arg);  
-	void PMINUB(X64Reg dest, OpArg arg);  
+	void PMAXSW(X64Reg dest, OpArg arg);
+	void PMAXUB(X64Reg dest, OpArg arg);
+	void PMINSW(X64Reg dest, OpArg arg);
+	void PMINUB(X64Reg dest, OpArg arg);
 	// SSE4 has PMAXSB and PMINSB and PMAXUW and PMINUW too if we need them.
-	
+
 	void PMOVMSKB(X64Reg dest, OpArg arg);
+	void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle);
 	void PSHUFB(X64Reg dest, OpArg arg);
 
-	void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle);
 	void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle);
 
 	void PSRLW(X64Reg reg, int shift);
@@ -688,13 +695,19 @@ public:
 	void PSRAW(X64Reg reg, int shift);
 	void PSRAD(X64Reg reg, int shift);
 
+	// AVX
+	void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
 	void RTDSC();
 
 	// Utility functions
 	// The difference between this and CALL is that this aligns the stack
 	// where appropriate.
 	void ABI_CallFunction(const void *func);
-
 	template <typename T>
 	void ABI_CallFunction(T (*func)()) {
 		ABI_CallFunction((const void *)func);
@@ -703,10 +716,9 @@ public:
 	void ABI_CallFunction(const u8 *func) {
 		ABI_CallFunction((const void *)func);
 	}
-
 	void ABI_CallFunctionC16(const void *func, u16 param1);
 	void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2);
-	
+
 	// These only support u32 parameters, but that's enough for a lot of uses.
 	// These will destroy the 1 or 2 first "parameter regs".
 	void ABI_CallFunctionC(const void *func, u32 param1);
@@ -783,8 +795,7 @@ public:
 	// Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
 	void FreeCodeSpace();
 
-	bool IsInSpace(const u8 *ptr) const
-	{
+	bool IsInSpace(const u8 *ptr) const {
 		return ptr >= region && ptr < region + region_size;
 	}
 
@@ -792,13 +803,11 @@ public:
 	// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
 	void WriteProtect();
 
-	void ResetCodePtr()
-	{
+	void ResetCodePtr() {
 		SetCodePtr(region);
 	}
 
-	size_t GetSpaceLeft() const
-	{
+	size_t GetSpaceLeft() const {
 		return region_size - (GetCodePtr() - region);
 	}
 

From 3b1476c8ecb59982b0349cd85e75a0a2777cd777 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 9 Oct 2014 21:38:25 +0200
Subject: [PATCH 2/3] MIPSTables: Annotate fp and hi/lo in/out more accurately
 than just "other"

Some typo fixes
---
 Core/Debugger/DisassemblyManager.cpp |   4 +-
 Core/MIPS/MIPS.h                     |  10 +--
 Core/MIPS/MIPSAnalyst.cpp            |   8 +-
 Core/MIPS/MIPSAnalyst.h              |   2 +-
 Core/MIPS/MIPSInt.cpp                |  36 +++------
 Core/MIPS/MIPSTables.cpp             | 106 +++++++++++++--------------
 Core/MIPS/MIPSTables.h               |  89 ++++++++++++----------
 Windows/Debugger/CtrlDisAsmView.cpp  |   2 +-
 8 files changed, 124 insertions(+), 133 deletions(-)

diff --git a/Core/Debugger/DisassemblyManager.cpp b/Core/Debugger/DisassemblyManager.cpp
index 874207b2f6..fa0203878d 100644
--- a/Core/Debugger/DisassemblyManager.cpp
+++ b/Core/Debugger/DisassemblyManager.cpp
@@ -772,7 +772,7 @@ bool DisassemblyMacro::disassemble(u32 address, DisassemblyLineInfo& dest, bool
 		dest.params = buffer;
 		
 		dest.info.hasRelevantAddress = true;
-		dest.info.releventAddress = immediate;
+		dest.info.relevantAddress = immediate;
 		break;
 	case MACRO_MEMORYIMM:
 		dest.name = name;
@@ -792,7 +792,7 @@ bool DisassemblyMacro::disassemble(u32 address, DisassemblyLineInfo& dest, bool
 		dest.info.dataSize = dataSize;
 
 		dest.info.hasRelevantAddress = true;
-		dest.info.releventAddress = immediate;
+		dest.info.relevantAddress = immediate;
 		break;
 	default:
 		return false;
diff --git a/Core/MIPS/MIPS.h b/Core/MIPS/MIPS.h
index 4a69dcd345..61ccc639ff 100644
--- a/Core/MIPS/MIPS.h
+++ b/Core/MIPS/MIPS.h
@@ -26,8 +26,7 @@ class PointerWrap;
 
 typedef Memory::Opcode MIPSOpcode;
 
-enum MIPSGPReg
-{
+enum MIPSGPReg {
 	MIPS_REG_ZERO=0,
 	MIPS_REG_COMPILER_SCRATCH=1,
 
@@ -65,17 +64,16 @@ enum MIPSGPReg
 	MIPS_REG_FP=30,
 	MIPS_REG_RA=31,
 
-	MIPS_REG_INVALID=-1,
-
 	// Not real regs, just for convenience/jit mapping.
 	MIPS_REG_HI = 32,
 	MIPS_REG_LO = 33,
 	MIPS_REG_FPCOND = 34,
 	MIPS_REG_VFPUCC = 35,
+
+	MIPS_REG_INVALID=-1,
 };
 
-enum
-{
+enum {
 	VFPU_CTRL_SPREFIX,
 	VFPU_CTRL_TPREFIX,
 	VFPU_CTRL_DPREFIX,
diff --git a/Core/MIPS/MIPSAnalyst.cpp b/Core/MIPS/MIPSAnalyst.cpp
index 626931a650..9dc3ea4cee 100644
--- a/Core/MIPS/MIPSAnalyst.cpp
+++ b/Core/MIPS/MIPSAnalyst.cpp
@@ -1204,19 +1204,19 @@ skip:
 			case 0x20:	// add
 			case 0x21:	// addu
 				info.hasRelevantAddress = true;
-				info.releventAddress = cpu->GetRegValue(0,MIPS_GET_RS(op))+cpu->GetRegValue(0,MIPS_GET_RT(op));
+				info.relevantAddress = cpu->GetRegValue(0,MIPS_GET_RS(op))+cpu->GetRegValue(0,MIPS_GET_RT(op));
 				break;
 			case 0x22:	// sub
 			case 0x23:	// subu
 				info.hasRelevantAddress = true;
-				info.releventAddress = cpu->GetRegValue(0,MIPS_GET_RS(op))-cpu->GetRegValue(0,MIPS_GET_RT(op));
+				info.relevantAddress = cpu->GetRegValue(0,MIPS_GET_RS(op))-cpu->GetRegValue(0,MIPS_GET_RT(op));
 				break;
 			}
 			break;
 		case 0x08:	// addi
 		case 0x09:	// adiu
 			info.hasRelevantAddress = true;
-			info.releventAddress = cpu->GetRegValue(0,MIPS_GET_RS(op))+((s16)(op & 0xFFFF));
+			info.relevantAddress = cpu->GetRegValue(0,MIPS_GET_RS(op))+((s16)(op & 0xFFFF));
 			break;
 		}
 
@@ -1323,7 +1323,7 @@ skip:
 			info.dataAddress = rs + imm16;
 
 			info.hasRelevantAddress = true;
-			info.releventAddress = info.dataAddress;
+			info.relevantAddress = info.dataAddress;
 		}
 
 		return info;
diff --git a/Core/MIPS/MIPSAnalyst.h b/Core/MIPS/MIPSAnalyst.h
index f296e88a66..4eaefbb91b 100644
--- a/Core/MIPS/MIPSAnalyst.h
+++ b/Core/MIPS/MIPSAnalyst.h
@@ -154,7 +154,7 @@ namespace MIPSAnalyst
 		u32 dataAddress;
 
 		bool hasRelevantAddress;
-		u32 releventAddress;
+		u32 relevantAddress;
 	} MipsOpcodeInfo;
 
 	MipsOpcodeInfo GetOpcodeInfo(DebugInterface* cpu, u32 address);
diff --git a/Core/MIPS/MIPSInt.cpp b/Core/MIPS/MIPSInt.cpp
index 330f29a054..792f85a0ae 100644
--- a/Core/MIPS/MIPSInt.cpp
+++ b/Core/MIPS/MIPSInt.cpp
@@ -74,29 +74,13 @@ int MIPS_SingleStep()
 #else
 	MIPSOpcode op = Memory::Read_Opcode_JIT(mipsr4k.pc);
 #endif
-	/*
-	// Choke on VFPU
-	MIPSInfo info = MIPSGetInfo(op);
-	if (info & IS_VFPU)
-	{
-		if (!Core_IsStepping() && !GetAsyncKeyState(VK_LSHIFT))
-		{
-			Core_EnableStepping(true);
-			return;
-		}
-	}*/
-
-	if (mipsr4k.inDelaySlot)
-	{
+	if (mipsr4k.inDelaySlot) {
 		MIPSInterpret(op);
-		if (mipsr4k.inDelaySlot)
-		{
+		if (mipsr4k.inDelaySlot) {
 			mipsr4k.pc = mipsr4k.nextPC;
 			mipsr4k.inDelaySlot = false;
 		}
-	}
-	else
-	{
+	} else {
 		MIPSInterpret(op);
 	}
 	return 1;
@@ -872,14 +856,12 @@ namespace MIPSInt
 		int pos = _POS;
 
 		// Don't change $zr.
-		if (rt == 0)
-		{
+		if (rt == 0) {
 			PC += 4;
 			return;
 		}
 
-		switch (op & 0x3f)
-		{
+		switch (op & 0x3f) {
 		case 0x0: //ext
 			{
 				int size = _SIZE + 1;
@@ -1025,10 +1007,10 @@ namespace MIPSInt
 
 		switch (op & 0x3f)
 		{
-		case 0: F(fd) = F(fs) + F(ft); break; //add
-		case 1: F(fd) = F(fs) - F(ft); break; //sub
-		case 2: F(fd) = F(fs) * F(ft); break; //mul
-		case 3: F(fd) = F(fs) / F(ft); break; //div
+		case 0: F(fd) = F(fs) + F(ft); break; // add.s
+		case 1: F(fd) = F(fs) - F(ft); break; // sub.s
+		case 2: F(fd) = F(fs) * F(ft); break; // mul.s
+		case 3: F(fd) = F(fs) / F(ft); break; // div.s
 		default:
 			_dbg_assert_msg_(CPU,0,"Trying to interpret FPU3Op instruction that can't be interpreted");
 			break;
diff --git a/Core/MIPS/MIPSTables.cpp b/Core/MIPS/MIPSTables.cpp
index fe5e269904..ff03a09fbc 100644
--- a/Core/MIPS/MIPSTables.cpp
+++ b/Core/MIPS/MIPSTables.cpp
@@ -31,8 +31,7 @@
 
 #include "JitCommon/JitCommon.h"
 
-enum MipsEncoding
-{
+enum MipsEncoding {
 	Imme,
 	Spec,
 	Spe2,
@@ -66,8 +65,7 @@ enum MipsEncoding
 	Inval = -2,
 };
 
-struct MIPSInstruction
-{
+struct MIPSInstruction {
 	MipsEncoding altEncoding;
 	const char *name;
 	MIPSComp::MIPSCompileFunc compile;
@@ -152,7 +150,7 @@ const MIPSInstruction tableImmediate[64] = // xxxxxx ..... ..... ...............
 	INVALID,
 	INVALID,
 	INSTR("swr", &Jit::Comp_ITypeMem, Dis_ITypeMem, Int_ITypeMem, IN_IMM16|IN_RS_ADDR|IN_RT|OUT_MEM|MEMTYPE_WORD),
-	INSTR("cache", &Jit::Comp_Cache, Dis_Cache, Int_Cache, IN_MEM|IN_IMM16|IN_RS_ADDR|IN_OTHER|OUT_OTHER),
+	INSTR("cache", &Jit::Comp_Cache, Dis_Cache, Int_Cache, IN_MEM|IN_IMM16|IN_RS_ADDR),
 	//48
 	INSTR("ll", &Jit::Comp_Generic, Dis_Generic, Int_StoreSync, IN_MEM|IN_IMM16|IN_RS_ADDR|OUT_RT|OUT_OTHER|MEMTYPE_WORD),
 	INSTR("lwc1", &Jit::Comp_FPULS, Dis_FPULS, Int_FPULS, IN_MEM|IN_IMM16|IN_RS_ADDR|OUT_OTHER|MEMTYPE_FLOAT),
@@ -198,22 +196,22 @@ const MIPSInstruction tableSpecial[64] = // 000000 ..... ..... ..... ..... xxxxx
 	INSTR("sync",  &Jit::Comp_DoNothing, Dis_Generic, Int_Sync, 0),
 
 	//16
-	INSTR("mfhi",  &Jit::Comp_MulDivType, Dis_FromHiloTransfer, Int_MulDivType, OUT_RD|IN_OTHER),
-	INSTR("mthi",  &Jit::Comp_MulDivType, Dis_ToHiloTransfer,   Int_MulDivType, IN_RS|OUT_OTHER),
-	INSTR("mflo",  &Jit::Comp_MulDivType, Dis_FromHiloTransfer, Int_MulDivType, OUT_RD|IN_OTHER),
-	INSTR("mtlo",  &Jit::Comp_MulDivType, Dis_ToHiloTransfer,   Int_MulDivType, IN_RS|OUT_OTHER),
+	INSTR("mfhi",  &Jit::Comp_MulDivType, Dis_FromHiloTransfer, Int_MulDivType, OUT_RD|IN_HI),
+	INSTR("mthi",  &Jit::Comp_MulDivType, Dis_ToHiloTransfer,   Int_MulDivType, IN_RS|OUT_HI),
+	INSTR("mflo",  &Jit::Comp_MulDivType, Dis_FromHiloTransfer, Int_MulDivType, OUT_RD|IN_LO),
+	INSTR("mtlo",  &Jit::Comp_MulDivType, Dis_ToHiloTransfer,   Int_MulDivType, IN_RS|OUT_LO),
 	INVALID,
 	INVALID,
 	INSTR("clz",   &Jit::Comp_RType2, Dis_RType2, Int_RType2, OUT_RD|IN_RS),
 	INSTR("clo",   &Jit::Comp_RType2, Dis_RType2, Int_RType2, OUT_RD|IN_RS),
 
 	//24
-	INSTR("mult",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_OTHER),
-	INSTR("multu", &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_OTHER),
-	INSTR("div",   &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_OTHER),
-	INSTR("divu",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_OTHER),
-	INSTR("madd",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_OTHER|OUT_OTHER),
-	INSTR("maddu", &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_OTHER|OUT_OTHER),
+	INSTR("mult",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_HI|OUT_LO),
+	INSTR("multu", &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_HI|OUT_LO),
+	INSTR("div",   &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_HI|OUT_LO),
+	INSTR("divu",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|OUT_HI|OUT_LO),
+	INSTR("madd",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_HI|IN_LO|OUT_HI|OUT_LO),
+	INSTR("maddu", &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_HI|IN_LO|OUT_HI|OUT_LO),
 	INVALID,
 	INVALID,
 
@@ -234,8 +232,8 @@ const MIPSInstruction tableSpecial[64] = // 000000 ..... ..... ..... ..... xxxxx
 	INSTR("sltu", &Jit::Comp_RType3, Dis_RType3, Int_RType3, IN_RS|IN_RT|OUT_RD),
 	INSTR("max",  &Jit::Comp_RType3, Dis_RType3, Int_RType3, IN_RS|IN_RT|OUT_RD),
 	INSTR("min",  &Jit::Comp_RType3, Dis_RType3, Int_RType3, IN_RS|IN_RT|OUT_RD),
-	INSTR("msub",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_OTHER|OUT_OTHER),
-	INSTR("msubu", &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_OTHER|OUT_OTHER),
+	INSTR("msub",  &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_HI|IN_LO|OUT_HI|OUT_LO),
+	INSTR("msubu", &Jit::Comp_MulDivType, Dis_MulDivType, Int_MulDivType, IN_RS|IN_RT|IN_HI|IN_LO|OUT_HI|OUT_LO),
 
 	//48
 	INSTR("tge",  &Jit::Comp_Generic, Dis_RType3, 0, 0),
@@ -262,9 +260,9 @@ const MIPSInstruction tableSpecial2[64] = // 011100 ..... ..... ..... ..... xxxx
 	INVALID_X_8,
 	//32
 	INVALID, INVALID, INVALID, INVALID,
-	INSTR("mfic", &Jit::Comp_Generic, Dis_Generic, Int_Special2, 0),
+	INSTR("mfic", &Jit::Comp_Generic, Dis_Generic, Int_Special2, OUT_OTHER),
 	INVALID,
-	INSTR("mtic", &Jit::Comp_Generic, Dis_Generic, Int_Special2, 0),
+	INSTR("mtic", &Jit::Comp_Generic, Dis_Generic, Int_Special2, OUT_OTHER),
 	INVALID,
 	//40
 	INVALID_X_8,
@@ -369,11 +367,11 @@ const MIPSInstruction tableCop2BC2[4] = // 010010 01000 ...xx ................
 
 const MIPSInstruction tableCop0[32] = // 010000 xxxxx ..... ................
 {
-	INSTR("mfc0", &Jit::Comp_Generic, Dis_Generic, 0, OUT_RT),
+	INSTR("mfc0", &Jit::Comp_Generic, Dis_Generic, 0, OUT_RT),  // unused
 	INVALID,
 	INVALID,
 	INVALID,
-	INSTR("mtc0", &Jit::Comp_Generic, Dis_Generic, 0, IN_RT),
+	INSTR("mtc0", &Jit::Comp_Generic, Dis_Generic, 0, IN_RT),  // unused
 	INVALID,
 	INVALID,
 	INVALID,
@@ -423,11 +421,11 @@ const MIPSInstruction tableCop0CO[64] = // 010000 1.... ..... ..... ..... xxxxxx
 
 const MIPSInstruction tableCop1[32] = // 010001 xxxxx ..... ..... ...........
 {
-	INSTR("mfc1", &Jit::Comp_mxc1, Dis_mxc1, Int_mxc1, IN_OTHER|OUT_RT),
+	INSTR("mfc1", &Jit::Comp_mxc1, Dis_mxc1, Int_mxc1, IN_FS|OUT_RT),
 	INVALID,
 	INSTR("cfc1", &Jit::Comp_mxc1, Dis_mxc1, Int_mxc1, IN_OTHER|IN_FPUFLAG|OUT_RT),
 	INVALID,
-	INSTR("mtc1", &Jit::Comp_mxc1, Dis_mxc1, Int_mxc1, IN_RT|OUT_OTHER),
+	INSTR("mtc1", &Jit::Comp_mxc1, Dis_mxc1, Int_mxc1, IN_RT|OUT_FS),
 	INVALID,
 	INSTR("ctc1", &Jit::Comp_mxc1, Dis_mxc1, Int_mxc1, IN_RT|OUT_FPUFLAG|OUT_OTHER),
 	INVALID,
@@ -455,20 +453,20 @@ const MIPSInstruction tableCop1BC[32] = // 010001 01000 xxxxx ................
 
 const MIPSInstruction tableCop1S[64] = // 010001 10000 ..... ..... ..... xxxxxx
 {
-	INSTR("add.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, IN_OTHER|OUT_OTHER),
-	INSTR("sub.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, IN_OTHER|OUT_OTHER),
-	INSTR("mul.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, IN_OTHER|OUT_OTHER),
-	INSTR("div.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, IN_OTHER|OUT_OTHER),
-	INSTR("sqrt.s", &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
-	INSTR("abs.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
-	INSTR("mov.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
-	INSTR("neg.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
+	INSTR("add.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, OUT_FD|IN_FS|IN_FT),
+	INSTR("sub.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, OUT_FD|IN_FS|IN_FT),
+	INSTR("mul.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, OUT_FD|IN_FS|IN_FT),
+	INSTR("div.s",  &Jit::Comp_FPU3op, Dis_FPU3op, Int_FPU3op, OUT_FD|IN_FS|IN_FT),
+	INSTR("sqrt.s", &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
+	INSTR("abs.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
+	INSTR("mov.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
+	INSTR("neg.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
 	//8
 	INVALID, INVALID, INVALID, INVALID,
-	INSTR("round.w.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
-	INSTR("trunc.w.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
-	INSTR("ceil.w.s",   &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
-	INSTR("floor.w.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
+	INSTR("round.w.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
+	INSTR("trunc.w.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
+	INSTR("ceil.w.s",   &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
+	INSTR("floor.w.s",  &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
 	//16
 	INVALID_X_8,
 	//24
@@ -476,29 +474,29 @@ const MIPSInstruction tableCop1S[64] = // 010001 10000 ..... ..... ..... xxxxxx
 	//32
 	INVALID, INVALID, INVALID, INVALID,
 	//36
-	INSTR("cvt.w.s", &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
+	INSTR("cvt.w.s", &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
 	INVALID,
 	INSTR("dis.int", &Jit::Comp_Generic, Dis_Generic, Int_Interrupt, 0),
 	INVALID,
 	//40
 	INVALID_X_8,
 	//48 - 010001 10000 ..... ..... ..... 11xxxx
-	INSTR("c.f",   &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.un",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.eq",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.ueq", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.olt", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.ult", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.ole", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.ule", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.sf",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.ngle",&Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.seq", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.ngl", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.lt",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.nge", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.le",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
-	INSTR("c.ngt", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_OTHER|OUT_FPUFLAG),
+	INSTR("c.f",   &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, OUT_FPUFLAG),
+	INSTR("c.un",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.eq",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.ueq", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.olt", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.ult", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.ole", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.ule", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.sf",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, OUT_FPUFLAG),
+	INSTR("c.ngle",&Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.seq", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.ngl", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.lt",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.nge", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.le",  &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
+	INSTR("c.ngt", &Jit::Comp_FPUComp, Dis_FPUComp, Int_FPUComp, IN_FS|IN_FT|OUT_FPUFLAG),
 };
 
 const MIPSInstruction tableCop1W[64] = // 010001 10100 ..... ..... ..... xxxxxx
@@ -511,7 +509,7 @@ const MIPSInstruction tableCop1W[64] = // 010001 10100 ..... ..... ..... xxxxxx
 	//24
 	INVALID_X_8,
 	//32
-	INSTR("cvt.s.w", &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, IN_OTHER|OUT_OTHER),
+	INSTR("cvt.s.w", &Jit::Comp_FPU2op, Dis_FPU2op, Int_FPU2op, OUT_FD|IN_FS),
 	INVALID, INVALID, INVALID,
 	//36
 	INVALID,
@@ -890,8 +888,6 @@ const MIPSInstruction *mipsTables[NumEncodings] =
 	0,
 };
 
-
-
 //arm encoding table
 //const MIPSInstruction mipsinstructions[] = 
 //{
diff --git a/Core/MIPS/MIPSTables.h b/Core/MIPS/MIPSTables.h
index b05e04e199..1f70366eb2 100644
--- a/Core/MIPS/MIPSTables.h
+++ b/Core/MIPS/MIPSTables.h
@@ -25,14 +25,14 @@ struct MIPSInfo {
 		value = 0;
 	}
 
-	explicit MIPSInfo(u32 v) : value(v) {
+	explicit MIPSInfo(u64 v) : value(v) {
 	}
 
-	u32 operator & (const u32 &arg) const {
+	u64 operator & (const u32 &arg) const {
 		return value & arg;
 	}
 
-	u32 value;
+	u64 value;
 };
 
 #define CONDTYPE_MASK   0x00000007
@@ -49,44 +49,59 @@ struct MIPSInfo {
 // as long as the other flags are checked,
 // there is no way to misinterpret these
 // as CONDTYPE_X
-#define MEMTYPE_MASK    0x00000007
-#define MEMTYPE_BYTE    0x00000001
-#define MEMTYPE_HWORD   0x00000002
-#define MEMTYPE_WORD    0x00000003
-#define MEMTYPE_FLOAT   0x00000004
-#define MEMTYPE_VQUAD   0x00000005
+#define MEMTYPE_MASK    0x00000007ULL
+#define MEMTYPE_BYTE    0x00000001ULL
+#define MEMTYPE_HWORD   0x00000002ULL
+#define MEMTYPE_WORD    0x00000003ULL
+#define MEMTYPE_FLOAT   0x00000004ULL
+#define MEMTYPE_VQUAD   0x00000005ULL
 
-#define IS_CONDMOVE     0x00000008
-#define DELAYSLOT       0x00000010
-#define BAD_INSTRUCTION 0x00000020
-#define LIKELY          0x00000040
-#define IS_CONDBRANCH   0x00000080
-#define IS_JUMP         0x00000100
+#define IS_CONDMOVE     0x00000008ULL
+#define DELAYSLOT       0x00000010ULL
+#define BAD_INSTRUCTION 0x00000020ULL
+#define LIKELY          0x00000040ULL
+#define IS_CONDBRANCH   0x00000080ULL
+#define IS_JUMP         0x00000100ULL
 
-#define IN_RS           0x00000200
-#define IN_RS_ADDR      (0x00000400 | IN_RS)
-#define IN_RS_SHIFT     (0x00000800 | IN_RS)
-#define IN_RT           0x00001000
-#define IN_SA           0x00002000
-#define IN_IMM16        0x00004000
-#define IN_IMM26        0x00008000
-#define IN_MEM          0x00010000
-#define IN_OTHER        0x00020000
-#define IN_FPUFLAG      0x00040000
-#define IN_VFPU_CC      0x00080000
+#define IN_RS           0x00000200ULL
+#define IN_RS_ADDR      (0x00000400ULL | IN_RS)
+#define IN_RS_SHIFT     (0x00000800ULL | IN_RS)
+#define IN_RT           0x00001000ULL
+#define IN_SA           0x00002000ULL
+#define IN_IMM16        0x00004000ULL
+#define IN_IMM26        0x00008000ULL
+#define IN_MEM          0x00010000ULL
+#define IN_OTHER        0x00020000ULL
+#define IN_FPUFLAG      0x00040000ULL
+#define IN_VFPU_CC      0x00080000ULL
 
-#define OUT_RT          0x00100000
-#define OUT_RD          0x00200000
-#define OUT_RA          0x00400000
-#define OUT_MEM         0x00800000
-#define OUT_OTHER       0x01000000
-#define OUT_FPUFLAG     0x02000000
-#define OUT_VFPU_CC     0x04000000
-#define OUT_EAT_PREFIX  0x08000000
+#define OUT_RT          0x00100000ULL
+#define OUT_RD          0x00200000ULL
+#define OUT_RA          0x00400000ULL
+#define OUT_MEM         0x00800000ULL
+#define OUT_OTHER       0x01000000ULL
+#define OUT_FPUFLAG     0x02000000ULL
+#define OUT_VFPU_CC     0x04000000ULL
+#define OUT_EAT_PREFIX  0x08000000ULL
 
-#define VFPU_NO_PREFIX  0x10000000
-#define IS_VFPU         0x20000000
-#define IS_FPU          0x40000000
+#define VFPU_NO_PREFIX  0x10000000ULL
+#define IS_VFPU         0x20000000ULL
+#define IS_FPU          0x40000000ULL
+
+#define IN_FS           0x000100000000ULL
+#define IN_FT           0x000200000000ULL
+#define IN_LO           0x000400000000ULL
+#define IN_HI           0x000800000000ULL
+
+#define OUT_FD          0x001000000000ULL
+#define OUT_FS          0x002000000000ULL
+#define OUT_LO          0x004000000000ULL
+#define OUT_HI          0x008000000000ULL
+
+#define IN_VS           0x010000000000ULL
+#define IN_VT           0x020000000000ULL
+
+#define OUT_VD          0x100000000000ULL
 
 #ifndef CDECL
 #define CDECL
diff --git a/Windows/Debugger/CtrlDisAsmView.cpp b/Windows/Debugger/CtrlDisAsmView.cpp
index ec90a340e4..3fe114c135 100644
--- a/Windows/Debugger/CtrlDisAsmView.cpp
+++ b/Windows/Debugger/CtrlDisAsmView.cpp
@@ -643,7 +643,7 @@ void CtrlDisAsmView::followBranch()
 		} else if (line.info.hasRelevantAddress)
 		{
 			// well, not  exactly a branch, but we can do something anyway
-			SendMessage(GetParent(wnd),WM_DEB_GOTOHEXEDIT,line.info.releventAddress,0);
+			SendMessage(GetParent(wnd),WM_DEB_GOTOHEXEDIT,line.info.relevantAddress,0);
 			SetFocus(wnd);
 		}
 	} else if (line.type == DISTYPE_DATA)

From 7bde97606919f8a012b5991645903e21d906daf1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 10 Oct 2014 20:41:00 +0200
Subject: [PATCH 3/3] Merge x64 emitter from a newer Dolphin version.

This one can generate slightly smaller code by exploiting some EAX-only
encoding and various other short forms, and adds support for many newer
CPU instructions.
---
 Common/CPUDetect.cpp  |   43 ++
 Common/CPUDetect.h    |    5 +
 Common/x64Emitter.cpp | 1086 ++++++++++++++++++++++++++---------------
 Common/x64Emitter.h   |  233 ++++++---
 4 files changed, 912 insertions(+), 455 deletions(-)

diff --git a/Common/CPUDetect.cpp b/Common/CPUDetect.cpp
index cb2e72eb23..188fbdaf28 100644
--- a/Common/CPUDetect.cpp
+++ b/Common/CPUDetect.cpp
@@ -49,6 +49,17 @@ void do_cpuid(u32 regs[4], u32 cpuid_leaf) {
 
 #ifdef _M_SSE
 #include <xmmintrin.h>
+
+#define _XCR_XFEATURE_ENABLED_MASK 0
+static unsigned long long _xgetbv(unsigned int index)
+{
+	unsigned int eax, edx;
+	__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+	return ((unsigned long long)edx << 32) | eax;
+}
+
+#else
+#define _XCR_XFEATURE_ENABLED_MASK 0
 #endif
 
 #if defined __FreeBSD__
@@ -172,6 +183,38 @@ void CPUInfo::Detect() {
 				bFMA = true;
 		}
 		if ((cpu_id[2] >> 25) & 1) bAES = true;
+
+		if ((cpu_id[3] >> 24) & 1)
+		{
+			// We can use FXSAVE.
+			bFXSR = true;
+		}
+
+		// AVX support requires 3 separate checks:
+		//  - Is the AVX bit set in CPUID?
+		//  - Is the XSAVE bit set in CPUID?
+		//  - XGETBV result has the XCR bit set.
+		if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1))
+		{
+			if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6)
+			{
+				bAVX = true;
+				if ((cpu_id[2] >> 12) & 1)
+					bFMA = true;
+			}
+		}
+
+		if (max_std_fn >= 7)
+		{
+			do_cpuid(cpu_id, 0x00000007);
+			// careful; we can't enable AVX2 unless the XSAVE/XGETBV checks above passed
+			if ((cpu_id[1] >> 5) & 1)
+				bAVX2 = bAVX;
+			if ((cpu_id[1] >> 3) & 1)
+				bBMI1 = true;
+			if ((cpu_id[1] >> 8) & 1)
+				bBMI2 = true;
+		}
 	}
 	if (max_ex_fn >= 0x80000004) {
 		// Extract brand string
diff --git a/Common/CPUDetect.h b/Common/CPUDetect.h
index 04c615b412..091e8f9713 100644
--- a/Common/CPUDetect.h
+++ b/Common/CPUDetect.h
@@ -56,10 +56,15 @@ struct CPUInfo {
 	bool bLZCNT;
 	bool bSSE4A;
 	bool bAVX;
+	bool bAVX2;
 	bool bFMA;
 	bool bAES;
 	bool bLAHFSAHF64;
 	bool bLongMode;
+	bool bBMI1;
+	bool bBMI2;
+	bool bMOVBE;
+	bool bFXSR;
 
 	// ARM specific CPUInfo
 	bool bSwp;
diff --git a/Common/x64Emitter.cpp b/Common/x64Emitter.cpp
index f454296470..c4455067e4 100644
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@@ -34,27 +34,28 @@ namespace Gen
 // TODO(ector): Add EAX special casing, for ever so slightly smaller code.
 struct NormalOpDef
 {
-	u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, ext;
+	u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
 };
 
-static const NormalOpDef nops[11] =
+// 0xCC is code for invalid combination of immediates
+static const NormalOpDef normalops[11] =
 {
-	{0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0}, //ADD
-	{0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 2}, //ADC
+	{0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, //ADD
+	{0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, //ADC
 
-	{0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 5}, //SUB
-	{0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 3}, //SBB
+	{0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, //SUB
+	{0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, //SBB
 
-	{0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 4}, //AND
-	{0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 1}, //OR
+	{0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, //AND
+	{0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, //OR
 
-	{0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 6}, //XOR
-	{0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0}, //MOV
+	{0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, //XOR
+	{0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, //MOV
 
-	{0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0}, //TEST (to == from)
-	{0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 7}, //CMP
+	{0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, //TEST (to == from)
+	{0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, //CMP
 
-	{0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 7}, //XCHG
+	{0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, //XCHG
 };
 
 enum NormalSSEOps
@@ -76,10 +77,16 @@ enum NormalSSEOps
 	sseRSQRT       = 0x52, //RSQRT (NO DOUBLE PRECISION!!!)
 	sseMOVAPfromRM = 0x28, //MOVAP from RM
 	sseMOVAPtoRM   = 0x29, //MOVAP to RM
-	sseMOVUPfromRM = 0x10, //MOVUP from RM	
+	sseMOVUPfromRM = 0x10, //MOVUP from RM
+	sseMOVUPtoRM   = 0x11, //MOVUP to RM
+	sseMOVLPDfromRM= 0x12,
+	sseMOVLPDtoRM  = 0x13,
+	sseMOVHPDfromRM= 0x16,
+	sseMOVHPDtoRM  = 0x17,
+	sseMOVHLPS     = 0x12,
+	sseMOVLHPS     = 0x16,
 	sseMOVDQfromRM = 0x6F,
 	sseMOVDQtoRM   = 0x7F,
-	sseMOVUPtoRM   = 0x11, //MOVUP to RM
 	sseMASKMOVDQU  = 0xF7,
 	sseLDDQU       = 0xF0,
 	sseSHUF        = 0xC6,
@@ -133,6 +140,14 @@ const u8 *XEmitter::AlignCodePage()
 	return code;
 }
 
+// This operation modifies flags; check to see the flags are locked.
+// If the flags are locked, we should immediately and loudly fail before
+// causing a subtle JIT bug.
+void XEmitter::CheckFlags()
+{
+	_assert_msg_(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!");
+}
+
 void XEmitter::WriteModRM(int mod, int reg, int rm)
 {
 	Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
@@ -148,51 +163,42 @@ void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const
 	if (customOp == -1)       customOp = operandReg;
 #ifdef _M_X64
 	u8 op = 0x40;
+	// REX.W (whether operation is a 64-bit operation)
 	if (opBits == 64)         op |= 8;
+	// REX.R (whether ModR/M reg field refers to R8-R15.
 	if (customOp & 8)         op |= 4;
+	// REX.X (whether ModR/M SIB index field refers to R8-R15)
 	if (indexReg & 8)         op |= 2;
-	if (offsetOrBaseReg & 8)  op |= 1; //TODO investigate if this is dangerous
+	// REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
+	if (offsetOrBaseReg & 8)  op |= 1;
+	// Write REX if wr have REX bits to write, or if the operation accesses
+	// SIL, DIL, BPL, or SPL.
 	if (op != 0x40 ||
-		(bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
-		(opBits == 8 && (customOp & 0x10c) == 4)) {
+	    (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+	    (opBits == 8 && (customOp & 0x10c) == 4))
+	{
 		emit->Write8(op);
-		_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 0x100) == 0 || bits != 8);
-		_dbg_assert_(DYNA_REC, (customOp & 0x100) == 0 || opBits != 8);
-	} else {
-		_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 0x10c) == 0 ||
-				(offsetOrBaseReg & 0x10c) == 0x104 ||
-				bits != 8);
-		_dbg_assert_(DYNA_REC, (customOp & 0x10c) == 0 ||
-				(customOp & 0x10c) == 0x104 ||
-				opBits != 8);
+		// Check the operation doesn't access AH, BH, CH, or DH.
+		_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 0x100) == 0);
+		_dbg_assert_(DYNA_REC, (customOp & 0x100) == 0);
 	}
-
 #else
 	_dbg_assert_(DYNA_REC, opBits != 64);
 	_dbg_assert_(DYNA_REC, (customOp & 8) == 0 || customOp == -1);
 	_dbg_assert_(DYNA_REC, (indexReg & 8) == 0);
 	_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 8) == 0);
 	_dbg_assert_(DYNA_REC, opBits != 8 || (customOp & 0x10c) != 4 || customOp == -1);
-	_dbg_assert_(DYNA_REC, bits != 8 || (offsetOrBaseReg & 0x10c) != 4);
+	_dbg_assert_(DYNA_REC, scale == SCALE_ATREG || bits != 8 || (offsetOrBaseReg & 0x10c) != 4);
 #endif
 }
 
-void OpArg::WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, Gen::X64Reg regOp2) const
+void OpArg::WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W) const
 {
 	int R = !(regOp1 & 8);
 	int X = !(indexReg & 8);
 	int B = !(offsetOrBaseReg & 8);
 
-	// not so sure about this one...
-	int W = 0;
-
-	// aka map_select in AMD manuals
-	// only support VEX opcode map 1 for now (analog to secondary opcode map)
-	int mmmmm = 1;
-
 	int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
-	int L = size == 256;
-	int pp = (packed << 1) | (size == 64);
 
 	// do we need any VEX fields that only appear in the three-byte form?
 	if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
@@ -214,7 +220,7 @@ void OpArg::WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, G
 void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	bool warn_64bit_offset) const
 {
-	if (_operandReg == 0xff)
+	if (_operandReg == INVALID_REG)
 		_operandReg = (X64Reg)this->operandReg;
 	int mod = 0;
 	int ireg = indexReg;
@@ -225,16 +231,17 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	{
 		// Oh, RIP addressing.
 		_offsetOrBaseReg = 5;
-		emit->WriteModRM(0, _operandReg&7, 5);
+		emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
 		//TODO : add some checks
 #ifdef _M_X64
 		u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
 		s64 distance = (s64)offset - (s64)ripAddr;
-		_assert_msg_(DYNA_REC, (distance < 0x80000000LL
-					&& distance >=  -0x80000000LL) ||
-			     !warn_64bit_offset,
-			     "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")",
-			     ripAddr, offset);
+		_assert_msg_(DYNA_REC,
+		             (distance < 0x80000000LL &&
+		              distance >=  -0x80000000LL) ||
+		             !warn_64bit_offset,
+		             "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")",
+		             ripAddr, offset);
 		s32 offs = (s32)distance;
 		emit->Write32((u32)offs);
 #else
@@ -349,7 +356,6 @@ void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
 	}
 }
 
-
 // W = operand extended width (1 if 64-bit)
 // R = register# upper bit
 // X = scale amnt upper bit
@@ -381,9 +387,9 @@ void XEmitter::JMP(const u8 *addr, bool force5Bytes)
 	{
 		s64 distance = (s64)(fn - ((u64)code + 5));
 
-		_assert_msg_(DYNA_REC, distance >= -0x80000000LL
-			     && distance < 0x80000000LL,
-			     "Jump target too far away, needs indirect register");
+		_assert_msg_(DYNA_REC,
+		             distance >= -0x80000000LL && distance < 0x80000000LL,
+		             "Jump target too far away, needs indirect register");
 		Write8(0xE9);
 		Write32((u32)(s32)distance);
 	}
@@ -419,9 +425,10 @@ void XEmitter::CALLptr(OpArg arg)
 void XEmitter::CALL(const void *fnptr)
 {
 	u64 distance = u64(fnptr) - (u64(code) + 5);
-	_assert_msg_(DYNA_REC, distance < 0x0000000080000000ULL
-		     || distance >=  0xFFFFFFFF80000000ULL,
-		     "CALL out of range (%p calls %p)", code, fnptr);
+	_assert_msg_(DYNA_REC,
+	             distance < 0x0000000080000000ULL ||
+	             distance >=  0xFFFFFFFF80000000ULL,
+	             "CALL out of range (%p calls %p)", code, fnptr);
 	Write8(0xE8);
 	Write32(u32(distance));
 }
@@ -465,27 +472,25 @@ FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
 	return branch;
 }
 
-void XEmitter::J_CC(CCFlags conditionCode, const u8 * addr, bool force5Bytes)
+void XEmitter::J_CC(CCFlags conditionCode, const u8* addr, bool force5bytes)
 {
 	u64 fn = (u64)addr;
-	if (!force5Bytes)
+	s64 distance = (s64)(fn - ((u64)code + 2));
+	if (distance < -0x80 || distance >= 0x80 || force5bytes)
 	{
-		s64 distance = (s64)(fn - ((u64)code + 2));
-		_assert_msg_(DYNA_REC, distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
-		//8 bits will do
-		Write8(0x70 + conditionCode);
-		Write8((u8)(s8)distance);
-	}
-	else
-	{
-		s64 distance = (s64)(fn - ((u64)code + 6));
-		_assert_msg_(DYNA_REC, distance >= -0x80000000LL
-			     && distance < 0x80000000LL,
-			     "Jump target too far away, needs indirect register");
+		distance = (s64)(fn - ((u64)code + 6));
+		_assert_msg_(DYNA_REC,
+		             distance >= -0x80000000LL && distance < 0x80000000LL,
+		             "Jump target too far away, needs indirect register");
 		Write8(0x0F);
 		Write8(0x80 + conditionCode);
 		Write32((u32)(s32)distance);
 	}
+	else
+	{
+		Write8(0x70 + conditionCode);
+		Write8((u8)(s8)distance);
+	}
 }
 
 void XEmitter::SetJumpTarget(const FixupBranch &branch)
@@ -534,30 +539,71 @@ void XEmitter::INT3() {Write8(0xCC);}
 void XEmitter::RET()  {Write8(0xC3);}
 void XEmitter::RET_FAST()  {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret
 
-void XEmitter::NOP(int count)
+// The first sign of decadence: optimized NOPs.
+void XEmitter::NOP(size_t size)
 {
-	// TODO: look up the fastest nop sleds for various sizes
-	int i;
-	switch (count) {
-	case 1:
-		Write8(0x90);
-		break;
-	case 2:
-		Write8(0x66);
-		Write8(0x90);
-		break;
-	default:
-		for (i = 0; i < count; i++) {
+	_dbg_assert_(DYNA_REC, (int)size > 0);
+	while (true)
+	{
+		switch (size)
+		{
+		case 0:
+			return;
+		case 1:
 			Write8(0x90);
+			return;
+		case 2:
+			Write8(0x66); Write8(0x90);
+			return;
+		case 3:
+			Write8(0x0F); Write8(0x1F); Write8(0x00);
+			return;
+		case 4:
+			Write8(0x0F); Write8(0x1F); Write8(0x40); Write8(0x00);
+			return;
+		case 5:
+			Write8(0x0F); Write8(0x1F); Write8(0x44); Write8(0x00);
+			Write8(0x00);
+			return;
+		case 6:
+			Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x44);
+			Write8(0x00); Write8(0x00);
+			return;
+		case 7:
+			Write8(0x0F); Write8(0x1F); Write8(0x80); Write8(0x00);
+			Write8(0x00); Write8(0x00); Write8(0x00);
+			return;
+		case 8:
+			Write8(0x0F); Write8(0x1F); Write8(0x84); Write8(0x00);
+			Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00);
+			return;
+		case 9:
+			Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x84);
+			Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00);
+			Write8(0x00);
+			return;
+		case 10:
+			Write8(0x66); Write8(0x66); Write8(0x0F); Write8(0x1F);
+			Write8(0x84); Write8(0x00); Write8(0x00); Write8(0x00);
+			Write8(0x00); Write8(0x00);
+			return;
+		default:
+			// Even though x86 instructions are allowed to be up to 15 bytes long,
+			// AMD advises against using NOPs longer than 11 bytes because they
+			// carry a performance penalty on CPUs older than AMD family 16h.
+			Write8(0x66); Write8(0x66); Write8(0x66); Write8(0x0F);
+			Write8(0x1F); Write8(0x84); Write8(0x00); Write8(0x00);
+			Write8(0x00); Write8(0x00); Write8(0x00);
+			size -= 11;
+			continue;
 		}
-		break;
 	}
 }
 
 void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some cpu
-void XEmitter::CLC()  {Write8(0xF8);} //clear carry
-void XEmitter::CMC()  {Write8(0xF5);} //flip carry
-void XEmitter::STC()  {Write8(0xF9);} //set carry
+void XEmitter::CLC()  {CheckFlags(); Write8(0xF8);} //clear carry
+void XEmitter::CMC()  {CheckFlags(); Write8(0xF5);} //flip carry
+void XEmitter::STC()  {CheckFlags(); Write8(0xF9);} //set carry
 
 //TODO: xchg ah, al ???
 void XEmitter::XCHG_AHAL()
@@ -569,10 +615,10 @@ void XEmitter::XCHG_AHAL()
 
 //These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
 void XEmitter::LAHF() {Write8(0x9F);}
-void XEmitter::SAHF() {Write8(0x9E);}
+void XEmitter::SAHF() {CheckFlags(); Write8(0x9E);}
 
 void XEmitter::PUSHF() {Write8(0x9C);}
-void XEmitter::POPF()  {Write8(0x9D);}
+void XEmitter::POPF()  {CheckFlags(); Write8(0x9D);}
 
 void XEmitter::LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);}
 void XEmitter::MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);}
@@ -580,14 +626,16 @@ void XEmitter::SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);}
 
 void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
 {
-	if (bits == 16) {Write8(0x66);}
+	if (bits == 16)
+		Write8(0x66);
 	Rex(bits == 64, 0, 0, (int)reg >> 3);
 	Write8(byte + ((int)reg & 7));
 }
 
 void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
 {
-	if (bits == 16) {Write8(0x66);}
+	if (bits == 16)
+		Write8(0x66);
 	Rex(bits==64, 0, 0, (int)reg >> 3);
 	Write8(byte1);
 	Write8(byte2 + ((int)reg & 7));
@@ -595,14 +643,16 @@ void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
 
 void XEmitter::CWD(int bits)
 {
-	if (bits == 16) {Write8(0x66);}
+	if (bits == 16)
+		Write8(0x66);
 	Rex(bits == 64, 0, 0, 0);
 	Write8(0x99);
 }
 
 void XEmitter::CBW(int bits)
 {
-	if (bits == 8) {Write8(0x66);}
+	if (bits == 8)
+		Write8(0x66);
 	Rex(bits == 32, 0, 0, 0);
 	Write8(0x98);
 }
@@ -655,7 +705,7 @@ void XEmitter::POP(int /*bits*/, const OpArg &reg)
 	if (reg.IsSimpleReg())
 		POP(reg.GetSimpleReg());
 	else
-		INT3();
+		_assert_msg_(DYNA_REC, 0, "POP - Unsupported encoding");
 }
 
 void XEmitter::BSWAP(int bits, X64Reg reg)
@@ -688,7 +738,7 @@ void XEmitter::UD2()
 
 void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
 {
-	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "PREFETCH - Imm argument");;
+	_assert_msg_(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument");
 	arg.operandReg = (u8)level;
 	arg.WriteRex(this, 0, 0);
 	Write8(0x0F);
@@ -698,9 +748,9 @@ void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
 
 void XEmitter::SETcc(CCFlags flag, OpArg dest)
 {
-	if (dest.IsImm()) _assert_msg_(DYNA_REC, 0, "SETcc - Imm argument");
+	_assert_msg_(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument");
 	dest.operandReg = 0;
-	dest.WriteRex(this, 0, 0);
+	dest.WriteRex(this, 0, 8);
 	Write8(0x0F);
 	Write8(0x90 + (u8)flag);
 	dest.WriteRest(this);
@@ -708,7 +758,10 @@ void XEmitter::SETcc(CCFlags flag, OpArg dest)
 
 void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
 {
-	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "CMOVcc - Imm argument");
+	_assert_msg_(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument");
+	_assert_msg_(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported");
+	if (bits == 16)
+		Write8(0x66);
 	src.operandReg = dest;
 	src.WriteRex(this, bits, bits);
 	Write8(0x0F);
@@ -718,10 +771,12 @@ void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
 
 void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
 {
-	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "WriteMulDivType - Imm argument");
+	_assert_msg_(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument");
+	CheckFlags();
 	src.operandReg = ext;
-	if (bits == 16) Write8(0x66);
-	src.WriteRex(this, bits, bits);
+	if (bits == 16)
+		Write8(0x66);
+	src.WriteRex(this, bits, bits, 0);
 	if (bits == 8)
 	{
 		Write8(0xF6);
@@ -740,11 +795,15 @@ void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);}
 void XEmitter::NEG(int bits, OpArg src)  {WriteMulDivType(bits, src, 3);}
 void XEmitter::NOT(int bits, OpArg src)  {WriteMulDivType(bits, src, 2);}
 
-void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2)
+void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
 {
-	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "WriteBitSearchType - Imm argument");
+	_assert_msg_(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument");
+	CheckFlags();
 	src.operandReg = (u8)dest;
-	if (bits == 16) Write8(0x66);
+	if (bits == 16)
+		Write8(0x66);
+	if (rep)
+		Write8(0xF3);
 	src.WriteRex(this, bits, bits);
 	Write8(0x0F);
 	Write8(byte2);
@@ -753,22 +812,40 @@ void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2)
 
 void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src)
 {
-	if (bits <= 16) _assert_msg_(DYNA_REC, 0, "MOVNTI - bits<=16");
+	if (bits <= 16)
+		_assert_msg_(DYNA_REC, 0, "MOVNTI - bits<=16");
 	WriteBitSearchType(bits, src, dest, 0xC3);
 }
 
 void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit
 void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit
 
+void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src)
+{
+	CheckFlags();
+	if (!cpu_info.bBMI1)
+		PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+	WriteBitSearchType(bits, dest, src, 0xBC, true);
+}
+void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src)
+{
+	CheckFlags();
+	if (!cpu_info.bLZCNT)
+		PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
+	WriteBitSearchType(bits, dest, src, 0xBD, true);
+}
+
 void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
 {
-	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "MOVSX - Imm argument");
-	if (dbits == sbits) {
+	_assert_msg_(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument");
+	if (dbits == sbits)
+	{
 		MOV(dbits, R(dest), src);
 		return;
 	}
 	src.operandReg = (u8)dest;
-	if (dbits == 16) Write8(0x66);
+	if (dbits == 16)
+		Write8(0x66);
 	src.WriteRex(this, dbits, sbits);
 	if (sbits == 8)
 	{
@@ -793,13 +870,15 @@ void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
 
 void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
 {
-	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "MOVZX - Imm argument");
-	if (dbits == sbits) {
+	_assert_msg_(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument");
+	if (dbits == sbits)
+	{
 		MOV(dbits, R(dest), src);
 		return;
 	}
 	src.operandReg = (u8)dest;
-	if (dbits == 16) Write8(0x66);
+	if (dbits == 16)
+		Write8(0x66);
 	//the 32bit result is automatically zero extended to 64bit
 	src.WriteRex(this, dbits == 64 ? 32 : dbits, sbits);
 	if (sbits == 8)
@@ -818,25 +897,59 @@ void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
 	}
 	else
 	{
-		Crash();
+		_assert_msg_(DYNA_REC, 0, "MOVZX - Invalid size");
 	}
 	src.WriteRest(this);
 }
 
+void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src)
+{
+	_assert_msg_(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
+	if (bits == 8)
+	{
+		MOV(bits, dest, src);
+		return;
+	}
+
+	if (bits == 16)
+		Write8(0x66);
+
+	if (dest.IsSimpleReg())
+	{
+		_assert_msg_(DYNA_REC, !src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem");
+		src.WriteRex(this, bits, bits, dest.GetSimpleReg());
+		Write8(0x0F); Write8(0x38); Write8(0xF0);
+		src.WriteRest(this, 0, dest.GetSimpleReg());
+	}
+	else if (src.IsSimpleReg())
+	{
+		_assert_msg_(DYNA_REC, !dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem");
+		dest.WriteRex(this, bits, bits, src.GetSimpleReg());
+		Write8(0x0F); Write8(0x38); Write8(0xF1);
+		dest.WriteRest(this, 0, src.GetSimpleReg());
+	}
+	else
+	{
+		_assert_msg_(DYNA_REC, 0, "MOVBE: Not loading or storing to mem");
+	}
+}
+
 
 void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
 {
-	if (src.IsImm()) _assert_msg_(DYNA_REC, 0, "LEA - Imm argument");
+	_assert_msg_(DYNA_REC, !src.IsImm(), "LEA - Imm argument");
 	src.operandReg = (u8)dest;
-	if (bits == 16) Write8(0x66); //TODO: performance warning
+	if (bits == 16)
+		Write8(0x66); //TODO: performance warning
 	src.WriteRex(this, bits, bits);
 	Write8(0x8D);
-	src.WriteRest(this, 0, (X64Reg)0xFF, bits == 64);
+	src.WriteRest(this, 0, INVALID_REG, bits == 64);
 }
 
 //shift can be either imm8 or cl
 void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext)
 {
+	CheckFlags();
 	bool writeImm = false;
 	if (dest.IsImm())
 	{
@@ -847,7 +960,8 @@ void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext)
 		_assert_msg_(DYNA_REC, 0, "WriteShift - illegal argument");
 	}
 	dest.operandReg = ext;
-	if (bits == 16) Write8(0x66);
+	if (bits == 16)
+		Write8(0x66);
 	dest.WriteRex(this, bits, bits, 0);
 	if (shift.GetImmBits() == 8)
 	{
@@ -885,6 +999,7 @@ void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, sh
 // index can be either imm8 or register, don't use memory destination because it's slow
 void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext)
 {
+	CheckFlags();
 	if (dest.IsImm())
 	{
 		_assert_msg_(DYNA_REC, 0, "WriteBitTest - can't test imms");
@@ -893,7 +1008,8 @@ void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext)
 	{
 		_assert_msg_(DYNA_REC, 0, "WriteBitTest - illegal argument");
 	}
-	if (bits == 16) Write8(0x66);
+	if (bits == 16)
+		Write8(0x66);
 	if (index.IsImm())
 	{
 		dest.WriteRex(this, bits, bits);
@@ -918,6 +1034,7 @@ void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest,
 //shift can be either imm8 or cl
 void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
 {
+	CheckFlags();
 	if (dest.IsImm())
 	{
 		_assert_msg_(DYNA_REC, 0, "SHRD - can't use imms as destination");
@@ -930,7 +1047,8 @@ void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
 	{
 		_assert_msg_(DYNA_REC, 0, "SHRD - illegal shift");
 	}
-	if (bits == 16) Write8(0x66);
+	if (bits == 16)
+		Write8(0x66);
 	X64Reg operand = src.GetSimpleReg();
 	dest.WriteRex(this, bits, bits, operand);
 	if (shift.GetImmBits() == 8)
@@ -948,6 +1066,7 @@ void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
 
 void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift)
 {
+	CheckFlags();
 	if (dest.IsImm())
 	{
 		_assert_msg_(DYNA_REC, 0, "SHLD - can't use imms as destination");
@@ -960,7 +1079,8 @@ void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift)
 	{
 		_assert_msg_(DYNA_REC, 0, "SHLD - illegal shift");
 	}
-	if (bits == 16) Write8(0x66);
+	if (bits == 16)
+		Write8(0x66);
 	X64Reg operand = src.GetSimpleReg();
 	dest.WriteRex(this, bits, bits, operand);
 	if (shift.GetImmBits() == 8)
@@ -990,7 +1110,7 @@ void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bit
 //operand can either be immediate or register
 void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const
 {
-	X64Reg _operandReg = (X64Reg)this->operandReg;
+	X64Reg _operandReg;
 	if (IsImm())
 	{
 		_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
@@ -1003,7 +1123,6 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
 
 	if (operand.IsImm())
 	{
-		_operandReg = (X64Reg)0;
 		WriteRex(emit, bits, bits);
 
 		if (!toRM)
@@ -1013,26 +1132,81 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
 
 		if (operand.scale == SCALE_IMM8 && bits == 8)
 		{
-			emit->Write8(nops[op].imm8);
+			// op al, imm8
+			if (!scale && offsetOrBaseReg == AL && normalops[op].eaximm8 != 0xCC)
+			{
+				emit->Write8(normalops[op].eaximm8);
+				emit->Write8((u8)operand.offset);
+				return;
+			}
+			// mov reg, imm8
+			if (!scale && op == nrmMOV)
+			{
+				emit->Write8(0xB0 + (offsetOrBaseReg & 7));
+				emit->Write8((u8)operand.offset);
+				return;
+			}
+			// op r/m8, imm8
+			emit->Write8(normalops[op].imm8);
 			immToWrite = 8;
 		}
 		else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
 				 (operand.scale == SCALE_IMM32 && bits == 32) ||
 				 (operand.scale == SCALE_IMM32 && bits == 64))
 		{
-			emit->Write8(nops[op].imm32);
-			immToWrite = bits == 16 ? 16 : 32;
+			// Try to save immediate size if we can, but first check to see
+			// if the instruction supports simm8.
+			// op r/m, imm8
+			if (normalops[op].simm8 != 0xCC &&
+			    ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
+			     (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
+			{
+				emit->Write8(normalops[op].simm8);
+				immToWrite = 8;
+			}
+			else
+			{
+				// mov reg, imm
+				if (!scale && op == nrmMOV && bits != 64)
+				{
+					emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+					if (bits == 16)
+						emit->Write16((u16)operand.offset);
+					else
+						emit->Write32((u32)operand.offset);
+					return;
+				}
+				// op eax, imm
+				if (!scale && offsetOrBaseReg == EAX && normalops[op].eaximm32 != 0xCC)
+				{
+					emit->Write8(normalops[op].eaximm32);
+					if (bits == 16)
+						emit->Write16((u16)operand.offset);
+					else
+						emit->Write32((u32)operand.offset);
+					return;
+				}
+				// op r/m, imm
+				emit->Write8(normalops[op].imm32);
+				immToWrite = bits == 16 ? 16 : 32;
+			}
 		}
 		else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
 				 (operand.scale == SCALE_IMM8 && bits == 32) ||
 				 (operand.scale == SCALE_IMM8 && bits == 64))
 		{
-			emit->Write8(nops[op].simm8);
+			// op r/m, imm8
+			emit->Write8(normalops[op].simm8);
 			immToWrite = 8;
 		}
 		else if (operand.scale == SCALE_IMM64 && bits == 64)
 		{
-			if (op == nrmMOV)
+			if (scale)
+			{
+				_assert_msg_(DYNA_REC, 0, "WriteNormalOp - MOV with 64-bit imm requres register destination");
+			}
+			// mov reg64, imm64
+			else if (op == nrmMOV)
 			{
 				emit->Write8(0xB8 + (offsetOrBaseReg & 7));
 				emit->Write64((u64)operand.offset);
@@ -1044,25 +1218,24 @@ void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &o
 		{
 			_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
 		}
-		_operandReg = (X64Reg)nops[op].ext; //pass extension in REG of ModRM
+		_operandReg = (X64Reg)normalops[op].ext; //pass extension in REG of ModRM
 	}
 	else
 	{
 		_operandReg = (X64Reg)operand.offsetOrBaseReg;
 		WriteRex(emit, bits, bits, _operandReg);
-		// mem/reg or reg/reg op
+		// op r/m, reg
 		if (toRM)
 		{
-			emit->Write8(bits == 8 ? nops[op].toRm8 : nops[op].toRm32);
-			// _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH4");
+			emit->Write8(bits == 8 ? normalops[op].toRm8 : normalops[op].toRm32);
 		}
+		// op reg, r/m
 		else
 		{
-			emit->Write8(bits == 8 ? nops[op].fromRm8 : nops[op].fromRm32);
-			// _assert_msg_(DYNA_REC, code[-1] != 0xCC, "ARGH5");
+			emit->Write8(bits == 8 ? normalops[op].fromRm8 : normalops[op].fromRm32);
 		}
 	}
-	WriteRest(emit, immToWrite>>3, _operandReg);
+	WriteRest(emit, immToWrite >> 3, _operandReg);
 	switch (immToWrite)
 	{
 	case 0:
@@ -1101,40 +1274,44 @@ void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg
 		}
 		else
 		{
+			_assert_msg_(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(), "WriteNormalOp - a1 and a2 cannot both be memory");
 			a1.WriteNormalOp(emit, true, op, a2, bits);
 		}
 	}
 }
 
-void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmADD, a1, a2);}
-void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmADC, a1, a2);}
-void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmSUB, a1, a2);}
-void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmSBB, a1, a2);}
-void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmAND, a1, a2);}
-void XEmitter::OR  (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmOR , a1, a2);}
-void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXOR, a1, a2);}
+void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);}
+void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);}
+void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);}
+void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);}
+void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);}
+void XEmitter::OR  (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);}
+void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);}
 void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2)
 {
-#ifdef _DEBUG
-	_assert_msg_(DYNA_REC, !a1.IsSimpleReg() || !a2.IsSimpleReg() || a1.GetSimpleReg() != a2.GetSimpleReg(), "Redundant MOV @ %p - bug in DYNA_REC?",
-				 code);
-#endif
+	if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
+		ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code);
 	WriteNormalOp(this, bits, nrmMOV, a1, a2);
 }
-void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmTEST, a1, a2);}
-void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmCMP, a1, a2);}
+void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);}
+void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);}
 void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);}
 
 void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
 {
-	if (bits == 8) {
+	CheckFlags();
+	if (bits == 8)
+	{
 		_assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!");
 		return;
 	}
-	if (a1.IsImm()) {
+
+	if (a1.IsImm())
+	{
 		_assert_msg_(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
 		return;
 	}
+
 	if (!a2.IsImm())
 	{
 		_assert_msg_(DYNA_REC, 0, "IMUL - third arg must be imm!");
@@ -1145,20 +1322,29 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
 		Write8(0x66);
 	a1.WriteRex(this, bits, bits, regOp);
 
-	if (a2.GetImmBits() == 8) {
+	if (a2.GetImmBits() == 8 ||
+	    (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
+	    (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
+	{
 		Write8(0x6B);
 		a1.WriteRest(this, 1, regOp);
 		Write8((u8)a2.offset);
-	} else {
+	}
+	else
+	{
 		Write8(0x69);
-		if (a2.GetImmBits() == 16 && bits == 16) {
+		if (a2.GetImmBits() == 16 && bits == 16)
+		{
 			a1.WriteRest(this, 2, regOp);
 			Write16((u16)a2.offset);
-		} else if (a2.GetImmBits() == 32 &&
-			(bits == 32 || bits == 64)) {
-				a1.WriteRest(this, 4, regOp);
-				Write32((u32)a2.offset);
-		} else {
+		}
+		else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
+		{
+			a1.WriteRest(this, 4, regOp);
+			Write32((u32)a2.offset);
+		}
+		else
+		{
 			_assert_msg_(DYNA_REC, 0, "IMUL - unhandled case!");
 		}
 	}
@@ -1166,10 +1352,13 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
 
 void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
 {
-	if (bits == 8) {
+	CheckFlags();
+	if (bits == 8)
+	{
 		_assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!");
 		return;
 	}
+
 	if (a.IsImm())
 	{
 		IMUL(bits, regOp, R(regOp), a) ;
@@ -1185,49 +1374,92 @@ void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
 }
 
 
-void XEmitter::WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
+void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
 {
-	if (size == 64 && packed)
-		Write8(0x66); //this time, override goes upwards
-	if (!packed)
-		Write8(size == 64 ? 0xF2 : 0xF3);
+	if (opPrefix)
+		Write8(opPrefix);
 	arg.operandReg = regOp;
 	arg.WriteRex(this, 0, 0);
 	Write8(0x0F);
-	Write8(sseOp);
+	if (op > 0xFF)
+		Write8((op >> 8) & 0xFF);
+	Write8(op & 0xFF);
 	arg.WriteRest(this, extrabytes);
 }
 
-void XEmitter::WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
 {
-	if (size == 64 && packed)
-	Write8(0x66); //this time, override goes upwards
-	if (!packed)
-		Write8(size == 64 ? 0xF2 : 0xF3);
-	arg.operandReg = regOp;
-	arg.WriteRex(this, 0, 0);
-	Write8(0x0F);
-	Write8(0x38);
-	Write8(sseOp);
-	arg.WriteRest(this, extrabytes);
+	WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, extrabytes);
 }
 
-void XEmitter::WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes)
+static int GetVEXmmmmm(u16 op)
 {
-	WriteAVXOp(size, sseOp, packed, regOp, X64Reg::INVALID_REG, arg, extrabytes);
+	// Currently, only 0x38 and 0x3A are used as secondary escape byte.
+	if ((op >> 8) == 0x3A)
+		return 3;
+	else if ((op >> 8) == 0x38)
+		return 2;
+	else
+		return 1;
 }
 
-void XEmitter::WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+static int GetVEXpp(u8 opPrefix)
 {
-	arg.WriteVex(this, size, packed, regOp1, regOp2);
-	Write8(sseOp);
+	if (opPrefix == 0x66)
+		return 1;
+	else if (opPrefix == 0xF3)
+		return 2;
+	else if (opPrefix == 0xF2)
+		return 3;
+	else
+		return 0;
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	if (!cpu_info.bAVX)
+		PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+	int mmmmm = GetVEXmmmmm(op);
+	int pp = GetVEXpp(opPrefix);
+	// FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
+	arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm);
+	Write8(op & 0xFF);
 	arg.WriteRest(this, extrabytes, regOp1);
 }
 
-void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x6E, true, dest, arg, 0);}
-void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(64, 0x7E, true, src, arg, 0);}
+// Like the above, but more general; covers GPR-based VEX operations, like BMI1/2
+void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	if (size != 32 && size != 64)
+		PanicAlert("VEX GPR instructions only support 32-bit and 64-bit modes!");
+	int mmmmm = GetVEXmmmmm(op);
+	int pp = GetVEXpp(opPrefix);
+	arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64);
+	Write8(op & 0xFF);
+	arg.WriteRest(this, extrabytes, regOp1);
+}
 
-void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) {
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	CheckFlags();
+	if (!cpu_info.bBMI1)
+		PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+	WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	CheckFlags();
+	if (!cpu_info.bBMI2)
+		PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
+	WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);}
+void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);}
+
+void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
+{
 #ifdef _M_X64
 		// Alternate encoding
 		// This does not display correctly in MSVC's debugger, it thinks it's a MOVD
@@ -1246,10 +1478,9 @@ void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg) {
 #endif
 }
 
-void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) {
-	if (arg.IsSimpleReg())
-		PanicAlert("Emitter: MOVQ_xmm doesn't support single registers as destination");
-	if (src > 7)
+void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
+{
+	if (src > 7 || arg.IsSimpleReg())
 	{
 		// Alternate encoding
 		// This does not display correctly in MSVC's debugger, it thinks it's a MOVD
@@ -1259,7 +1490,9 @@ void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src) {
 		Write8(0x0f);
 		Write8(0x7E);
 		arg.WriteRest(this, 0);
-	} else {
+	}
+	else
+	{
 		arg.operandReg = src;
 		arg.WriteRex(this, 0, 0);
 		Write8(0x66);
@@ -1284,119 +1517,128 @@ void XEmitter::WriteMXCSR(OpArg arg, int ext)
 void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);}
 void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);}
 
-void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp)   {WriteSSEOp(64, sseMOVNTDQ, true, regOp, arg);}
-void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp)   {WriteSSEOp(32, sseMOVNTP, true, regOp, arg);}
-void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp)   {WriteSSEOp(64, sseMOVNTP, true, regOp, arg);}
+void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);}
+void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);}
+void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);}
 
-void XEmitter::ADDSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseADD, false, regOp, arg);}
-void XEmitter::ADDSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseADD, false, regOp, arg);}
-void XEmitter::SUBSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseSUB, false, regOp, arg);}
-void XEmitter::SUBSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseSUB, false, regOp, arg);}
-void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(32, sseCMP, false, regOp, arg,1); Write8(compare);}
-void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(64, sseCMP, false, regOp, arg,1); Write8(compare);}
-void XEmitter::MULSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseMUL, false, regOp, arg);}
-void XEmitter::MULSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMUL, false, regOp, arg);}
-void XEmitter::DIVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseDIV, false, regOp, arg);}
-void XEmitter::DIVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseDIV, false, regOp, arg);}
-void XEmitter::MINSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseMIN, false, regOp, arg);}
-void XEmitter::MINSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMIN, false, regOp, arg);}
-void XEmitter::MAXSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseMAX, false, regOp, arg);}
-void XEmitter::MAXSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMAX, false, regOp, arg);}
-void XEmitter::SQRTSS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseSQRT, false, regOp, arg);}
-void XEmitter::SQRTSD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseSQRT, false, regOp, arg);}
-void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, false, regOp, arg);}
+void XEmitter::ADDSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseADD, regOp, arg);}
+void XEmitter::ADDSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseADD, regOp, arg);}
+void XEmitter::SUBSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseSUB, regOp, arg);}
+void XEmitter::SUBSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseSUB, regOp, arg);}
+void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::MULSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMUL, regOp, arg);}
+void XEmitter::MULSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMUL, regOp, arg);}
+void XEmitter::DIVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseDIV, regOp, arg);}
+void XEmitter::DIVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseDIV, regOp, arg);}
+void XEmitter::MINSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMIN, regOp, arg);}
+void XEmitter::MINSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMIN, regOp, arg);}
+void XEmitter::MAXSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMAX, regOp, arg);}
+void XEmitter::MAXSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMAX, regOp, arg);}
+void XEmitter::SQRTSS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF3, sseSQRT, regOp, arg);}
+void XEmitter::SQRTSD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF2, sseSQRT, regOp, arg);}
+void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);}
 
-void XEmitter::ADDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseADD, true, regOp, arg);}
-void XEmitter::ADDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseADD, true, regOp, arg);}
-void XEmitter::SUBPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseSUB, true, regOp, arg);}
-void XEmitter::SUBPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseSUB, true, regOp, arg);}
-void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(32, sseCMP, true, regOp, arg,1); Write8(compare);}
-void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(64, sseCMP, true, regOp, arg,1); Write8(compare);}
-void XEmitter::ANDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseAND, true, regOp, arg);}
-void XEmitter::ANDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseAND, true, regOp, arg);}
-void XEmitter::ANDNPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseANDN, true, regOp, arg);}
-void XEmitter::ANDNPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseANDN, true, regOp, arg);}
-void XEmitter::ORPS(X64Reg regOp, OpArg arg)    {WriteSSEOp(32, sseOR, true, regOp, arg);}
-void XEmitter::ORPD(X64Reg regOp, OpArg arg)    {WriteSSEOp(64, sseOR, true, regOp, arg);}
-void XEmitter::XORPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseXOR, true, regOp, arg);}
-void XEmitter::XORPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseXOR, true, regOp, arg);}
-void XEmitter::MULPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseMUL, true, regOp, arg);}
-void XEmitter::MULPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMUL, true, regOp, arg);}
-void XEmitter::DIVPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseDIV, true, regOp, arg);}
-void XEmitter::DIVPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseDIV, true, regOp, arg);}
-void XEmitter::MINPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseMIN, true, regOp, arg);}
-void XEmitter::MINPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMIN, true, regOp, arg);}
-void XEmitter::MAXPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseMAX, true, regOp, arg);}
-void XEmitter::MAXPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMAX, true, regOp, arg);}
-void XEmitter::SQRTPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseSQRT, true, regOp, arg);}
-void XEmitter::SQRTPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseSQRT, true, regOp, arg);}
-void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseRSQRT, true, regOp, arg);}
-void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(32, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
-void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(64, sseSHUF, true, regOp, arg,1); Write8(shuffle);}
+void XEmitter::ADDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseADD, regOp, arg);}
+void XEmitter::ADDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseADD, regOp, arg);}
+void XEmitter::SUBPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseSUB, regOp, arg);}
+void XEmitter::SUBPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseSUB, regOp, arg);}
+void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::ANDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseAND, regOp, arg);}
+void XEmitter::ANDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseAND, regOp, arg);}
+void XEmitter::ANDNPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseANDN, regOp, arg);}
+void XEmitter::ANDNPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseANDN, regOp, arg);}
+void XEmitter::ORPS(X64Reg regOp, OpArg arg)    {WriteSSEOp(0x00, sseOR, regOp, arg);}
+void XEmitter::ORPD(X64Reg regOp, OpArg arg)    {WriteSSEOp(0x66, sseOR, regOp, arg);}
+void XEmitter::XORPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseXOR, regOp, arg);}
+void XEmitter::XORPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseXOR, regOp, arg);}
+void XEmitter::MULPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMUL, regOp, arg);}
+void XEmitter::MULPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMUL, regOp, arg);}
+void XEmitter::DIVPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseDIV, regOp, arg);}
+void XEmitter::DIVPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseDIV, regOp, arg);}
+void XEmitter::MINPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMIN, regOp, arg);}
+void XEmitter::MINPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMIN, regOp, arg);}
+void XEmitter::MAXPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMAX, regOp, arg);}
+void XEmitter::MAXPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMAX, regOp, arg);}
+void XEmitter::SQRTPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseSQRT, regOp, arg);}
+void XEmitter::SQRTPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseSQRT, regOp, arg);}
+void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);}
+void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);}
+void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);}
 
-void XEmitter::COMISS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseCOMIS, true, regOp, arg);} //weird that these should be packed
-void XEmitter::COMISD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseCOMIS, true, regOp, arg);} //ordered
-void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, sseUCOMIS, true, regOp, arg);} //unordered
-void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(64, sseUCOMIS, true, regOp, arg);}
+void XEmitter::COMISS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered
+void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered
+void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);}
 
-void XEmitter::MOVAPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseMOVAPfromRM, true, regOp, arg);}
-void XEmitter::MOVAPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseMOVAPfromRM, true, regOp, arg);}
-void XEmitter::MOVAPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(32, sseMOVAPtoRM, true, regOp, arg);}
-void XEmitter::MOVAPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(64, sseMOVAPtoRM, true, regOp, arg);}
+void XEmitter::MOVAPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);}
+void XEmitter::MOVAPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);}
+void XEmitter::MOVAPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);}
+void XEmitter::MOVAPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);}
 
-void XEmitter::MOVUPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseMOVUPfromRM, true, regOp, arg);}
-void XEmitter::MOVUPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseMOVUPfromRM, true, regOp, arg);}
-void XEmitter::MOVUPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(32, sseMOVUPtoRM, true, regOp, arg);}
-void XEmitter::MOVUPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(64, sseMOVUPtoRM, true, regOp, arg);}
+void XEmitter::MOVUPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVUPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVUPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);}
+void XEmitter::MOVUPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);}
 
-void XEmitter::MOVDQA(X64Reg regOp, OpArg arg)  {WriteSSEOp(64, sseMOVDQfromRM, true, regOp, arg);}
-void XEmitter::MOVDQA(OpArg arg, X64Reg regOp)  {WriteSSEOp(64, sseMOVDQtoRM, true, regOp, arg);}
-void XEmitter::MOVDQU(X64Reg regOp, OpArg arg)  {WriteSSEOp(32, sseMOVDQfromRM, false, regOp, arg);}
-void XEmitter::MOVDQU(OpArg arg, X64Reg regOp)  {WriteSSEOp(32, sseMOVDQtoRM, false, regOp, arg);}
+void XEmitter::MOVDQA(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);}
+void XEmitter::MOVDQA(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);}
+void XEmitter::MOVDQU(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);}
+void XEmitter::MOVDQU(OpArg arg, X64Reg regOp)  {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);}
 
-void XEmitter::MOVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(32, sseMOVUPfromRM, false, regOp, arg);}
-void XEmitter::MOVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(64, sseMOVUPfromRM, false, regOp, arg);}
-void XEmitter::MOVSS(OpArg arg, X64Reg regOp)   {WriteSSEOp(32, sseMOVUPtoRM, false, regOp, arg);}
-void XEmitter::MOVSD(OpArg arg, X64Reg regOp)   {WriteSSEOp(64, sseMOVUPtoRM, false, regOp, arg);}
+void XEmitter::MOVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVSS(OpArg arg, X64Reg regOp)   {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);}
+void XEmitter::MOVSD(OpArg arg, X64Reg regOp)   {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);}
 
-void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, true, regOp, arg);}
-void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, true, regOp, arg);}
+void XEmitter::MOVLPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF2, sseMOVLPDfromRM, regOp, arg);}
+void XEmitter::MOVHPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF2, sseMOVHPDfromRM, regOp, arg);}
+void XEmitter::MOVLPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0xF2, sseMOVLPDtoRM, regOp, arg);}
+void XEmitter::MOVHPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0xF2, sseMOVHPDtoRM, regOp, arg);}
 
-void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5A, false, regOp, arg);}
-void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5A, false, regOp, arg);}
-void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x2D, false, regOp, arg);}
+void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));}
+void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));}
 
-void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0xE6, false, regOp, arg);}
-void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(32, 0x5B, true, regOp, arg);}
-void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0xE6, false, regOp, arg);}
-void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(64, 0x5B, true, regOp, arg);}
+void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);}
+void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);}
 
-void XEmitter::CVTSI2SS(X64Reg xregdest, OpArg arg) {WriteSSEOp(32, 0x2A, false, xregdest, arg);}
-void XEmitter::CVTSS2SI(X64Reg xregdest, OpArg arg) {WriteSSEOp(32, 0x2D, false, xregdest, arg);}
-void XEmitter::CVTTSS2SI(X64Reg xregdest, OpArg arg) {WriteSSEOp(32, 0x2C, false, xregdest, arg);}
-void XEmitter::CVTTPS2DQ(X64Reg xregdest, OpArg arg) {WriteSSEOp(32, 0x5B, false, xregdest, arg);}
-void XEmitter::CVTTSD2SI(X64Reg xregdest, OpArg arg) {WriteSSEOp(64, 0x2C, false, xregdest, arg);}
-void XEmitter::CVTTPD2DQ(X64Reg xregdest, OpArg arg) {WriteSSEOp(64, 0xE6, true, xregdest, arg); }
+void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);}
+void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);}
+void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);}
+void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);}
+void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);}
+void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);}
 
-void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)  {WriteSSEOp(64, sseMASKMOVDQU, true, dest, R(src));}
+void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);}
+void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);}
+void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);}
+void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);}
 
-void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x50, true, dest, arg);}
-void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x50, true, dest, arg);}
+void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);}
+void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);}
 
-void XEmitter::LDDQU(X64Reg dest, OpArg arg)    {WriteSSEOp(64, sseLDDQU, false, dest, arg);} // For integer data only
+void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)  {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}
+
+void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
+void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}
+
+void XEmitter::LDDQU(X64Reg dest, OpArg arg)    {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
 
 // THESE TWO ARE UNTESTED.
-void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x14, true, dest, arg);}
-void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(32, 0x15, true, dest, arg);}
+void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
+void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
 
-void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x14, true, dest, arg);}
-void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x15, true, dest, arg);}
+void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);}
+void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);}
 
 void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
 {
 	if (cpu_info.bSSE3)
 	{
-		WriteSSEOp(64, 0x12, false, regOp, arg); //SSE3 movddup
+		WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup
 	}
 	else
 	{
@@ -1410,101 +1652,69 @@ void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
 //There are a few more left
 
 // Also some integer instructions are missing
-void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x6B, true, dest, arg);}
-void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x63, true, dest, arg);}
-//void PACKUSDW(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x66, true, dest, arg);} // WRONG
-void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x67, true, dest, arg);}
+void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);}
+void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);}
+void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);}
 
-void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
-void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x61, true, dest, arg);}
-void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(64, 0x62, true, dest, arg);}
-//void PUNPCKLQDQ(X64Reg dest, OpArg arg) {WriteSSEOp(64, 0x60, true, dest, arg);}
+void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);}
+void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);}
+void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);}
 
-void XEmitter::PMOVSXBW(X64Reg dest, const OpArg &arg) {
-	if (!cpu_info.bSSE4_1) {
-		PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer.");
-	}
-	WriteSSEOp2(64, 0x20, true, dest, arg);
-}
-
-void XEmitter::PMOVSXBD(X64Reg dest, const OpArg &arg) {
-	if (!cpu_info.bSSE4_1) {
-		PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer.");
-	}
-	WriteSSEOp2(64, 0x21, true, dest, arg);
-}
-
-void XEmitter::PMOVSXWD(X64Reg dest, const OpArg &arg) {
-	if (!cpu_info.bSSE4_1) {
-		PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer.");
-	}
-	WriteSSEOp2(64, 0x23, true, dest, arg);
-}
-
-void XEmitter::PMOVZXBW(X64Reg dest, const OpArg &arg) {
-	if (!cpu_info.bSSE4_1) {
-		PanicAlert("Trying to use PMOVSXBW on a system that doesn't support it. Bad programmer.");
-	}
-	WriteSSEOp2(64, 0x30, true, dest, arg);
-}
-
-void XEmitter::PMOVZXBD(X64Reg dest, const OpArg &arg) {
-	if (!cpu_info.bSSE4_1) {
-		PanicAlert("Trying to use PMOVSXBD on a system that doesn't support it. Bad programmer.");
-	}
-	WriteSSEOp2(64, 0x31, true, dest, arg);
-}
-
-void XEmitter::PMOVZXWD(X64Reg dest, const OpArg &arg) {
-	if (!cpu_info.bSSE4_1) {
-		PanicAlert("Trying to use PMOVSXWD on a system that doesn't support it. Bad programmer.");
-	}
-	WriteSSEOp2(64, 0x33, true, dest, arg);
-}
-
-void XEmitter::PSRLW(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x71, true, (X64Reg)2, R(reg));
+void XEmitter::PSRLW(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
 	Write8(shift);
 }
 
-void XEmitter::PSRLD(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x72, true, (X64Reg)2, R(reg));
+void XEmitter::PSRLD(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
 	Write8(shift);
 }
 
-void XEmitter::PSRLQ(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x73, true, (X64Reg)2, R(reg));
+void XEmitter::PSRLQ(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
 	Write8(shift);
 }
 
-void XEmitter::PSLLW(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x71, true, (X64Reg)6, R(reg));
+void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
+{
+	WriteSSEOp(0x66, 0xd3, reg, arg);
+}
+
+void XEmitter::PSRLDQ(X64Reg reg, int shift) {
+	WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
 	Write8(shift);
 }
 
-void XEmitter::PSLLD(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x72, true, (X64Reg)6, R(reg));
+void XEmitter::PSLLW(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
 	Write8(shift);
 }
 
-void XEmitter::PSLLQ(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x73, true, (X64Reg)6, R(reg));
+void XEmitter::PSLLD(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSLLQ(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
 	Write8(shift);
 }
 
 void XEmitter::PSLLDQ(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x73, true, (X64Reg)7, R(reg));
-	Write8(shift);
-}
-
-void XEmitter::PSRLDQ(X64Reg reg, int shift) {
-	WriteSSEOp(64, 0x73, true, (X64Reg)3, R(reg));
+	WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
 	Write8(shift);
 }
 
 
 // WARNING not REX compatible
-void XEmitter::PSRAW(X64Reg reg, int shift) {
+void XEmitter::PSRAW(X64Reg reg, int shift)
+{
 	if (reg > 7)
 		PanicAlert("The PSRAW-emitter does not support regs above 7");
 	Write8(0x66);
@@ -1515,7 +1725,8 @@ void XEmitter::PSRAW(X64Reg reg, int shift) {
 }
 
 // WARNING not REX compatible
-void XEmitter::PSRAD(X64Reg reg, int shift) {
+void XEmitter::PSRAD(X64Reg reg, int shift)
+{
 	if (reg > 7)
 		PanicAlert("The PSRAD-emitter does not support regs above 7");
 	Write8(0x66);
@@ -1525,83 +1736,163 @@ void XEmitter::PSRAD(X64Reg reg, int shift) {
 	Write8(shift);
 }
 
-void XEmitter::PSHUFB(X64Reg dest, OpArg arg) {
-	if (!cpu_info.bSSSE3) {
-		PanicAlert("Trying to use PSHUFB on a system that doesn't support it. Bad programmer.");
-	}
-	WriteSSEOp2(64, 0x00, true, dest, arg);
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+	if (!cpu_info.bSSSE3)
+		PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
+	WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
 }
 
-void XEmitter::PAND(X64Reg dest, OpArg arg)     {WriteSSEOp(64, 0xDB, true, dest, arg);}
-void XEmitter::PANDN(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xDF, true, dest, arg);}
-void XEmitter::PXOR(X64Reg dest, OpArg arg)     {WriteSSEOp(64, 0xEF, true, dest, arg);}
-void XEmitter::POR(X64Reg dest, OpArg arg)      {WriteSSEOp(64, 0xEB, true, dest, arg);}
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+	if (!cpu_info.bSSE4_1)
+		PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
+	WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
 
-void XEmitter::PADDB(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xFC, true, dest, arg);}
-void XEmitter::PADDW(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xFD, true, dest, arg);}
-void XEmitter::PADDD(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xFE, true, dest, arg);}
-void XEmitter::PADDQ(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xD4, true, dest, arg);}
+void XEmitter::PSHUFB(X64Reg dest, OpArg arg)   {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
+void XEmitter::PTEST(X64Reg dest, OpArg arg)    {WriteSSE41Op(0x66, 0x3817, dest, arg);}
+void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
 
-void XEmitter::PADDSB(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xEC, true, dest, arg);}
-void XEmitter::PADDSW(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xED, true, dest, arg);}
-void XEmitter::PADDUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0xDC, true, dest, arg);}
-void XEmitter::PADDUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0xDD, true, dest, arg);}
+void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
+void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
+void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
+void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);}
+void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);}
+void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);}
+void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);}
+void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);}
+void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);}
+void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);}
+void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);}
+void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);}
 
-void XEmitter::PSUBB(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xF8, true, dest, arg);}
-void XEmitter::PSUBW(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xF9, true, dest, arg);}
-void XEmitter::PSUBD(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xFA, true, dest, arg);}
-void XEmitter::PSUBQ(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xDB, true, dest, arg);}
+void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
+void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
+void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
 
-void XEmitter::PSUBSB(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xE8, true, dest, arg);}
-void XEmitter::PSUBSW(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xE9, true, dest, arg);}
-void XEmitter::PSUBUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0xD8, true, dest, arg);}
-void XEmitter::PSUBUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0xD9, true, dest, arg);}
+void XEmitter::PAND(X64Reg dest, OpArg arg)     {WriteSSEOp(0x66, 0xDB, dest, arg);}
+void XEmitter::PANDN(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xDF, dest, arg);}
+void XEmitter::PXOR(X64Reg dest, OpArg arg)     {WriteSSEOp(0x66, 0xEF, dest, arg);}
+void XEmitter::POR(X64Reg dest, OpArg arg)      {WriteSSEOp(0x66, 0xEB, dest, arg);}
 
-void XEmitter::PAVGB(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xE0, true, dest, arg);}
-void XEmitter::PAVGW(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xE3, true, dest, arg);}
+void XEmitter::PADDB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFC, dest, arg);}
+void XEmitter::PADDW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFD, dest, arg);}
+void XEmitter::PADDD(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFE, dest, arg);}
+void XEmitter::PADDQ(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xD4, dest, arg);}
 
-void XEmitter::PCMPEQB(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x74, true, dest, arg);}
-void XEmitter::PCMPEQW(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x75, true, dest, arg);}
-void XEmitter::PCMPEQD(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x76, true, dest, arg);}
+void XEmitter::PADDSB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEC, dest, arg);}
+void XEmitter::PADDSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xED, dest, arg);}
+void XEmitter::PADDUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xDC, dest, arg);}
+void XEmitter::PADDUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xDD, dest, arg);}
 
-void XEmitter::PCMPGTB(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x64, true, dest, arg);}
-void XEmitter::PCMPGTW(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x65, true, dest, arg);}
-void XEmitter::PCMPGTD(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0x66, true, dest, arg);}
+void XEmitter::PSUBB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xF8, dest, arg);}
+void XEmitter::PSUBW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xF9, dest, arg);}
+void XEmitter::PSUBD(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFA, dest, arg);}
+void XEmitter::PSUBQ(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFB, dest, arg);}
 
-void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(64, 0xC5, true, dest, arg); Write8(subreg);}
-void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(64, 0xC4, true, dest, arg); Write8(subreg);}
+void XEmitter::PSUBSB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xE8, dest, arg);}
+void XEmitter::PSUBSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xE9, dest, arg);}
+void XEmitter::PSUBUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xD8, dest, arg);}
+void XEmitter::PSUBUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xD9, dest, arg);}
 
-void XEmitter::PMADDWD(X64Reg dest, OpArg arg)  {WriteSSEOp(64, 0xF5, true, dest, arg); }
-void XEmitter::PSADBW(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xF6, true, dest, arg);}
+void XEmitter::PAVGB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xE0, dest, arg);}
+void XEmitter::PAVGW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xE3, dest, arg);}
 
-void XEmitter::PMAXSW(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xEE, true, dest, arg); }
-void XEmitter::PMAXUB(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xDE, true, dest, arg); }
-void XEmitter::PMINSW(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xEA, true, dest, arg); }
-void XEmitter::PMINUB(X64Reg dest, OpArg arg)   {WriteSSEOp(64, 0xDA, true, dest, arg); }
+void XEmitter::PCMPEQB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x74, dest, arg);}
+void XEmitter::PCMPEQW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x75, dest, arg);}
+void XEmitter::PCMPEQD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x76, dest, arg);}
 
-void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg)    {WriteSSEOp(64, 0xD7, true, dest, arg); }
+void XEmitter::PCMPGTB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x64, dest, arg);}
+void XEmitter::PCMPGTW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x65, dest, arg);}
+void XEmitter::PCMPGTD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x66, dest, arg);}
 
-void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle)    {WriteSSEOp(64, 0x70, true, regOp, arg, 1); Write8(shuffle);}
-void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(64, 0x70, false, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(0x66, 0xC5, dest, arg); Write8(subreg);}
+void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(0x66, 0xC4, dest, arg); Write8(subreg);}
+
+void XEmitter::PMADDWD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xF5, dest, arg); }
+void XEmitter::PSADBW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xF6, dest, arg);}
+
+void XEmitter::PMAXSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEE, dest, arg); }
+void XEmitter::PMAXUB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xDE, dest, arg); }
+void XEmitter::PMINSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEA, dest, arg); }
+void XEmitter::PMINUB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xDA, dest, arg); }
+
+void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xD7, dest, arg); }
+void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle)    {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);}
 
 // VEX
-void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseADD, false, regOp1, regOp2, arg);}
-void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseSUB, false, regOp1, regOp2, arg);}
-void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseMUL, false, regOp1, regOp2, arg);}
-void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(64, sseDIV, false, regOp1, regOp2, arg);}
-void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(64, sseSQRT, false, regOp1, regOp2, arg);}
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);}
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);}
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg)     {WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);}
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 1); Write8(shuffle);}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
+
+void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate)      {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);}
+void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg)                 {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg)               {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);}
+void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg)                 {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);}
 
 // Prefixes
 
 void XEmitter::LOCK()  { Write8(0xF0); }
 void XEmitter::REP()   { Write8(0xF3); }
 void XEmitter::REPNE() { Write8(0xF2); }
+void XEmitter::FSOverride() { Write8(0x64); }
+void XEmitter::GSOverride() { Write8(0x65); }
 
-void XEmitter::FWAIT() {
+void XEmitter::FWAIT()
+{
 	Write8(0x9B);
 }
 
-void XEmitter::RTDSC() { Write8(0x0F); Write8(0x31); }
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg)
+{
+	int mf = 0;
+	_assert_msg_(DYNA_REC, !(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction");
+	switch (bits)
+	{
+	case 32: mf = 0; break;
+	case 64: mf = 4; break;
+	case 80: mf = 2; break;
+	default: _assert_msg_(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
+	}
+	Write8(0xd9 | mf);
+	// x87 instructions use the reg field of the ModR/M byte as opcode:
+	if (bits == 80)
+		op = op_80b;
+	arg.WriteRest(this, 0, (X64Reg) op);
+}
+
+void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);}
+void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);}
+void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);}
+void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }
+
+void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); }
 
 void XCodeBlock::AllocCodeSpace(int size) {
 	region_size = size;
@@ -1625,5 +1916,4 @@ void XCodeBlock::WriteProtect() {
 	WriteProtectMemory(region, region_size, true);
 }
 
-} // Gen
-
+}
diff --git a/Common/x64Emitter.h b/Common/x64Emitter.h
index 2b163ff52a..3af96eea84 100644
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@@ -22,6 +22,10 @@
 
 #include "Common.h"
 
+#ifdef _M_X64
+#define _ARCH_64
+#endif
+
 namespace Gen
 {
 
@@ -55,10 +59,10 @@ enum CCFlags
 {
 	CC_O   = 0,
 	CC_NO  = 1,
-	CC_B   = 2, CC_C  = 2, CC_NAE = 2,
-	CC_NB  = 3, CC_NC = 3, CC_AE  = 3,
+	CC_B   = 2, CC_C   = 2, CC_NAE = 2,
+	CC_NB  = 3, CC_NC  = 3, CC_AE  = 3,
 	CC_Z   = 4, CC_E   = 4,
-	CC_NZ  = 5,	CC_NE  = 5,
+	CC_NZ  = 5, CC_NE  = 5,
 	CC_BE  = 6, CC_NA  = 6,
 	CC_NBE = 7, CC_A   = 7,
 	CC_S   = 8,
@@ -121,6 +125,16 @@ enum {
 	CMP_ORD = 7,
 };
 
+enum FloatOp {
+	floatLD = 0,
+	floatST = 2,
+	floatSTP = 3,
+	floatLD80 = 5,
+	floatSTP80 = 7,
+
+	floatINVALID = -1,
+};
+
 class XEmitter;
 
 // RIP addressing does not benefit from micro op fusion on Core arch
@@ -136,9 +150,15 @@ struct OpArg
 		//if scale == 0 never mind offsetting
 		offset = _offset;
 	}
+	bool operator==(OpArg b)
+	{
+		return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
+		       indexReg == b.indexReg && offset == b.offset;
+	}
 	void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
-	void WriteVex(XEmitter* emit, int size, int packed, Gen::X64Reg regOp1, X64Reg regOp2) const;
-	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=(X64Reg)0xFF, bool warn_64bit_offset = true) const;
+	void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const;
+	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const;
+	void WriteFloatModRM(XEmitter *emit, FloatOp op);
 	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
 	// This one is public - must be written to
 	u64 offset;  // use RIP-relative as much as possible - 64-bit immediates are not available.
@@ -147,7 +167,8 @@ struct OpArg
 	void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
 	bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
 	bool IsSimpleReg() const {return scale == SCALE_NONE;}
-	bool IsSimpleReg(X64Reg reg) const {
+	bool IsSimpleReg(X64Reg reg) const
+	{
 		if (!IsSimpleReg())
 			return false;
 		return GetSimpleReg() == reg;
@@ -195,26 +216,35 @@ private:
 	u16 indexReg;
 };
 
-inline OpArg M(void *ptr)	    {return OpArg((u64)ptr, (int)SCALE_RIP);}
+inline OpArg M(const void *ptr) {return OpArg((u64)ptr, (int)SCALE_RIP);}
 template <typename T>
 inline OpArg M(const T *ptr)    {return OpArg((u64)(const void *)ptr, (int)SCALE_RIP);}
-inline OpArg R(X64Reg value)	{return OpArg(0, SCALE_NONE, value);}
+inline OpArg R(X64Reg value)    {return OpArg(0, SCALE_NONE, value);}
 inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
-inline OpArg MDisp(X64Reg value, int offset) {
+
+inline OpArg MDisp(X64Reg value, int offset)
+{
 	return OpArg((u32)offset, SCALE_ATREG, value);
 }
-inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) {
+
+inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
 	return OpArg(offset, scale, base, scaled);
 }
-inline OpArg MScaled(X64Reg scaled, int scale, int offset) {
+
+inline OpArg MScaled(X64Reg scaled, int scale, int offset)
+{
 	if (scale == SCALE_1)
 		return OpArg(offset, SCALE_ATREG, scaled);
 	else
 		return OpArg(offset, scale | 0x20, RAX, scaled);
 }
-inline OpArg MRegSum(X64Reg base, X64Reg offset) {
+
+inline OpArg MRegSum(X64Reg base, X64Reg offset)
+{
 	return MComplex(base, offset, 1, 0);
 }
+
 inline OpArg Imm8 (u8 imm)  {return OpArg(imm, SCALE_IMM8);}
 inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
 inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
@@ -226,19 +256,23 @@ inline OpArg SImmAuto(s32 imm) {
 	return OpArg(imm, (imm >= 128 || imm < -128) ? SCALE_IMM32 : SCALE_IMM8);
 }
 
-#ifdef _M_X64
+#ifdef _ARCH_64
 inline OpArg ImmPtr(const void* imm) {return Imm64((u64)imm);}
 #else
 inline OpArg ImmPtr(const void* imm) {return Imm32((u32)imm);}
 #endif
-inline u32 PtrOffset(const void* ptr, const void* base) {
-#ifdef _M_X64
+
+inline u32 PtrOffset(const void* ptr, const void* base)
+{
+#ifdef _ARCH_64
 	s64 distance = (s64)ptr-(s64)base;
 	if (distance >= 0x80000000LL ||
-	    distance < -0x80000000LL) {
-		_assert_msg_(JIT, 0, "pointer offset out of range");
+	    distance < -0x80000000LL)
+	{
+		_assert_msg_(DYNA_REC, 0, "pointer offset out of range");
 		return 0;
 	}
+
 	return (u32)distance;
 #else
 	return (u32)ptr-(u32)base;
@@ -275,21 +309,31 @@ class XEmitter
 	friend struct OpArg;  // for Write8 etc
 private:
 	u8 *code;
+	bool flags_locked;
+
+	void CheckFlags();
 
 	void Rex(int w, int r, int x, int b);
 	void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
 	void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
 	void WriteMulDivType(int bits, OpArg src, int ext);
-	void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2);
+	void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
 	void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
 	void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
 	void WriteMXCSR(OpArg arg, int ext);
-	void WriteSSEOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
-	void WriteSSEOp2(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
-	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp, OpArg arg, int extrabytes = 0);
-	void WriteAVXOp(int size, u8 sseOp, bool packed, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
 	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
 
+	void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+
 protected:
 	inline void Write8(u8 value)   {*code++ = value;}
 	inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
@@ -297,8 +341,8 @@ protected:
 	inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
 
 public:
-	XEmitter() { code = NULL; }
-	XEmitter(u8 *code_ptr) { code = code_ptr; }
+	XEmitter() { code = nullptr; flags_locked = false; }
+	XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; }
 	virtual ~XEmitter() {}
 
 	void WriteModRM(int mod, int rm, int reg);
@@ -312,6 +356,9 @@ public:
 	const u8 *GetCodePtr() const;
 	u8 *GetWritableCodePtr();
 
+	void LockFlags() { flags_locked = true; }
+	void UnlockFlags() { flags_locked = false; }
+
 	// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
 	// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
 	// INC and DEC are slow on Intel Core, but not on AMD. They create a
@@ -322,7 +369,7 @@ public:
 	void INT3();
 
 	// Do nothing
-	void NOP(int count = 1); //nop padding - TODO: fast nop slides, for amd and intel (check their manuals)
+	void NOP(size_t count = 1);
 
 	// Save energy in wait-loops on P4 only. Probably not too useful.
 	void PAUSE();
@@ -459,6 +506,14 @@ public:
 	void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
 	void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
 
+	// Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
+	void MOVBE(int dbits, const OpArg& dest, const OpArg& src);
+
+	// Available only on AMD >= Phenom or Intel >= Haswell
+	void LZCNT(int bits, X64Reg dest, OpArg src);
+	// Note: this one is actually part of BMI1
+	void TZCNT(int bits, X64Reg dest, OpArg src);
+
 	// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
 	void STMXCSR(OpArg memloc);
 	void LDMXCSR(OpArg memloc);
@@ -467,7 +522,31 @@ public:
 	void LOCK();
 	void REP();
 	void REPNE();
+	void FSOverride();
+	void GSOverride();
 
+	// x87
+	enum x87StatusWordBits {
+		x87_InvalidOperation = 0x1,
+		x87_DenormalizedOperand = 0x2,
+		x87_DivisionByZero = 0x4,
+		x87_Overflow = 0x8,
+		x87_Underflow = 0x10,
+		x87_Precision = 0x20,
+		x87_StackFault = 0x40,
+		x87_ErrorSummary = 0x80,
+		x87_C0 = 0x100,
+		x87_C1 = 0x200,
+		x87_C2 = 0x400,
+		x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+		x87_C3 = 0x4000,
+		x87_FPUBusy = 0x8000,
+	};
+
+	void FLD(int bits, OpArg src);
+	void FST(int bits, OpArg dest);
+	void FSTP(int bits, OpArg dest);
+	void FNSTSW_AX();
 	void FWAIT();
 
 	// SSE/SSE2: Floating point arithmetic
@@ -490,14 +569,6 @@ public:
 	// SSE/SSE2: Floating point bitwise (yes)
 	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
 	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
-	void ANDSS(X64Reg regOp, OpArg arg);
-	void ANDSD(X64Reg regOp, OpArg arg);
-	void ANDNSS(X64Reg regOp, OpArg arg);
-	void ANDNSD(X64Reg regOp, OpArg arg);
-	void ORSS(X64Reg regOp, OpArg arg);
-	void ORSD(X64Reg regOp, OpArg arg);
-	void XORSS(X64Reg regOp, OpArg arg);
-	void XORSD(X64Reg regOp, OpArg arg);
 
 	inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); }
 	inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); }
@@ -543,11 +614,8 @@ public:
 	// SSE/SSE2: Useful alternative to shuffle in some cases.
 	void MOVDDUP(X64Reg regOp, OpArg arg);
 
-	// THESE TWO ARE NEW AND UNTESTED
 	void UNPCKLPS(X64Reg dest, OpArg src);
 	void UNPCKHPS(X64Reg dest, OpArg src);
-
-	// These are OK.
 	void UNPCKLPD(X64Reg dest, OpArg src);
 	void UNPCKHPD(X64Reg dest, OpArg src);
 
@@ -568,7 +636,6 @@ public:
 	void MOVUPS(OpArg arg, X64Reg regOp);
 	void MOVUPD(OpArg arg, X64Reg regOp);
 
-	// Integers (NOTE: untested - I added these then it turned out I didn't have a use for them after all).
 	void MOVDQA(X64Reg regOp, OpArg arg);
 	void MOVDQA(OpArg arg, X64Reg regOp);
 	void MOVDQU(X64Reg regOp, OpArg arg);
@@ -579,6 +646,14 @@ public:
 	void MOVSS(OpArg arg, X64Reg regOp);
 	void MOVSD(OpArg arg, X64Reg regOp);
 
+	void MOVLPD(X64Reg regOp, OpArg arg);
+	void MOVHPD(X64Reg regOp, OpArg arg);
+	void MOVLPD(OpArg arg, X64Reg regOp);
+	void MOVHPD(OpArg arg, X64Reg regOp);
+
+	void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
+	void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
+
 	void MOVD_xmm(X64Reg dest, const OpArg &arg);
 	void MOVQ_xmm(X64Reg dest, OpArg arg);
 	void MOVD_xmm(const OpArg &arg, X64Reg src);
@@ -596,37 +671,34 @@ public:
 	void CVTPS2PD(X64Reg dest, OpArg src);
 	void CVTPD2PS(X64Reg dest, OpArg src);
 	void CVTSS2SD(X64Reg dest, OpArg src);
+	void CVTSI2SS(X64Reg dest, OpArg src);
 	void CVTSD2SS(X64Reg dest, OpArg src);
-	void CVTSD2SI(X64Reg dest, OpArg src);
+	void CVTSI2SD(X64Reg dest, OpArg src);
 	void CVTDQ2PD(X64Reg regOp, OpArg arg);
 	void CVTPD2DQ(X64Reg regOp, OpArg arg);
 	void CVTDQ2PS(X64Reg regOp, OpArg arg);
 	void CVTPS2DQ(X64Reg regOp, OpArg arg);
 
-	void CVTTSS2SI(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
 	void CVTTPS2DQ(X64Reg regOp, OpArg arg);
-	void CVTSI2SS(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
-	void CVTSS2SI(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
-	void CVTTSD2SI(X64Reg xregdest, OpArg arg);  // Yeah, destination really is a GPR like EAX!
-	void CVTTPD2DQ(X64Reg xregdest, OpArg arg);
+	void CVTTPD2DQ(X64Reg regOp, OpArg arg);
+
+	// Destinations are X64 regs (rax, rbx, ...) for these instructions.
+	void CVTSS2SI(X64Reg xregdest, OpArg src);
+	void CVTSD2SI(X64Reg xregdest, OpArg src);
+	void CVTTSS2SI(X64Reg xregdest, OpArg arg);
+	void CVTTSD2SI(X64Reg xregdest, OpArg arg);
 
 	// SSE2: Packed integer instructions
 	void PACKSSDW(X64Reg dest, OpArg arg);
 	void PACKSSWB(X64Reg dest, OpArg arg);
-	//void PACKUSDW(X64Reg dest, OpArg arg);
+	void PACKUSDW(X64Reg dest, OpArg arg);
 	void PACKUSWB(X64Reg dest, OpArg arg);
 
 	void PUNPCKLBW(X64Reg dest, const OpArg &arg);
 	void PUNPCKLWD(X64Reg dest, const OpArg &arg);
 	void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
 
-	void PMOVSXBW(X64Reg dest, const OpArg &arg);
-	void PMOVSXBD(X64Reg dest, const OpArg &arg);
-	void PMOVSXWD(X64Reg dest, const OpArg &arg);
-	void PMOVZXBW(X64Reg dest, const OpArg &arg);
-	void PMOVZXBD(X64Reg dest, const OpArg &arg);
-	void PMOVZXWD(X64Reg dest, const OpArg &arg);
-
+	void PTEST(X64Reg dest, OpArg arg);
 	void PAND(X64Reg dest, OpArg arg);
 	void PANDN(X64Reg dest, OpArg arg);
 	void PXOR(X64Reg dest, OpArg arg);
@@ -680,29 +752,75 @@ public:
 	void PSHUFB(X64Reg dest, OpArg arg);
 
 	void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle);
+	void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle);
 
 	void PSRLW(X64Reg reg, int shift);
 	void PSRLD(X64Reg reg, int shift);
 	void PSRLQ(X64Reg reg, int shift);
+	void PSRLQ(X64Reg reg, OpArg arg);
+	void PSRLDQ(X64Reg reg, int shift);
 
 	void PSLLW(X64Reg reg, int shift);
 	void PSLLD(X64Reg reg, int shift);
 	void PSLLQ(X64Reg reg, int shift);
-
-	void PSRLDQ(X64Reg reg, int shift);
 	void PSLLDQ(X64Reg reg, int shift);
 
 	void PSRAW(X64Reg reg, int shift);
 	void PSRAD(X64Reg reg, int shift);
 
+	// SSE4: data type conversions
+	void PMOVSXBW(X64Reg dest, OpArg arg);
+	void PMOVSXBD(X64Reg dest, OpArg arg);
+	void PMOVSXBQ(X64Reg dest, OpArg arg);
+	void PMOVSXWD(X64Reg dest, OpArg arg);
+	void PMOVSXWQ(X64Reg dest, OpArg arg);
+	void PMOVSXDQ(X64Reg dest, OpArg arg);
+	void PMOVZXBW(X64Reg dest, OpArg arg);
+	void PMOVZXBD(X64Reg dest, OpArg arg);
+	void PMOVZXBQ(X64Reg dest, OpArg arg);
+	void PMOVZXWD(X64Reg dest, OpArg arg);
+	void PMOVZXWQ(X64Reg dest, OpArg arg);
+	void PMOVZXDQ(X64Reg dest, OpArg arg);
+
+	// SSE4: variable blend instructions (xmm0 implicit argument)
+	void PBLENDVB(X64Reg dest, OpArg arg);
+	void BLENDVPS(X64Reg dest, OpArg arg);
+	void BLENDVPD(X64Reg dest, OpArg arg);
+
 	// AVX
 	void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 	void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle);
+	void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
 
-	void RTDSC();
+	// VEX GPR instructions
+	void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate);
+	void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void BLSR(int bits, X64Reg regOp, OpArg arg);
+	void BLSMSK(int bits, X64Reg regOp, OpArg arg);
+	void BLSI(int bits, X64Reg regOp, OpArg arg);
+	void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
+	void RDTSC();
 
 	// Utility functions
 	// The difference between this and CALL is that this aligns the stack
@@ -719,6 +837,7 @@ public:
 	void ABI_CallFunctionC16(const void *func, u16 param1);
 	void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2);
 
+
 	// These only support u32 parameters, but that's enough for a lot of uses.
 	// These will destroy the 1 or 2 first "parameter regs".
 	void ABI_CallFunctionC(const void *func, u32 param1);
@@ -736,8 +855,8 @@ public:
 	void ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2);
 
 	// Pass a register as a parameter.
-	void ABI_CallFunctionR(const void *func, Gen::X64Reg reg1);
-	void ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2);
+	void ABI_CallFunctionR(const void *func, X64Reg reg1);
+	void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2);
 
 	template <typename Tr, typename T1>
 	void ABI_CallFunctionC(Tr (*func)(T1), u32 param1) {
@@ -822,4 +941,4 @@ public:
 
 }  // namespace
 
-#endif // _DOLPHIN_INTEL_CODEGEN_
+#endif