From e270b955bb04b422e7edf4666fc48272a0839b13 Mon Sep 17 00:00:00 2001
From: Henrik Rydgard <hrydgard@gmail.com>
Date: Tue, 22 Mar 2016 18:36:08 +0100
Subject: [PATCH] x86/x64: Minor vertex decoder optimization

---
 GPU/Common/VertexDecoderX86.cpp | 49 ++++++++++++++++++++++++++++-----
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index 949bb25d44..fd87d565d6 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -379,13 +379,48 @@ void VertexDecoderJitCache::Jit_WeightsU16ToFloat() {
 
 void VertexDecoderJitCache::Jit_WeightsFloat() {
 	int j;
-	for (j = 0; j < dec_->nweights; j++) {
-		MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
-		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
-	}
-	while (j & 3) {  // Zero additional weights rounding up to 4.
-		MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
-		j++;
+	switch (dec_->nweights) {
+	case 1:
+		// MOVSS: When the source operand is a memory location and destination operand is an XMM register, the three high-order doublewords of the destination operand are cleared to all 0s.
+		MOVSS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	// Don't we have an emitter for MOVQ?
+	//case 2:
+	//	MOVQ(XMM3, MDisp(srcReg, dec_->weightoff));
+	//	MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+	//	break;
+
+	case 4:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		break;
+
+	case 5:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVSS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	case 8:
+		MOVUPS(XMM3, MDisp(srcReg, dec_->weightoff));
+		MOVUPS(XMM4, MDisp(srcReg, dec_->weightoff + 16));
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off), XMM3);
+		MOVUPS(MDisp(dstReg, dec_->decFmt.w0off + 16), XMM4);
+		break;
+
+	default:
+		for (j = 0; j < dec_->nweights; j++) {
+			MOV(32, R(tempReg1), MDisp(srcReg, dec_->weightoff + j * 4));
+			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), R(tempReg1));
+		}
+		while (j & 3) {  // Zero additional weights rounding up to 4.
+			MOV(32, MDisp(dstReg, dec_->decFmt.w0off + j * 4), Imm32(0));
+			j++;
+		}
+		break;
 	}
 }