samplerjit: Better vectorize UV linear calc.

Gives about 1-2% when mips are used.
2025-04-02 11:01:50 -04:00 · 2022-01-24 20:14:00 -08:00 · 2022-01-24 20:14:00 -08:00 · c1e657ed47
commit c1e657ed47
parent 733046962f
5 changed files with 124 additions and 135 deletions
--- a/Common/x64Emitter.cpp
+++ b/Common/x64Emitter.cpp
@ -1920,8 +1920,30 @@ void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(0x66, 0x
 void XEmitter::PEXTRB(OpArg dest, X64Reg arg, u8 subreg)    {WriteSSE41Op(0x66, 0x3A14, arg, dest, 1); Write8(subreg);}
 void XEmitter::PEXTRW(OpArg dest, X64Reg arg, u8 subreg)    {WriteSSE41Op(0x66, 0x3A15, arg, dest, 1); Write8(subreg);}
 void XEmitter::PEXTRD(OpArg dest, X64Reg arg, u8 subreg)    {WriteSSE41Op(0x66, 0x3A16, arg, dest, 1); Write8(subreg);}
+void XEmitter::PEXTRQ(OpArg dest, X64Reg arg, u8 subreg) {
+	_assert_msg_(cpu_info.bSSE4_1, "Trying to use SSE4.1 on a system that doesn't support it.");
+	Write8(0x66);
+	dest.operandReg = arg;
+	dest.WriteRex(this, 64, 0);
+	Write8(0x0F);
+	Write8(0x3A);
+	Write8(0x16);
+	dest.WriteRest(this, 1);
+	Write8(subreg);
+}
 void XEmitter::PINSRB(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSE41Op(0x66, 0x3A20, dest, arg, 1); Write8(subreg);}
 void XEmitter::PINSRD(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSE41Op(0x66, 0x3A22, dest, arg, 1); Write8(subreg);}
+void XEmitter::PINSRQ(X64Reg dest, OpArg arg, u8 subreg) {
+	_assert_msg_(cpu_info.bSSE4_1, "Trying to use SSE4.1 on a system that doesn't support it.");
+	Write8(0x66);
+	arg.operandReg = dest;
+	arg.WriteRex(this, 64, 0);
+	Write8(0x0F);
+	Write8(0x3A);
+	Write8(0x22);
+	arg.WriteRest(this, 1);
+	Write8(subreg);
+}

 void XEmitter::PMADDWD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xF5, dest, arg); }
 void XEmitter::PMADDUBSW(X64Reg dest, OpArg arg) {WriteSSSE3Op(0x66, 0x3804, dest, arg);}
--- a/Common/x64Emitter.h
+++ b/Common/x64Emitter.h
@ -817,8 +817,10 @@ public:
 	void PEXTRB(OpArg dest, X64Reg arg, u8 subreg);
 	void PEXTRW(OpArg dest, X64Reg arg, u8 subreg);
 	void PEXTRD(OpArg dest, X64Reg arg, u8 subreg);
+	void PEXTRQ(OpArg dest, X64Reg arg, u8 subreg);
 	void PINSRB(X64Reg dest, OpArg arg, u8 subreg);
 	void PINSRD(X64Reg dest, OpArg arg, u8 subreg);
+	void PINSRQ(X64Reg dest, OpArg arg, u8 subreg);

 	void PMADDWD(X64Reg dest, OpArg arg);
 	void PMADDUBSW(X64Reg dest, OpArg arg);
--- a/GPU/Software/Sampler.cpp
+++ b/GPU/Software/Sampler.cpp
@ -105,7 +105,7 @@ void SamplerJitCache::Clear() {
 	const10Low_ = nullptr;
 	const10All8_ = nullptr;

-	constWidth256f_ = nullptr;
+	constWidthHeight256f_ = nullptr;
 	constHeight256f_ = nullptr;
 	constWidthMinus1i_ = nullptr;
 	constHeightMinus1i_ = nullptr;
--- a/GPU/Software/Sampler.h
+++ b/GPU/Software/Sampler.h
@ -104,7 +104,7 @@ private:
 	int stackFracUV1Offset_ = 0;
 #endif

-	const u8 *constWidth256f_ = nullptr;
+	const u8 *constWidthHeight256f_ = nullptr;
 	const u8 *constHeight256f_ = nullptr;
 	const u8 *constWidthMinus1i_ = nullptr;
 	const u8 *constHeightMinus1i_ = nullptr;
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@ -871,14 +871,18 @@ void SamplerJitCache::WriteConstantPool(const SamplerID &id) {
 	// These are unique to the sampler ID.
 	if (!id.hasAnyMips) {
 		float w256f = (1 << id.width0Shift) * 256;
-		WriteDynamicConst4x32(constWidth256f_, *(uint32_t *)&w256f);
 		float h256f = (1 << id.height0Shift) * 256;
+		constWidthHeight256f_ = AlignCode16();
+		Write32(*(uint32_t *)&w256f);
+		Write32(*(uint32_t *)&h256f);
+		Write32(*(uint32_t *)&w256f);
+		Write32(*(uint32_t *)&h256f);
 		WriteDynamicConst4x32(constHeight256f_, *(uint32_t *)&h256f);

 		WriteDynamicConst4x32(constWidthMinus1i_, (1 << id.width0Shift) - 1);
 		WriteDynamicConst4x32(constHeightMinus1i_, (1 << id.height0Shift) - 1);
 	} else {
-		constWidth256f_ = nullptr;
+		constWidthHeight256f_ = nullptr;
 		constHeight256f_ = nullptr;
 		constWidthMinus1i_ = nullptr;
 		constHeightMinus1i_ = nullptr;
@ -2394,7 +2398,7 @@ bool SamplerJitCache::Jit_GetTexelCoords(const SamplerID &id) {
 	X64Reg vReg = regCache_.Alloc(RegCache::GEN_ARG_V);
 	X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
 	X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);
-	if (constWidth256f_ == nullptr) {
+	if (constWidthHeight256f_ == nullptr) {
 		// We have to figure out levels and the proper width, ugh.
 		X64Reg idReg = GetSamplerID();
 		X64Reg tempReg = regCache_.Alloc(RegCache::GEN_TEMP0);
@ -2458,7 +2462,7 @@ bool SamplerJitCache::Jit_GetTexelCoords(const SamplerID &id) {
 		regCache_.Release(tempVecReg, RegCache::VEC_TEMP0);
 	} else {
 		// Multiply, then convert to integer...
-		MULSS(sReg, M(constWidth256f_));
+		MULSS(sReg, M(constWidthHeight256f_));
 		MULSS(tReg, M(constHeight256f_));
 		CVTTPS2DQ(sReg, R(sReg));
 		CVTTPS2DQ(tReg, R(tReg));
@ -2517,13 +2521,11 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 	X64Reg sReg = regCache_.Find(RegCache::VEC_ARG_S);
 	X64Reg tReg = regCache_.Find(RegCache::VEC_ARG_T);

-	// Start by multiplying with the width/height... which might be complex with mips.
-	X64Reg width0VecReg = INVALID_REG;
-	X64Reg height0VecReg = INVALID_REG;
-	X64Reg width1VecReg = INVALID_REG;
-	X64Reg height1VecReg = INVALID_REG;
+	// We use this if there are mips later, to apply wrap/clamp.
+	X64Reg sizesReg = INVALID_REG;

-	if (constWidth256f_ == nullptr) {
+	// Start by multiplying with the width/height... which might be complex with mips.
+	if (id.hasAnyMips) {
 		// We have to figure out levels and the proper width, ugh.
 		X64Reg idReg = GetSamplerID();

@ -2539,8 +2541,16 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 		}

 		// This will load the current and next level's sizes, 16x4.
-		X64Reg sizesReg = regCache_.Alloc(RegCache::VEC_TEMP5);
-		MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
+		sizesReg = regCache_.Alloc(RegCache::VEC_TEMP5);
+		// We actually want this in 32-bit, though, so extend.
+		if (cpu_info.bSSE4_1) {
+			PMOVZXWD(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
+		} else {
+			MOVQ_xmm(sizesReg, MComplex(idReg, levelReg, SCALE_4, offsetof(SamplerID, cached.sizes[0].w)));
+			X64Reg zeroReg = GetZeroVec();
+			PUNPCKLWD(sizesReg, R(zeroReg));
+			regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
+		}

 		if (releaseLevelReg)
 			regCache_.Release(levelReg, RegCache::GEN_ARG_LEVEL);
@ -2548,75 +2558,31 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 			regCache_.Unlock(levelReg, RegCache::GEN_ARG_LEVEL);
 		UnlockSamplerID(idReg);

-		X64Reg tempVecReg = regCache_.Alloc(RegCache::VEC_TEMP0);
-		auto loadSizeAndMul = [&](X64Reg dest, X64Reg size, bool isY, bool isLevel1) {
-			// Grab the size and mask out 16 bits using walls.
-			if (cpu_info.bSSE4_1) {
-				int lane = (isY ? 1 : 0) + (isLevel1 ? 2 : 0);
-				PMOVZXWD(size, R(sizesReg));
-				PSHUFD(size, R(size), _MM_SHUFFLE(lane, lane, lane, lane));
-			} else {
-				if (size != sizesReg)
-					MOVDQA(size, R(sizesReg));
-				PSLLQ(size, 48 - (isY ? 16 : 0) - (isLevel1 ? 32 : 0));
-				PSRLQ(size, 48);
-				PSHUFD(size, R(size), _MM_SHUFFLE(0, 0, 0, 0));
-			}
+		// Now make a float version of sizesReg, times 256.
+		X64Reg sizes256Reg = regCache_.Alloc(RegCache::VEC_TEMP0);
+		PSLLD(sizes256Reg, sizesReg, 8);
+		CVTDQ2PS(sizes256Reg, R(sizes256Reg));

-			PSLLD(tempVecReg, size, 8);
-			CVTDQ2PS(tempVecReg, R(tempVecReg));
-			// And then multiply.
-			MULPS(dest, R(tempVecReg));
-		};
+		// Next off, move S and T into a single reg, which will become U0 V0 U1 V1.
+		UNPCKLPS(sReg, R(tReg));
+		SHUFPS(sReg, R(sReg), _MM_SHUFFLE(1, 0, 1, 0));
+		// And multiply by the sizes, all lined up already.
+		MULPS(sReg, R(sizes256Reg));
+		regCache_.Release(sizes256Reg, RegCache::VEC_TEMP0);

-		// Copy out S and T so we can multiply.
-		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
-		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
-		MOVAPS(u1Reg, R(sReg));
-		MOVAPS(v1Reg, R(tReg));
-
-		// Load width and height for the given level, and multiply sReg/tReg meanwhile.
-		width0VecReg = regCache_.Alloc(RegCache::VEC_TEMP2);
-		loadSizeAndMul(sReg, width0VecReg, false, false);
-		height0VecReg = regCache_.Alloc(RegCache::VEC_TEMP3);
-		loadSizeAndMul(tReg, height0VecReg, true, false);
-
-		// And same for the next level, but with u1Reg/v1Reg.
-		width1VecReg = regCache_.Alloc(RegCache::VEC_TEMP4);
-		loadSizeAndMul(u1Reg, width1VecReg, false, true);
-		// We reuse this one we allocated above.
-		height1VecReg = sizesReg;
-		loadSizeAndMul(v1Reg, height1VecReg, true, true);
-
-		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
-		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
-
-		// Now just subtract one.  We use this later for clamp/wrap.
-		MOVDQA(tempVecReg, M(constOnes32_));
-		PSUBD(width0VecReg, R(tempVecReg));
-		PSUBD(height0VecReg, R(tempVecReg));
-		PSUBD(width1VecReg, R(tempVecReg));
-		PSUBD(height1VecReg, R(tempVecReg));
-		regCache_.Release(tempVecReg, RegCache::VEC_TEMP0);
+		// For wrap/clamp purposes, we want width or height minus one.  Do that now.
+		PSUBD(sizesReg, M(constOnes32_));
 	} else {
 		// Easy mode.
-		MULSS(sReg, M(constWidth256f_));
-		MULSS(tReg, M(constHeight256f_));
+		UNPCKLPS(sReg, R(tReg));
+		MULPS(sReg, M(constWidthHeight256f_));
 	}

 	// And now, convert to integers for all later processing.
 	CVTPS2DQ(sReg, R(sReg));
-	CVTPS2DQ(tReg, R(tReg));
-	if (regCache_.Has(RegCache::VEC_U1)) {
-		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
-		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
-		CVTPS2DQ(u1Reg, R(u1Reg));
-		CVTPS2DQ(v1Reg, R(v1Reg));
-		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
-		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
-	}

 	// Now adjust X and Y...
+	// TODO: Could we cache this?  Should only vary on offset, maybe?
 	X64Reg xReg = regCache_.Find(RegCache::GEN_ARG_X);
 	X64Reg yReg = regCache_.Find(RegCache::GEN_ARG_Y);
 	NEG(32, R(xReg));
@ -2629,19 +2595,9 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 	// Add them in.  We do this in the SSE because we have more to do there...
 	X64Reg tempXYReg = regCache_.Alloc(RegCache::VEC_TEMP0);
 	MOVQ_xmm(tempXYReg, R(xReg));
+	if (id.hasAnyMips)
+		PSHUFD(tempXYReg, R(tempXYReg), _MM_SHUFFLE(1, 0, 1, 0));
 	PADDD(sReg, R(tempXYReg));
-	if (regCache_.Has(RegCache::VEC_U1)) {
-		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
-		PADDD(u1Reg, R(tempXYReg));
-		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
-	}
-	PSHUFD(tempXYReg, R(tempXYReg), _MM_SHUFFLE(1, 1, 1, 1));
-	PADDD(tReg, R(tempXYReg));
-	if (regCache_.Has(RegCache::VEC_V1)) {
-		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
-		PADDD(v1Reg, R(tempXYReg));
-		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
-	}
 	regCache_.Release(tempXYReg, RegCache::VEC_TEMP0);

 	regCache_.Unlock(xReg, RegCache::GEN_ARG_X);
@ -2652,54 +2608,54 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 	// We do want the fraction, though, so extract that.
 	X64Reg fracUReg = regCache_.Find(RegCache::GEN_ARG_FRAC_U);
 	X64Reg fracVReg = regCache_.Find(RegCache::GEN_ARG_FRAC_V);
-	if (regCache_.Has(RegCache::VEC_U1)) {
-		// Start with the next level so we end with current in the regs.
-		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
-		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
-		MOVD_xmm(R(fracUReg), u1Reg);
-		MOVD_xmm(R(fracVReg), v1Reg);
-		SHR(32, R(fracUReg), Imm8(4));
-		AND(32, R(fracUReg), Imm8(0x0F));
-		SHR(32, R(fracVReg), Imm8(4));
-		AND(32, R(fracVReg), Imm8(0x0F));
-		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
-		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
-
-		// Store them on the stack for now.
-		MOV(32, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_), R(fracUReg));
-		MOV(32, MDisp(RSP, stackArgPos_ + stackFracUV1Offset_ + 4), R(fracVReg));
+	X64Reg fracExtractReg = regCache_.Alloc(RegCache::VEC_TEMP0);
+	// We only want the four bits after the first four, though.
+	PSLLD(fracExtractReg, sReg, 24);
+	PSRLD(fracExtractReg, 28);
+	// Okay, grab the regs.
+	if (cpu_info.bSSE4_1) {
+		MOVD_xmm(R(fracUReg), fracExtractReg);
+		PEXTRD(R(fracVReg), fracExtractReg, 1);
+	} else {
+		MOVD_xmm(R(fracUReg), fracExtractReg);
+		PSRLDQ(fracExtractReg, 4);
+		MOVD_xmm(R(fracVReg), fracExtractReg);
 	}
-	MOVD_xmm(R(fracUReg), sReg);
-	MOVD_xmm(R(fracVReg), tReg);
-	SHR(32, R(fracUReg), Imm8(4));
-	AND(32, R(fracUReg), Imm8(0x0F));
-	SHR(32, R(fracVReg), Imm8(4));
-	AND(32, R(fracVReg), Imm8(0x0F));
+	// And store frac U1/V1 on the stack, if we have mips.
+	if (id.hasAnyMips) {
+		if (cpu_info.bSSE4_1) {
+			// They're next to each other, so one extract is enough.
+			PEXTRQ(MDisp(RSP, stackArgPos_ + stackFracUV1Offset_), fracExtractReg, 1);
+		} else {
+			// We already shifted 4 for fracVReg, shift again and store.
+			PSRLDQ(fracExtractReg, 4);
+			MOVQ_xmm(MDisp(RSP, stackArgPos_ + stackFracUV1Offset_), fracExtractReg);
+		}
+	}
+	regCache_.Release(fracExtractReg, RegCache::VEC_TEMP0);
 	regCache_.Unlock(fracUReg, RegCache::GEN_ARG_FRAC_U);
 	regCache_.Unlock(fracVReg, RegCache::GEN_ARG_FRAC_V);

-	// Get rid of the fractional bits, and spread out.
-	PSHUFD(sReg, R(sReg), _MM_SHUFFLE(0, 0, 0, 0));
-	PSHUFD(tReg, R(tReg), _MM_SHUFFLE(0, 0, 0, 0));
+	// With those extracted, we can now get rid of the fractional bits.
 	PSRAD(sReg, 8);
-	PSRAD(tReg, 8);
-	// Add U/V values for the next coords.
-	PADDD(sReg, M(constUNext_));
-	PADDD(tReg, M(constVNext_));

-	if (regCache_.Has(RegCache::VEC_U1)) {
+	// Now it's time to separate the lanes into separate registers and add next UV offsets.
+	if (id.hasAnyMips) {
 		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
 		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
-		PSHUFD(u1Reg, R(u1Reg), _MM_SHUFFLE(0, 0, 0, 0));
-		PSHUFD(v1Reg, R(v1Reg), _MM_SHUFFLE(0, 0, 0, 0));
-		PSRAD(u1Reg, 8);
-		PSRAD(v1Reg, 8);
+		PSHUFD(u1Reg, R(sReg), _MM_SHUFFLE(2, 2, 2, 2));
+		PSHUFD(v1Reg, R(sReg), _MM_SHUFFLE(3, 3, 3, 3));
 		PADDD(u1Reg, M(constUNext_));
 		PADDD(v1Reg, M(constVNext_));
 		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
 		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
 	}

+	PSHUFD(tReg, R(sReg), _MM_SHUFFLE(1, 1, 1, 1));
+	PSHUFD(sReg, R(sReg), _MM_SHUFFLE(0, 0, 0, 0));
+	PADDD(tReg, M(constVNext_));
+	PADDD(sReg, M(constUNext_));
+
 	X64Reg temp0ClampReg = regCache_.Alloc(RegCache::VEC_TEMP0);
 	bool temp0ClampZero = false;

@ -2724,8 +2680,12 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 			PANDN(temp0ClampReg, R(stReg));

 			// Now make a mask where bound is greater than the ST value in temp0ClampReg.
-			MOVDQA(stReg, bound);
-			PCMPGTD(stReg, R(temp0ClampReg));
+			if (cpu_info.bAVX && bound.IsSimpleReg()) {
+				VPCMPGTD(128, stReg, bound.GetSimpleReg(), R(temp0ClampReg));
+			} else {
+				MOVDQA(stReg, bound);
+				PCMPGTD(stReg, R(temp0ClampReg));
+			}
 			// Throw away the values that are greater in our temp0ClampReg in progress result.
 			PAND(temp0ClampReg, R(stReg));

@ -2736,26 +2696,31 @@ bool SamplerJitCache::Jit_GetTexelCoordsQuad(const SamplerID &id) {
 		}
 	};

-	doClamp(id.clampS, sReg, width0VecReg == INVALID_REG ? M(constWidthMinus1i_) : R(width0VecReg));
-	doClamp(id.clampT, tReg, height0VecReg == INVALID_REG ? M(constHeightMinus1i_) : R(height0VecReg));
-	if (width1VecReg != INVALID_REG) {
+	if (id.hasAnyMips) {
+		// We'll spread sizes out into a temp.
+		X64Reg spreadSizeReg = regCache_.Alloc(RegCache::VEC_TEMP1);
+
+		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(0, 0, 0, 0));
+		doClamp(id.clampS, sReg, R(spreadSizeReg));
+		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(1, 1, 1, 1));
+		doClamp(id.clampT, tReg, R(spreadSizeReg));
 		X64Reg u1Reg = regCache_.Find(RegCache::VEC_U1);
 		X64Reg v1Reg = regCache_.Find(RegCache::VEC_V1);
-		doClamp(id.clampS, u1Reg, R(width1VecReg));
-		doClamp(id.clampT, v1Reg, R(height1VecReg));
+		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(2, 2, 2, 2));
+		doClamp(id.clampS, u1Reg, R(spreadSizeReg));
+		PSHUFD(spreadSizeReg, R(sizesReg), _MM_SHUFFLE(3, 3, 3, 3));
+		doClamp(id.clampT, v1Reg, R(spreadSizeReg));
 		regCache_.Unlock(u1Reg, RegCache::VEC_U1);
 		regCache_.Unlock(v1Reg, RegCache::VEC_V1);
+
+		regCache_.Release(spreadSizeReg, RegCache::VEC_TEMP1);
+	} else {
+		doClamp(id.clampS, sReg, M(constWidthMinus1i_));
+		doClamp(id.clampT, tReg, M(constHeightMinus1i_));
 	}

-	if (width0VecReg != INVALID_REG)
-		regCache_.Release(width0VecReg, RegCache::VEC_TEMP2);
-	if (height0VecReg != INVALID_REG)
-		regCache_.Release(height0VecReg, RegCache::VEC_TEMP3);
-	if (width1VecReg != INVALID_REG)
-		regCache_.Release(width1VecReg, RegCache::VEC_TEMP4);
-	if (height1VecReg != INVALID_REG)
-		regCache_.Release(height1VecReg, RegCache::VEC_TEMP5);
-
+	if (sizesReg != INVALID_REG)
+		regCache_.Release(sizesReg, RegCache::VEC_TEMP5);
 	regCache_.Release(temp0ClampReg, RegCache::VEC_TEMP0);

 	regCache_.Unlock(sReg, RegCache::VEC_ARG_S);