WIP: vfpu_dot experiments

2025-04-02 11:01:50 -04:00 · 2019-08-06 12:16:44 +02:00 · 2019-08-06 12:16:44 +02:00 · 35f848423c
commit 35f848423c
parent 2ba998d007
3 changed files with 100 additions and 6 deletions
--- a/Core/MIPS/MIPSIntVFPU.cpp
+++ b/Core/MIPS/MIPSIntVFPU.cpp
@ -481,7 +481,8 @@ namespace MIPSInt
 				}

 				if (useAccurateDot) {
-					sum.f = vfpu_dot(&s[b * 4], &t[a * 4]);
+					// sum.f = vfpu_dot(&s[b * 4], &t[a * 4]);
+					sum.f = vfpu_dot_tekken_approx(&s[b * 4], &t[a * 4]);
 					if (my_isnan(sum.f)) {
 						sum.u = 0x7f800001;
 					} else if ((sum.u & 0x7F800000) == 0) {
--- a/Core/MIPS/MIPSVFPUUtils.cpp
+++ b/Core/MIPS/MIPSVFPUUtils.cpp
@ -17,8 +17,12 @@

 #include <cstdint>
 #include <limits>
-#include <stdio.h>
+#include <cstdio>

+#include "Common.h"
+#ifdef _M_SSE
+#include <emmintrin.h>
+#endif
 #include "Common/BitScan.h"
 #include "Common/CommonFuncs.h"
 #include "Core/Reporting.h"
@ -610,20 +614,20 @@ float Float16ToFloat32(unsigned short l)
 	return f;
 }

-static uint32_t get_uexp(uint32_t x) {
+inline uint32_t get_uexp(uint32_t x) {
 	return (x >> 23) & 0xFF;
 }

-int32_t get_exp(uint32_t x) {
+inline int32_t get_exp(uint32_t x) {
 	return get_uexp(x) - 127;
 }

-static int32_t get_mant(uint32_t x) {
+inline int32_t get_mant(uint32_t x) {
 	// Note: this returns the hidden 1.
 	return (x & 0x007FFFFF) | 0x00800000;
 }

-static int32_t get_sign(uint32_t x) {
+inline int32_t get_sign(uint32_t x) {
 	return x & 0x80000000;
 }

@ -744,6 +748,94 @@ float vfpu_dot(float a[4], float b[4]) {
 	return result.f;
 }

+// A cut-down version of vfpu_dot that works for Tekken VMMUL (no leg shaking).
+// The aim is to get this down to something that can be practically JIT'd.
+float vfpu_dot_tekken_approx(float a[4], float b[4]) {
+	static const int EXTRA_BITS = 2;
+	union float2int {
+		uint32_t i;
+		float f;
+	};
+	float2int result;
+	int32_t exps[4];
+	int32_t mants[4];
+	int32_t signs[4];
+	int32_t max_exp = 0;
+	int32_t last_inf = -1;
+
+	// This can be SSE'd or NEON'd.
+#if 0 && defined(_M_SSE)
+	const __m128i mask_exp = _mm_set1_epi32(0xFF);
+	const __m128i mask_sign = _mm_set1_epi32(0x80000000);
+	const __m128i mask_mant = _mm_set1_epi32(0x007FFFFF);
+	const __m128i or_mask_mant = _mm_set1_epi32(0x00800000);
+
+	__m128i mm_prod = _mm_castps_si128(_mm_mul_ps(_mm_loadu_ps(a), _mm_loadu_ps(b)));
+	__m128i mm_exp = _mm_and_si128(_mm_srli_epi32(mm_prod, 23), mask_exp);
+	__m128i mm_sign = _mm_and_si128(mm_prod, mask_sign);
+	__m128i mm_mant = _mm_or_si128(_mm_and_si128(mm_prod, mask_mant), or_mask_mant);
+
+	_mm_storeu_si128((__m128i *)exps, mm_exp);
+	_mm_storeu_si128((__m128i *)signs, mm_sign);
+	_mm_storeu_si128((__m128i *)mants, mm_mant);
+
+	// TODO: This in SSE... gah. There's an SSE4 one here: https://stackoverflow.com/questions/9877700/getting-max-value-in-a-m128i-vector-with-sse
+	for (int i = 0; i < 4; i++) {
+		if (exps[i] > max_exp) {
+			max_exp = exps[i];
+		}
+	}
+#else
+	for (int i = 0; i < 4; i++) {
+		float2int prod;
+		prod.f = a[i] * b[i];
+		exps[i] = get_uexp(prod.i);
+		signs[i] = get_sign(prod.i);
+		mants[i] = get_mant(prod.i);
+		if (exps[i] > max_exp) {
+			max_exp = exps[i];
+		}
+	}
+#endif
+
+	// This can be NEON'd (or AVX'd - SSE doesn't do per-lane variable shifts).
+	int32_t mant_sum = 0;
+	for (int i = 0; i < 4; i++) {
+		int exp = max_exp - exps[i];
+		if (exp >= 32) {
+			mants[i] = 0;
+		} else {
+			mants[i] >>= exp;
+		}
+		if (signs[i]) {
+			mants[i] = -mants[i];
+		}
+		mant_sum += mants[i];
+	}
+
+	uint32_t sign_sum = 0;
+	if (mant_sum < 0) {
+		sign_sum = 0x80000000;
+		mant_sum = -mant_sum;
+	}
+
+	if (mant_sum == 0 || max_exp <= 0) {
+		return 0.0f;
+	}
+
+	int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
+	if (shift < 0) {
+		// Dropped the rounding.
+		mant_sum >>= -shift;
+		max_exp += -shift;
+	} else {
+		mant_sum <<= shift;
+		max_exp -= shift;
+	}
+	result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
+	return result.f;
+}
+
 // TODO: This is still not completely accurate compared to the PSP's vsqrt.
 float vfpu_sqrt(float a) {
 	union float2int {
--- a/Core/MIPS/MIPSVFPUUtils.h
+++ b/Core/MIPS/MIPSVFPUUtils.h
@ -98,6 +98,7 @@ inline float vfpu_clamp(float v, float min, float max) {
 }

 float vfpu_dot(float a[4], float b[4]);
+float vfpu_dot_tekken_approx(float a[4], float b[4]);
 float vfpu_sqrt(float a);
 float vfpu_rsqrt(float a);