diff --git a/Common/Common.h b/Common/Common.h
index c24342f6a2..e1049f2dda 100644
--- a/Common/Common.h
+++ b/Common/Common.h
@@ -87,17 +87,3 @@
 
 #define __forceinline inline __attribute__((always_inline))
 #endif
-
-#if defined __SSE4_2__
-# define _M_SSE 0x402
-#elif defined __SSE4_1__
-# define _M_SSE 0x401
-#elif defined __SSSE3__
-# define _M_SSE 0x301
-#elif defined __SSE3__
-# define _M_SSE 0x300
-#elif defined __SSE2__
-# define _M_SSE 0x200
-#elif !defined(__GNUC__) && (defined(_M_X64) || defined(_M_IX86))
-# define _M_SSE 0x402
-#endif
diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
index 61e8b6fd17..8e812a7819 100644
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@@ -67,6 +67,20 @@ static inline uint32x4_t vcgezq_f32(float32x4_t v) {
 
 #if PPSSPP_ARCH(SSE2)
 
+#if defined __SSE4_2__
+# define _M_SSE 0x402
+#elif defined __SSE4_1__
+# define _M_SSE 0x401
+#elif defined __SSSE3__
+# define _M_SSE 0x301
+#elif defined __SSE3__
+# define _M_SSE 0x300
+#elif defined __SSE2__
+# define _M_SSE 0x200
+#elif !defined(__GNUC__) && (defined(_M_X64) || defined(_M_IX86))
+# define _M_SSE 0x402
+#endif
+
 // These are SSE2 versions of SSE4.1 instructions, for compatibility and ease of
 // writing code.
 // May later figure out how to use the appropriate ones depending on compile flags.
diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c
index 13d202e5e7..0402f36629 100644
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@@ -4,9 +4,7 @@
 
 #include "fast_matrix.h"
 
-#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
-
-#include <emmintrin.h>
+#if PPSSPP_ARCH(SSE2)
 
 #include "fast_matrix.h"
 
@@ -28,12 +26,6 @@ void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 
 #elif PPSSPP_ARCH(ARM_NEON)
 
-#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
-#include <arm64_neon.h>
-#else
-#include <arm_neon.h>
-#endif
-
 #if PPSSPP_ARCH(ARM)
 static inline float32x4_t vfmaq_laneq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b, int lane) {
 	if (lane == 0)      return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 0);
diff --git a/Core/MIPS/x86/CompVFPU.cpp b/Core/MIPS/x86/CompVFPU.cpp
index cbb678e2b4..1fe06707e4 100644
--- a/Core/MIPS/x86/CompVFPU.cpp
+++ b/Core/MIPS/x86/CompVFPU.cpp
@@ -23,11 +23,10 @@
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 #include <cmath>
 #include <limits>
-#include <emmintrin.h>
-
 #include "Common/Math/math_util.h"
 
 #include "Common/CPUDetect.h"
+#include "Common/Math/SIMDHeaders.h"
 #include "Common/Log.h"
 #include "Core/Compatibility.h"
 #include "Core/Config.h"
diff --git a/Core/MIPS/x86/RegCacheFPU.cpp b/Core/MIPS/x86/RegCacheFPU.cpp
index de2ef1e7ff..44c0e27ca6 100644
--- a/Core/MIPS/x86/RegCacheFPU.cpp
+++ b/Core/MIPS/x86/RegCacheFPU.cpp
@@ -19,8 +19,7 @@
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 
 #include <cstring>
-#include <emmintrin.h>
-
+#include "Common/Math/SIMDHeaders.h"
 #include "Common/Log.h"
 #include "Common/x64Emitter.h"
 #include "Core/MIPS/MIPSAnalyst.h"
diff --git a/GPU/Common/IndexGenerator.cpp b/GPU/Common/IndexGenerator.cpp
index 0488ab5cc9..2c7ce5975c 100644
--- a/GPU/Common/IndexGenerator.cpp
+++ b/GPU/Common/IndexGenerator.cpp
@@ -19,7 +19,6 @@
 
 #include "ppsspp_config.h"
 
-#include "Common/Common.h"
 #include "Common/Math/SIMDHeaders.h"
 #include "GPU/Common/IndexGenerator.h"
 
diff --git a/GPU/Common/TextureScalerCommon.cpp b/GPU/Common/TextureScalerCommon.cpp
index 4603638af9..43363c688f 100644
--- a/GPU/Common/TextureScalerCommon.cpp
+++ b/GPU/Common/TextureScalerCommon.cpp
@@ -25,12 +25,10 @@
 #include "Core/Config.h"
 #include "Common/Common.h"
 #include "Common/Log.h"
+#include "Common/Math/SIMDHeaders.h"
 #include "Common/Thread/ParallelLoop.h"
 #include "ext/xbrz/xbrz.h"
 
-#include "Common/Math/SIMDHeaders.h"
-
-
 // Report the time and throughput for each larger scaling operation in the log
 //#define SCALING_MEASURE_TIME
 #include "Common/TimeUtil.h"
diff --git a/GPU/Common/VertexDecoderX86.cpp b/GPU/Common/VertexDecoderX86.cpp
index 1df3be8388..5c29bbcab2 100644
--- a/GPU/Common/VertexDecoderX86.cpp
+++ b/GPU/Common/VertexDecoderX86.cpp
@@ -16,12 +16,12 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include "ppsspp_config.h"
-#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 
-#include <emmintrin.h>
+#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 
 #include "Common/CPUDetect.h"
 #include "Common/Data/Convert/ColorConv.h"
+#include "Common/Math/SIMDHeaders.h"
 #include "Core/Config.h"
 #include "GPU/GPUState.h"
 #include "GPU/Common/VertexDecoderCommon.h"
diff --git a/GPU/GPUState.cpp b/GPU/GPUState.cpp
index 03b7722730..1049b8be96 100644
--- a/GPU/GPUState.cpp
+++ b/GPU/GPUState.cpp
@@ -17,13 +17,13 @@
 
 #include "ppsspp_config.h"
 #include "Common/Common.h"
+#include "Common/Math/SIMDHeaders.h"
 #include "Common/Serialize/Serializer.h"
 #include "Common/Serialize/SerializeFuncs.h"
 #include "Core/MemMap.h"
 #include "GPU/ge_constants.h"
 #include "GPU/GPUCommon.h"
 #include "GPU/GPUState.h"
-#include "Common/Math/SIMDHeaders.h"
 
 // This must be aligned so that the matrices within are aligned.
 alignas(16) GPUgstate gstate;
diff --git a/GPU/Math3D.cpp b/GPU/Math3D.cpp
index c380a72022..2fa2caba40 100644
--- a/GPU/Math3D.cpp
+++ b/GPU/Math3D.cpp
@@ -15,8 +15,9 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
-#include "Common/Common.h"
 #include "GPU/Math3D.h"
+#include "Common/Common.h"
+#include "Common/Math/SIMDHeaders.h"
 
 #if PPSSPP_ARCH(SSE2)
 // For the SSE4 stuff.
diff --git a/GPU/Software/DrawPixelX86.cpp b/GPU/Software/DrawPixelX86.cpp
index 2c887f44f1..187c3b2a0e 100644
--- a/GPU/Software/DrawPixelX86.cpp
+++ b/GPU/Software/DrawPixelX86.cpp
@@ -16,12 +16,13 @@
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
 #include "ppsspp_config.h"
+
 #if PPSSPP_ARCH(AMD64)
 
-#include <emmintrin.h>
 #include "Common/x64Emitter.h"
 #include "Common/CPUDetect.h"
 #include "Common/LogReporting.h"
+#include "Common/Math/SIMDHeaders.h"
 #include "GPU/GPUState.h"
 #include "GPU/Software/DrawPixel.h"
 #include "GPU/Software/SoftGpu.h"
diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
index 8613a92d6b..30ce034924 100644
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@@ -38,6 +38,11 @@
 
 #include "Common/Math/SIMDHeaders.h"
 
+// For the SSE4 stuff
+#if PPSSPP_ARCH(SSE2)
+#include <smmintrin.h>
+#endif
+
 namespace Rasterizer {
 
 // Only OK on x64 where our stack is aligned
diff --git a/GPU/Software/RasterizerRegCache.h b/GPU/Software/RasterizerRegCache.h
index 4b8ada5007..20c9263687 100644
--- a/GPU/Software/RasterizerRegCache.h
+++ b/GPU/Software/RasterizerRegCache.h
@@ -25,9 +25,8 @@
 #include <vector>
 
 #include "Common/Common.h"
-#if defined(_M_SSE)
-#include <emmintrin.h>
-#endif
+#include "Common/Math/SIMDHeaders.h"
+
 #if PPSSPP_ARCH(ARM64_NEON)
 #if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
 #include <arm64_neon.h>
diff --git a/GPU/Software/SamplerX86.cpp b/GPU/Software/SamplerX86.cpp
index 3b3d432210..31a607ef64 100644
--- a/GPU/Software/SamplerX86.cpp
+++ b/GPU/Software/SamplerX86.cpp
@@ -18,7 +18,7 @@
 #include "ppsspp_config.h"
 #if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
 
-#include <emmintrin.h>
+#include "Common/Math/SIMDHeaders.h"
 #include "Common/x64Emitter.h"
 #include "Common/BitScan.h"
 #include "Common/CPUDetect.h"
diff --git a/GPU/Software/TransformUnit.cpp b/GPU/Software/TransformUnit.cpp
index d6dde859e1..63b4ae54d4 100644
--- a/GPU/Software/TransformUnit.cpp
+++ b/GPU/Software/TransformUnit.cpp
@@ -15,7 +15,10 @@
 // Official git repository and contact information can be found at
 // https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
 
+#include "ppsspp_config.h"
+
 #include <cmath>
+
 #include "Common/Common.h"
 #include "Common/CPUDetect.h"
 #include "Common/Math/math_util.h"
@@ -25,12 +28,18 @@
 #include "GPU/Common/DrawEngineCommon.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 #include "GPU/Common/SoftwareTransformCommon.h"
+#include "Common/Math/SIMDHeaders.h"
 #include "GPU/Software/BinManager.h"
 #include "GPU/Software/Clipper.h"
 #include "GPU/Software/Lighting.h"
 #include "GPU/Software/RasterizerRectangle.h"
 #include "GPU/Software/TransformUnit.h"
 
+// For the SSE4 stuff
+#if PPSSPP_ARCH(SSE2)
+#include <smmintrin.h>
+#endif
+
 #define TRANSFORM_BUF_SIZE (65536 * 48)
 
 TransformUnit::TransformUnit() {
diff --git a/ext/at3_standalone/atrac3plusdsp.cpp b/ext/at3_standalone/atrac3plusdsp.cpp
index a070148e0e..63d4d7c57f 100644
--- a/ext/at3_standalone/atrac3plusdsp.cpp
+++ b/ext/at3_standalone/atrac3plusdsp.cpp
@@ -659,7 +659,7 @@ void ff_atrac3p_ipqf(FFTContext *dct_ctx, Atrac3pIPQFChannelCtx *hist,
             const float *coeffs2 = ipqf_coeffs2[t];
 
             float *outp = out + s * 16;
-#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
+#if PPSSPP_ARCH(SSE2)
             auto _mm_reverse = [](__m128 x) -> __m128 {
                 return _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3));
             };