/* ScummVM - Graphic Adventure Engine * * ScummVM is the legal property of its developers, whose names * are too numerous to list here. Please refer to the COPYRIGHT * file distributed with this source distribution. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . * */ #include "common/scummsys.h" #include "graphics/blit/blit-alpha.h" #include "graphics/pixelformat.h" #include #if !defined(__x86_64__) #if defined(__clang__) #pragma clang attribute push (__attribute__((target("sse2"))), apply_to=function) #elif defined(__GNUC__) #pragma GCC push_options #pragma GCC target("sse2") #endif #endif // !defined(__x86_64__) namespace Graphics { static FORCEINLINE __m128i sse2_mul32(__m128i a, __m128i b) { __m128i even = _mm_shuffle_epi32(_mm_mul_epu32(a, b), _MM_SHUFFLE(0, 0, 2, 0)); __m128i odd = _mm_shuffle_epi32(_mm_mul_epu32(_mm_bsrli_si128(a, 4), _mm_bsrli_si128(b, 4)), _MM_SHUFFLE(0, 0, 2, 0)); return _mm_unpacklo_epi32(even, odd); } class BlendBlitImpl_SSE2 : public BlendBlitImpl_Base { friend class BlendBlit; template struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend { public: constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend(color) {} inline __m128i simd(__m128i src, __m128i dst) const { __m128i ina; if (alphamod) ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8); else ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)); __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128()); if (rgbmod) { __m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); __m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); __m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift); dstR = _mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kRModShift - 8); dstG = _mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kGModShift - 8); dstB = _mm_mullo_epi16(dstB, _mm_sub_epi32(_mm_set1_epi32(255), ina)); srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(this->cr)), BlendBlit::kRModShift - 8)); srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(this->cg)), BlendBlit::kGModShift - 8)); srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(this->cb))); src = _mm_or_si128(_mm_and_si128(srcB, _mm_set1_epi32(BlendBlit::kBModMask)), _mm_set1_epi32(BlendBlit::kAModMask)); src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), src); src = _mm_or_si128(_mm_and_si128(srcR, _mm_set1_epi32(BlendBlit::kRModMask)), src); } else { __m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); dstRB = _mm_srli_epi32(sse2_mul32(dstRB, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8); dstG = _mm_srli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8); srcRB = _mm_slli_epi32(_mm_add_epi32(dstRB, _mm_srli_epi32(sse2_mul32(srcRB, ina), 8)), BlendBlit::kBModShift); srcG = _mm_slli_epi32(_mm_add_epi32(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift); src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), _mm_set1_epi32(BlendBlit::kAModMask)); src = _mm_or_si128(_mm_and_si128(srcRB, _mm_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src); } dst = _mm_and_si128(alphaMask, dst); src = _mm_andnot_si128(alphaMask, src); return _mm_or_si128(dst, src); } }; template struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend { public: constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend(color) {} inline __m128i simd(__m128i src, __m128i dst) const { __m128i ina, alphaMask; if (alphamod) { ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8); alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128()); } else { ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)); alphaMask = _mm_set1_epi32(BlendBlit::kAModMask); } if (rgbmod) { __m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); __m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); srcB = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstB, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcB, _mm_set1_epi32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask)); srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcG, _mm_set1_epi32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask)); srcR = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcR, _mm_set1_epi32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask)); src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)); src = _mm_or_si128(src, _mm_or_si128(srcB, _mm_or_si128(srcG, srcR))); } else { constexpr uint32 rbMask = BlendBlit::kRModMask | BlendBlit::kBModMask; __m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(rbMask)), BlendBlit::kBModShift); __m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(rbMask)), BlendBlit::kBModShift); srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), 8), _mm_set1_epi32(BlendBlit::kGModMask)); srcRB = _mm_and_si128(_mm_mullo_epi16(dstRB, _mm_srli_epi32(_mm_and_si128(sse2_mul32(srcRB, ina), _mm_set1_epi32(rbMask)), 8)), _mm_set1_epi32(rbMask)); src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)); src = _mm_or_si128(src, _mm_or_si128(srcRB, srcG)); } dst = _mm_and_si128(alphaMask, dst); src = _mm_andnot_si128(alphaMask, src); return _mm_or_si128(dst, src); } }; template struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend { public: constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend(color) {} inline __m128i simd(__m128i src, __m128i dst) const { return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)); } }; template struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend { public: constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend(color) {} inline __m128i simd(__m128i src, __m128i dst) const { __m128i alphaMask = _mm_cmpeq_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_setzero_si128()); dst = _mm_and_si128(dst, alphaMask); src = _mm_andnot_si128(alphaMask, _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask))); return _mm_or_si128(src, dst); } }; template struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend { public: constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend(color) {} inline __m128i simd(__m128i src, __m128i dst) const { __m128i ina; if (alphamod) ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8); else ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)); __m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_set1_epi32(0)); if (rgbmod) { __m128i srcb = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)); __m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); __m128i dstb = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)); __m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(this->cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask)); srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(this->cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask)); srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(this->cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask)); src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask)); src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr))); } else if (alphamod) { __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)); __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)); __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift); srcg = _mm_and_si128(_mm_add_epi32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), _mm_set1_epi32(BlendBlit::kGModMask)); srcrb = _mm_and_si128(_mm_add_epi32(dstrb, sse2_mul32(srcrb, ina)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)); src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask)); src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg)); } else { __m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)); __m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)); __m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift); srcg = _mm_and_si128(_mm_add_epi32(dstg, srcg), _mm_set1_epi32(BlendBlit::kGModMask)); srcrb = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(dstrb, srcrb), 8), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)); src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask)); src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg)); } dst = _mm_and_si128(alphaMask, dst); src = _mm_andnot_si128(alphaMask, src); return _mm_or_si128(dst, src); } }; template struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend { public: constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend(color) {} inline __m128i simd(__m128i src, __m128i dst) const { __m128i ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)); __m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); __m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift); __m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift); __m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift); srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(this->cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask)); srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(this->cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask)); srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(this->cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask)); return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr))); } }; public: template