// // arch/x86_64/rsp/vmadl.h // // This file is subject to the terms and conditions defined in // 'LICENSE', which is part of this source code package. // #include "common.h" static inline __m128i rsp_vmadl_vmudl(uint32_t iw, __m128i vs, __m128i vt, __m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) { __m128i hi, overflow_mask; hi = _mm_mulhi_epu16(vs, vt); // VMADL if (iw & 0x8) { // Tricky part: start accumulate everything. // Get/keep the carry as we'll add it in later. overflow_mask = _mm_adds_epu16(*acc_lo, hi); *acc_lo = _mm_add_epi16(*acc_lo, hi); overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask); overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); hi = _mm_sub_epi16(zero, overflow_mask); // Check for overflow of the upper sum. // // TODO: Since hi can only be {0,1}, we should // be able to generalize this for performance. overflow_mask = _mm_adds_epu16(*acc_md, hi); *acc_md = _mm_add_epi16(*acc_md, hi); overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask); overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero); // Finish up the accumulation of the... accumulator. // Since the product was unsigned, only worry about // positive overflow (i.e.: borrowing not possible). *acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask); return rsp_uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero); } // VMUDL else { *acc_lo = hi; *acc_md = zero; *acc_hi = zero; return hi; } }