cen64/arch/x86_64/rsp/vmull.h
Tyler Stachecki 7262516636 Start merging RSP vector functions.
No need to separate all these functions when they contain so
much common code, so start combining things for the sake of
locality and predictor effectiveness (and size). In addition
to these benefits, the CPU backend is usually busy during the
execution of these functions, so suffering a misprediction
isn't as painful (especially seeing as we can potentially
improve the prediction from the indirect branch).
2015-01-02 22:17:41 -05:00

55 lines
1.5 KiB
C

//
// arch/x86_64/rsp/vmadl.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "common.h"
static inline __m128i rsp_vmadl_vmudl(uint32_t iw, __m128i vs, __m128i vt,
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
__m128i hi, overflow_mask;
hi = _mm_mulhi_epu16(vs, vt);
// VMADL
if (iw & 0x8) {
// Tricky part: start accumulate everything.
// Get/keep the carry as we'll add it in later.
overflow_mask = _mm_adds_epu16(*acc_lo, hi);
*acc_lo = _mm_add_epi16(*acc_lo, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
hi = _mm_sub_epi16(zero, overflow_mask);
// Check for overflow of the upper sum.
//
// TODO: Since hi can only be {0,1}, we should
// be able to generalize this for performance.
overflow_mask = _mm_adds_epu16(*acc_md, hi);
*acc_md = _mm_add_epi16(*acc_md, hi);
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
// Finish up the accumulation of the... accumulator.
// Since the product was unsigned, only worry about
// positive overflow (i.e.: borrowing not possible).
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
return rsp_uclamp_acc(*acc_lo, *acc_md, *acc_hi, zero);
}
// VMUDL
else {
*acc_lo = hi;
*acc_md = zero;
*acc_hi = zero;
return hi;
}
}