//
// arch/x86_64/rsp/vmulm.h
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//

#include "common.h"

static inline __m128i rsp_vmadm_vmudm(uint32_t iw, __m128i vs, __m128i vt,
  __m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
  __m128i lo, hi, sign, overflow_mask;

  lo = _mm_mullo_epi16(vs, vt); 
  hi = _mm_mulhi_epu16(vs, vt); 

  // What we're really want to do is unsigned vs * signed vt.
  // However, we have no such instructions to do so.
  //   
  // There's a trick to "fix" an unsigned product, though:
  // If vt was negative, take the upper 16-bits of the product
  // and subtract vs.
  sign = _mm_srai_epi16(vs, 15); 
  vt = _mm_and_si128(vt, sign);
  hi = _mm_sub_epi16(hi, vt); 

  // VMADM
  if (iw & 0x8) {
    // Tricky part: start accumulate everything.
    // Get/keep the carry as we'll add it in later.
    overflow_mask = _mm_adds_epu16(*acc_lo, lo); 
    *acc_lo = _mm_add_epi16(*acc_lo, lo); 

    overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
    overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);

    // This is REALLY clever. Since the product results from
    // two 16-bit components, one positive and one negative,
    // we don't have to worry about carrying the 1 (we can
    // only borrow) past 32-bits. So we can just add it here.
    hi = _mm_sub_epi16(hi, overflow_mask);

    // Check for overflow of the upper sum.
    overflow_mask = _mm_adds_epu16(*acc_md, hi); 
    *acc_md = _mm_add_epi16(*acc_md, hi); 

    overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
    overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);

    // Finish up the accumulation of the... accumulator.
    *acc_hi = _mm_add_epi16(*acc_hi, _mm_srai_epi16(hi, 15));
    *acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);

    return rsp_sclamp_acc_tomd(*acc_md, *acc_hi);
  }

  // VMUDM
  else {
    *acc_lo = lo;
    *acc_md = hi;
    *acc_hi = _mm_srai_epi16(hi, 15);

    return hi;
  }
}