mirror of
https://github.com/n64dev/cen64.git
synced 2025-04-02 10:31:54 -04:00
No need to separate all these functions when they contain so much common code, so start combining things for the sake of locality and predictor effectiveness (and size). In addition to these benefits, the CPU backend is usually busy during the execution of these functions, so suffering a misprediction isn't as painful (especially seeing as we can potentially improve the prediction from the indirect branch).
65 lines
2 KiB
C
65 lines
2 KiB
C
//
|
|
// arch/x86_64/rsp/vmacf.h
|
|
//
|
|
// This file is subject to the terms and conditions defined in
|
|
// 'LICENSE', which is part of this source code package.
|
|
//
|
|
|
|
static inline __m128i rsp_vmacf_vmacu(uint32_t iw, __m128i vs, __m128i vt,
|
|
__m128i zero, __m128i *acc_lo, __m128i *acc_md, __m128i *acc_hi) {
|
|
__m128i overflow_hi_mask, overflow_md_mask;
|
|
__m128i lo, md, hi, carry, overflow_mask;
|
|
|
|
// Get the product and shift it over
|
|
// being sure to save the carries.
|
|
lo = _mm_mullo_epi16(vs, vt);
|
|
hi = _mm_mulhi_epi16(vs, vt);
|
|
|
|
md = _mm_slli_epi16(hi, 1);
|
|
carry = _mm_srli_epi16(lo, 15);
|
|
hi = _mm_srai_epi16(hi, 15);
|
|
md = _mm_or_si128(md, carry);
|
|
lo = _mm_slli_epi16(lo, 1);
|
|
|
|
// Tricky part: start accumulating everything.
|
|
// Get/keep the carry as we'll add it in later.
|
|
overflow_mask = _mm_adds_epu16(*acc_lo, lo);
|
|
*acc_lo = _mm_add_epi16(*acc_lo, lo);
|
|
|
|
overflow_mask = _mm_cmpeq_epi16(*acc_lo, overflow_mask);
|
|
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
|
|
|
|
// Add in the carry. If the middle portion is
|
|
// already 0xFFFF and we have a carry, we have
|
|
// to carry the all the way up to hi.
|
|
md = _mm_sub_epi16(md, overflow_mask);
|
|
carry = _mm_cmpeq_epi16(md, zero);
|
|
carry = _mm_and_si128(carry, overflow_mask);
|
|
hi = _mm_sub_epi16(hi, carry);
|
|
|
|
// Accumulate the middle portion.
|
|
overflow_mask = _mm_adds_epu16(*acc_md, md);
|
|
*acc_md = _mm_add_epi16(*acc_md, md);
|
|
|
|
overflow_mask = _mm_cmpeq_epi16(*acc_md, overflow_mask);
|
|
overflow_mask = _mm_cmpeq_epi16(overflow_mask, zero);
|
|
|
|
// Finish up the accumulation of the... accumulator.
|
|
*acc_hi = _mm_add_epi16(*acc_hi, hi);
|
|
*acc_hi = _mm_sub_epi16(*acc_hi, overflow_mask);
|
|
|
|
// VMACU
|
|
if (iw & 0x1) {
|
|
overflow_hi_mask = _mm_srai_epi16(*acc_hi, 15);
|
|
overflow_md_mask = _mm_srai_epi16(*acc_md, 15);
|
|
md = _mm_or_si128(overflow_md_mask, *acc_md);
|
|
overflow_mask = _mm_cmpgt_epi16(*acc_hi, zero);
|
|
md = _mm_andnot_si128(overflow_hi_mask, md);
|
|
return _mm_or_si128(overflow_mask, md);
|
|
}
|
|
|
|
// VMACF
|
|
else
|
|
return rsp_sclamp_acc_tomd(*acc_md, *acc_hi);
|
|
}
|
|
|