cen64/arch/x86_64/rsp/rsp.c
2015-01-28 22:38:24 -05:00

556 lines
20 KiB
C

//
// arch/x86_64/rsp/rsp.c
//
// Declarations for host RSP functions.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "common.h"
#include "os/dynarec.h"
#include "rsp/cpu.h"
#include "rsp/pipeline.h"
#include "rsp/rsp.h"
//
// Masks for AND/OR/XOR and NAND/NOR/NXOR.
//
cen64_align(const uint16_t rsp_vlogic_mask[2][8], 32) = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0}
};
#ifdef __SSSE3__
//
// This table is used to "shuffle" the RSP vector after loading it.
//
cen64_align(const uint16_t shuffle_keys[16][8], CACHE_LINE_SIZE) = {
/* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E},
/* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E},
/* 0q */ {0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0D0C, 0x0D0C},
/* 1q */ {0x0302, 0x0302, 0x0706, 0x0706, 0x0B0A, 0x0B0A, 0x0F0E, 0x0F0E},
/* 0h */ {0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908},
/* 1h */ {0x0302, 0x0302, 0x0302, 0x0302, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A},
/* 2h */ {0x0504, 0x0504, 0x0504, 0x0504, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C},
/* 3h */ {0x0706, 0x0706, 0x0706, 0x0706, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E},
/* 0w */ {0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100},
/* 1w */ {0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302},
/* 2w */ {0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504},
/* 3w */ {0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706},
/* 4w */ {0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908},
/* 5w */ {0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A},
/* 6w */ {0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C},
/* 7w */ {0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E},
};
#endif
//
// These tables are used to shift data loaded from DMEM.
// In addition to shifting, they also take into account that
// DMEM uses big-endian byte ordering, whereas vectors are
// 2-byte little-endian.
//
// Shift left LUT; shifts in zeros from the right, one byte at a time.
cen64_align(const uint16_t sll_b2l_keys[16][8], CACHE_LINE_SIZE) = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E},
{0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C},
{0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A},
{0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000},
};
// Shift left LUT; shirts low order to high order, inserting 0x00s.
cen64_align(const uint16_t sll_l2b_keys[16][8], CACHE_LINE_SIZE) = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0E0C},
{0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A},
{0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08},
{0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607},
{0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001},
{0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180},
};
// Shift right LUT; shifts in zeros from the left, one byte at a time.
cen64_align(const uint16_t srl_b2l_keys[16][8], CACHE_LINE_SIZE) = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080},
{0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080},
{0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080},
{0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080},
{0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
{0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080},
};
cen64_align(const uint16_t ror_b2l_keys[16][8], CACHE_LINE_SIZE) = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001},
{0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203},
{0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405},
{0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607},
{0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708},
{0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A},
{0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C},
{0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E},
};
// Rotate left LUT; rotates high order bytes back to low order.
cen64_align(const uint16_t rol_l2b_keys[16][8], CACHE_LINE_SIZE) = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C},
{0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A},
{0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08},
{0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607},
{0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405},
{0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203},
{0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001},
{0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E},
};
// Rotate right LUT; rotates high order bytes back to low order.
cen64_align(const uint16_t ror_l2b_keys[16][8], CACHE_LINE_SIZE) = {
{0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F},
{0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E},
{0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001},
{0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300},
{0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203},
{0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502},
{0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405},
{0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704},
{0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607},
{0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906},
{0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809},
{0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08},
{0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B},
{0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A},
{0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D},
{0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C},
};
// TODO: Highly optimized. More of a stopgap measure.
#ifndef __SSSE3__
static inline __m128i sse2_pshufb(__m128i v, const uint16_t *keys) {
uint8_t dest[16];
uint8_t temp[16];
unsigned j;
_mm_storeu_si128((__m128i *) temp, v);
for (j = 0; j < 8; j++) {
uint16_t key = keys[j];
uint8_t key_hi = key >> 8;
uint8_t key_lo = key >> 0;
dest[(j << 1) + 1] = key_hi == 0x80 ? 0x00 : temp[key_hi];
dest[(j << 1) + 0] = key_lo == 0x80 ? 0x00 : temp[key_lo];
}
return _mm_loadu_si128((__m128i *) dest);
}
#endif
// Deallocates dynarec buffers for SSE2.
void arch_rsp_destroy(struct rsp *rsp) {}
// Allocates dynarec buffers for SSE2.
int arch_rsp_init(struct rsp *rsp) { return 0; }
#ifndef __SSSE3__
__m128i rsp_vect_load_and_shuffle_operand(
const uint16_t *src, unsigned element) {
uint16_t word_lo, word_hi;
uint64_t dword;
// element => 0w ... 7w
if (element >= 8) {
memcpy(&word_lo, src + (element - 8), sizeof(word_lo));
dword = word_lo | ((uint32_t) word_lo << 16);
return _mm_shuffle_epi32(_mm_loadl_epi64((__m128i *) &dword),
_MM_SHUFFLE(0,0,0,0));
}
// element => 0h ... 3h
else if (element >= 4) {
__m128i v;
memcpy(&word_hi, src + element - 0, sizeof(word_hi));
memcpy(&word_lo, src + element - 4, sizeof(word_lo));
dword = word_lo | ((uint32_t) word_hi << 16);
v = _mm_loadl_epi64((__m128i *) &dword);
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(1,1,0,0));
return _mm_shuffle_epi32(v, _MM_SHUFFLE(1,1,0,0));
}
// element => 0q ... 1q
else if (element >= 2) {
__m128i v = rsp_vect_load_unshuffled_operand(src);
if (element == 2) {
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(2,2,0,0));
v = _mm_shufflehi_epi16(v, _MM_SHUFFLE(2,2,0,0));
}
else {
v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3,3,1,1));
v = _mm_shufflehi_epi16(v, _MM_SHUFFLE(3,3,1,1));
}
return v;
}
return rsp_vect_load_unshuffled_operand(src);
}
#endif
//
// SSSE3+ accelerated loads for group I. Byteswap big-endian to 2-byte
// little-endian vector. Start at vector element offset, discarding any
// wraparound as necessary.
//
// TODO: Reverse-engineer what happens when loads to vector elements must
// wraparound. Do we just discard the data, as below, or does the
// data effectively get rotated around the edge of the vector?
//
void rsp_vload_group1(struct rsp *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
__m128i ekey, data;
unsigned offset = addr & 0x7;
unsigned ror = offset - element;
// Always load in 8-byte chunks to emulate wraparound.
if (offset) {
uint32_t aligned_addr_lo = addr & ~0x7;
uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
__m128i temp;
data = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_lo));
temp = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_hi));
data = _mm_unpacklo_epi64(data, temp);
}
else
data = _mm_loadl_epi64((__m128i *) (rsp->mem + addr));
// Shift the DQM up to the point where we mux in the data.
#ifndef __SSSE3__
dqm = sse2_pshufb(dqm, sll_b2l_keys[element]);
#else
ekey = _mm_load_si128((__m128i *) (sll_b2l_keys[element]));
dqm = _mm_shuffle_epi8(dqm, ekey);
#endif
// Align the data to the DQM so we can mask it in.
#ifndef __SSSE3__
data = sse2_pshufb(data, ror_b2l_keys[ror & 0xF]);
#else
ekey = _mm_load_si128((__m128i *) (ror_b2l_keys[ror & 0xF]));
data = _mm_shuffle_epi8(data, ekey);
#endif
// Mask and mux in the data.
#ifdef __SSE4_1__
reg = _mm_blendv_epi8(reg, data, dqm);
#else
data = _mm_and_si128(dqm, data);
reg = _mm_andnot_si128(dqm, reg);
reg = _mm_or_si128(data, reg);
#endif
_mm_store_si128((__m128i *) regp, reg);
}
//
// SSSE3+ accelerated loads for group II.
//
// TODO: Reverse-engineer what happens when loads to vector elements must
// wraparound. Do we just discard the data, as below, or does the
// data effectively get rotated around the edge of the vector?
//
// TODO: Reverse-engineer what happens when element != 0.
//
void rsp_vload_group2(struct rsp *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
unsigned offset = addr & 0x7;
__m128i data, zero;
// Always load in 8-byte chunks to emulate wraparound.
if (offset) {
uint32_t aligned_addr_lo = addr & ~0x7;
uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
uint64_t datalow, datahigh;
memcpy(&datalow, rsp->mem + aligned_addr_lo, sizeof(datalow));
memcpy(&datahigh, rsp->mem + aligned_addr_hi, sizeof(datahigh));
// TODO: Test for endian issues?
datahigh >>= ((8 - offset) << 3);
datalow <<= (offset << 3);
datalow = datahigh | datalow;
data = _mm_loadl_epi64((__m128i *) &datalow);
}
else
data = _mm_loadl_epi64((__m128i *) (rsp->mem + addr));
// "Unpack" the data.
zero = _mm_setzero_si128();
data = _mm_unpacklo_epi8(zero, data);
if (rsp->pipeline.exdf_latch.request.type != RSP_MEM_REQUEST_PACK)
data = _mm_srli_epi16(data, 1);
_mm_store_si128((__m128i *) regp, data);
}
//
// SSSE3+ accelerated loads for group IV. Byteswap big-endian to 2-byte
// little-endian vector. Stop loading at quadword boundaries.
//
// TODO: Reverse-engineer what happens when loads from vector elements
// must wraparound (i.e., the address offset is small, starting
// element is large).
//
void rsp_vload_group4(struct rsp *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
uint32_t aligned_addr = addr & 0xFF0;
unsigned offset = addr & 0xF;
unsigned ror;
__m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr));
__m128i dkey;
if (rsp->pipeline.exdf_latch.request.type == RSP_MEM_REQUEST_QUAD)
ror = 16 - element + offset;
// TODO: How is this adjusted for LRV when e != 0?
else {
dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
ror = 16 - offset;
}
#ifndef __SSSE3__
data = sse2_pshufb(data, ror_b2l_keys[ror & 0xF]);
dqm = sse2_pshufb(dqm, ror_b2l_keys[ror & 0xF]);
#else
dkey = _mm_load_si128((__m128i *) (ror_b2l_keys[ror & 0xF]));
data = _mm_shuffle_epi8(data, dkey);
dqm = _mm_shuffle_epi8(dqm, dkey);
#endif
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(reg, data, dqm);
#else
data = _mm_and_si128(dqm, data);
reg = _mm_andnot_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_store_si128((__m128i *) regp, data);
}
//
// SSE3+ accelerated stores for group I. Byteswap 2-byte little-endian
// vector back to big-endian. Start at vector element offset, wrapping
// around the edge of the vector as necessary.
//
// TODO: Reverse-engineer what happens when stores from vector elements
// must wraparound. Do we just stop storing the data, or do we
// continue storing from the front of the vector, as below?
//
void rsp_vstore_group1(struct rsp *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
unsigned offset = addr & 0x7;
unsigned ror = element - offset;
__m128i ekey, data;
// Shift the DQM up to the point where we mux in the data.
#ifndef __SSSE3__
dqm = sse2_pshufb(dqm, sll_l2b_keys[offset]);
#else
ekey = _mm_load_si128((__m128i *) (sll_l2b_keys[offset]));
dqm = _mm_shuffle_epi8(dqm, ekey);
#endif
// Rotate the reg to align with the DQM.
#ifndef __SSSE3__
reg = sse2_pshufb(reg, ror_l2b_keys[ror & 0xF]);
#else
ekey = _mm_load_si128((__m128i *) (ror_l2b_keys[ror & 0xF]));
reg = _mm_shuffle_epi8(reg, ekey);
#endif
// Always load in 8-byte chunks to emulate wraparound.
if (offset) {
uint32_t aligned_addr_lo = addr & ~0x7;
uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF;
__m128i temp;
data = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_lo));
temp = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_hi));
data = _mm_unpacklo_epi64(data, temp);
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
data = _mm_andnot_si128(dqm, data);
reg = _mm_and_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_storel_epi64((__m128i *) (rsp->mem + aligned_addr_lo), data);
data = _mm_srli_si128(data, 8);
_mm_storel_epi64((__m128i *) (rsp->mem + aligned_addr_hi), data);
}
else {
data = _mm_loadl_epi64((__m128i *) (rsp->mem + addr));
// Mask and mux in the data.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
data = _mm_andnot_si128(dqm, data);
reg = _mm_and_si128(dqm, reg);
data = _mm_or_si128(data, reg);
#endif
_mm_storel_epi64((__m128i *) (rsp->mem + addr), data);
}
}
//
// SSE3+ accelerated stores for group II. Byteswap 2-byte little-endian
// vector back to big-endian. Start at vector element offset, wrapping
// around the edge of the vector as necessary.
//
// TODO: Reverse-engineer what happens when stores from vector elements
// must wraparound. Do we just stop storing the data, or do we
// continue storing from the front of the vector, as below?
//
// TODO: Reverse-engineer what happens when element != 0.
//
void rsp_vstore_group2(struct rsp *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
// "Pack" the data.
if (rsp->pipeline.exdf_latch.request.type != RSP_MEM_REQUEST_PACK)
reg = _mm_slli_epi16(reg, 1);
reg = _mm_srai_epi16(reg, 8);
reg = _mm_packs_epi16(reg, reg);
// TODO: Always store in 8-byte chunks to emulate wraparound.
_mm_storel_epi64((__m128i *) (rsp->mem + addr), reg);
}
//
// SSE3+ accelerated stores for group IV. Byteswap 2-byte little-endian
// vector back to big-endian. Stop storing at quadword boundaries.
//
void rsp_vstore_group4(struct rsp *rsp, uint32_t addr, unsigned element,
uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) {
uint32_t aligned_addr = addr & 0xFF0;
unsigned offset = addr & 0xF;
unsigned rol = offset;
__m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr));
__m128i ekey;
if (rsp->pipeline.exdf_latch.request.type == RSP_MEM_REQUEST_QUAD)
rol -= element;
// TODO: How is this adjusted for SRV when e != 0?
else
dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm);
#ifndef __SSSE3__
reg = sse2_pshufb(reg, rol_l2b_keys[rol & 0xF]);
#else
ekey = _mm_load_si128((__m128i *) (rol_l2b_keys[rol & 0xF]));
reg = _mm_shuffle_epi8(reg, ekey);
#endif
// Mask and mux out the data, write.
#ifdef __SSE4_1__
data = _mm_blendv_epi8(data, reg, dqm);
#else
reg = _mm_and_si128(dqm, reg);
data = _mm_andnot_si128(dqm, data);
data = _mm_or_si128(data, reg);
#endif
_mm_store_si128((__m128i *) (rsp->mem + aligned_addr), data);
}