// // arch/x86_64/rsp/rsp.c // // Declarations for host RSP functions. // // This file is subject to the terms and conditions defined in // 'LICENSE', which is part of this source code package. // #include "common.h" #include "os/dynarec.h" #include "rsp/cpu.h" #include "rsp/pipeline.h" #include "rsp/rsp.h" // // Masks for AND/OR/XOR and NAND/NOR/NXOR. // cen64_align(const uint16_t rsp_vlogic_mask[2][8], 32) = { { 0, 0, 0, 0, 0, 0, 0, 0}, {~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0} }; #ifdef __SSSE3__ // // This table is used to "shuffle" the RSP vector after loading it. // cen64_align(const uint16_t shuffle_keys[16][8], CACHE_LINE_SIZE) = { /* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E}, /* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E}, /* 0q */ {0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0D0C, 0x0D0C}, /* 1q */ {0x0302, 0x0302, 0x0706, 0x0706, 0x0B0A, 0x0B0A, 0x0F0E, 0x0F0E}, /* 0h */ {0x0100, 0x0100, 0x0100, 0x0100, 0x0908, 0x0908, 0x0908, 0x0908}, /* 1h */ {0x0302, 0x0302, 0x0302, 0x0302, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A}, /* 2h */ {0x0504, 0x0504, 0x0504, 0x0504, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C}, /* 3h */ {0x0706, 0x0706, 0x0706, 0x0706, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E}, /* 0w */ {0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100, 0x0100}, /* 1w */ {0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302, 0x0302}, /* 2w */ {0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504, 0x0504}, /* 3w */ {0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706, 0x0706}, /* 4w */ {0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908, 0x0908}, /* 5w */ {0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A, 0x0B0A}, /* 6w */ {0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C, 0x0D0C}, /* 7w */ {0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E, 0x0F0E}, }; #endif // // These tables are used to shift data loaded from DMEM. // In addition to shifting, they also take into account that // DMEM uses big-endian byte ordering, whereas vectors are // 2-byte little-endian. // // Shift left LUT; shifts in zeros from the right, one byte at a time. cen64_align(const uint16_t sll_b2l_keys[16][8], CACHE_LINE_SIZE) = { {0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}, {0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E}, {0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D}, {0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C}, {0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B}, {0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A}, {0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809}, {0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506, 0x0708}, {0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304, 0x0506}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102, 0x0304}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000, 0x0102}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8000}, }; // Shift left LUT; shirts low order to high order, inserting 0x00s. cen64_align(const uint16_t sll_l2b_keys[16][8], CACHE_LINE_SIZE) = { {0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}, {0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0E0C}, {0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D}, {0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A}, {0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B}, {0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08}, {0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809}, {0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704, 0x0906}, {0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405, 0x0607}, {0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502, 0x0704}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203, 0x0405}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300, 0x0502}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001, 0x0203}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180, 0x0300}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0001}, {0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x0180}, }; // Shift right LUT; shifts in zeros from the left, one byte at a time. cen64_align(const uint16_t srl_b2l_keys[16][8], CACHE_LINE_SIZE) = { {0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}, {0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80}, {0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080}, {0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080}, {0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080}, {0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080}, {0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080}, {0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080}, {0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080}, {0x090A, 0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080}, {0x0A0B, 0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, {0x0B0C, 0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, {0x0C0D, 0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, {0x0D0E, 0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, {0x0E0F, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, {0x0F80, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080}, }; cen64_align(const uint16_t ror_b2l_keys[16][8], CACHE_LINE_SIZE) = { {0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}, {0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00}, {0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001}, {0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102}, {0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203}, {0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304}, {0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405}, {0x0708, 0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506}, {0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607}, {0x090A, 0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708}, {0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809}, {0x0B0C, 0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A}, {0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B}, {0x0D0E, 0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C}, {0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D}, {0x0F00, 0x0102, 0x0304, 0x0506, 0x0708, 0x090A, 0x0B0C, 0x0D0E}, }; // Rotate left LUT; rotates high order bytes back to low order. cen64_align(const uint16_t rol_l2b_keys[16][8], CACHE_LINE_SIZE) = { {0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}, {0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C}, {0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D}, {0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A}, {0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B}, {0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08}, {0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809}, {0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906}, {0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607}, {0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704}, {0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405}, {0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502}, {0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203}, {0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300}, {0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001}, {0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E}, }; // Rotate right LUT; rotates high order bytes back to low order. cen64_align(const uint16_t ror_l2b_keys[16][8], CACHE_LINE_SIZE) = { {0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F}, {0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E}, {0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001}, {0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300}, {0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203}, {0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502}, {0x0607, 0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405}, {0x0906, 0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704}, {0x0809, 0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607}, {0x0B08, 0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906}, {0x0A0B, 0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809}, {0x0D0A, 0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08}, {0x0C0D, 0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B}, {0x0F0C, 0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A}, {0x0E0F, 0x0001, 0x0203, 0x0405, 0x0607, 0x0809, 0x0A0B, 0x0C0D}, {0x010E, 0x0300, 0x0502, 0x0704, 0x0906, 0x0B08, 0x0D0A, 0x0F0C}, }; // TODO: Highly optimized. More of a stopgap measure. #ifndef __SSSE3__ static inline __m128i sse2_pshufb(__m128i v, const uint16_t *keys) { uint8_t dest[16]; uint8_t temp[16]; unsigned j; _mm_storeu_si128((__m128i *) temp, v); for (j = 0; j < 8; j++) { uint16_t key = keys[j]; uint8_t key_hi = key >> 8; uint8_t key_lo = key >> 0; dest[(j << 1) + 1] = key_hi == 0x80 ? 0x00 : temp[key_hi]; dest[(j << 1) + 0] = key_lo == 0x80 ? 0x00 : temp[key_lo]; } return _mm_loadu_si128((__m128i *) dest); } #endif // Deallocates dynarec buffers for SSE2. void arch_rsp_destroy(struct rsp *rsp) {} // Allocates dynarec buffers for SSE2. int arch_rsp_init(struct rsp *rsp) { return 0; } #ifndef __SSSE3__ __m128i rsp_vect_load_and_shuffle_operand( const uint16_t *src, unsigned element) { uint16_t word_lo, word_hi; uint64_t dword; // element => 0w ... 7w if (element >= 8) { memcpy(&word_lo, src + (element - 8), sizeof(word_lo)); dword = word_lo | ((uint32_t) word_lo << 16); return _mm_shuffle_epi32(_mm_loadl_epi64((__m128i *) &dword), _MM_SHUFFLE(0,0,0,0)); } // element => 0h ... 3h else if (element >= 4) { __m128i v; memcpy(&word_hi, src + element - 0, sizeof(word_hi)); memcpy(&word_lo, src + element - 4, sizeof(word_lo)); dword = word_lo | ((uint32_t) word_hi << 16); v = _mm_loadl_epi64((__m128i *) &dword); v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(1,1,0,0)); return _mm_shuffle_epi32(v, _MM_SHUFFLE(1,1,0,0)); } // element => 0q ... 1q else if (element >= 2) { __m128i v = rsp_vect_load_unshuffled_operand(src); if (element == 2) { v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(2,2,0,0)); v = _mm_shufflehi_epi16(v, _MM_SHUFFLE(2,2,0,0)); } else { v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3,3,1,1)); v = _mm_shufflehi_epi16(v, _MM_SHUFFLE(3,3,1,1)); } return v; } return rsp_vect_load_unshuffled_operand(src); } #endif // // SSSE3+ accelerated loads for group I. Byteswap big-endian to 2-byte // little-endian vector. Start at vector element offset, discarding any // wraparound as necessary. // // TODO: Reverse-engineer what happens when loads to vector elements must // wraparound. Do we just discard the data, as below, or does the // data effectively get rotated around the edge of the vector? // void rsp_vload_group1(struct rsp *rsp, uint32_t addr, unsigned element, uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) { __m128i ekey, data; unsigned offset = addr & 0x7; unsigned ror = offset - element; // Always load in 8-byte chunks to emulate wraparound. if (offset) { uint32_t aligned_addr_lo = addr & ~0x7; uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF; __m128i temp; data = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_lo)); temp = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_hi)); data = _mm_unpacklo_epi64(data, temp); } else data = _mm_loadl_epi64((__m128i *) (rsp->mem + addr)); // Shift the DQM up to the point where we mux in the data. #ifndef __SSSE3__ dqm = sse2_pshufb(dqm, sll_b2l_keys[element]); #else ekey = _mm_load_si128((__m128i *) (sll_b2l_keys[element])); dqm = _mm_shuffle_epi8(dqm, ekey); #endif // Align the data to the DQM so we can mask it in. #ifndef __SSSE3__ data = sse2_pshufb(data, ror_b2l_keys[ror & 0xF]); #else ekey = _mm_load_si128((__m128i *) (ror_b2l_keys[ror & 0xF])); data = _mm_shuffle_epi8(data, ekey); #endif // Mask and mux in the data. #ifdef __SSE4_1__ reg = _mm_blendv_epi8(reg, data, dqm); #else data = _mm_and_si128(dqm, data); reg = _mm_andnot_si128(dqm, reg); reg = _mm_or_si128(data, reg); #endif _mm_store_si128((__m128i *) regp, reg); } // // SSSE3+ accelerated loads for group II. // // TODO: Reverse-engineer what happens when loads to vector elements must // wraparound. Do we just discard the data, as below, or does the // data effectively get rotated around the edge of the vector? // // TODO: Reverse-engineer what happens when element != 0. // void rsp_vload_group2(struct rsp *rsp, uint32_t addr, unsigned element, uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) { unsigned offset = addr & 0x7; __m128i data, zero; // Always load in 8-byte chunks to emulate wraparound. if (offset) { uint32_t aligned_addr_lo = addr & ~0x7; uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF; uint64_t datalow, datahigh; memcpy(&datalow, rsp->mem + aligned_addr_lo, sizeof(datalow)); memcpy(&datahigh, rsp->mem + aligned_addr_hi, sizeof(datahigh)); // TODO: Test for endian issues? datahigh >>= ((8 - offset) << 3); datalow <<= (offset << 3); datalow = datahigh | datalow; data = _mm_loadl_epi64((__m128i *) &datalow); } else data = _mm_loadl_epi64((__m128i *) (rsp->mem + addr)); // "Unpack" the data. zero = _mm_setzero_si128(); data = _mm_unpacklo_epi8(zero, data); if (rsp->pipeline.exdf_latch.request.type != RSP_MEM_REQUEST_PACK) data = _mm_srli_epi16(data, 1); _mm_store_si128((__m128i *) regp, data); } // // SSSE3+ accelerated loads for group IV. Byteswap big-endian to 2-byte // little-endian vector. Stop loading at quadword boundaries. // // TODO: Reverse-engineer what happens when loads from vector elements // must wraparound (i.e., the address offset is small, starting // element is large). // void rsp_vload_group4(struct rsp *rsp, uint32_t addr, unsigned element, uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) { uint32_t aligned_addr = addr & 0xFF0; unsigned offset = addr & 0xF; unsigned ror; __m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr)); __m128i dkey; if (rsp->pipeline.exdf_latch.request.type == RSP_MEM_REQUEST_QUAD) ror = 16 - element + offset; // TODO: How is this adjusted for LRV when e != 0? else { dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm); ror = 16 - offset; } #ifndef __SSSE3__ data = sse2_pshufb(data, ror_b2l_keys[ror & 0xF]); dqm = sse2_pshufb(dqm, ror_b2l_keys[ror & 0xF]); #else dkey = _mm_load_si128((__m128i *) (ror_b2l_keys[ror & 0xF])); data = _mm_shuffle_epi8(data, dkey); dqm = _mm_shuffle_epi8(dqm, dkey); #endif // Mask and mux in the data. #ifdef __SSE4_1__ data = _mm_blendv_epi8(reg, data, dqm); #else data = _mm_and_si128(dqm, data); reg = _mm_andnot_si128(dqm, reg); data = _mm_or_si128(data, reg); #endif _mm_store_si128((__m128i *) regp, data); } // // SSE3+ accelerated stores for group I. Byteswap 2-byte little-endian // vector back to big-endian. Start at vector element offset, wrapping // around the edge of the vector as necessary. // // TODO: Reverse-engineer what happens when stores from vector elements // must wraparound. Do we just stop storing the data, or do we // continue storing from the front of the vector, as below? // void rsp_vstore_group1(struct rsp *rsp, uint32_t addr, unsigned element, uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) { unsigned offset = addr & 0x7; unsigned ror = element - offset; __m128i ekey, data; // Shift the DQM up to the point where we mux in the data. #ifndef __SSSE3__ dqm = sse2_pshufb(dqm, sll_l2b_keys[offset]); #else ekey = _mm_load_si128((__m128i *) (sll_l2b_keys[offset])); dqm = _mm_shuffle_epi8(dqm, ekey); #endif // Rotate the reg to align with the DQM. #ifndef __SSSE3__ reg = sse2_pshufb(reg, ror_l2b_keys[ror & 0xF]); #else ekey = _mm_load_si128((__m128i *) (ror_l2b_keys[ror & 0xF])); reg = _mm_shuffle_epi8(reg, ekey); #endif // Always load in 8-byte chunks to emulate wraparound. if (offset) { uint32_t aligned_addr_lo = addr & ~0x7; uint32_t aligned_addr_hi = (aligned_addr_lo + 8) & 0xFFF; __m128i temp; data = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_lo)); temp = _mm_loadl_epi64((__m128i *) (rsp->mem + aligned_addr_hi)); data = _mm_unpacklo_epi64(data, temp); // Mask and mux in the data. #ifdef __SSE4_1__ data = _mm_blendv_epi8(data, reg, dqm); #else data = _mm_andnot_si128(dqm, data); reg = _mm_and_si128(dqm, reg); data = _mm_or_si128(data, reg); #endif _mm_storel_epi64((__m128i *) (rsp->mem + aligned_addr_lo), data); data = _mm_srli_si128(data, 8); _mm_storel_epi64((__m128i *) (rsp->mem + aligned_addr_hi), data); } else { data = _mm_loadl_epi64((__m128i *) (rsp->mem + addr)); // Mask and mux in the data. #ifdef __SSE4_1__ data = _mm_blendv_epi8(data, reg, dqm); #else data = _mm_andnot_si128(dqm, data); reg = _mm_and_si128(dqm, reg); data = _mm_or_si128(data, reg); #endif _mm_storel_epi64((__m128i *) (rsp->mem + addr), data); } } // // SSE3+ accelerated stores for group II. Byteswap 2-byte little-endian // vector back to big-endian. Start at vector element offset, wrapping // around the edge of the vector as necessary. // // TODO: Reverse-engineer what happens when stores from vector elements // must wraparound. Do we just stop storing the data, or do we // continue storing from the front of the vector, as below? // // TODO: Reverse-engineer what happens when element != 0. // void rsp_vstore_group2(struct rsp *rsp, uint32_t addr, unsigned element, uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) { // "Pack" the data. if (rsp->pipeline.exdf_latch.request.type != RSP_MEM_REQUEST_PACK) reg = _mm_slli_epi16(reg, 1); reg = _mm_srai_epi16(reg, 8); reg = _mm_packs_epi16(reg, reg); // TODO: Always store in 8-byte chunks to emulate wraparound. _mm_storel_epi64((__m128i *) (rsp->mem + addr), reg); } // // SSE3+ accelerated stores for group IV. Byteswap 2-byte little-endian // vector back to big-endian. Stop storing at quadword boundaries. // void rsp_vstore_group4(struct rsp *rsp, uint32_t addr, unsigned element, uint16_t *regp, rsp_vect_t reg, rsp_vect_t dqm) { uint32_t aligned_addr = addr & 0xFF0; unsigned offset = addr & 0xF; unsigned rol = offset; __m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr)); __m128i ekey; if (rsp->pipeline.exdf_latch.request.type == RSP_MEM_REQUEST_QUAD) rol -= element; // TODO: How is this adjusted for SRV when e != 0? else dqm = _mm_cmpeq_epi8(_mm_setzero_si128(), dqm); #ifndef __SSSE3__ reg = sse2_pshufb(reg, rol_l2b_keys[rol & 0xF]); #else ekey = _mm_load_si128((__m128i *) (rol_l2b_keys[rol & 0xF])); reg = _mm_shuffle_epi8(reg, ekey); #endif // Mask and mux out the data, write. #ifdef __SSE4_1__ data = _mm_blendv_epi8(data, reg, dqm); #else reg = _mm_and_si128(dqm, reg); data = _mm_andnot_si128(dqm, data); data = _mm_or_si128(data, reg); #endif _mm_store_si128((__m128i *) (rsp->mem + aligned_addr), data); }