(Finally) permit SSE2-only builds.

Add SSE2 codepaths where necessary (even if not complete), while
still allowing the project to be compiled with SSSE3+ intrinsics.
This commit is contained in:
Tyler Stachecki 2014-11-10 14:29:13 -05:00
parent 3a24a67f1f
commit 316214d82d
12 changed files with 291 additions and 38 deletions

View file

@ -41,7 +41,7 @@ if (${CMAKE_C_COMPILER_ID} MATCHES GNU)
endif()
if (NOT NATIVE_BUILD)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
endif (NOT NATIVE_BUILD)
include_directories(${PROJECT_SOURCE_DIR}/os/unix/x86_64)
@ -108,7 +108,7 @@ if (${CMAKE_C_COMPILER_ID} MATCHES Clang)
set(CEN64_ARCH_DIR "x86_64")
if (NOT NATIVE_BUILD)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
endif (NOT NATIVE_BUILD)
include_directories(${PROJECT_SOURCE_DIR}/os/unix/x86_64)

View file

@ -9,8 +9,51 @@
#include "common.h"
#include "arch/x86_64/rsp/rsp.h"
#include "os/dynarec.h"
#include "rsp/cpu.h"
// Deallocates dynarec buffers for SSE2.
void arch_rsp_destroy(struct rsp *rsp) {
#ifndef __SSSE3__
free_dynarec_slab(&rsp->vstore_dynarec);
#endif
}
// Allocates dynarec buffers for SSE2.
int arch_rsp_init(struct rsp *rsp) {
#ifndef __SSSE3__
void *vload_buffer, *vstore_buffer;
// See rsp_vstore_dmem for code description.
static const uint8_t vstore_code[] = {
0x66, 0x0F, 0x73, 0xF8, 0x00,
0x66, 0x0F, 0x73, 0xD9, 0x00,
0x66, 0x0F, 0x73, 0xFA, 0x00,
0x66, 0x0F, 0xEB, 0xCA,
0x66, 0x0F, 0xDB, 0xC8,
0x66, 0x0F, 0xDF, 0xC3,
0x66, 0x0F, 0xEB, 0xC1,
0xC3
};
if ((vload_buffer = alloc_dynarec_slab(
&rsp->vload_dynarec, CACHE_LINE_SIZE)) == NULL)
return 1;
if ((vstore_buffer = alloc_dynarec_slab(
&rsp->vstore_dynarec, CACHE_LINE_SIZE)) == NULL) {
free_dynarec_slab(&rsp->vload_dynarec);
return 1;
}
memcpy(vstore_buffer, vstore_code, sizeof(vstore_code));
#endif
return 0;
}
#ifdef __SSSE3__
cen64_align(const uint16_t shuffle_keys[16][8], CACHE_LINE_SIZE) = {
/* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E},
@ -66,7 +109,9 @@ __m128i rsp_vect_load_and_shuffle_operand(
__m128i vlo, vhi;
int i;
for (i = -2; i < 6; i += 2)
dword = src[element - 2];
for (i = -1; i < 6; i += 2)
dword = (dword << 16) | src[element + i];
vlo = _mm_loadl_epi64((__m128i *) &dword);
@ -133,7 +178,7 @@ cen64_align(const uint16_t srl_b2l_keys[16][8], CACHE_LINE_SIZE) = {
};
//
// Accelerated loads. Byteswap big-endian to 2-byte little-endian
// SSSE3+ accelerated loads. Byteswap big-endian to 2-byte little-endian
// vector. Start at vector element offset, discarding any wraparound
// as necessary. Lastly, don't load across cacheline boundary.
//
@ -181,6 +226,31 @@ __m128i rsp_vload_dmem(struct rsp *rsp,
return data;
}
#else
//
// SSE2 accelerated loads. Byteswap big-endian to 2-byte little-endian
// vector. Start at vector element offset, discarding any wraparound
// as necessary. Lastly, don't load across cacheline boundary.
//
// TODO: Verify wraparound behavior.
// TODO: Only tested for L{B/S/L/D/Q}V
//
__m128i rsp_vload_dmem(struct rsp *rsp,
uint32_t addr, unsigned element, __m128i reg, __m128i dqm) {
__m128i datah, datal;
unsigned doffset = addr & 0xF;
uint32_t aligned_addr = addr & 0xFF0;
__m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr));
// TODO: Implement this correctly.
datah = _mm_slli_epi16(data, 8);
datal = _mm_srli_epi16(data, 8);
data = _mm_or_si128(datah, datal);
return data;
}
#endif
#ifdef __SSSE3__
@ -236,7 +306,7 @@ cen64_align(const uint16_t sll_l2b_keys[16][8], CACHE_LINE_SIZE) = {
};
//
// Accelerated stores. Byteswap 2-byte little-endian vector back
// SSE3+ accelerated stores. Byteswap 2-byte little-endian vector back
// to big-endian. Start at vector element offset, wrapping around
// as necessary. Lastly, only store upto the cacheline boundary.
//
@ -268,5 +338,70 @@ void rsp_vstore_dmem(struct rsp *rsp,
_mm_store_si128((__m128i *) (rsp->mem + aligned_addr), data);
}
#else
//
// SSE2 accelerated stores. Byteswap 2-byte little-endian vector back
// to big-endian. Start at vector element offset, wrapping around
// as necessary. Lastly, only store upto the cacheline boundary.
//
// TODO: Verify wraparound behavior.
// TODO: Only tested for L{B/S/L/D/Q}V
//
void rsp_vstore_dmem(struct rsp *rsp,
uint32_t addr, unsigned element, __m128i reg, __m128i dqm) {
__m128i dqmh, dqml, regh, regl;
unsigned doffset = addr & 0xF;
unsigned eoffset = (doffset - element) & 0xF;
uint32_t aligned_addr = addr & 0xFF0;
__m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr));
// Byteswap both vectors, first.
dqmh = _mm_slli_epi16(dqm, 8);
dqml = _mm_srli_epi16(dqm, 8);
dqm = _mm_or_si128(dqmh, dqml);
regh = _mm_slli_epi16(reg, 8);
regl = _mm_srli_epi16(reg, 8);
reg = _mm_or_si128(regh, regl);
//
// Since SSE2 only provides "fixed immediate" shuffles:
// Patch/call a dynarec buffer that does the following:
//
// Given:
// xmm0 = byteswapped dqm
// xmm1 = byteswapped reg
// xmm2 = byteswapped reg
// xmm3 = data [dmem]
//
// Perform:
// 1) [xmm0 -> xmm0] Shift left:
// 66 0f 73 f8 0# pslldq $0x#,%xmm0
//
// 2) [xmm1,xmm2 -> xmm1] Rotate left:
// 66 0F 73 D9 0# psrldq $0x#,%xmm1
// 66 0F 73 FA 0# pslldq $0x#,%xmm2
// 66 0F EB CA por %xmm2,%xmm1
//
// 3) Mask/mux reg and data:
// 66 0F DB C8 pand %xmm0,%xmm1
// 66 0F DF C3 pandn %xmm3,%xmm0
// 66 0F EB C1 por %xmm1,%xmm0
//
// 4) Return from dynarec:
// C3 retq
//
rsp->vstore_dynarec.ptr[4] = doffset;
rsp->vstore_dynarec.ptr[9] = eoffset;
rsp->vstore_dynarec.ptr[14] = 16 - eoffset;
data = ((__m128i (*)(__m128i, __m128i, __m128i, __m128i))
rsp->vstore_dynarec.ptr)(dqm, reg, reg, data);
_mm_store_si128((__m128i *) (rsp->mem + aligned_addr), data);
}
#endif

View file

@ -26,6 +26,10 @@
struct rsp;
typedef __m128i rsp_vect_t;
// Gives the architecture backend a chance to initialize the RSP.
void arch_rsp_destroy(struct rsp *rsp);
int arch_rsp_init(struct rsp *rsp);
// Loads and shuffles a 16x8 vector according to element.
#ifdef __SSSE3__
extern const uint16_t shuffle_keys[16][8];

View file

@ -11,15 +11,6 @@
#ifndef __common_h__
#define __common_h__
// TODO: Remove me after SSE2 support is added.
#ifndef __SSE3__
#define __SSE3__
#endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#define tostring(s) #s
#define stringify(s) tostring(s)

View file

@ -24,6 +24,7 @@
#include "vi/controller.h"
#include "vr4300/cpu.h"
cen64_cold static void device_destroy(struct cen64_device *device);
cen64_cold static struct cen64_device *device_create(struct cen64_device *device,
uint8_t *ram, const struct rom_file *pifrom, const struct rom_file *cart);
@ -102,6 +103,11 @@ struct cen64_device *device_create(struct cen64_device *device,
return device;
}
// Cleans up memory allocated for the device.
void device_destroy(struct cen64_device *device) {
rsp_destroy(&device->rsp);
}
// Called when we should (probably?) leave simulation.
// After calling this function, we return to device_runmode_*.
void device_request_exit(struct bus_controller *bus) {
@ -202,6 +208,8 @@ int device_run(struct cen64_device *device, struct cen64_options *options,
status = unlikely(options->extra_mode)
? device_runmode_extra(device)
: device_runmode_fast(device);
device_destroy(device);
}
return status;

24
os/dynarec.h Normal file
View file

@ -0,0 +1,24 @@
//
// os/dynarec.h
//
// Functions for allocating executable code buffers.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#ifndef __os_dynarec_h__
#define __os_dynarec_h__
#include "common.h"
#include <stddef.h>
struct dynarec_slab {
size_t size;
uint8_t *ptr;
};
cen64_cold void *alloc_dynarec_slab(struct dynarec_slab *slab, size_t size);
cen64_cold void free_dynarec_slab(struct dynarec_slab *slab);
#endif

30
os/unix/dynarec.c Normal file
View file

@ -0,0 +1,30 @@
//
// os/unix/dynarec.c
//
// Functions for allocating executable code buffers.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "common.h"
#include "os/dynarec.h"
#include <sys/mman.h>
extern const int zero_page_fd;
// Allocates memory with execute permissions set.
void *alloc_dynarec_slab(struct dynarec_slab *slab, size_t size) {
if ((slab->ptr = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE,
MAP_PRIVATE, zero_page_fd, 0)) == MAP_FAILED)
return NULL;
slab->size = size;
return slab->ptr;
}
// Frees memory acquired for a dynarec buffer.
void free_dynarec_slab(struct dynarec_slab *slab) {
munmap(slab->ptr, slab->size);
}

View file

@ -14,48 +14,59 @@
#include "os/main.h"
#include "os/unix/glx_window.h"
#include <fcntl.h>
#include <stddef.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
struct ram_hunk {
size_t size;
void *ptr;
int fd;
};
cen64_cold static uint8_t *allocate_ram(struct ram_hunk *ram, size_t size);
cen64_cold static void deallocate_ram(struct ram_hunk *ram, size_t size);
cen64_cold static void deallocate_ram(struct ram_hunk *ram);
// Global file descriptor for allocations.
#ifdef __linux__
const char *zero_page_path = "/dev/zero";
#else
const char *zero_page_path = "/dev/null";
#endif
int zero_page_fd;
// Allocates a large hunk of zeroed RAM.
uint8_t *allocate_ram(struct ram_hunk *ram, size_t size) {
#ifdef __linux__
if ((ram->fd = open("/dev/zero", O_RDWR)) < 0)
#else
if ((ram->fd = open("/dev/null", O_RDWR)) < 0)
#endif
return NULL;
if ((ram->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE, ram->fd, 0)) == MAP_FAILED) {
close(ram->fd);
MAP_PRIVATE, zero_page_fd, 0)) == MAP_FAILED)
return NULL;
}
#ifndef __linux__
memset(ram->ptr, 0, size);
#endif
ram->size = size;
return ram->ptr;
}
// Allocates a large hunk of RAM.
void deallocate_ram(struct ram_hunk *ram, size_t size) {
munmap(ram->ptr, size);
close(ram->fd);
void deallocate_ram(struct ram_hunk *ram) {
munmap(ram->ptr, ram->size);
}
// Unix application entry point.
int main(int argc, const char *argv[]) {
return cen64_cmdline_main(argc, argv);
cen64_cold int main(int argc, const char *argv[]) {
int status;
if ((zero_page_fd = open(zero_page_path, O_RDWR)) < 0) {
printf("Failed to open: %s\n", zero_page_path);
return EXIT_FAILURE;
}
status = cen64_cmdline_main(argc, argv);
close(zero_page_fd);
return status;
}
// Informs the simulation thread if an exit was requested.
@ -92,14 +103,13 @@ int os_main(struct cen64_options *options,
if (create_gl_window(&device.bus, &device.vi.gl_window, &hints)) {
printf("Failed to create a window.\n");
deallocate_ram(&hunk, DEVICE_RAMSIZE);
deallocate_ram(&hunk);
return 1;
}
status = device_run(&device, options, ram, pifrom, cart);
destroy_gl_window(&device.vi.gl_window);
deallocate_ram(&hunk, DEVICE_RAMSIZE);
deallocate_ram(&hunk);
return status;
}

29
os/windows/dynarec.c Normal file
View file

@ -0,0 +1,29 @@
//
// os/windows/dynarec.c
//
// Functions for allocating executable code buffers.
//
// This file is subject to the terms and conditions defined in
// 'LICENSE', which is part of this source code package.
//
#include "common.h"
#include "os/dynarec.h"
#include <windows.h>
extern HANDLE dynarec_heap;
// Allocates memory with execute permissions set.
void *alloc_dynarec_slab(struct dynarec_slab *slab, size_t size) {
if ((slab->ptr = HeapAlloc(dynarec_heap, HEAP_ZERO_MEMORY, size)) == NULL)
return NULL;
slab->size = size;
return slab->ptr;
}
// Frees memory acquired for a dynarec buffer.
void free_dynarec_slab(struct dynarec_slab *slab) {
HeapFree(dynarec_heap, 0, slab->ptr);
}

View file

@ -24,6 +24,8 @@ static int load_roms(const char *pifrom_path, const char *cart_path,
static void hide_console(void);
static void show_console(void);
HANDLE dynarec_heap;
// Windows application entry point.
int WINAPI WinMain(HINSTANCE hInstance,
HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd) {
@ -37,9 +39,18 @@ int WINAPI WinMain(HINSTANCE hInstance,
return status;
}
status = cen64_win32_main(__argc, __argv);
if ((dynarec_heap = HeapCreate(HEAP_CREATE_ENABLE_EXECUTE, 0, 0)) == NULL) {
MessageBox(NULL, "Failed to create the dynarec heap.", "CEN64",
MB_OK | MB_ICONEXCLAMATION);
WSACleanup();
return EXIT_FAILURE;
}
status = cen64_win32_main(__argc, __argv);
HeapDestroy(dynarec_heap);
WSACleanup();
return status;
}

View file

@ -25,13 +25,18 @@ static void rsp_connect_bus(struct rsp *rsp, struct bus_controller *bus) {
rsp->bus = bus;
}
// Releases memory acquired for the RSP component.
void rsp_destroy(struct rsp *rsp) {
arch_rsp_destroy(rsp);
}
// Initializes the RSP component.
cen64_cold int rsp_init(struct rsp *rsp, struct bus_controller *bus) {
int rsp_init(struct rsp *rsp, struct bus_controller *bus) {
rsp_connect_bus(rsp, bus);
rsp_cp0_init(rsp);
rsp_pipeline_init(&rsp->pipeline);
return 0;
return arch_rsp_init(rsp);
}

View file

@ -11,6 +11,7 @@
#ifndef __rsp_cpu_h__
#define __rsp_cpu_h__
#include "common.h"
#include "os/dynarec.h"
#include "rsp/cp2.h"
#include "rsp/pipeline.h"
@ -57,9 +58,14 @@ struct rsp {
uint8_t mem[0x2000];
struct bus_controller *bus;
// TODO: Only for IA32/x86_64 SSE2; sloppy?
struct dynarec_slab vload_dynarec;
struct dynarec_slab vstore_dynarec;
};
cen64_cold int rsp_init(struct rsp *rsp, struct bus_controller *bus);
cen64_cold void rsp_destroy(struct rsp *rsp);
void rsp_cycle(struct rsp *rsp);