mirror of
https://github.com/n64dev/cen64.git
synced 2024-06-21 13:32:40 -04:00
(Finally) permit SSE2-only builds.
Add SSE2 codepaths where necessary (even if not complete), while still allowing the project to be compiled with SSSE3+ intrinsics.
This commit is contained in:
parent
3a24a67f1f
commit
316214d82d
|
@ -41,7 +41,7 @@ if (${CMAKE_C_COMPILER_ID} MATCHES GNU)
|
|||
endif()
|
||||
|
||||
if (NOT NATIVE_BUILD)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
|
||||
endif (NOT NATIVE_BUILD)
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/os/unix/x86_64)
|
||||
|
@ -108,7 +108,7 @@ if (${CMAKE_C_COMPILER_ID} MATCHES Clang)
|
|||
set(CEN64_ARCH_DIR "x86_64")
|
||||
|
||||
if (NOT NATIVE_BUILD)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mssse3")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse2")
|
||||
endif (NOT NATIVE_BUILD)
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/os/unix/x86_64)
|
||||
|
|
|
@ -9,8 +9,51 @@
|
|||
|
||||
#include "common.h"
|
||||
#include "arch/x86_64/rsp/rsp.h"
|
||||
#include "os/dynarec.h"
|
||||
#include "rsp/cpu.h"
|
||||
|
||||
// Deallocates dynarec buffers for SSE2.
|
||||
void arch_rsp_destroy(struct rsp *rsp) {
|
||||
#ifndef __SSSE3__
|
||||
free_dynarec_slab(&rsp->vstore_dynarec);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Allocates dynarec buffers for SSE2.
|
||||
int arch_rsp_init(struct rsp *rsp) {
|
||||
#ifndef __SSSE3__
|
||||
void *vload_buffer, *vstore_buffer;
|
||||
|
||||
// See rsp_vstore_dmem for code description.
|
||||
static const uint8_t vstore_code[] = {
|
||||
0x66, 0x0F, 0x73, 0xF8, 0x00,
|
||||
|
||||
0x66, 0x0F, 0x73, 0xD9, 0x00,
|
||||
0x66, 0x0F, 0x73, 0xFA, 0x00,
|
||||
0x66, 0x0F, 0xEB, 0xCA,
|
||||
|
||||
0x66, 0x0F, 0xDB, 0xC8,
|
||||
0x66, 0x0F, 0xDF, 0xC3,
|
||||
0x66, 0x0F, 0xEB, 0xC1,
|
||||
|
||||
0xC3
|
||||
};
|
||||
|
||||
if ((vload_buffer = alloc_dynarec_slab(
|
||||
&rsp->vload_dynarec, CACHE_LINE_SIZE)) == NULL)
|
||||
return 1;
|
||||
|
||||
if ((vstore_buffer = alloc_dynarec_slab(
|
||||
&rsp->vstore_dynarec, CACHE_LINE_SIZE)) == NULL) {
|
||||
free_dynarec_slab(&rsp->vload_dynarec);
|
||||
return 1;
|
||||
}
|
||||
|
||||
memcpy(vstore_buffer, vstore_code, sizeof(vstore_code));
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __SSSE3__
|
||||
cen64_align(const uint16_t shuffle_keys[16][8], CACHE_LINE_SIZE) = {
|
||||
/* -- */ {0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E},
|
||||
|
@ -66,7 +109,9 @@ __m128i rsp_vect_load_and_shuffle_operand(
|
|||
__m128i vlo, vhi;
|
||||
int i;
|
||||
|
||||
for (i = -2; i < 6; i += 2)
|
||||
dword = src[element - 2];
|
||||
|
||||
for (i = -1; i < 6; i += 2)
|
||||
dword = (dword << 16) | src[element + i];
|
||||
|
||||
vlo = _mm_loadl_epi64((__m128i *) &dword);
|
||||
|
@ -133,7 +178,7 @@ cen64_align(const uint16_t srl_b2l_keys[16][8], CACHE_LINE_SIZE) = {
|
|||
};
|
||||
|
||||
//
|
||||
// Accelerated loads. Byteswap big-endian to 2-byte little-endian
|
||||
// SSSE3+ accelerated loads. Byteswap big-endian to 2-byte little-endian
|
||||
// vector. Start at vector element offset, discarding any wraparound
|
||||
// as necessary. Lastly, don't load across cacheline boundary.
|
||||
//
|
||||
|
@ -181,6 +226,31 @@ __m128i rsp_vload_dmem(struct rsp *rsp,
|
|||
|
||||
return data;
|
||||
}
|
||||
#else
|
||||
//
|
||||
// SSE2 accelerated loads. Byteswap big-endian to 2-byte little-endian
|
||||
// vector. Start at vector element offset, discarding any wraparound
|
||||
// as necessary. Lastly, don't load across cacheline boundary.
|
||||
//
|
||||
// TODO: Verify wraparound behavior.
|
||||
// TODO: Only tested for L{B/S/L/D/Q}V
|
||||
//
|
||||
__m128i rsp_vload_dmem(struct rsp *rsp,
|
||||
uint32_t addr, unsigned element, __m128i reg, __m128i dqm) {
|
||||
__m128i datah, datal;
|
||||
|
||||
unsigned doffset = addr & 0xF;
|
||||
uint32_t aligned_addr = addr & 0xFF0;
|
||||
|
||||
__m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr));
|
||||
|
||||
// TODO: Implement this correctly.
|
||||
datah = _mm_slli_epi16(data, 8);
|
||||
datal = _mm_srli_epi16(data, 8);
|
||||
data = _mm_or_si128(datah, datal);
|
||||
|
||||
return data;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __SSSE3__
|
||||
|
@ -236,7 +306,7 @@ cen64_align(const uint16_t sll_l2b_keys[16][8], CACHE_LINE_SIZE) = {
|
|||
};
|
||||
|
||||
//
|
||||
// Accelerated stores. Byteswap 2-byte little-endian vector back
|
||||
// SSE3+ accelerated stores. Byteswap 2-byte little-endian vector back
|
||||
// to big-endian. Start at vector element offset, wrapping around
|
||||
// as necessary. Lastly, only store upto the cacheline boundary.
|
||||
//
|
||||
|
@ -268,5 +338,70 @@ void rsp_vstore_dmem(struct rsp *rsp,
|
|||
|
||||
_mm_store_si128((__m128i *) (rsp->mem + aligned_addr), data);
|
||||
}
|
||||
|
||||
#else
|
||||
//
|
||||
// SSE2 accelerated stores. Byteswap 2-byte little-endian vector back
|
||||
// to big-endian. Start at vector element offset, wrapping around
|
||||
// as necessary. Lastly, only store upto the cacheline boundary.
|
||||
//
|
||||
// TODO: Verify wraparound behavior.
|
||||
// TODO: Only tested for L{B/S/L/D/Q}V
|
||||
//
|
||||
void rsp_vstore_dmem(struct rsp *rsp,
|
||||
uint32_t addr, unsigned element, __m128i reg, __m128i dqm) {
|
||||
__m128i dqmh, dqml, regh, regl;
|
||||
|
||||
unsigned doffset = addr & 0xF;
|
||||
unsigned eoffset = (doffset - element) & 0xF;
|
||||
uint32_t aligned_addr = addr & 0xFF0;
|
||||
|
||||
__m128i data = _mm_load_si128((__m128i *) (rsp->mem + aligned_addr));
|
||||
|
||||
// Byteswap both vectors, first.
|
||||
dqmh = _mm_slli_epi16(dqm, 8);
|
||||
dqml = _mm_srli_epi16(dqm, 8);
|
||||
dqm = _mm_or_si128(dqmh, dqml);
|
||||
|
||||
regh = _mm_slli_epi16(reg, 8);
|
||||
regl = _mm_srli_epi16(reg, 8);
|
||||
reg = _mm_or_si128(regh, regl);
|
||||
|
||||
//
|
||||
// Since SSE2 only provides "fixed immediate" shuffles:
|
||||
// Patch/call a dynarec buffer that does the following:
|
||||
//
|
||||
// Given:
|
||||
// xmm0 = byteswapped dqm
|
||||
// xmm1 = byteswapped reg
|
||||
// xmm2 = byteswapped reg
|
||||
// xmm3 = data [dmem]
|
||||
//
|
||||
// Perform:
|
||||
// 1) [xmm0 -> xmm0] Shift left:
|
||||
// 66 0f 73 f8 0# pslldq $0x#,%xmm0
|
||||
//
|
||||
// 2) [xmm1,xmm2 -> xmm1] Rotate left:
|
||||
// 66 0F 73 D9 0# psrldq $0x#,%xmm1
|
||||
// 66 0F 73 FA 0# pslldq $0x#,%xmm2
|
||||
// 66 0F EB CA por %xmm2,%xmm1
|
||||
//
|
||||
// 3) Mask/mux reg and data:
|
||||
// 66 0F DB C8 pand %xmm0,%xmm1
|
||||
// 66 0F DF C3 pandn %xmm3,%xmm0
|
||||
// 66 0F EB C1 por %xmm1,%xmm0
|
||||
//
|
||||
// 4) Return from dynarec:
|
||||
// C3 retq
|
||||
//
|
||||
rsp->vstore_dynarec.ptr[4] = doffset;
|
||||
rsp->vstore_dynarec.ptr[9] = eoffset;
|
||||
rsp->vstore_dynarec.ptr[14] = 16 - eoffset;
|
||||
|
||||
data = ((__m128i (*)(__m128i, __m128i, __m128i, __m128i))
|
||||
rsp->vstore_dynarec.ptr)(dqm, reg, reg, data);
|
||||
|
||||
_mm_store_si128((__m128i *) (rsp->mem + aligned_addr), data);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -26,6 +26,10 @@
|
|||
struct rsp;
|
||||
typedef __m128i rsp_vect_t;
|
||||
|
||||
// Gives the architecture backend a chance to initialize the RSP.
|
||||
void arch_rsp_destroy(struct rsp *rsp);
|
||||
int arch_rsp_init(struct rsp *rsp);
|
||||
|
||||
// Loads and shuffles a 16x8 vector according to element.
|
||||
#ifdef __SSSE3__
|
||||
extern const uint16_t shuffle_keys[16][8];
|
||||
|
|
|
@ -11,15 +11,6 @@
|
|||
#ifndef __common_h__
|
||||
#define __common_h__
|
||||
|
||||
// TODO: Remove me after SSE2 support is added.
|
||||
#ifndef __SSE3__
|
||||
#define __SSE3__
|
||||
#endif
|
||||
|
||||
#ifndef __SSSE3__
|
||||
#define __SSSE3__
|
||||
#endif
|
||||
|
||||
#define tostring(s) #s
|
||||
#define stringify(s) tostring(s)
|
||||
|
||||
|
|
8
device.c
8
device.c
|
@ -24,6 +24,7 @@
|
|||
#include "vi/controller.h"
|
||||
#include "vr4300/cpu.h"
|
||||
|
||||
cen64_cold static void device_destroy(struct cen64_device *device);
|
||||
cen64_cold static struct cen64_device *device_create(struct cen64_device *device,
|
||||
uint8_t *ram, const struct rom_file *pifrom, const struct rom_file *cart);
|
||||
|
||||
|
@ -102,6 +103,11 @@ struct cen64_device *device_create(struct cen64_device *device,
|
|||
return device;
|
||||
}
|
||||
|
||||
// Cleans up memory allocated for the device.
|
||||
void device_destroy(struct cen64_device *device) {
|
||||
rsp_destroy(&device->rsp);
|
||||
}
|
||||
|
||||
// Called when we should (probably?) leave simulation.
|
||||
// After calling this function, we return to device_runmode_*.
|
||||
void device_request_exit(struct bus_controller *bus) {
|
||||
|
@ -202,6 +208,8 @@ int device_run(struct cen64_device *device, struct cen64_options *options,
|
|||
status = unlikely(options->extra_mode)
|
||||
? device_runmode_extra(device)
|
||||
: device_runmode_fast(device);
|
||||
|
||||
device_destroy(device);
|
||||
}
|
||||
|
||||
return status;
|
||||
|
|
24
os/dynarec.h
Normal file
24
os/dynarec.h
Normal file
|
@ -0,0 +1,24 @@
|
|||
//
|
||||
// os/dynarec.h
|
||||
//
|
||||
// Functions for allocating executable code buffers.
|
||||
//
|
||||
// This file is subject to the terms and conditions defined in
|
||||
// 'LICENSE', which is part of this source code package.
|
||||
//
|
||||
|
||||
#ifndef __os_dynarec_h__
|
||||
#define __os_dynarec_h__
|
||||
#include "common.h"
|
||||
#include <stddef.h>
|
||||
|
||||
struct dynarec_slab {
|
||||
size_t size;
|
||||
uint8_t *ptr;
|
||||
};
|
||||
|
||||
cen64_cold void *alloc_dynarec_slab(struct dynarec_slab *slab, size_t size);
|
||||
cen64_cold void free_dynarec_slab(struct dynarec_slab *slab);
|
||||
|
||||
#endif
|
||||
|
30
os/unix/dynarec.c
Normal file
30
os/unix/dynarec.c
Normal file
|
@ -0,0 +1,30 @@
|
|||
//
|
||||
// os/unix/dynarec.c
|
||||
//
|
||||
// Functions for allocating executable code buffers.
|
||||
//
|
||||
// This file is subject to the terms and conditions defined in
|
||||
// 'LICENSE', which is part of this source code package.
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "os/dynarec.h"
|
||||
#include <sys/mman.h>
|
||||
|
||||
extern const int zero_page_fd;
|
||||
|
||||
// Allocates memory with execute permissions set.
|
||||
void *alloc_dynarec_slab(struct dynarec_slab *slab, size_t size) {
|
||||
if ((slab->ptr = mmap(NULL, size, PROT_EXEC | PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE, zero_page_fd, 0)) == MAP_FAILED)
|
||||
return NULL;
|
||||
|
||||
slab->size = size;
|
||||
return slab->ptr;
|
||||
}
|
||||
|
||||
// Frees memory acquired for a dynarec buffer.
|
||||
void free_dynarec_slab(struct dynarec_slab *slab) {
|
||||
munmap(slab->ptr, slab->size);
|
||||
}
|
||||
|
|
@ -14,48 +14,59 @@
|
|||
#include "os/main.h"
|
||||
#include "os/unix/glx_window.h"
|
||||
#include <fcntl.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
|
||||
struct ram_hunk {
|
||||
size_t size;
|
||||
void *ptr;
|
||||
int fd;
|
||||
};
|
||||
|
||||
cen64_cold static uint8_t *allocate_ram(struct ram_hunk *ram, size_t size);
|
||||
cen64_cold static void deallocate_ram(struct ram_hunk *ram, size_t size);
|
||||
cen64_cold static void deallocate_ram(struct ram_hunk *ram);
|
||||
|
||||
// Global file descriptor for allocations.
|
||||
#ifdef __linux__
|
||||
const char *zero_page_path = "/dev/zero";
|
||||
#else
|
||||
const char *zero_page_path = "/dev/null";
|
||||
#endif
|
||||
|
||||
int zero_page_fd;
|
||||
|
||||
// Allocates a large hunk of zeroed RAM.
|
||||
uint8_t *allocate_ram(struct ram_hunk *ram, size_t size) {
|
||||
#ifdef __linux__
|
||||
if ((ram->fd = open("/dev/zero", O_RDWR)) < 0)
|
||||
#else
|
||||
if ((ram->fd = open("/dev/null", O_RDWR)) < 0)
|
||||
#endif
|
||||
return NULL;
|
||||
|
||||
if ((ram->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE, ram->fd, 0)) == MAP_FAILED) {
|
||||
close(ram->fd);
|
||||
MAP_PRIVATE, zero_page_fd, 0)) == MAP_FAILED)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifndef __linux__
|
||||
memset(ram->ptr, 0, size);
|
||||
#endif
|
||||
ram->size = size;
|
||||
return ram->ptr;
|
||||
}
|
||||
|
||||
// Allocates a large hunk of RAM.
|
||||
void deallocate_ram(struct ram_hunk *ram, size_t size) {
|
||||
munmap(ram->ptr, size);
|
||||
close(ram->fd);
|
||||
void deallocate_ram(struct ram_hunk *ram) {
|
||||
munmap(ram->ptr, ram->size);
|
||||
}
|
||||
|
||||
// Unix application entry point.
|
||||
int main(int argc, const char *argv[]) {
|
||||
return cen64_cmdline_main(argc, argv);
|
||||
cen64_cold int main(int argc, const char *argv[]) {
|
||||
int status;
|
||||
|
||||
if ((zero_page_fd = open(zero_page_path, O_RDWR)) < 0) {
|
||||
printf("Failed to open: %s\n", zero_page_path);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
status = cen64_cmdline_main(argc, argv);
|
||||
|
||||
close(zero_page_fd);
|
||||
return status;
|
||||
}
|
||||
|
||||
// Informs the simulation thread if an exit was requested.
|
||||
|
@ -92,14 +103,13 @@ int os_main(struct cen64_options *options,
|
|||
if (create_gl_window(&device.bus, &device.vi.gl_window, &hints)) {
|
||||
printf("Failed to create a window.\n");
|
||||
|
||||
deallocate_ram(&hunk, DEVICE_RAMSIZE);
|
||||
deallocate_ram(&hunk);
|
||||
return 1;
|
||||
}
|
||||
|
||||
status = device_run(&device, options, ram, pifrom, cart);
|
||||
destroy_gl_window(&device.vi.gl_window);
|
||||
deallocate_ram(&hunk, DEVICE_RAMSIZE);
|
||||
|
||||
deallocate_ram(&hunk);
|
||||
return status;
|
||||
}
|
||||
|
||||
|
|
29
os/windows/dynarec.c
Normal file
29
os/windows/dynarec.c
Normal file
|
@ -0,0 +1,29 @@
|
|||
//
|
||||
// os/windows/dynarec.c
|
||||
//
|
||||
// Functions for allocating executable code buffers.
|
||||
//
|
||||
// This file is subject to the terms and conditions defined in
|
||||
// 'LICENSE', which is part of this source code package.
|
||||
//
|
||||
|
||||
#include "common.h"
|
||||
#include "os/dynarec.h"
|
||||
#include <windows.h>
|
||||
|
||||
extern HANDLE dynarec_heap;
|
||||
|
||||
// Allocates memory with execute permissions set.
|
||||
void *alloc_dynarec_slab(struct dynarec_slab *slab, size_t size) {
|
||||
if ((slab->ptr = HeapAlloc(dynarec_heap, HEAP_ZERO_MEMORY, size)) == NULL)
|
||||
return NULL;
|
||||
|
||||
slab->size = size;
|
||||
return slab->ptr;
|
||||
}
|
||||
|
||||
// Frees memory acquired for a dynarec buffer.
|
||||
void free_dynarec_slab(struct dynarec_slab *slab) {
|
||||
HeapFree(dynarec_heap, 0, slab->ptr);
|
||||
}
|
||||
|
|
@ -24,6 +24,8 @@ static int load_roms(const char *pifrom_path, const char *cart_path,
|
|||
static void hide_console(void);
|
||||
static void show_console(void);
|
||||
|
||||
HANDLE dynarec_heap;
|
||||
|
||||
// Windows application entry point.
|
||||
int WINAPI WinMain(HINSTANCE hInstance,
|
||||
HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nShowCmd) {
|
||||
|
@ -37,9 +39,18 @@ int WINAPI WinMain(HINSTANCE hInstance,
|
|||
return status;
|
||||
}
|
||||
|
||||
status = cen64_win32_main(__argc, __argv);
|
||||
if ((dynarec_heap = HeapCreate(HEAP_CREATE_ENABLE_EXECUTE, 0, 0)) == NULL) {
|
||||
MessageBox(NULL, "Failed to create the dynarec heap.", "CEN64",
|
||||
MB_OK | MB_ICONEXCLAMATION);
|
||||
|
||||
WSACleanup();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
status = cen64_win32_main(__argc, __argv);
|
||||
HeapDestroy(dynarec_heap);
|
||||
WSACleanup();
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
|
11
rsp/cpu.c
11
rsp/cpu.c
|
@ -25,13 +25,18 @@ static void rsp_connect_bus(struct rsp *rsp, struct bus_controller *bus) {
|
|||
rsp->bus = bus;
|
||||
}
|
||||
|
||||
// Releases memory acquired for the RSP component.
|
||||
void rsp_destroy(struct rsp *rsp) {
|
||||
arch_rsp_destroy(rsp);
|
||||
}
|
||||
|
||||
// Initializes the RSP component.
|
||||
cen64_cold int rsp_init(struct rsp *rsp, struct bus_controller *bus) {
|
||||
int rsp_init(struct rsp *rsp, struct bus_controller *bus) {
|
||||
rsp_connect_bus(rsp, bus);
|
||||
|
||||
rsp_cp0_init(rsp);
|
||||
|
||||
rsp_pipeline_init(&rsp->pipeline);
|
||||
return 0;
|
||||
|
||||
return arch_rsp_init(rsp);
|
||||
}
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#ifndef __rsp_cpu_h__
|
||||
#define __rsp_cpu_h__
|
||||
#include "common.h"
|
||||
#include "os/dynarec.h"
|
||||
#include "rsp/cp2.h"
|
||||
#include "rsp/pipeline.h"
|
||||
|
||||
|
@ -57,9 +58,14 @@ struct rsp {
|
|||
uint8_t mem[0x2000];
|
||||
|
||||
struct bus_controller *bus;
|
||||
|
||||
// TODO: Only for IA32/x86_64 SSE2; sloppy?
|
||||
struct dynarec_slab vload_dynarec;
|
||||
struct dynarec_slab vstore_dynarec;
|
||||
};
|
||||
|
||||
cen64_cold int rsp_init(struct rsp *rsp, struct bus_controller *bus);
|
||||
cen64_cold void rsp_destroy(struct rsp *rsp);
|
||||
|
||||
void rsp_cycle(struct rsp *rsp);
|
||||
|
||||
|
|
Loading…
Reference in a new issue