Apple M1 Support for MacOS

This commit adds support for compiling Dolphin for ARM on MacOS so that it can
run natively on the M1 processors without running through Rosseta2 emulation
providing a 30-50% performance speedup and less hitches from Rosseta2.

It consists of several key changes:

- Adding support for W^X allocation(MAP_JIT) for the ARM JIT
- Adding the machine context and config info to identify the M1 processor
- Additions to the build system and docs to support building universal binaries
- Adding code signing entitlements to access the MAP_JIT functionality
- Updating the MoltenVK libvulkan.dylib to a newer version with M1 support
This commit is contained in:
Skyler Saleh 2021-01-13 06:23:57 -08:00
parent f9b0225b69
commit 4ecb3084b7
18 changed files with 284 additions and 21 deletions

146
BuildMacOSUniveralBinary.py Normal file
View file

@ -0,0 +1,146 @@
"""
The current tooling supported in CMake, Homebrew, and QT5 are insufficient for creating
MacOSX universal binaries automatically for applications like Dolphin which have more
complicated build requirements (like different libraries, build flags and source files
for each target architecture).
So instead, this script manages the conifiguration and compilation of distinct builds
and project files for each target architecture and then merges the two binaries into
a single universal binary.
Running this script will:
1) Generate Xcode project files for the ARM build (if project files don't already exist)
2) Generate Xcode project files for the x64 build (if project files don't already exist)
3) Build the ARM project for the selected build_target
4) Build the x64 project for the selected build_target
5) Generates universal .app packages combining the ARM and x64 packages
6) Utilizes the lipo tool to combine the binary objects inside each of the packages
into universal binaries
7) Code signs the final universal binaries using the specified codesign_identity
"""
##BEGIN CONFIG##
#Location of destination universal binary
dst_app = "universal/"
#Build Target (dolphin-emu to just build the emulator and skip the tests)
build_target = "ALL_BUILD"
#Locations to pkg config files for arm and x64 libraries
#The default values of these paths are taken from the default
#paths used for homebrew
arm_pkg_config_path='/opt/homebrew/lib/pkgconfig'
x64_pkg_config_path='/usr/local/lib/pkgconfig'
#Locations to qt5 directories for arm and x64 libraries
#The default values of these paths are taken from the default
#paths used for homebrew
arm_qt5_path='/opt/homebrew/opt/qt5'
x64_qt5_path='/usr/local/opt/qt5'
# Identity to use for code signing. "-" indicates that the app will not
# be cryptographically signed/notarized but will instead just use a
# SHA checksum to verify the integrity of the app. This doesn't
# protect against malicious actors, but it does protect against
# running corrupted binaries and allows for access to the extended
# permisions needed for ARM builds
codesign_identity ='"-"'
##END CONFIG##
import glob
import sys
import os
import shutil
import filecmp
#Configure ARM project files if they don't exist
if not os.path.exists("arm"):
os.mkdir("arm");
os.chdir("arm");
os.system('PKG_CONFIG_PATH="'+arm_pkg_config_path+'" Qt5_DIR="'+arm_qt5_path+'" CMAKE_OSX_ARCHITECTURES=arm64 arch -arm64 cmake ../../ -G Xcode');
os.chdir("..");
#Configure x64 project files if they don't exist
if not os.path.exists("x64"):
os.mkdir("x64");
os.chdir("x64");
os.system('PKG_CONFIG_PATH="'+x64_pkg_config_path+'" Qt5_DIR="'+x64_qt5_path+'" CMAKE_OSX_ARCHITECTURES=x86_64 arch -x86_64 cmake ../../ -G Xcode')
os.chdir("..");
#Build ARM and x64 projects
os.system('xcodebuild -project arm/dolphin-emu.xcodeproj -target "'+build_target+'" -configuration Release');
os.system('xcodebuild -project x64/dolphin-emu.xcodeproj -target "'+build_target+'" -configuration Release');
#Merge ARM and x64 binaries into universal binaries
#Source binaries to merge together
src_app0 = "arm/Binaries/release"
src_app1 = "x64/Binaries/release"
if os.path.exists(dst_app): shutil.rmtree(dst_app)
os.mkdir(dst_app);
def lipo(path0,path1,dst):
cmd = 'lipo -create -output "'+dst + '" "' + path0 +'" "' + path1+'"'
print(cmd)
os.system(cmd)
def recursiveMergeBinaries(src0,src1,dst):
#loop over all files in src0
for newpath0 in glob.glob(src0+"/*"):
filename = os.path.basename(newpath0);
newpath1 = os.path.join(src1,filename);
new_dst_path = os.path.join(dst,filename);
if not os.path.islink(newpath0):
if os.path.exists(newpath1):
if os.path.isdir(newpath1):
os.mkdir(new_dst_path);
#recurse into directories
recursiveMergeBinaries(newpath0,newpath1,new_dst_path)
else:
if filecmp.cmp(newpath0,newpath1):
#copy files that are the same
shutil.copy(newpath0,new_dst_path);
else:
#lipo together files that are different
lipo(newpath0,newpath1,new_dst_path)
else:
#copy files that don't exist in path1
shutil.copy(newpath0,new_dst_path)
#loop over files in src1 and copy missing things over to dst
for newpath1 in glob.glob(src1+"/*"):
filename = os.path.basename(newpath0);
newpath0 = os.path.join(src0,filename);
new_dst_path = os.path.join(dst,filename);
if not os.path.exists(newpath0) and not os.path.islink(newpath1):
shutil.copytree(newpath1,new_dst_path);
#fix up symlinks for path0
for newpath0 in glob.glob(src0+"/*"):
filename = os.path.basename(newpath0);
new_dst_path = os.path.join(dst,filename);
if os.path.islink(newpath0):
relative_path = os.path.relpath(os.path.realpath(newpath0),src0)
print(relative_path,new_dst_path)
os.symlink(relative_path,new_dst_path);
#fix up symlinks for path1
for newpath1 in glob.glob(src1+"/*"):
filename = os.path.basename(newpath1);
new_dst_path = os.path.join(dst,filename);
newpath0 = os.path.join(src0,filename);
if os.path.islink(newpath1) and not os.path.exists(newpath0):
relative_path = os.path.relpath(os.path.realpath(newpath1),src1)
print(relative_path,new_dst_path)
os.symlink(relative_path,new_dst_path);
return;
#create univeral binary
recursiveMergeBinaries(src_app0,src_app1,dst_app);
#codesign
os.system("codesign --deep --force -s "+codesign_identity+" " +dst_app +"/*");

View file

@ -2,13 +2,13 @@
# General setup
#
cmake_minimum_required(VERSION 3.10)
set(CMAKE_OSX_ARCHITECTURES "x86_64")
# Minimum OS X version.
# This is inserted into the Info.plist as well.
# MacOS prior to 10.12 did not fully support C++17, which is used to
# handle configuration options
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.12.0" CACHE STRING "")
# MacOS prior to 10.14 did not fully support C++17, which is used to
# handle configuration options and aligned alloc
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14.0" CACHE STRING "")
set(CMAKE_USER_MAKE_RULES_OVERRIDE "CMake/FlagsOverride.cmake")

Binary file not shown.

View file

@ -64,6 +64,19 @@ will inform you if a bundled library is used or if you need to install any
missing packages yourself.
### macOS Build Steps:
A script is provided to build Universal binaries supporting both x64 and ARM in the same
application bundle using the following steps:
1. `mkdir build`
2. `cd build`
3. `python ../BuildMacOSUniveralBinary.py`
4. Universal binaries will be available in the `universal` folder
Doing this requires installation of library dependencies for both x64 and ARM (or universal library
equilvalents) and may require modification of the config portion of the script to point to the
library locations
A binary supporting a single architecture can be built as well using the following steps:
1. `mkdir build`
2. `cd build`

View file

@ -21,6 +21,9 @@
#ifdef _WIN32
#include <Windows.h>
#endif
#ifdef __APPLE__
#include <libkern/OSCacheControl.h>
#endif
namespace Arm64Gen
{
@ -342,7 +345,7 @@ void ARM64XEmitter::FlushIcacheSection(u8* start, u8* end)
if (start == end)
return;
#if defined(IOS)
#if defined(IOS) ||defined(__APPLE__)
// Header file says this is equivalent to: sys_icache_invalidate(start, end - start);
sys_cache_control(kCacheFunctionPrepareForExecution, start, end - start);
#elif defined(WIN32)

View file

@ -8,6 +8,7 @@
#include <string>
#include <thread>
#ifndef __APPLE__
#ifndef _WIN32
#ifndef __FreeBSD__
#include <asm/hwcap.h>
@ -15,6 +16,7 @@
#include <sys/auxv.h>
#include <unistd.h>
#endif
#endif
#include <fmt/format.h>
@ -71,7 +73,17 @@ void CPUInfo::Detect()
vendor = CPUVendor::ARM;
bFlushToZero = true;
#ifdef _WIN32
#ifdef __APPLE__
num_cores = std::thread::hardware_concurrency();
// M-series CPUs have all of these
bFP = true;
bASIMD = true;
bAES = true;
bSHA1 = true;
bSHA2 = true;
bCRC32 = true;
#elif defined(_WIN32)
num_cores = std::thread::hardware_concurrency();
// Windows does not provide any mechanism for querying the system registers on ARMv8, unlike Linux

View file

@ -16,6 +16,7 @@
#include <windows.h>
#include "Common/StringUtil.h"
#else
#include <pthread.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/types.h>
@ -38,9 +39,12 @@ void* AllocateExecutableMemory(size_t size)
#if defined(_WIN32)
void* ptr = VirtualAlloc(nullptr, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
#else
int map_flags =MAP_ANON | MAP_PRIVATE;
#if defined(_M_ARM_64) && defined(__APPLE__)
map_flags |= MAP_JIT;
#endif
void* ptr =
mmap(nullptr, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANON | MAP_PRIVATE, -1, 0);
mmap(nullptr, size, PROT_READ | PROT_WRITE |PROT_EXEC , map_flags, -1, 0);
if (ptr == MAP_FAILED)
ptr = nullptr;
#endif
@ -50,6 +54,35 @@ void* AllocateExecutableMemory(size_t size)
return ptr;
}
// Certain platforms (Mac OS X on ARM) enforce that a single thread can only have write or
// execute permissions to pages at any given point of time. The two below functions
// are used to toggle between having write permissions or execute permissions.
//
// The default state of these allocations in Dolphin is for them to be executable,
// but not writeable. So, functions that are updating these pages should wrap their
// writes like below:
// JITPageWriteEnableExecuteDisable();
// PrepareInstructionStreamForJIT();
// JITPageWriteDisableExecuteEnable();
//Allows a thread to write to executable memory, but not execute the data.
void JITPageWriteEnableExecuteDisable(){
#if defined(_M_ARM_64) && defined(__APPLE__)
if (__builtin_available(macOS 11.0, *)) {
pthread_jit_write_protect_np(0);
}
#endif
}
//Allows a thread to execute memory allocated for execution, but not write to it.
void JITPageWriteDisableExecuteEnable(){
#if defined(_M_ARM_64) && defined(__APPLE__)
if (__builtin_available(macOS 11.0, *)) {
pthread_jit_write_protect_np(1);
}
#endif
}
void* AllocateMemoryPages(size_t size)
{

View file

@ -10,6 +10,10 @@
namespace Common
{
void* AllocateExecutableMemory(size_t size);
//Allows a thread to write to executable memory, but not execute the data.
void JITPageWriteEnableExecuteDisable();
//Allows a thread to execute memory allocated for execution, but not write to it.
void JITPageWriteDisableExecuteEnable();
void* AllocateMemoryPages(size_t size);
void FreeMemoryPages(void* ptr, size_t size);
void* AllocateAlignedMemory(size_t size, size_t alignment);

View file

@ -277,7 +277,8 @@ void DolphinAnalytics::MakeBaseBuilder()
builder.AddData("android-version", s_get_val_func("DEVICE_OS"));
#elif defined(__APPLE__)
builder.AddData("os-type", "osx");
//objc_msgSend is only available on x86
#ifndef _M_ARM_64
// id processInfo = [NSProcessInfo processInfo]
id processInfo = reinterpret_cast<id (*)(Class, SEL)>(objc_msgSend)(
objc_getClass("NSProcessInfo"), sel_getUid("processInfo"));
@ -298,6 +299,8 @@ void DolphinAnalytics::MakeBaseBuilder()
builder.AddData("osx-ver-minor", version.minor_version);
builder.AddData("osx-ver-bugfix", version.patch_version);
}
#endif
#elif defined(__linux__)
builder.AddData("os-type", "linux");
#elif defined(__FreeBSD__)

View file

@ -67,6 +67,12 @@ typedef x86_thread_state64_t SContext;
#define CTX_R14 __r14
#define CTX_R15 __r15
#define CTX_RIP __rip
#elif _M_ARM_64
typedef arm_thread_state64_t SContext;
#define CTX_REG(x) __x[x]
#define CTX_LR __x[30]
#define CTX_SP __sp
#define CTX_PC __pc
#else
#error No context definition for architecture
#endif

View file

@ -25,6 +25,20 @@
#include <unistd.h> // Needed for _POSIX_VERSION
#endif
#if defined(__APPLE__)
#ifdef _M_X86_64
#define THREAD_STATE64_COUNT x86_THREAD_STATE64_COUNT
#define THREAD_STATE64 x86_THREAD_STATE64
#define thread_state64_t x86_thread_state64_t
#elif defined(_M_ARM_64)
#define THREAD_STATE64_COUNT ARM_THREAD_STATE64_COUNT
#define THREAD_STATE64 ARM_THREAD_STATE64
#define thread_state64_t arm_thread_state64_t
#else
#error Unsupported architecture
#endif
#endif
namespace EMM
{
#ifdef _WIN32
@ -123,7 +137,7 @@ static void ExceptionThread(mach_port_t port)
int64_t code[2];
int flavor;
mach_msg_type_number_t old_stateCnt;
natural_t old_state[x86_THREAD_STATE64_COUNT];
natural_t old_state[THREAD_STATE64_COUNT];
mach_msg_trailer_t trailer;
} msg_in;
@ -134,7 +148,7 @@ static void ExceptionThread(mach_port_t port)
kern_return_t RetCode;
int flavor;
mach_msg_type_number_t new_stateCnt;
natural_t new_state[x86_THREAD_STATE64_COUNT];
natural_t new_state[THREAD_STATE64_COUNT];
} msg_out;
#pragma pack()
memset(&msg_in, 0xee, sizeof(msg_in));
@ -165,13 +179,13 @@ static void ExceptionThread(mach_port_t port)
return;
}
if (msg_in.flavor != x86_THREAD_STATE64)
if (msg_in.flavor != THREAD_STATE64)
{
PanicAlertFmt("unknown flavor {} (expected {})", msg_in.flavor, x86_THREAD_STATE64);
PanicAlertFmt("unknown flavor {} (expected {})", msg_in.flavor, THREAD_STATE64);
return;
}
x86_thread_state64_t* state = (x86_thread_state64_t*)msg_in.old_state;
thread_state64_t* state = (thread_state64_t*)msg_in.old_state;
bool ok = JitInterface::HandleFault((uintptr_t)msg_in.code[1], state);
@ -184,9 +198,9 @@ static void ExceptionThread(mach_port_t port)
if (ok)
{
msg_out.RetCode = KERN_SUCCESS;
msg_out.flavor = x86_THREAD_STATE64;
msg_out.new_stateCnt = x86_THREAD_STATE64_COUNT;
memcpy(msg_out.new_state, msg_in.old_state, x86_THREAD_STATE64_COUNT * sizeof(natural_t));
msg_out.flavor = THREAD_STATE64;
msg_out.new_stateCnt = THREAD_STATE64_COUNT;
memcpy(msg_out.new_state, msg_in.old_state, THREAD_STATE64_COUNT * sizeof(natural_t));
}
else
{
@ -218,7 +232,7 @@ void InstallExceptionHandler()
// Debuggers set the task port, so we grab the thread port.
CheckKR("thread_set_exception_ports",
thread_set_exception_ports(mach_thread_self(), EXC_MASK_BAD_ACCESS, port,
EXCEPTION_STATE | MACH_EXCEPTION_CODES, x86_THREAD_STATE64));
EXCEPTION_STATE | MACH_EXCEPTION_CODES, THREAD_STATE64));
// ...and get rid of our copy so that MACH_NOTIFY_NO_SENDERS works.
CheckKR("mach_port_mod_refs",
mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, -1));

View file

@ -73,6 +73,8 @@ void JitArm64::Init()
bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
{
//Ifdef this since the exception handler runs on a separate thread on Mac OS X (ARM)
#if !defined(__APPLE__) && !defined(_M_ARM_64)
// We can't handle any fault from other threads.
if (!Core::IsCPUThread())
{
@ -80,6 +82,7 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
DoBacktrace(access_address, ctx);
return false;
}
#endif
bool success = false;
@ -124,11 +127,13 @@ void JitArm64::ClearCache()
m_handler_to_loc.clear();
blocks.Clear();
Common::JITPageWriteEnableExecuteDisable();
ClearCodeSpace();
farcode.ClearCodeSpace();
UpdateMemoryOptions();
GenerateAsm();
Common::JITPageWriteDisableExecuteEnable();
}
void JitArm64::Shutdown()
@ -596,6 +601,7 @@ void JitArm64::Jit(u32)
{
ClearCache();
}
Common::JITPageWriteEnableExecuteDisable();
std::size_t block_size = m_code_buffer.size();
const u32 em_address = PowerPC::ppcState.pc;
@ -624,6 +630,8 @@ void JitArm64::Jit(u32)
JitBlock* b = blocks.AllocateBlock(em_address);
DoJit(em_address, b, nextPC);
blocks.FinalizeBlock(*b, jo.enableBlocklink, code_block.m_physical_addresses);
Common::JITPageWriteDisableExecuteEnable();
}
void JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

View file

@ -59,11 +59,12 @@ void JitArm64BlockCache::WriteLinkBlock(Arm64Gen::ARM64XEmitter& emit,
void JitArm64BlockCache::WriteLinkBlock(const JitBlock::LinkData& source, const JitBlock* dest)
{
Common::JITPageWriteEnableExecuteDisable();
u8* location = source.exitPtrs;
ARM64XEmitter emit(location);
WriteLinkBlock(emit, source, dest);
Common::JITPageWriteDisableExecuteEnable();
emit.FlushIcache();
}
@ -71,9 +72,9 @@ void JitArm64BlockCache::WriteDestroyBlock(const JitBlock& block)
{
// Only clear the entry points as we might still be within this block.
ARM64XEmitter emit(block.checkedEntry);
Common::JITPageWriteEnableExecuteDisable();
while (emit.GetWritableCodePtr() <= block.normalEntry)
emit.BRK(0x123);
Common::JITPageWriteDisableExecuteEnable();
emit.FlushIcache();
}

View file

@ -289,6 +289,7 @@ bool JitArm64::HandleFastmemFault(uintptr_t access_address, SContext* ctx)
if ((const u8*)ctx->CTX_PC - fault_location > fastmem_area_length)
return false;
Common::JITPageWriteEnableExecuteDisable();
ARM64XEmitter emitter((u8*)fault_location);
emitter.BL(slow_handler_iter->second.slowmem_code);
@ -300,6 +301,8 @@ bool JitArm64::HandleFastmemFault(uintptr_t access_address, SContext* ctx)
m_fault_to_handler.erase(slow_handler_iter);
emitter.FlushIcache();
Common::JITPageWriteDisableExecuteEnable();
ctx->CTX_PC = reinterpret_cast<std::uintptr_t>(fault_location);
return true;
}

View file

@ -25,6 +25,8 @@ using namespace Arm64Gen;
void JitArm64::GenerateAsm()
{
Common::JITPageWriteEnableExecuteDisable();
// This value is all of the callee saved registers that we are required to save.
// According to the AACPS64 we need to save R19 ~ R30 and Q8 ~ Q15.
const u32 ALL_CALLEE_SAVED = 0x7FF80000;
@ -197,6 +199,7 @@ void JitArm64::GenerateAsm()
GenerateCommonAsm();
FlushIcache();
Common::JITPageWriteDisableExecuteEnable();
}
void JitArm64::GenerateCommonAsm()

View file

@ -473,6 +473,8 @@ if(APPLE)
set_target_properties(dolphin-emu PROPERTIES
MACOSX_BUNDLE true
MACOSX_BUNDLE_INFO_PLIST ${CMAKE_CURRENT_SOURCE_DIR}/Info.plist.in
XCODE_ATTRIBUTE_CODE_SIGN_ENTITLEMENTS "${CMAKE_CURRENT_SOURCE_DIR}/DolphinEmu.entitlements"
XCODE_ATTRIBUTE_OTHER_CODE_SIGN_FLAGS "--deep"
OUTPUT_NAME Dolphin
)

View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>com.apple.security.cs.allow-jit</key>
<true/>
<key>com.apple.security.cs.disable-library-validation</key>
<true/>
</dict>
</plist>

View file

@ -54,8 +54,10 @@ VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_at
: VertexLoaderBase(vtx_desc, vtx_att), m_float_emit(this)
{
AllocCodeSpace(4096);
Common::JITPageWriteEnableExecuteDisable();
ClearCodeSpace();
GenerateVertexLoader();
Common::JITPageWriteDisableExecuteEnable();
WriteProtect();
}