/******************************************************************************\ * Project: Basic MIPS R4000 Instruction Set for Scalar Unit Operations * * Authors: Iconoclast * * Release: 2018.03.17 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * * and related and neighboring rights to this software to the public domain * * worldwide. This software is distributed without any warranty. * * * * You should have received a copy of the CC0 Public Domain Dedication along * * with this software. * * If not, see . * \******************************************************************************/ #ifndef _SU_H_ #define _SU_H_ #include #include #include "my_types.h" #include "rsp.h" #define SEMAPHORE_LOCK_CORRECTIONS #define WAIT_FOR_CPU_HOST #if (0) #define SP_EXECUTE_LOG #define VU_EMULATE_SCALAR_ACCUMULATOR_READ #endif /* * Currently, the plugin system this module is written for doesn't notify us * of how much RDRAM is installed to the system, so we'll use signal handlers * to catch memory segment access faults in the trial search to find it out. */ extern unsigned long su_max_address; /* * Interact with memory using server-side byte order (MIPS big-endian) or * client-side (VM host's) native byte order on a 32-bit boundary. * * Unfortunately, most op-codes are optimized to require this to be TRUE. */ #if (ENDIAN_M == 0) #define USE_CLIENT_ENDIAN 0 #else #define USE_CLIENT_ENDIAN 1 #endif /* * Always keep this enabled for faster interpreter CPU. * * If you disable this, the branch delay slot algorithm will match the * documentation found in the MIPS manuals (which is not entirely accurate). * * Enabled: * while (CPU_running) { * PC = static_delay_slot_adjustments(); * switch (opcode) { ... continue; } * Disabled: * while (CPU_running) { * switch (opcode) { ... break; } * PC = documented_branch_delay_slot(); * continue; */ #if 1 #define EMULATE_STATIC_PC #endif #if (0 != 0) #define PROFILE_MODE static NOINLINE #else #define PROFILE_MODE static INLINE #endif typedef enum { zero = 0, at = 1, #ifdef TRUE_MIPS_AND_NOT_JUST_THE_RSP_SUBSET v0 = 2, v1 = 3, a0 = 4, a1 = 5, a2 = 6, a3 = 7, t0 = 8, t1 = 9, t2 = 10, t3 = 11, t4 = 12, t5 = 13, t6 = 14, t7 = 15, t8 = 24, t9 = 25, s0 = 16, s1 = 17, s2 = 18, s3 = 19, s4 = 20, s5 = 21, s6 = 22, s7 = 23, k0 = 26, k1 = 27, gp = 28, #endif sp = 29, fp = 30, /* new, official MIPS name for it: "frame pointer" */ ra = 31, NUMBER_OF_SCALAR_REGISTERS, S8 = fp /* older name for GPR $fp as of the R4000 ISA */ } GPR_specifier; extern RSP_INFO RSP_INFO_NAME; extern pu8 DRAM; extern pu8 DMEM; extern pu8 IMEM; extern u8 conf[]; /* * general-purpose scalar registers * * based on the MIPS instruction set architecture but without most of the * original register names (for example, no kernel-reserved registers) */ extern u32 SR[]; #define FIT_IMEM(PC) ((PC) & 0xFFFu & 0xFFCu) #ifdef EMULATE_STATIC_PC #define JUMP goto set_branch_delay #else #define JUMP break #endif #ifdef EMULATE_STATIC_PC #define BASE_OFF 0x000 #else #define BASE_OFF 0x004 #endif #ifndef EMULATE_STATIC_PC int stage; #endif #ifdef WAIT_FOR_CPU_HOST extern short MFC0_count[]; /* Keep one C0 MF status read count for each scalar register. */ #endif /* * The number of times to tolerate executing `MFC0 $at, $c4`. * Replace $at with any register--the timeout limit is per each. * * Set to a higher value to avoid prematurely quitting the interpreter. * Set to a lower value for speed...you could get away with 10 sometimes. */ extern int MF_SP_STATUS_TIMEOUT; #define SLOT_OFF ((BASE_OFF) + 0x000) #define LINK_OFF ((BASE_OFF) + 0x004) extern void set_PC(unsigned int address); /* * If the client CPU's shift amount is exactly 5 bits for a 32-bit source, * then omit emulating (sa & 31) in the SLL/SRL/SRA interpreter steps. * (Additionally, omit doing (GPR[rs] & 31) in SLLV/SRLV/SRAV.) * * As C pre-processor logic seems incapable of interpreting type storage, * stuff like #if (1U << 31 == 1U << ~0U) will generally just fail. * * Some of these also will only work assuming 2's complement (e.g., Intel). */ #if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON) #define MASK_SA(sa) (sa) #define IW_RD(inst) ((u16)(inst) >> 11) #define SIGNED_IMM16(imm) (s16)(imm) #else #define MASK_SA(sa) ((sa) & 31) #define IW_RD(inst) (u8)(((inst) >> 11) % (1 << 5)) #define SIGNED_IMM16(imm) (s16)(((imm) & 0x8000u) ? -(~(imm) + 1) : (imm)) #endif /* * If primary op-code is SPECIAL (000000), we could skip ANDing the rs shift. * Shifts losing precision are undefined, so don't assume that (1 >> 1 == 0). */ #if (0xFFFFFFFFul >> 31 != 0x000000001ul) || defined(_DEBUG) #define SPECIAL_DECODE_RS(inst) (((inst) & 0x03E00000UL) >> 21) #else #define SPECIAL_DECODE_RS(inst) ((inst) >> 21) #endif /* * Try to stick to (unsigned char) to conform to strict aliasing rules. * * Do not say `u8`. My custom type definitions are minimum-size types. * Do not say `uint8_t`. Exact-width types are not portable/universal. */ #if (CHAR_BIT != 8) #error Non-POSIX-compliant (char) storage width. #endif /* * RSP general-purpose registers (GPRs) are always 32-bit scalars (SRs). * SR_B(gpr, 0) is SR[gpr]31..24, and SR_B(gpr, 3) is SR[gpr]7..0. */ #define SR_B(scalar, i) *((unsigned char *)&(SR[scalar]) + BES(i)) /* * Universal byte-access macro for 8-element vectors of 16-bit halfwords. * Use this macro if you are not sure whether the element is odd or even. * * Maybe a typedef union{} can be better, but it's less readable for RSP * vector registers. Only 16-bit element computations exist, so the correct * allocation of the register file is int16_t v[32][8], not a_union v[32]. * * Either method--dynamic union reads or special aliasing--is undefined * behavior and will not truly be portable code anyway, so it hardly matters. */ #define VR_B(vt, element) *((unsigned char *)&(VR[vt][0]) + MES(element)) /* * Optimized byte-access macros for the vector registers. * Use these ONLY if you know the element is even (VR_A) or odd (VR_U). * * They are faster because LEA PTR [offset +/- 1] means fewer CPU * instructions generated than (offset ^ 1) does, in most cases. */ #define VR_A(vt, e) *((unsigned char *)&(VR[vt][0]) + e + MES(0)) #define VR_U(vt, e) *((unsigned char *)&(VR[vt][0]) + e - MES(0)) /* * Use this ONLY if you know the element is even, not odd. * * This is only provided for purposes of consistency with VR_B() and friends. * Saying `VR[vt][1] = x;` instead of `VR_S(vt, 2) = x` works as well. */ #define VR_S(vt, element) *(pi16)((unsigned char *)&(VR[vt][0]) + element) /*** Scalar, Coprocessor Operations (system control) ***/ #define SP_STATUS_HALT (0x00000001ul << 0) #define SP_STATUS_BROKE (0x00000001ul << 1) #define SP_STATUS_DMA_BUSY (0x00000001ul << 2) #define SP_STATUS_DMA_FULL (0x00000001ul << 3) #define SP_STATUS_IO_FULL (0x00000001ul << 4) #define SP_STATUS_SSTEP (0x00000001ul << 5) #define SP_STATUS_INTR_BREAK (0x00000001ul << 6) #define SP_STATUS_SIG0 (0x00000001ul << 7) #define SP_STATUS_SIG1 (0x00000001ul << 8) #define SP_STATUS_SIG2 (0x00000001ul << 9) #define SP_STATUS_SIG3 (0x00000001ul << 10) #define SP_STATUS_SIG4 (0x00000001ul << 11) #define SP_STATUS_SIG5 (0x00000001ul << 12) #define SP_STATUS_SIG6 (0x00000001ul << 13) #define SP_STATUS_SIG7 (0x00000001ul << 14) typedef enum { RCP_SP_MEM_ADDR_REG, RCP_SP_DRAM_ADDR_REG, RCP_SP_RD_LEN_REG, RCP_SP_WR_LEN_REG, RCP_SP_STATUS_REG, RCP_SP_DMA_FULL_REG, RCP_SP_DMA_BUSY_REG, RCP_SP_SEMAPHORE_REG, RCP_DPC_START_REG, RCP_DPC_END_REG, RCP_DPC_CURRENT_REG, RCP_DPC_STATUS_REG, RCP_DPC_CLOCK_REG, RCP_DPC_BUFBUSY_REG, RCP_DPC_PIPEBUSY_REG, RCP_DPC_TMEM_REG, NUMBER_OF_CP0_REGISTERS } CPR_specifier; extern pu32 CR[]; extern void SP_DMA_READ(void); extern void SP_DMA_WRITE(void); extern u16 rwR_VCE(void); extern void rwW_VCE(u16 VCE); extern void MFC2(unsigned int rt, unsigned int vs, unsigned int e); extern void MTC2(unsigned int rt, unsigned int vd, unsigned int e); extern void CFC2(unsigned int rt, unsigned int rd); extern void CTC2(unsigned int rt, unsigned int rd); /*** Modern pseudo-operations (not real instructions, but nice shortcuts) ***/ extern void ULW(unsigned int rd, u32 addr); extern void USW(unsigned int rs, u32 addr); /* * The scalar unit controls the primary R4000 operations implementation, * which inherently includes interfacing with the vector unit under COP2. * * Although no scalar unit operations are computational vector operations, * several of them will access machine states shared with the vector unit. * * We will need access to the vector unit's vector register file and its * vector control register file used mainly for vector select instructions. */ #include "vu/select.h" NOINLINE extern void res_S(void); extern void SP_CP0_MF(unsigned int rt, unsigned int rd); /* * example syntax (basically the same for all LWC2/SWC2 ops): * LTWV $v0[0], -64($at) * SBV $v0[9], 0xFFE($0) */ typedef void(*mwc2_func)( unsigned int vt, unsigned int element, signed int offset, unsigned int base ); extern mwc2_func LWC2[2 * 8*2]; extern mwc2_func SWC2[2 * 8*2]; extern void res_lsw( unsigned int vt, unsigned int element, signed int offset, unsigned int base ); /*** Scalar, Coprocessor Operations (vector unit, scalar cache transfers) ***/ extern void LBV(unsigned vt, unsigned element, signed offset, unsigned base); extern void LSV(unsigned vt, unsigned element, signed offset, unsigned base); extern void LLV(unsigned vt, unsigned element, signed offset, unsigned base); extern void LDV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SBV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SSV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SLV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SDV(unsigned vt, unsigned element, signed offset, unsigned base); /* * Group II vector loads and stores: * PV and UV (As of RCP implementation, XV and ZV are reserved opcodes.) */ extern void LPV(unsigned vt, unsigned element, signed offset, unsigned base); extern void LUV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SPV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SUV(unsigned vt, unsigned element, signed offset, unsigned base); /* * Group III vector loads and stores: * HV, FV, and AV (As of RCP implementation, AV opcodes are reserved.) */ extern void LHV(unsigned vt, unsigned element, signed offset, unsigned base); extern void LFV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SHV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SFV(unsigned vt, unsigned element, signed offset, unsigned base); /* * Group IV vector loads and stores: * QV and RV */ extern void LQV(unsigned vt, unsigned element, signed offset, unsigned base); extern void LRV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SQV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SRV(unsigned vt, unsigned element, signed offset, unsigned base); /* * Group V vector loads and stores * TV and SWV (As of RCP implementation, LTWV opcode was undesired.) */ extern void LTV(unsigned vt, unsigned element, signed offset, unsigned base); extern void SWV(unsigned vt, unsigned element, signed offset, unsigned base); extern void STV(unsigned vt, unsigned element, signed offset, unsigned base); NOINLINE extern void run_task(void); #endif