orbital/tools/ida/analyze_user.py

#!/usr/bin/env python

import idaapi
import idautils
import json
import re
import sys

# Regex patterns
patterns_function_outside = [
    # Found in safemode.elf
    "^([0-9A-Za-z:_]+) failed! \(result:%#x\)\n$",
    "^([0-9A-Za-z:_]+)\([0-9A-Za-z_, ]*\) : %x\n$",
]
patterns_function_inside = [
    # Found in safemode.elf
    "^([0-9A-Za-z:_]+) failed! \(standby page is NULL\)\n$",
]

# Binary patterns
patterns_syscall = [
    # mov rax, 0xXXXX; mov r10, rcx; syscall; jb $+0x5
    "48 C7 C0 ?? ?? 00 00 49 89 CA 0F 05",
]

### Utilities ###

def get_file_path(name):
    script_path = os.path.realpath(sys.argv[0])
    script_base = os.path.dirname(script_path)
    return os.path.join(script_base, name)

### Helpers ###

def get_last_direct_call(block, strict=True):
    """
    Iterate predecessor instructions backwards until first direct call.
    Strict-mode ensure that the match is actually the last call in the block,
    and will return NULL to distinguish it from a no-call scenario.
    """
    for head in reversed(list(Heads(block.startEA, block.endEA))):
        instr = idautils.DecodeInstruction(head)
        mnem = instr.get_canon_mnem()
        if mnem != "call":
            continue
        refs = list(CodeRefsFrom(head, 1))
        if len(refs) > 1:
            return refs[1]
        if strict:
            return 0x0
    return BADADDR

def get_predecessors(blocks, blacklist=set()):
    """
    Get set of predecessor blocks of a given set of blocks.
    Optionally, it can be filtered with set of blacklisted blocks.
    """
    preds = set()
    for block in blocks:
        preds |= set(block.preds())
    preds = preds.difference(blacklist)
    return preds

def rename_function_outside(name, string_ea):
    functions = set()
    # Get set of functions called prior to basic blocks that xref the string
    for instr_xref in XrefsTo(string_ea):
        instr_ea = instr_xref.frm
        func = idaapi.get_func(instr_ea)
        if not func:
            continue
        cfg = idaapi.FlowChart(func, flags=ida_gdl.FC_PREDS)
        # Get predecessor block(s)
        preds = {}
        blacklist = {}
        for block in cfg:
            if block.startEA <= instr_ea and block.endEA > instr_ea:
                blacklist = {block}
                preds = get_predecessors({block}, blacklist)
                break
        # Scan predecessors recursively for last direct calls
        found = False
        while True:
            for pred in preds:
                target_ea = get_last_direct_call(pred)
                if target_ea != BADADDR:
                    if target_ea != 0x0:
                        functions.add(target_ea)
                    found = True
            # Exit on candidates or no predecessors
            if found or not preds:
                break
            # Update predecessors
            blacklist |= preds
            preds = get_predecessors(preds, blacklist)

    # Ensure we only have exactly one candidate function
    if len(functions) != 1:
        print "None or multiple candidates detected @ string:0x%X" % (string_ea)
        return
    # Rename the candidate function
    func_ea = next(iter(functions))
    print "Renaming function 0x%X to %s" % (func_ea, name)
    idc.MakeNameEx(func_ea, name, idc.SN_NOWARN)

def rename_function_inside(name, string_ea):
    functions = set()
    # Get set of functions that contain xrefs the string
    for instr_xref in XrefsTo(string_ea):
        instr_ea = instr_xref.frm
        func = idaapi.get_func(instr_ea)
        functions.add(func.startEA)
    # Ensure we only have exactly one candidate function
    if len(functions) != 1:
        print "None or multiple candidates detected @ string:0x%X" % (string_ea)
        return
    # Rename the candidate function
    func_ea = next(iter(functions))
    print "Renaming function 0x%X to %s" % (func_ea, name)
    idc.MakeNameEx(func_ea, name, idc.SN_NOWARN)


### Analysis ###

def analyze_functions():
    # Reconstruct function names from strings
    for pattern in patterns_function_outside:
        for string in idautils.Strings():
            match = re.match(pattern, str(string))
            if match:
                rename_function_outside(match.group(1), string.ea)
    for pattern in patterns_function_inside:
        for string in idautils.Strings():
            match = re.match(pattern, str(string))
            if match:
                rename_function_inside(match.group(1), string.ea)

def analyze_syscalls():
    path = get_file_path('db_syscalls.json')
    with open(path, 'r') as f:
        db = json.load(f)
    # Detect and rename syscall wrappers
    for pattern in patterns_syscall:
        ea = 0x0
        while True:
            ea = idc.FindBinary(ea+1, idc.SEARCH_DOWN, pattern)
            if ea == BADADDR:
                break
            func = idaapi.get_func(ea)
            if not func or func.startEA != ea:
                continue
            syscall_id = Dword(ea + 0x3)
            syscall_name = db.get(str(syscall_id), None)
            if syscall_name:
                syscall_name = str(syscall_name)
                idc.MakeNameEx(ea, syscall_name, idc.SN_NOWARN)

def analyze_nids():
    # TODO: Not yet implemented, boot userland executables don't use them.
    return

def analyze_qwords():
    # Get user boundaries
    user_start = BADADDR
    user_stop = 0x0
    seg_count = 0
    for ea in Segments():
        seg_count += 1
        user_start = min(user_start, SegStart(ea))
        user_stop = max(user_stop, SegEnd(ea))

    # Transform every potential user pointer to a qword
    for i in range(seg_count):
        seg = idaapi.getnseg(i)
        for ea in range(seg.startEA, seg.endEA, 8):
            if get_item_size(ea) >= 8:
                continue
            value = get_qword(ea)
            if user_start <= value < user_stop:
                create_qword(ea)

def analyze_prologues():
    # Target prologue: push rbp; mov rbp, rsp
    pattern = "55 48 89"
    # For each user code segment
    for ea in Segments():
        if ida_segment.segtype(ea) != SEG_CODE:
            continue
        user_start = SegStart(ea)
        user_stop = SegEnd(ea)
        # Find solitary prologues
        ea = user_start
        while True:
            ea = idc.FindBinary(ea+1, idc.SEARCH_DOWN, pattern)
            if ea > user_stop:
                break
            func = idaapi.get_func(ea)
            if func is None:
                idc.MakeFunction(ea)

def analyze_types():
    path = get_file_path('db_types.json')
    with open(path, 'r') as f:
        db = json.load(f)
    for ea in Segments():
        if ida_segment.segtype(ea) != SEG_CODE:
            continue
        for func_addr in Functions(SegStart(ea), SegEnd(ea)):
            func_name = GetFunctionName(func_addr)
            if func_name.startswith('sub_'):
                continue
            db_type = db.get(func_name, None)
            if not db_type:
                continue
            flags = 1 | 2 | 4 # PT_SIL | PT_NDC | PT_TYP
            db_type = str(db_type)
            t = parse_decl(db_type, flags)
            if not t:
                print "Failed to apply type: %s" % db_type
                continue
            ida_typeinf.apply_type(None, t[1], t[2], func_addr, TINFO_DEFINITE)

### Main ###

def main():
    """
    The order of the following analysis stages is not arbitrary:
    They are ordered according to these two rules:
    1. Analysis dependencies must be considered,
       e.g. first detect functions, then process functions.
    2. Analysis with higher success rate go last.
       e.g. first do pattern based search, then rename syscalls.
    """
    # Detection stage
    analyze_qwords()
    analyze_prologues()
    # Assigning names
    analyze_functions()
    analyze_syscalls()
    analyze_nids()
    # Assigning types
    analyze_types()

if __name__ == '__main__':
    main()