基于 Unicorn 实现一个轻量级的 ARM64 模拟器

版权归作者所有,如有转发,请注明文章出处:cyrus-studio.github.io/blog/

基于 Unicorn 实现一个轻量级的 ARM64 模拟器,具备代码加载、内存映射、指令执行、反汇编、寄存器监控、Hook、Patch、字符串处理等功能,适合用于逆向分析或调试 ARM64 代码。

初始化与内存管理

  • 代码加载:通过 _load_binary() 将 so 文件加载到内存中。

  • 内存映射:在 _setup_memory() 中分配 10MB 的代码区和 1MB 的栈区。

  • 寄存器初始化:在 _setup_registers() 中设置栈指针(SP)和程序计数器(PC)。

  • 寄存器设置:提供了 set_x0()、set_x1() 和 set_x2() 等方法,用于直接设置寄存器值。

python 复制代码
import capstone
from unicorn import *
from unicorn.arm64_const import *


class ARM64Emulator:

    def __init__(self, so_file: str):
        self.so_file = so_file

        # 分配代码区(TEXT 段)
        self.CODE_BASE = 0x000000  # 假设代码段起始地址
        self.CODE_SIZE = 1024 * 1024 * 10  # 10MB

        # 分配栈区(STACK 段)
        self.STACK_BASE = self.CODE_BASE + self.CODE_SIZE
        self.STACK_SIZE = 1024 * 1024 * 1  # 1MB

        # 初始化 Unicorn
        self.mu = Uc(UC_ARCH_ARM64, UC_MODE_ARM)

        self._load_binary()
        self._setup_memory()
        self._setup_registers()
        self._setup_hooks()

    def _load_binary(self):
        with open(self.so_file, "rb") as f:
            self.CODE = f.read()

    def _setup_memory(self):
        self.mu.mem_map(self.CODE_BASE, self.CODE_SIZE)
        self.mu.mem_map(self.STACK_BASE, self.STACK_SIZE)
        # 写入指令
        self.mu.mem_write(self.CODE_BASE, self.CODE)

    def _setup_registers(self):
        self.mu.reg_write(UC_ARM64_REG_SP, self.STACK_BASE + self.STACK_SIZE - 4)  # 使 SP 从栈的顶部往下移动 4 字节,以 预留一点空间,避免越界错误。
        self.mu.reg_write(UC_ARM64_REG_PC, self.CODE_BASE)

    def set_x0(self, value):
        self.mu.reg_write(UC_ARM64_REG_X0, value)

    def set_x1(self, value):
        self.mu.reg_write(UC_ARM64_REG_X1, value)


    def set_x2(self, value):
        self.mu.reg_write(UC_ARM64_REG_X2, value)

打印寄存器

dump_registers() 打印所有 ARM64 寄存器的当前值。

python 复制代码
def dump_registers(self):
    """ 打印 Unicorn ARM64 CPU 的所有寄存器 """
    print("\n====== Registers Dump ======")

    # 遍历 X0 - X30
    for i in range(31):  # X0 ~ X30
        reg_id = getattr(arm64_const, f'UC_ARM64_REG_X{i}')
        value = self.mu.reg_read(reg_id)
        print(f"X{i:02}: 0x{value:016x}")

    # 打印 SP(栈指针)和 PC(程序计数器)
    sp = self.mu.reg_read(UC_ARM64_REG_SP)
    pc = self.mu.reg_read(UC_ARM64_REG_PC)

    print(f"\nSP:  0x{sp:016x}")
    print(f"PC:  0x{pc:016x}")
    print("============================\n")

运行程序

run() 使用 emu_start() 运行从 start_address 到 end_address 的指令。

python 复制代码
def run(self, start_address, end_address):
    print("\nBefore execution:")
    self.dump_registers()
    # 运行 Unicorn
    self.mu.emu_start(self.CODE_BASE + start_address, self.CODE_BASE + end_address)
    print("\nAfter execution:")
    self.dump_registers()

反汇编

disassembly() 使用 Capstone 对指定地址的内存数据进行反汇编。

python 复制代码
class ARM64Emulator:

    def __init__(self, so_file: str):
        
        # 初始化 Capstone 反汇编器 (针对 ARM64 架构)
        self.cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)

    def disassembly(self, start_address, end_address):
        """
        反汇编指定地址的字节码
        :param start_address: 开始地址
        :param end_address: 结束地址
        """
        # 提取目标方法的字节码
        target_data = self.CODE[start_address:end_address]
        # 反汇编字节码
        print("Disassembly:")
        for instruction in self.cs.disasm(target_data, start_address):
            print(f"0x{instruction.address:x}:\t{instruction.mnemonic}\t{instruction.op_str}")
            

Hook 管理

  • 代码 Hook:在 _setup_hooks() 中设置 UC_HOOK_CODE 钩子,每次执行到一条指令时触发 hook_code()。

  • 注册 Hook:register_hook() 允许用户在特定地址注册自定义的 Hook 函数。

  • 取消 Hook:unregister_hook() 提供取消 Hook 的功能。

python 复制代码
class ARM64Emulator:

    def __init__(self, so_file: str):
        
        self._hooks = [] # 存储所有注册的 Hook
        
        self._setup_hooks()
        
    def _setup_hooks(self):
        self.mu.hook_add(UC_HOOK_CODE, self.hook_code)
        
    def hook_code(self, mu, address, size, user_data):
        code = mu.mem_read(address, size)
        # 反汇编并打印当前执行的指令
        for i in self.cs.disasm(code, 0, len(code)):
            print("[addr:%x;code:%s]:%s %s" % (address, code.hex(), i.mnemonic, i.op_str))
    
        # 遍历所有已注册的 Hook,并执行匹配的 Hook
        for hook_addr, hook_fn in self._hooks:
            if address == hook_addr:
                hook_fn()
                
    def register_hook(self, address: int, hook_fn):
        """
        注册 Hook
        :param address: 需要 Hook 的地址
        :param hook_fn: Hook 处理函数
        """
        self._hooks.append((address, hook_fn))
        print(f"Hook registered at {hex(address)}")
    
    def unregister_hook(self, address: int):
        """
        取消 Hook
        :param address: 需要解除 Hook 的地址
        """
        self._hooks = [(addr, fn) for addr, fn in self._hooks if addr != address]
        print(f"Hook unregistered at {hex(address)}")

寄存器监控

  • 监控寄存器变更:watch_registers() 支持监控特定寄存器的变化,并在变化时打印相关信息。

  • 自动更新寄存器值:在 hook_code() 中检测变化,并输出变化信息。

python 复制代码
class ARM64Emulator:

    def __init__(self, so_file: str):

        self._last_registers = {}  # 记录上次的寄存器值
        self._watch_registers = set()  # 存储需要监控的寄存器
    
    def hook_code(self, mu, address, size, user_data):
        code = mu.mem_read(address, size)
    
        insn = next(self.cs.disasm(code, 0, len(code)), None)
        if not insn:
            return
    
        # 检查监控的寄存器是否变化
        for reg in self._watch_registers:
            new_value = mu.reg_read(reg)
            if self._last_registers[reg] != new_value:
                print(f">> PC: 0x{address:X}, {insn.mnemonic} {insn.op_str}, {reg} changed: 0x{self._last_registers[reg]:X} -> 0x{new_value:X}")
                self._last_registers[reg] = new_value  # 更新值
    
    def watch_registers(self, *regs):
        """
        添加要监控的寄存器
    
        使用示例: emu.watch_registers("X4", "X8")  # 监控 X4 和 X8
    
        """
        reg_map = {
            "X0": UC_ARM64_REG_X0, "X1": UC_ARM64_REG_X1, "X2": UC_ARM64_REG_X2, "X3": UC_ARM64_REG_X3,
            "X4": UC_ARM64_REG_X4, "X5": UC_ARM64_REG_X5, "X6": UC_ARM64_REG_X6, "X7": UC_ARM64_REG_X7,
            "X8": UC_ARM64_REG_X8, "X9": UC_ARM64_REG_X9, "X10": UC_ARM64_REG_X10, "X11": UC_ARM64_REG_X11,
            "X12": UC_ARM64_REG_X12, "X13": UC_ARM64_REG_X13, "X14": UC_ARM64_REG_X14, "X15": UC_ARM64_REG_X15,
            "X16": UC_ARM64_REG_X16, "X17": UC_ARM64_REG_X17, "X18": UC_ARM64_REG_X18, "X19": UC_ARM64_REG_X19,
            "X20": UC_ARM64_REG_X20, "X21": UC_ARM64_REG_X21, "X22": UC_ARM64_REG_X22, "X23": UC_ARM64_REG_X23,
            "X24": UC_ARM64_REG_X24, "X25": UC_ARM64_REG_X25, "X26": UC_ARM64_REG_X26, "X27": UC_ARM64_REG_X27,
            "X28": UC_ARM64_REG_X28, "FP": UC_ARM64_REG_FP, "LR": UC_ARM64_REG_LR, "SP": UC_ARM64_REG_SP,
            "PC": UC_ARM64_REG_PC
        }
        for reg in regs:
            if reg in reg_map:
                self._watch_registers.add(reg_map[reg])
                self._last_registers[reg_map[reg]] = 0  # 初始化记录值

Patch NOP

  • patch_nop():将给定地址列表中的指令替换为 NOP(0xD503201F)。

  • patch_nop_range():将指定地址范围内的所有指令替换为 NOP。

python 复制代码
def patch_nop_range(self, start_addr: int, end_addr: int):
    """
    在指定范围内将指令 patch 为 NOP (0xD503201F),**包括 end_addr 位置**

    :param start_addr: 需要 patch 的起始地址 (必须 4 字节对齐)
    :param end_addr: 需要 patch 的结束地址 (必须 4 字节对齐,包含此地址)
    """
    # 确保地址对齐
    if start_addr % 4 != 0 or end_addr % 4 != 0:
        raise ValueError("Start and end addresses must be 4-byte aligned.")

    if end_addr < start_addr:
        raise ValueError("End address must be greater than or equal to start address.")

    # NOP 指令在 AArch64 下的编码
    NOP_INSTRUCTION = b'\x1F\x20\x03\xD5'  # 0xD503201F

    # 计算 patch 的指令数量 (包括 end_addr)
    nop_count = ((end_addr - start_addr) // 4) + 1

    # 生成 NOP 指令序列
    nop_data = NOP_INSTRUCTION * nop_count

    # 写入 Unicorn 内存
    self.mu.mem_write(start_addr, nop_data)

    print(f"Patched {nop_count} instructions to NOP from {hex(start_addr)} to {hex(end_addr)} (inclusive)")

def patch_nop(self, addr_list: list):
    """
    将地址列表中的每个地址 patch 为 NOP (0xD503201F)

    :param addr_list: 需要 patch 的地址列表 (每个地址必须 4 字节对齐)
    """
    # NOP 指令在 AArch64 下的编码
    NOP_INSTRUCTION = b'\x1F\x20\x03\xD5'  # 0xD503201F

    for addr in addr_list:
        if addr % 4 != 0:
            raise ValueError(f"Address {hex(addr)} is not 4-byte aligned.")

        self.mu.mem_write(addr, NOP_INSTRUCTION)
        print(f"Patched NOP at {hex(addr)}")

字符串操作

  • get_string_utf_chars() 模拟了 GetStringUTFChars(),在指定内存地址写入 UTF-8 编码的字符串,并返回指针地址。

  • read_c_string() 从仿真器内存中读取以 NULL 结尾的 C 语言字符串。

python 复制代码
def get_string_utf_chars(self, input_str: str, str_addr: int):
    """
    模拟 GetStringUTFChars,把 Python 参数 `input_str` 作为返回的 UTF-8 字符串
    """
    utf8_str = input_str.encode("utf-8") + b"\x00"  # UTF-8 编码并加 NULL 终止符

    # 写入 Unicorn 内存
    self.mu.mem_write(str_addr, utf8_str)

    # 设置 X0 返回值 (UTF-8 字符串地址)
    self.mu.reg_write(UC_ARM64_REG_X0, str_addr)

    print(f"GetStringUTFChars Hooked: '{input_str}' -> {hex(str_addr)}")

def read_c_string(self, addr, max_len=256):
    """ 从 Unicorn 模拟内存中读取 C 语言字符串(以 null 结尾) """
    result = b""
    for i in range(max_len):
        byte = self.mu.mem_read(addr + i, 1)
        if byte == b"\x00":  # 遇到 null 终止符
            break
        result += byte
    return result.decode("utf-8", errors="ignore")

完整源码

项目地址:github.com/CYRUS-STUDI...

python 复制代码
import capstone
from unicorn import *
from unicorn.arm64_const import *


class ARM64Emulator:

    def __init__(self, so_file: str):
        self.so_file = so_file

        self._hooks = [] # 存储所有注册的 Hook
        self._last_registers = {}  # 记录上次的寄存器值
        self._watch_registers = set()  # 存储需要监控的寄存器

        # 分配代码区(TEXT 段)
        self.CODE_BASE = 0x000000  # 假设代码段起始地址
        self.CODE_SIZE = 1024 * 1024 * 10  # 10MB

        # 分配栈区(STACK 段)
        self.STACK_BASE = self.CODE_BASE + self.CODE_SIZE
        self.STACK_SIZE = 1024 * 1024 * 1  # 1MB

        # 初始化 Unicorn
        self.mu = Uc(UC_ARCH_ARM64, UC_MODE_ARM)
        # 初始化 Capstone 反汇编器 (针对 ARM64 架构)
        self.cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)

        self._load_binary()
        self._setup_memory()
        self._setup_registers()
        self._setup_hooks()

    def _load_binary(self):
        with open(self.so_file, "rb") as f:
            self.CODE = f.read()

    def _setup_memory(self):
        self.mu.mem_map(self.CODE_BASE, self.CODE_SIZE)
        self.mu.mem_map(self.STACK_BASE, self.STACK_SIZE)
        # 写入指令
        self.mu.mem_write(self.CODE_BASE, self.CODE)

    def _setup_registers(self):
        self.mu.reg_write(UC_ARM64_REG_SP, self.STACK_BASE + self.STACK_SIZE - 4)  # 使 SP 从栈的顶部往下移动 4 字节,以 预留一点空间,避免越界错误。
        self.mu.reg_write(UC_ARM64_REG_PC, self.CODE_BASE)

    def set_x0(self, value):
        self.mu.reg_write(UC_ARM64_REG_X0, value)


    def set_x1(self, value):
        self.mu.reg_write(UC_ARM64_REG_X1, value)


    def set_x2(self, value):
        self.mu.reg_write(UC_ARM64_REG_X2, value)

    def _setup_hooks(self):
        self.mu.hook_add(UC_HOOK_CODE, self.hook_code)


    def dump_registers(self):
        """ 打印 Unicorn ARM64 CPU 的所有寄存器 """
        print("\n====== Registers Dump ======")

        # 遍历 X0 - X30
        for i in range(31):  # X0 ~ X30
            reg_id = getattr(arm64_const, f'UC_ARM64_REG_X{i}')
            value = self.mu.reg_read(reg_id)
            print(f"X{i:02}: 0x{value:016x}")

        # 打印 SP(栈指针)和 PC(程序计数器)
        sp = self.mu.reg_read(UC_ARM64_REG_SP)
        pc = self.mu.reg_read(UC_ARM64_REG_PC)

        print(f"\nSP:  0x{sp:016x}")
        print(f"PC:  0x{pc:016x}")
        print("============================\n")

    def run(self, start_address, end_address):
        print("\nBefore execution:")
        self.dump_registers()
        # 运行 Unicorn
        self.mu.emu_start(self.CODE_BASE + start_address, self.CODE_BASE + end_address)
        print("\nAfter execution:")
        self.dump_registers()

    def disassembly(self, start_address, end_address):
        """
        反汇编指定地址的字节码
        :param start_address: 开始地址
        :param end_address: 结束地址
        """
        # 提取目标方法的字节码
        target_data = self.CODE[start_address:end_address]
        # 反汇编字节码
        print("Disassembly:")
        for instruction in self.cs.disasm(target_data, start_address):
            print(f"0x{instruction.address:x}:\t{instruction.mnemonic}\t{instruction.op_str}")


    def hook_code(self, mu, address, size, user_data):
        code = mu.mem_read(address, size)
        # 反汇编并打印当前执行的指令
        for i in self.cs.disasm(code, 0, len(code)):
            print("[addr:%x;code:%s]:%s %s" % (address, code.hex(), i.mnemonic, i.op_str))

        # 遍历所有已注册的 Hook,并执行匹配的 Hook
        for hook_addr, hook_fn in self._hooks:
            if address == hook_addr:
                hook_fn()

        insn = next(self.cs.disasm(code, 0, len(code)), None)
        if not insn:
            return

        # 检查监控的寄存器是否变化
        for reg in self._watch_registers:
            new_value = mu.reg_read(reg)
            if self._last_registers[reg] != new_value:
                print(f">> PC: 0x{address:X}, {insn.mnemonic} {insn.op_str}, {reg} changed: 0x{self._last_registers[reg]:X} -> 0x{new_value:X}")
                self._last_registers[reg] = new_value  # 更新值


    def register_hook(self, address: int, hook_fn):
        """
        注册 Hook
        :param address: 需要 Hook 的地址
        :param hook_fn: Hook 处理函数
        """
        self._hooks.append((address, hook_fn))
        print(f"Hook registered at {hex(address)}")

    def unregister_hook(self, address: int):
        """
        取消 Hook
        :param address: 需要解除 Hook 的地址
        """
        self._hooks = [(addr, fn) for addr, fn in self._hooks if addr != address]
        print(f"Hook unregistered at {hex(address)}")

    def watch_registers(self, *regs):
        """
        添加要监控的寄存器

        使用示例: emu.watch_registers("X4", "X8")  # 监控 X4 和 X8

        """
        reg_map = {
            "X0": UC_ARM64_REG_X0, "X1": UC_ARM64_REG_X1, "X2": UC_ARM64_REG_X2, "X3": UC_ARM64_REG_X3,
            "X4": UC_ARM64_REG_X4, "X5": UC_ARM64_REG_X5, "X6": UC_ARM64_REG_X6, "X7": UC_ARM64_REG_X7,
            "X8": UC_ARM64_REG_X8, "X9": UC_ARM64_REG_X9, "X10": UC_ARM64_REG_X10, "X11": UC_ARM64_REG_X11,
            "X12": UC_ARM64_REG_X12, "X13": UC_ARM64_REG_X13, "X14": UC_ARM64_REG_X14, "X15": UC_ARM64_REG_X15,
            "X16": UC_ARM64_REG_X16, "X17": UC_ARM64_REG_X17, "X18": UC_ARM64_REG_X18, "X19": UC_ARM64_REG_X19,
            "X20": UC_ARM64_REG_X20, "X21": UC_ARM64_REG_X21, "X22": UC_ARM64_REG_X22, "X23": UC_ARM64_REG_X23,
            "X24": UC_ARM64_REG_X24, "X25": UC_ARM64_REG_X25, "X26": UC_ARM64_REG_X26, "X27": UC_ARM64_REG_X27,
            "X28": UC_ARM64_REG_X28, "FP": UC_ARM64_REG_FP, "LR": UC_ARM64_REG_LR, "SP": UC_ARM64_REG_SP,
            "PC": UC_ARM64_REG_PC
        }
        for reg in regs:
            if reg in reg_map:
                self._watch_registers.add(reg_map[reg])
                self._last_registers[reg_map[reg]] = 0  # 初始化记录值

    def patch_nop_range(self, start_addr: int, end_addr: int):
        """
        在指定范围内将指令 patch 为 NOP (0xD503201F),**包括 end_addr 位置**

        :param start_addr: 需要 patch 的起始地址 (必须 4 字节对齐)
        :param end_addr: 需要 patch 的结束地址 (必须 4 字节对齐,包含此地址)
        """
        # 确保地址对齐
        if start_addr % 4 != 0 or end_addr % 4 != 0:
            raise ValueError("Start and end addresses must be 4-byte aligned.")

        if end_addr < start_addr:
            raise ValueError("End address must be greater than or equal to start address.")

        # NOP 指令在 AArch64 下的编码
        NOP_INSTRUCTION = b'\x1F\x20\x03\xD5'  # 0xD503201F

        # 计算 patch 的指令数量 (包括 end_addr)
        nop_count = ((end_addr - start_addr) // 4) + 1

        # 生成 NOP 指令序列
        nop_data = NOP_INSTRUCTION * nop_count

        # 写入 Unicorn 内存
        self.mu.mem_write(start_addr, nop_data)

        print(f"Patched {nop_count} instructions to NOP from {hex(start_addr)} to {hex(end_addr)} (inclusive)")

    def patch_nop(self, addr_list: list):
        """
        将地址列表中的每个地址 patch 为 NOP (0xD503201F)

        :param addr_list: 需要 patch 的地址列表 (每个地址必须 4 字节对齐)
        """
        # NOP 指令在 AArch64 下的编码
        NOP_INSTRUCTION = b'\x1F\x20\x03\xD5'  # 0xD503201F

        for addr in addr_list:
            if addr % 4 != 0:
                raise ValueError(f"Address {hex(addr)} is not 4-byte aligned.")

            self.mu.mem_write(addr, NOP_INSTRUCTION)
            print(f"Patched NOP at {hex(addr)}")

    def get_string_utf_chars(self, input_str: str, str_addr: int):
        """
        模拟 GetStringUTFChars,把 Python 参数 `input_str` 作为返回的 UTF-8 字符串
        """
        utf8_str = input_str.encode("utf-8") + b"\x00"  # UTF-8 编码并加 NULL 终止符

        # 写入 Unicorn 内存
        self.mu.mem_write(str_addr, utf8_str)

        # 设置 X0 返回值 (UTF-8 字符串地址)
        self.mu.reg_write(UC_ARM64_REG_X0, str_addr)

        print(f"GetStringUTFChars Hooked: '{input_str}' -> {hex(str_addr)}")

    def read_c_string(self, addr, max_len=256):
        """ 从 Unicorn 模拟内存中读取 C 语言字符串(以 null 结尾) """
        result = b""
        for i in range(max_len):
            byte = self.mu.mem_read(addr + i, 1)
            if byte == b"\x00":  # 遇到 null 终止符
                break
            result += byte
        return result.decode("utf-8", errors="ignore")
相关推荐
还鮟2 小时前
CTF Web的数组巧用
android
小蜜蜂嗡嗡3 小时前
Android Studio flutter项目运行、打包时间太长
android·flutter·android studio
aqi003 小时前
FFmpeg开发笔记(七十一)使用国产的QPlayer2实现双播放器观看视频
android·ffmpeg·音视频·流媒体
zhangphil5 小时前
Android理解onTrimMemory中ComponentCallbacks2的内存警戒水位线值
android
你过来啊你5 小时前
Android View的绘制原理详解
android
移动开发者1号8 小时前
使用 Android App Bundle 极致压缩应用体积
android·kotlin
移动开发者1号8 小时前
构建高可用线上性能监控体系:从原理到实战
android·kotlin
ii_best13 小时前
按键精灵支持安卓14、15系统,兼容64位环境开发辅助工具
android
美狐美颜sdk13 小时前
跨平台直播美颜SDK集成实录:Android/iOS如何适配贴纸功能
android·人工智能·ios·架构·音视频·美颜sdk·第三方美颜sdk
恋猫de小郭18 小时前
Meta 宣布加入 Kotlin 基金会,将为 Kotlin 和 Android 生态提供全新支持
android·开发语言·ios·kotlin