minos 2.2 中断虚拟化——异常处理流程

首发微信公号：Rand_cs

上一节讲述了 ARMv8 异常模型，很多理论，这一节来看一个实际的例子，来看看 minos 中的异常处理流程

异常向量表

直接来看 minos 的异常向量表，很多事情就明了了：

C 复制代码

elx_vectors:
c0sync:     // Current EL with SP0
    BAD_MODE VECTOR_C0_SYNC
    .balign 0x80
c0irq:
    BAD_MODE VECTOR_C0_IRQ
    .balign 0x80
c0fiq:
    BAD_MODE VECTOR_C0_FIQ
    .balign 0x80
c0serr:
    BAD_MODE VECTOR_C0_SERR
    .balign 0x80    // Current EL with SPx
cxsync:
    b __sync_exception_from_current_el
    .balign 0x80
cxirq:
    b __irq_exception_from_current_el
    .balign 0x80
cxfiq:
    BAD_MODE VECTOR_CX_FIQ
    .balign 0x80
cxserr:
    BAD_MODE VECTOR_CX_SERR
    .balign 0x80    //Lower EL using AArch64
l64sync:
    b __sync_exception_from_lower_el
    .balign 0x80
l64irq:
    b __irq_exception_from_lower_el
    .balign 0x80
l64fiq:
    BAD_MODE VECTOR_L64_FIQ
    .balign 0x80
l64serr:
    BAD_MODE VECTOR_L64_SERR
    .balign 0x80    // Lower EL using AArch32
l32sync:
    b __sync_exception_from_lower_el
    .balign 0x80
l32irq:
    b __irq_exception_from_lower_el
    .balign 0x80
l32fiq:
    BAD_MODE VECTOR_L32_FIQ
    .balign 0x80
l32serr:
    BAD_MODE VECTOR_L32_SERR
    .balign 0x80

如果异常来自当前特权级 && 使用 SP_EL0，目前不支持
所有 fiq 和 serror 目前不支持

再排除掉 32 位的情况，总共还剩下 4 个向量：__sync_exception_from_current_el、__irq_exception_from_current_el、__sync_exception_from_lower_el、__irq_exception_from_lower_el，顾名思义，我们一个个来看它们是如何处理的

__sync_exception_from_current_el

C 复制代码

vfunc __sync_exception_from_current_el
    SAVE_GP_REGS        // 保存通用寄存器

    mov x0, sp
    str x0, [x18, #TASK_STACK_OFFSET]

    // use SVC for sched() , other type will
    // go to the exception handler.
    mrs x1, ESR_EL2
    ubfx    x2, x1, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH
    cmp x2, #ESR_ELx_EC_SVC64
    b.eq    __sync_current_out

    bl  sync_exception_from_current_el  // go to the c handler, will die.

__sync_current_out:
    b   exception_return

SAVE_GP_REGS

不论啥异常处理，第一步都是保存现场，保存一系列的寄存器到 SP_ELx

C 复制代码

.macro SAVE_GP_REGS
    stp x29, x30, [sp, #-16]!
    __SAVE_GP_REGS
.endm

.macro __SAVE_GP_REGS
    stp x27, x28, [sp, #-16]!
    stp x25, x26, [sp, #-16]!
    stp x23, x24, [sp, #-16]!
    stp x21, x22, [sp, #-16]!
    stp x19, x20, [sp, #-16]!
    stp x17, x18, [sp, #-16]!
    stp     x15, x16, [sp, #-16]!
    stp     x13, x14, [sp, #-16]!
    stp     x11, x12, [sp, #-16]!
    stp     x9, x10, [sp, #-16]!
    stp     x7, x8, [sp, #-16]!
    stp     x5, x6, [sp, #-16]!
    stp     x3, x4, [sp, #-16]!
    stp     x1, x2, [sp, #-16]!
    str x0, [sp, #-8]!
    mrs x0, SP_EL0
    str x0, [sp, #-8]!
    mrs x0, ARM64_SPSR
    str x0, [sp, #-8]!
    mrs x0, ARM64_ELR
    str x0, [sp, #-8]!
    dsb nsh
.endm

执行完上述操作后，minos EL2 栈（线程在 EL2 的栈）布局如下所示：

C 复制代码

    mov x0, sp       
    str x0, [x18, #TASK_STACK_OFFSET]   // 将保存了现场后的栈指针赋值给 task.stack_base
    
// .....................................................
#define current_regs        (gp_regs *)current->stack_base

这两句汇编将上图中的 SP_EL2 的值赋值给了当前线程 task->stack_base 字段，minos 中定义了一个获取当前线程保存在 SP_EL2 栈里面寄存器集合的宏，其定义如下

C 复制代码

struct aarch64_regs {
    uint64_t pc;        // elr_el2
    uint64_t pstate;    // spsr_el2
    uint64_t sp;        // sp_el0
    uint64_t x0;
    uint64_t x1;
    uint64_t x2;
    uint64_t x3;
    uint64_t x4;
    uint64_t x5;
    uint64_t x6;
    uint64_t x7;
    uint64_t x8;
    uint64_t x9;
    uint64_t x10;
    uint64_t x11;
    uint64_t x12;
    uint64_t x13;
    uint64_t x14;
    uint64_t x15;
    uint64_t x16;
    uint64_t x17;
    uint64_t x18;
    uint64_t x19;
    uint64_t x20;
    uint64_t x21;
    uint64_t x22;
    uint64_t x23;
    uint64_t x24;
    uint64_t x25;
    uint64_t x26;
    uint64_t x27;
    uint64_t x28;
    uint64_t x29;
    uint64_t lr;
}__packed;

可以看出，跟咱们图中结构布局一模一样，有时我们需要从栈里面获取低特权级的一些寄存器信息。

C 复制代码

    mrs x1, ESR_EL2         // 获取异常原因
    // x2=(x1>>ESR_ELx_EC_SHIFT)&ESR_ELx_EC_WIDTH = x1 >> 26 & 6
    ubfx    x2, x1, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH 
    cmp x2, #ESR_ELx_EC_SVC64
    b.eq    __sync_current_out
// ...............................................
#define ESR_ELx_EC_SHIFT    (26)
#define ESR_ELx_EC_WIDTH    (6)
#define ESR_ELx_EC_SVC64    (0x15)

这段代码意思是查看 ESR 中显示的异常原因，看看是不是 ESR_ELx_EC_SVC64，如果是跳转到 __sync_current_out 进行异常返回阶段的处理

NOTE：这部分可以查手册验证，developer.arm.com/documentati...

__sync_current_out

exception_return

C 复制代码

__sync_current_out:
    b   exception_return

vfunc exception_return
    LOAD_PCPU_STACK x1          // load percpu stack, need ensure the irq is off.

    bl  exception_return_handler    // check whether need to resched. x18 will the next task.

    ldr x1, [x18, #TASK_STACK_OFFSET]   // load the running task's stack
    mov sp, x1              // change to the new stack address

    ldr x1, [sp, #8]            // load spsr
    and x1, x1, #0x0f
    cmp x1, #9              // whether the task will return to user
    b.eq    __do_exception_return

    mov x0, sp
    bl  task_return_to_user
.macro LOAD_PCPU_STACK, tmp0
    mrs \tmp0, ARM64_TPIDR
    ldr \tmp0, [\tmp0, #PCPU_STACK_OFFSET]
    mov sp, \tmp0
.endm

#define ARM64_TPIDR     TPIDR_EL1    // 无虚拟化，minos 作为 el1 kernel
#define ARM64_TPIDR     TPIDR_EL2    // 有虚拟化，minos 作为 el2 hypervisor

每个物理 cpu，在 minos 中都定义了一个 struct pcpu，在启动期间，每个 pcpu 指针都被存放到了 ARM64_TPIDR 寄存里面。同样是启动期间，就划分了一份内存，作为 pcpu 的栈。LOAD_PCPU_STACK 执行的操作就是换栈，让 SP_EL2 指向 pcpu 栈

exception_return_handler

C 复制代码

// 异常返回时，检查是否需要 resched
void exception_return_handler(void)
{
    int ret = __exception_return_handler();

    // 只要不是执行出错，那么这里都会重新开始执行一个 task，所以这里重启 sched_timer
    if ((ret == 0) || (ret == -EAGAIN))
        sched_update_sched_timer();
}

// 在异常返回的时候做 resched 操作
static inline int __exception_return_handler(void)
{
    struct task *next, *task = current;
    struct task_info *ti = to_task_info(task);
    struct pcpu *pcpu = get_pcpu();

    /*
     * if the task is suspend state, means next the cpu
     * will call sched directly, so do not sched out here
     *
     * 1 - when preempt_count > 0, the scheduler whill try
     *     to shced() when preempt_enable.
     * 2 - __TIF_DONOT_PREEMPT is set, it will call sched() at
     *    once.
     */
    // 如果不需要 resched 或者 不允许抢占 或者 不要抢占，那么再次执行该 task
    if (!(ti->flags & __TIF_NEED_RESCHED) || (ti->preempt_count > 0) ||
            (ti->flags & __TIF_DONOT_PREEMPT))
        // 那么就再 run 一下
        goto task_run_again;

    // 否则先清除 __TIF_NEED_RESCHED（因为马上就要 resched，所以不需要该标志了）
    ti->flags &= ~__TIF_NEED_RESCHED;

    // 然后挑选 next task
    next = pick_next_task(pcpu);
    // 如果挑选的就是当前 task
    if ((next == task))
        goto task_run_again;
    
    // 切换 task
    switch_to_task(task, next);

    return 0;

task_run_again:
    // 清除掉当前 task 时间片已经到了的标志 TIF_TICK_EXHAUST
    if (test_and_clear_bit(TIF_TICK_EXHAUST, &ti->flags))
        return -EAGAIN;
    else
        return -EACCES;
}

终于来到熟悉的 C 界面，会发现 exception_return_handler 这个函数就是调度操作，这在 minos 3.1 CPU虚拟化讲过，这下就连起来了，minos 是在异常返回的时候会执行（后面可以看到其他异常在返回的时候也会执行 exception_return_handler）调度操作

TODO：为什么要换成 pcpu 栈

step out

回到 __sync_current_out->exception_return

C 复制代码

    // 切换为 task 栈
    ldr x1, [x18, #TASK_STACK_OFFSET]   // load the running task's stack
    mov sp, x1              // change to the new stack address

    ldr x1, [sp, #8]            // load spsr
    and x1, x1, #0x0f
    cmp x1, #9              // whether the task will return to user
    b.eq    __do_exception_return

首先是切换为 task 栈，然后获取 task 栈里面的 SPSR_EL2 的值，再次查手册，查看 SPSR_EL2 字段的后 4 位什么含义：

可以看出，#9(1001) 表示异常是来自 EL2 with SP_EL2，如果该字段是 9，说明我们将同级返回，否则返回到 EL1（返回到虚机）

__do_exception_return

C 复制代码

__do_exception_return:
    LOAD_GP_REGS
    eret

返回很简单，LOAD_GP_REGS 就是 SAVE_GP_REGS 的逆操作，eret 指令就是恢复 SPSR_EL2 的值到 PSTATE，恢复 ELR_EL2 的值到 PC

C 复制代码

    mov x0, sp
    bl  task_return_to_user

如果 SPSR_EL2 最后 4 位不是 9，那么会调用 task_return_to_user 返回到虚拟机中，本文中不做详细说明

sync_exception_from_current_el

回到 __sync_exception_from_current_el，如果不是 ESR_ELx_EC_SVC64 调用

C 复制代码

    bl  sync_exception_from_current_el  // go to the c handler, will die.

__sync_current_out:
    b   exception_return
    
// 处理同步异常
static void handle_sync_exception(gp_regs *regs)
{
    uint32_t esr_value;
    uint32_t ec_type;
    struct sync_desc *ec;
    // 获取异常原因
    esr_value = read_esr();
    ec_type = ESR_ELx_EC(esr_value);
    if (ec_type >= ESR_ELx_EC_MAX)
        panic("unknown sync exception type from current EL %d\n", ec_type);

    /*
     * for normal userspace process the return address shall
     * be adjust
     */
    // 获取该异常的描述符
    ec = process_sync_descs[ec_type];
    // 执行该异常处理程序，并且修正 elr_el2 的值
    regs->pc += ec->ret_addr_adjust;
    ec->handler(regs, ec_type, esr_value);
}

void sync_exception_from_current_el(gp_regs *regs)
{
    handle_sync_exception(regs);
}

如果不是 svc 调用（在 minos 里面只有 sched->svc #0 这一种情况）那么跳去 sync_exception_from_current_el 处理同步异常

这里我们就只是先简单看一下，每个同步异常都有个描述符，里面有相关回调 handler，这里调用相关 handler 来处理同步异常

__irq_exception_from_current_el

来自当前特权级的 irq，比如说 cpu 之间通信，发送 SGI 类型的中断信号给某个 cpu，就会调用 __irq_exception_from_current_el

C 复制代码

vfunc __irq_exception_from_current_el
    SAVE_GP_REGS

    // Set the irq flags into ti->flags.
    ldr x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    orr x1, x1, #__TIF_HARDIRQ_MASK
    str x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    dsb sy

    mov x0, sp
    str     x0, [x18, #TASK_STACK_OFFSET]   // store the stack in case this task will scheded out.
    bl  irq_from_current_el     // irq is disabled all the time

    // clear the irq flags into ti->flags.
    ldr x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    and x1, x1, #(~__TIF_HARDIRQ_MASK)
    str x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    dsb sy

    b   exception_return

首先仍然是 SAVE_GP_REGS 来保存上下文

C 复制代码

    // Set the irq flags into ti->flags.
    // 设置中断上下文标志
    ldr x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    orr x1, x1, #__TIF_HARDIRQ_MASK
    str x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    dsb sy
    
//......................................
DEFINE(TASK_INFO_FLAGS_OFFSET, offsetof(struct task_info, flags));
#define __TIF_IN_INTERRUPT  (__TIF_HARDIRQ_MASK | __TIF_SOFTIRQ_MASK)
struct task_info {
    int preempt_count;
    unsigned long flags;
};

这一段汇编就是在当前线程 task->task_info.flags 中设置 __TIF_HARDIRQ_MASK，表示当前线程处于中断上下文了，而且是硬中断上下文，这里作者应该是想像 Linux 那样设计，但是目前相关功能还不是很完善，设置该标志就是表示处于中断上下文，没有区分软硬。

C 复制代码

    // 保存当前栈地址到 task.stack_base，然后跳去 irq_from_current_el
    // 当前栈地址就是 gp_regs 结构指针
    mov x0, sp
    str     x0, [x18, #TASK_STACK_OFFSET]   // store the stack in case this task will scheded out.
    bl  irq_from_current_el     // irq is disabled all the time

irq_from_current_el

C 复制代码

void irq_from_current_el(gp_regs *regs)
{
    irq_handler(regs);
}

static inline void irq_handler(gp_regs *regs)
{
    do_irq_handler();
}

// irq 的 handler 函数
int do_irq_handler(void)
{
    uint32_t irq;
    struct irq_desc *irq_desc;
    int cpuid = smp_processor_id();  // 当前 pcpuid

    // 遍历当前所有 pending 等待的 irq
    while (1) {
        // 获取中断号
        irq = irq_chip->get_pending_irq();
        if (irq >= BAD_IRQ)
            return 0;
        // 中断号对应的中断描述符
        irq_desc = get_irq_desc_cpu(cpuid, irq);
        if (unlikely(!irq_desc)) {
            pr_err("irq is not actived %d\n", irq);
            irq_chip->irq_eoi(irq);
            irq_chip->irq_dir(irq);
            continue;
        }
        // 执行中断描述符中注册的回调 handler
        do_handle_host_irq(cpuid, irq_desc);
    }

    return 0;
}

此函数就是去执行实际的中断处理函数，不同中断都有注册一个中断描述符，里面有对应的回调，这里就是去调用这个回调来处理中断。具体的中断子系统下一节 4.3 讲述，侧重与硬件相关的流程。

step out

C 复制代码

    // clear the irq flags into ti->flags.
    ldr x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    and x1, x1, #(~__TIF_HARDIRQ_MASK)
    str x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    dsb sy

    b   exception_return

这部分就是清除硬中断 __TIF_HARDIRQ_MASK 标志，然后跳去 exception_return 执行中断返回，这个在上面提到过，会先去看是否需要调度，随后异常返回

__sync_exception_from_lower_el

来自低特权级的同步异常，典型的情况就是低特权级调用 hvc、smc 指令，然后 trap 到 EL2，来看看 minos 是怎么处理的

C 复制代码

vfunc __sync_exception_from_lower_el
    SAVE_GP_REGS

    PCPU_LOAD_CURRENT_TASK          // x18 will be the current task.

    bl  task_exit_from_user

    mov x0, sp
    bl  sync_exception_from_lower_el    // go to the c handler.

    mov x0, sp
    bl      task_return_to_user

    LOAD_GP_REGS
    eret

同样的，首先通过 SAVE_GP_REGS 保存通用寄存器

C 复制代码

// 将当前 pcpu 上记录的 current running task 指针记录到 x18
.macro PCPU_LOAD_CURRENT_TASK
    mrs x18, ARM64_TPIDR
    ldr x18, [x18, #PCPU_CURRENT_TASK] // x18=pcpu->running_task
.endm

这一步将当前 task 指针记录到 x18 寄存器，因为在低特权级 x18 可不一定表示 task 指针，可能就是普通的通用寄存器，也可能被 guest OS 留作他用，我们不得而知。但是 minos 里面 x18 是作为当前 task 指针使用的，这在进入高异常等级的时候设置，返回低异常等级的时候从栈里面恢复。只要在 minos EL2 级别，x18 就表示当前 task 指针

task_exit_from_user 从 guest OS 退出到 hypervisor，细节先略过

C 复制代码

    mov x0, sp
    bl  sync_exception_from_lower_el    // go to the c handler.

类似 sync_exception_from_current_el，从 ESR 里面获取异常原因，然后获取相关的同步异常描述符，执行里面的 handler 来处理同步异常

C 复制代码

    mov x0, sp
    bl      task_return_to_user

    LOAD_GP_REGS
    eret

随后执行 task_return_to_user 进入虚机，LOAD_GP_REGS 恢复异常上下文，eret 异常返回，都是一样的操作

__irq_exception_from_lower_el

来自低特权级的 irq，典型的是虚机的 vtimer 中断，其处理方式基本和 irq_from_current_el 一样，只是增加了设置 x18 为当前 task 这个步骤，具体不说明了，下面的汇编应该能看懂

C 复制代码

vfunc __irq_exception_from_lower_el
    SAVE_GP_REGS

    PCPU_LOAD_CURRENT_TASK          // x18 will store the current task

    // Set the irq flags into ti->flags.
    ldr x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    orr x1, x1, #__TIF_HARDIRQ_MASK
    str x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    dsb sy

    mov x0, sp              // x0 is the gp_regs pass to irq_c_handler
    str x0, [x18, #TASK_STACK_OFFSET]   // save the current task's stack to task
    bl  task_exit_from_user

    mov x0, sp
    bl  irq_from_lower_el       // call the c irq handler

    // clear the irq flags into ti->flags.
    ldr x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    and x1, x1, #(~__TIF_HARDIRQ_MASK)
    str x1, [x18, #TASK_INFO_FLAGS_OFFSET]
    dsb sy

    b   exception_return
vfunc_end __irq_exception_from_lower_el

首发微信公号：Rand_cs