目录
引言
之前在研究Linux内核源码的时候总是找不到关于这部分源码的相关剖析,要么也是模棱两可的,也有一些比较专业的代码分析,不过比较分散,感觉大家都不太喜欢这部分代码,正好今天周末,这段时间也在学习Arm64汇编,以这部分为研究对象来解析
源码版本:Linux 5.0
架构信息
- 芯片架构:
ARM64
- 内存架构:
UMA
CONFIG_ARM64_VA_BITS:39
CONFIG_ARM64_PAGE_SHIFT:12
CONFIG_PGTABLE_LEVELS:3
之前写的一些相关文章:
汇编启动!!!
Linux内核代码从哪里执行的?从链接脚本看
shell
# arch/arm64/kernel/vmlinux.lds.S
SECTIONS
{
. = KIMAGE_VADDR + TEXT_OFFSET;
.head.text : {
_text = .;
HEAD_TEXT
}
c
// include/linux/init.h
#define __HEAD .section ".head.text","ax"
c
// arch/arm64/kernel/head.S
__HEAD
_head:
/*
* DO NOT MODIFY. Image header expected by Linux boot-loaders.
*/
b stext // branch to kernel start, magic
KIMAGE_VADDR
是vmalloc
区域的起始地址,TEXT_OFFSET
是内核起始地址距离ram
起始地址的偏移(每一版的Linux
内核内存架构都有点不同,此处不做纠结)
c
// arch/arm64/include/asm/memory.h
#define VA_BITS (39)
#define VA_START (UL(0xffffffffffffffff) - \
(UL(1) << VA_BITS) + 1)
#define PAGE_OFFSET (UL(0xffffffffffffffff) - \
(UL(1) << (VA_BITS - 1)) + 1)
#define KIMAGE_VADDR (MODULES_END)
#define BPF_JIT_REGION_START (VA_START + KASAN_SHADOW_SIZE)
#define BPF_JIT_REGION_SIZE (SZ_128M)
#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE)
#define MODULES_END (MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR (BPF_JIT_REGION_END)
#define MODULES_VSIZE (SZ_128M)
#define VMEMMAP_START (PAGE_OFFSET - VMEMMAP_SIZE)
#define PCI_IO_END (VMEMMAP_START - SZ_2M)
#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE)
#define FIXADDR_TOP (PCI_IO_START - SZ_2M)
这个Linear Mapping
区域位置不固定,有时候会在VM_START
下
上述跳转到stext
符号,从这里才正式开始
c
ENTRY(stext)
bl preserve_boot_args
bl el2_setup // Drop to EL1, w0=cpu_boot_mode
adrp x23, __PHYS_OFFSET
and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0
bl set_cpu_boot_mode_flag
bl
/*
* The following calls CPU setup code, see arch/arm64/mm/proc.S for
* details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
bl __cpu_setup // initialise processor
b __primary_switch
ENDPROC(stext)
首先是preserve_boot_args
符号
x20
存储的是FDT
设备树文件的物理地址,将其传递给x21
c
/*
* Preserve the arguments passed by the bootloader in x0 .. x3
*/
preserve_boot_args:
mov x21, x0 // x21=FDT
adr_l x0, boot_args // record the contents of
stp x21, x1, [x0] // x0 .. x3 at kernel entry
stp x2, x3, [x0, #16]
dmb sy // needed before dc ivac with
// MMU off
mov x1, #0x20 // 4 x 8 bytes
b __inval_dcache_area // tail call
ENDPROC(preserve_boot_args)
将x21 x1 x2 x3
四个寄存器的值存入boot_args
数组
c
// arch/arm64/kernel/setup.c
/*
* The recorded values of x0 .. x3 upon kernel entry.
*/
u64 __cacheline_aligned boot_args[4];
__inval_dcache_area
用于清理32
字节的数据缓存,x0
是boot_args
的地址,x1
是32
字节,即数组的四个元素(这部分功能代码,在末尾进行解析)
接下来是el2_setup
c
ENTRY(el2_setup)
msr SPsel, #1 // We want to use SP_EL{1,2}
mrs x0, CurrentEL
cmp x0, #CurrentEL_EL2
b.eq 1f
mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1)
msr sctlr_el1, x0
mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1
isb
ret
SPsel
用于在SP_EL0
和SP_ELn
中选择SP
寄存器,此处选择使用当前特权级SP
寄存器
CurrentEL
用于获取当前运行级别,并与CurrentEL_EL2
进行比较,我们假设此处不使用虚拟机,而是使用内核级级别
sctlr_el1
为系统控制寄存器,此处用于设置小端模式(如下,默认是小端)
c
// arch/arm64/include/asm/sysreg.h
#define SCTLR_EL1_RES1 ((_BITUL(11)) | (_BITUL(20)) | (_BITUL(22)) | (_BITUL(28)) | \
(_BITUL(29)))
#ifdef CONFIG_CPU_BIG_ENDIAN
#define ENDIAN_SET_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE)
#define ENDIAN_CLEAR_EL1 0
#else
#define ENDIAN_SET_EL1 0
#define ENDIAN_CLEAR_EL1 (SCTLR_EL1_E0E | SCTLR_ELx_EE)
#endif
BOOT_CPU_MODE_EL1
暂时不知道啥用处,只会通过返回存放在w0
寄存器,放在这里
c
// arch/arm64/include/asm/virt.h
#define BOOT_CPU_MODE_EL1 (0xe11)
kaslr
假设不开启,此处略过
直接看set_cpu_boot_mode_flag
c
/*
* Sets the __boot_cpu_mode flag depending on the CPU boot mode passed
* in w0. See arch/arm64/include/asm/virt.h for more info.
*/
set_cpu_boot_mode_flag:
adr_l x1, __boot_cpu_mode
cmp w0, #BOOT_CPU_MODE_EL2
b.ne 1f
add x1, x1, #4
1: str w0, [x1] // This CPU has booted in EL1
dmb sy
dc ivac, x1 // Invalidate potentially stale cache line
ret
ENDPROC(set_cpu_boot_mode_flag)
__boot_cpu_mode
是一个整数数组
c
ENTRY(__boot_cpu_mode)
.long BOOT_CPU_MODE_EL2
.long BOOT_CPU_MODE_EL1
c
// arch/arm64/include/asm/virt.h
extern u32 __boot_cpu_mode[2];
前面的w0
存储的是BOOT_CPU_MODE_EL1
,由此处可知:This CPU has booted in EL1
,跳转到1
标签
此处让__boot_cpu_mode[0]
等于BOOT_CPU_MODE_EL1
,并且清理此处缓存
然后是__cpu_setup
,先是清理tlb
缓存
c
.pushsection ".idmap.text", "awx"
ENTRY(__cpu_setup)
tlbi vmalle1 // Invalidate local TLB
dsb nsh
cpacr_el1
用于控制对浮点数simd
的访问:捕获访问与浮点和SIMD
执行相关的寄存器的指令,以便在从EL0
或EL1
执行时捕获到EL1
mdscr_el1(Monitor Debug System Control Register)
,debug
功能不做概述
c
mov x0, #3 << 20
msr cpacr_el1, x0 // Enable FP/ASIMD
mov x0, #1 << 12 // Reset mdscr_el1 and disable
msr mdscr_el1, x0 // access to the DCC from EL0
isb // Unmask debug exceptions now,
enable_dbg // since this is per-cpu
reset_pmuserenr_el0 x0 // Disable PMU access from EL0
mair_el1
用于控制存储器属性的编码:分为八段,用于描述不同的内存属性,后续会在页表中使用AttrIndx[2:0]
进行索引
ARMv8
最多可以定义八种不同的内存属性,而Linux
内核只定义了六种
c
ldr x5, =MAIR(0x00, MT_DEVICE_nGnRnE) | \
MAIR(0x04, MT_DEVICE_nGnRE) | \
MAIR(0x0c, MT_DEVICE_GRE) | \
MAIR(0x44, MT_NORMAL_NC) | \
MAIR(0xff, MT_NORMAL) | \
MAIR(0xbb, MT_NORMAL_WT)
msr mair_el1, x5
TCR
寄存器主要包括了与地址转换相关的控制信息以及与高速缓存相关的配置信息
c
/*
* Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
* both user and kernel.
*/
ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
TCR_TG_FLAGS | TCR_KASLR_FLAGS | TCR_ASID16 | \
TCR_TBI0 | TCR_A1 | TCR_KASAN_FLAGS
ldr_l x9, idmap_t0sz
tcr_set_t0sz x10, x9
/*
* Set the IPS bits in TCR_EL1.
*/
tcr_compute_pa_size x10, #TCR_IPS_SHIFT, x5, x6
msr tcr_el1, x10
ret // return to head.S
ENDPROC(__cpu_setup)
这部分是初始化内存部分
最重要的是__primary_switch
c
__primary_switch:
adrp x1, init_pg_dir
bl __enable_mmu
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
br x8
ENDPROC(__primary_switch)
__enable_mmu
这个看名词即知,用于开启mmu
__create_page_tables
用于创建两个页表:init_pg_dir
和idmap_pg_dir
在执行之前先关闭
mmu
c
msr sctlr_el1, x20 // disable the MMU
isb
bl __create_page_tables // recreate kernel mapping
清除init_pg_dir
页表缓存
c
__create_page_tables:
mov x28, lr
/*
* Invalidate the init page tables to avoid potential dirty cache lines
* being evicted. Other page tables are allocated in rodata as part of
* the kernel image, and thus are clean to the PoC per the boot
* protocol.
*/
adrp x0, init_pg_dir // adrp获取的是物理地址
adrp x1, init_pg_end
sub x1, x1, x0
bl __inval_dcache_area // 清除缓存
将init_pg_dir
页表内存重置为xzr
c
adrp x0, init_pg_dir
adrp x1, init_pg_end
sub x1, x1, x0 // x1为init_pg_dir占用的字节数
1: stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
stp xzr, xzr, [x0], #16
subs x1, x1, #64 // 一次清理64B
b.ne 1b
vabits_user
用于保存虚拟地址位数
c
/*
* Create the identity mapping. 恒等映射
*/
adrp x0, idmap_pg_dir // idmap_pg_dir的物理地址
adrp x3, __idmap_text_start // __pa(__idmap_text_start)
mov x5, #VA_BITS
1:
adr_l x6, vabits_user
str x5, [x6]
dmb sy
dc ivac, x6 // Invalidate potentially stale cache line
idmap_ptrs_per_pgd
用于获得PGD(idmap_pg_dir)
的PGD
表项数
PGDIR_SHIFT
为PGD
的偏移位数
c
/*
* If VA_BITS == 48, we don't have to configure an additional
* translation level, but the top-level table has more entries.
*/
mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
str_l x4, idmap_ptrs_per_pgd, x5
创建并更新idmap_pg_dir
页表
c
ldr_l x4, idmap_ptrs_per_pgd
mov x5, x3 // __pa(__idmap_text_start)
adr_l x6, __idmap_text_end // __pa(__idmap_text_end)
// 创建各个页表
map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
这部分空间是什么?
shell
# arch/arm64/kernel/vmlinux.lds.S
#define IDMAP_TEXT \
. = ALIGN(SZ_4K); \
__idmap_text_start = .; \
*(.idmap.text) \
__idmap_text_end = .;
其中.idmap.text
如下
c
pushsection .idmap.text, "awx"
//
.popsection
使用map_memory
创建页表并映射,map_memory
宏定义如下
c
/*
* Map memory for specified virtual address range. Each level of page table needed supports
* multiple entries. If a level requires n entries the next page table level is assumed to be
* formed from n pages.
*
* tbl: location of page table
* rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE)
* vstart: start address to map
* vend: end address to map - we map [vstart, vend]
* flags: flags to use to map last level entries
* phys: physical address corresponding to vstart - physical memory is contiguous
* pgds: the number of pgd entries
*
* Temporaries: istart, iend, tmp, count, sv - these need to be different registers
* Preserves: vstart, vend, flags
* Corrupts: tbl, rtbl, istart, iend, tmp, count, sv
*/
.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
add \rtbl, \tbl, #PAGE_SIZE
mov \sv, \rtbl
mov \count, #0
compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
mov \sv, \rtbl
#if SWAPPER_PGTABLE_LEVELS > 2
compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
#endif
compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
.endm
创建并更新init_pg_dir
表项
c
/*
* Map the kernel image (starting with PHYS_OFFSET).
*/
adrp x0, init_pg_dir
mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text)
add x5, x5, x23 // add KASLR displacement
mov x4, PTRS_PER_PGD
adrp x6, _end // runtime __pa(_end)
adrp x3, _text // runtime __pa(_text)
sub x6, x6, x3 // _end - _text
add x6, x6, x5 // runtime __va(_end)
map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
由上面可知,前面已经建立了恒等映射和内核映射,下面开启MMU
,并执行到__primary_switched
c
msr sctlr_el1, x19 // re-enable the MMU
isb
ic iallu // flush instructions fetched
dsb nsh // via old mapping
isb
ldr x8, =__primary_switched
adrp x0, __PHYS_OFFSET
br x8
ENDPROC(__primary_switch)
c
__primary_switched:
adrp x4, init_thread_union
add sp, x4, #THREAD_SIZE
adr_l x5, init_task
msr sp_el0, x5 // Save thread_info
// 设置异常向量表
adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
isb
stp xzr, x30, [sp, #-16]!
mov x29, sp
str_l x21, __fdt_pointer, x5 // Save FDT pointer
ldr_l x4, kimage_vaddr // Save the offset between
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 // physical mappings
// Clear BSS
adr_l x0, __bss_start
mov x1, xzr
adr_l x2, __bss_stop
sub x2, x2, x0
bl __pi_memset
dsb ishst // Make zero page visible to PTW
add sp, sp, #16
mov x29, #0
mov x30, #0
b start_kernel
ENDPROC(__primary_switched)
init_thread_union
存放了init
栈的起始地址,如下__start_init_task = init_thread_union = init_stack
,并将其add sp, x4, #THREAD_SIZE
赋值为sp
寄存器,并将init_task
进程描述符存储到sp_el0
寄存器
c
// include/asm-generic/vmlinux.lds.h
#define INIT_TASK_DATA(align) \
. = ALIGN(align); \
__start_init_task = .; \
init_thread_union = .; \
init_stack = .; \
KEEP(*(.data..init_task)) \
KEEP(*(.data..init_thread_info)) \
. = __start_init_task + THREAD_SIZE; \
__end_init_task = .;
c
// init/init_task.c
/*
* Set up the first task table, touch at your own risk!. Base=0,
* limit=0x1fffff (=2MB)
*/
struct task_struct init_task
#ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK
__init_task_data
#endif
= {
//
};
EXPORT_SYMBOL(init_task);
设置异常向量表(vectors
在后续会进行剖析)
c
adr_l x8, vectors // load VBAR_EL1 with virtual
msr vbar_el1, x8 // vector table address
isb
将FDT
物理地址(刚开始的时候将其地址存入x21
)存入__fdt_pointer
c
str_l x21, __fdt_pointer, x5 // Save FDT pointer
保存kimage_vaddr
,这个地址是kernel
的虚拟地址,x0
是内核被加载的物理地址
c
ldr_l x4, kimage_vaddr // Save the offset between
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 // physical mappings
最后是清理bss
段位执行内核函数做准备
跳转到start_kernel执行
细节剖析
map_memory
宏定义
由上面idmap_pg_dir
页表创建可知,
寄存器 | 地址 |
---|---|
x0 | idmap_pg_dir |
x3 | __idmap_text_start |
x6 | __idmap_text_end |
x7 | SWAPPER_MM_MMUFLAGS |
x4 | idmap_ptrs_per_pgd |
c
map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
可知,PGD
页表占用一个页:PAGE_SIZE
,tbl
用于存储下一级页表的基址
c
.macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
add \rtbl, \tbl, #PAGE_SIZE
mov \sv, \rtbl
mov \count, #0
compute_indices
宏的功能:用于计算虚拟地址计算各级页表的索引值
c
compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
populate_entries
宏的功能:填充索引值index
对应的页表项
此处用于设置PGD PUD PMD
页表项,此处不会设置PTE
,使用段映射,一般是2MB
c
// arch/arm64/include/asm/pgtable-hwdef.h
/* Initial memory map size */
#if ARM64_SWAPPER_USES_SECTION_MAPS
#define SWAPPER_BLOCK_SHIFT SECTION_SHIFT
#define SWAPPER_BLOCK_SIZE SECTION_SIZE
#define SWAPPER_TABLE_SHIFT PUD_SHIFT
#else
#define SWAPPER_BLOCK_SHIFT PAGE_SHIFT
#define SWAPPER_BLOCK_SIZE PAGE_SIZE
#define SWAPPER_TABLE_SHIFT PMD_SHIFT
#endif
c
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
是怎么切换页表的?每一次切换一级页表
c
mov \tbl, \sv
mov \sv, \rtbl
来看看具体的宏定义
c
.macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
lsr \iend, \vend, \shift // 计算结束PGD表项
mov \istart, \ptrs
sub \istart, \istart, #1 // 获得页表最大索引
and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) 将iend限制在最大范围内
mov \istart, \ptrs
mul \istart, \istart, \count
add \iend, \iend, \istart // iend += (count - 1) * ptrs
// our entries span multiple tables
lsr \istart, \vstart, \shift
mov \count, \ptrs
sub \count, \count, #1
and \istart, \istart, \count
sub \count, \iend, \istart
.endm
populate_entries
下次补上
__inval_dcache_area
宏定义
dc
指令用于控制数据缓存
civac
:PoC
,清理并使指定的虚拟地址对应的高速缓存失效ivac
:PoC
,使指定的虚拟地址中对于的高速缓存失效
c
ENTRY(__inval_dcache_area)
/* FALLTHROUGH */
/*
* __dma_inv_area(start, size)
* - start - virtual start address of region x0
* - size - size in question x1
*/
__dma_inv_area:
add x1, x1, x0 // x1=x0(start)+x1(size) 结束地址end
dcache_line_size x2, x3 // x2为缓存行大小
sub x3, x2, #1
tst x1, x3 // end cache line aligned? 缓存行是否对齐
bic x1, x1, x3 // 不对齐则清除为0,这会提前地址,而不会跳过规定的起始范围
b.eq 1f
dc civac, x1 // clean & invalidate D / U line
1: tst x0, x3 // start cache line aligned?
bic x0, x0, x3
b.eq 2f
dc civac, x0 // clean & invalidate D / U line
b 3f
2: dc ivac, x0 // invalidate D / U line
3: add x0, x0, x2
cmp x0, x1
b.lo 2b
dsb sy
ret
ENDPIPROC(__inval_dcache_area)
vectors
异常向量表
ARM64
中的中断向量表占用2048B
,分为四组,每组四个表项,每表项占用128B
,四组分别是:
EL1t
:在EL1
下,与当前栈指针SP_ELx
不同(一般是SP_EL0
)EL1t
:在EL1
下,与当前栈指针SP_ELx
相同(即SP_EL1
)- 从低异常级
EL0
进入当前异常级EL1
(Lower EL, AArch64
) - 从低异常级
EL0
进入当前异常级EL1
(Lower EL, AArch32
)
c
/*
* Exception vectors.
*/
.pushsection ".entry.text", "ax"
.align 11
ENTRY(vectors)
kernel_ventry 1, sync_invalid // Synchronous EL1t
kernel_ventry 1, irq_invalid // IRQ EL1t
kernel_ventry 1, fiq_invalid // FIQ EL1t
kernel_ventry 1, error_invalid // Error EL1t
kernel_ventry 1, sync // Synchronous EL1h
kernel_ventry 1, irq // IRQ EL1h
kernel_ventry 1, fiq_invalid // FIQ EL1h
kernel_ventry 1, error // Error EL1h
kernel_ventry 0, sync // Synchronous 64-bit EL0
kernel_ventry 0, irq // IRQ 64-bit EL0
kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0
kernel_ventry 0, error // Error 64-bit EL0
kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0
kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0
kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0
kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0
END(vectors)
每四个异常分别对应于
c
#define BAD_SYNC 0
#define BAD_IRQ 1
#define BAD_FIQ 2
#define BAD_ERROR 3