Linux 内存 domain 管理

文章目录

  • [1. 前言](#1. 前言)
  • [2. 内存 domain](#2. 内存 domain)
    • [2.1 domain 硬件支持](#2.1 domain 硬件支持)
    • [2.2 Linux 的 domain 管理](#2.2 Linux 的 domain 管理)
      • [2.2.1 定义](#2.2.1 定义)
      • [2.2.2 初始化](#2.2.2 初始化)
    • [2.3 Domain fault](#2.3 Domain fault)
  • [3. 参考资料](#3. 参考资料)

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. 内存 domain

本文以 ARMv7 架构 + Linux 4.14.x 为例,说明 Linux 内存管理的 domain

2.1 domain 硬件支持

ARMv7 支持最多 16 个内存 domain

32-bitDACR 寄存器定义了 16 个内存 domain,内存 domain 可以是以下 4 种类型之一:

  • 0b00:No access
    任何对标记为 No access 类型内存 domain 的访问,会生成 Domain fault
  • 0b01:Client
    对 Client 类型内存 domain 的访问,会做权限检查,不满足权限的访问会触发 Permission fault
  • 0b10: Reserved
    保留类型。
  • 0b11: Manager
    对 Manager 类型内存 domain 的访问,不做权限检查。

2.2 Linux 的 domain 管理

2.2.1 定义

首先看 Linux 下对内存 domain 编号定义:

c 复制代码
/* arch/arm/include/asm/domain.h */

/*
 * Domain numbers
 *
 *  DOMAIN_IO     - domain 2 includes all IO only
 *  DOMAIN_USER   - domain 1 includes all user memory only
 *  DOMAIN_KERNEL - domain 0 includes all kernel memory only
 *
 * The domain numbering depends on whether we support 36 physical
 * address for I/O or not.  Addresses above the 32 bit boundary can
 * only be mapped using supersections and supersections can only
 * be set for domain 0.  We could just default to DOMAIN_IO as zero,
 * but there may be systems with supersection support and no 36-bit
 * addressing.  In such cases, we want to map system memory with
 * supersections to reduce TLB misses and footprint.
 *
 * 36-bit addressing and supersections are only available on
 * CPUs based on ARMv6+ or the Intel XSC3 core.
 */
#ifndef CONFIG_IO_36
#define DOMAIN_KERNEL	0
#define DOMAIN_USER	1
#define DOMAIN_IO	2
#else
#define DOMAIN_KERNEL	2
#define DOMAIN_USER	1
#define DOMAIN_IO	0
#endif
#define DOMAIN_VECTORS	3

从上可见,Linux 使用了 DOMAIN_KERNEL,DOMAIN_USER,DOMAIN_IO,DOMAIN_VECTORS4 个内存 domain。

再来看下 Linux 对 domain 类型的定义:

c 复制代码
/* arch/arm/include/asm/domain.h */

/*
 * Domain types
 */
#define DOMAIN_NOACCESS	0
#define DOMAIN_CLIENT	1
#ifdef CONFIG_CPU_USE_DOMAINS
#define DOMAIN_MANAGER	3
#else
#define DOMAIN_MANAGER	1
#endif

2.2.2 初始化

c 复制代码
/* arch/arm/kernel/head.S */

__enable_mmu:
	...
#ifdef CONFIG_ARM_LPAE
	...
#else
	/*
	 * 设置 内存域(domain) 访问控制 
	 * DDI0406C_d_armv7ar_arm.pdf, P554
	 * B4.1.43 DACR, Domain Access Control Register, VMSA
	 */
	mov	r5, #DACR_INIT
	mcr	p15, 0, r5, c3, c0, 0		@ load domain access register (DACR = DACR_INIT)
#endif

DACR_INIT 的定义如下:

c 复制代码
/* arch/arm/include/asm/domain.h */

#define domain_mask(dom)	((3) << (2 * (dom)))
#define domain_val(dom,type)	((type) << (2 * (dom)))

#ifdef CONFIG_CPU_SW_DOMAIN_PAN
#define DACR_INIT \
	(domain_val(DOMAIN_USER, DOMAIN_NOACCESS) | \
	 domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
	 domain_val(DOMAIN_IO, DOMAIN_CLIENT) | \
	 domain_val(DOMAIN_VECTORS, DOMAIN_CLIENT))
#else
#define DACR_INIT \
	(domain_val(DOMAIN_USER, DOMAIN_CLIENT) | \
	 domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \
	 domain_val(DOMAIN_IO, DOMAIN_CLIENT) | \
	 domain_val(DOMAIN_VECTORS, DOMAIN_CLIENT))
#endif

所以,__enable_mmu 启用 MMU 时,初始化 DOMAIN_KERNEL,DOMAIN_USER,DOMAIN_IO,DOMAIN_VECTORS4 个内存 domain 的类型到 DACR 寄存器。在 CONFIG_CPU_SW_DOMAIN_PAN 开启或关闭的两种情形下,这 4 个内存 domain 的类型各有不同,具体见上面 DACR_INIT 的定义。

这里只是初始设定 Linux DOMAIN_KERNEL,DOMAIN_USER,DOMAIN_IO,DOMAIN_VECTORS4 个内存 domain 的类型,那么,怎么决定哪块内存属于哪个 domain?答案是页表(这里讨论的是 Short-descriptor 格式页表项):

mem_types[] 定义了各种类型内存的 domain 类型,而后创建页表时,按 mem_types[] 设置各类型内存的页表项的 domain 域:

c 复制代码
/* arch/arm/mm/mmu.c */

static struct mem_type mem_types[] __ro_after_init = {
	[MT_DEVICE] = {		  /* Strongly ordered / ARMv6 shared device */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
				  L_PTE_SHARED,
		.prot_pte_s2	= s2_policy(PROT_PTE_S2_DEVICE) |
				  s2_policy(L_PTE_S2_MT_DEV_SHARED) |
				  L_PTE_SHARED,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_S,
		.domain		= DOMAIN_IO,
	},
	[MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE,
		.domain		= DOMAIN_IO,
	},
	[MT_DEVICE_CACHED] = {	  /* ioremap_cached */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE | PMD_SECT_WB,
		.domain		= DOMAIN_IO,
	},
	[MT_DEVICE_WC] = {	/* ioremap_wc */
		.prot_pte	= PROT_PTE_DEVICE | L_PTE_MT_DEV_WC,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PROT_SECT_DEVICE,
		.domain		= DOMAIN_IO,
	},
	[MT_UNCACHED] = {
		.prot_pte	= PROT_PTE_DEVICE,
		.prot_l1	= PMD_TYPE_TABLE,
		.prot_sect	= PMD_TYPE_SECT | PMD_SECT_XN,
		.domain		= DOMAIN_IO,
	},
	[MT_CACHECLEAN] = {
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
		.domain    = DOMAIN_KERNEL,
	},
#ifndef CONFIG_ARM_LPAE
	[MT_MINICLEAN] = {
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE,
		.domain    = DOMAIN_KERNEL,
	},
#endif
	[MT_LOW_VECTORS] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
				L_PTE_RDONLY,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_VECTORS,
	},
	[MT_HIGH_VECTORS] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
				L_PTE_USER | L_PTE_RDONLY,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_VECTORS,
	},
	[MT_MEMORY_RWX] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RW] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
			     L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_ROM] = {
		.prot_sect = PMD_TYPE_SECT,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RWX_NONCACHED] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
				L_PTE_MT_BUFFERABLE,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RW_DTCM] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
				L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RWX_ITCM] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_RW_SO] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
				L_PTE_MT_UNCACHED | L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE | PMD_SECT_S |
				PMD_SECT_UNCACHED | PMD_SECT_XN,
		.domain    = DOMAIN_KERNEL,
	},
	[MT_MEMORY_DMA_READY] = {
		.prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
				L_PTE_XN,
		.prot_l1   = PMD_TYPE_TABLE,
		.domain    = DOMAIN_KERNEL,
	},
};

build_mem_type_table() 初始化各类型内存的 domain number(DOMAIN_KERNEL, ...)

c 复制代码
start_kernel()
	setup_arch()
		early_mm_init()
			build_mem_type_table()

static void __init build_mem_type_table(void)
{
	...

	/* 将 domain number(DOMAIN_KERNEL, ...) 设置到 页表项 初始值, 后续创建页表时配置到页表项 */
	for (i = 0; i < ARRAY_SIZE(mem_types); i++) {
		struct mem_type *t = &mem_types[i];
		if (t->prot_l1)
			t->prot_l1 |= PMD_DOMAIN(t->domain);
		if (t->prot_sect)
			t->prot_sect |= PMD_DOMAIN(t->domain);
	}
}

创建页表时将预设的 domain number(DOMAIN_KERNEL, ...) 填充到页目录表项,如 BOOT 阶段创建 lowmem 页表的 create_mapping() 调用:

c 复制代码
create_mapping()
	__create_mapping(&init_mm, md, early_alloc, false);

static void __init __create_mapping(struct mm_struct *mm, struct map_desc *md,
				    void *(*alloc)(unsigned long sz),
				    bool ng)
{
	...
	const struct mem_type *type;
	...

	/* @md->type 类型内存的 domain number,已由 build_mem_type_table() 时设置好 */
	type = &mem_types[md->type];

	do {
		...
		alloc_init_pud(pgd, addr, next, phys, type, alloc, ng);
		...
	} while (pgd++, addr != end);
}

alloc_init_pud()
	alloc_init_pmd()

static void __init alloc_init_pmd(pud_t *pud, unsigned long addr,
				      unsigned long end, phys_addr_t phys,
				      const struct mem_type *type,
				      void *(*alloc)(unsigned long sz), bool ng)
{
	pmd_t *pmd = pmd_offset(pud, addr);
	unsigned long next;

	do {
		next = pmd_addr_end(addr, end);

		if (type->prot_sect &&
				((addr | next | phys) & ~SECTION_MASK) == 0) {
			/* 映射 section */
			__map_init_section(pmd, addr, next, phys, type, ng);
		} else {
			/* 映射 page */
			alloc_init_pte(pmd, addr, next,
				       __phys_to_pfn(phys), type, alloc, ng);
		}
	} while (pmd++, addr = next, addr != end);
}

static void __init __map_init_section(pmd_t *pmd, unsigned long addr,
			unsigned long end, phys_addr_t phys,
			const struct mem_type *type, bool ng)
{
	pmd_t *p = pmd;

#ifndef CONFIG_ARM_LPAE
	/*
	 * In classic MMU format, puds and pmds are folded in to
	 * the pgds. pmd_offset gives the PGD entry. PGDs refer to a
	 * group of L1 entries making up one logical pointer to
	 * an L2 table (2MB), where as PMDs refer to the individual
	 * L1 entries (1MB). Hence increment to get the correct
	 * offset for odd 1MB sections.
	 * (See arch/arm/include/asm/pgtable-2level.h)
	 */
	if (addr & SECTION_SIZE)
		pmd++;
#endif
	do {
		/*
		 * type->prot_sect 含有 build_mem_type_table() 设置的 domain number, 
		 * 这样就设定映射 section 所属的 domain number,而该 domain number 的
		 * 类型,从 __enable_mmu() 设置到 DACR 寄存器的对应 bit 去查找,即
		 * domain number 的 type = domain_mask(domain number)。
		 */
		*pmd = __pmd(phys | type->prot_sect | (ng ? PMD_SECT_nG : 0));
		phys += SECTION_SIZE;
	} while (pmd++, addr += SECTION_SIZE, addr != end);

	flush_pmd_entry(p);
}

static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
				  unsigned long end, unsigned long pfn,
				  const struct mem_type *type,
				  void *(*alloc)(unsigned long sz),
				  bool ng)
{
	pte_t *pte = arm_pte_alloc(pmd, addr, type->prot_l1, alloc);
	do {
		/*
		 * type->prot_pte 含有 build_mem_type_table() 设置的 domain number, 
		 * 这样就设定映射 section 所属的 domain number,而该 domain number 的
		 * 类型,从 __enable_mmu() 设置到 DACR 寄存器的对应 bit 去查找,即
		 * domain number 的 type = domain_mask(domain number)。
		 */
		set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)),
			    ng ? PTE_EXT_NG : 0);
		pfn++;
	} while (pte++, addr += PAGE_SIZE, addr != end);
}

上面的代码分析的是 BOOT 阶段内存映射时 domain number 设置过程,那内核空间其它动态内存映射,以及用户空间内存映射,又是如何配置内存的 domain number 呢?

先看下内核空间其它动态内存映射 domain number 的设置:

c 复制代码
/* arch/arm/include/asm/pgalloc.h */

/* [内核空间] PMD 表项, domain 为 DOMAIN_KERNEL */
#define _PAGE_KERNEL_TABLE	(PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_KERNEL))

/*
 * Populate the pmdp entry with a pointer to the pte.  This pmd is part
 * of the mm address space.
 *
 * Ensure that we always set both PMD entries.
 */
static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
{
	/*
	 * The pmd must be loaded with the physical address of the PTE table
	 */
	__pmd_populate(pmdp, __pa(ptep), _PAGE_KERNEL_TABLE);
}

可以看到,内核空间动态内存映射,将内存的 domain number 设置为 DOMAIN_KERNEL

再来看看用户空间内存映射 domain number 的设置:

c 复制代码
/* arch/arm/include/asm/pgalloc.h */

/* [用户空间] PMD 表项, domain 为 DOMAIN_USER */
#define _PAGE_USER_TABLE	(PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))
c 复制代码
/* arch/arm/mm/mmu.c */

pmdval_t user_pmd_table = _PAGE_USER_TABLE;
c 复制代码
/* arch/arm/include/asm/pgalloc.h */

static inline void
pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
{
	extern pmdval_t user_pmd_table;
	pmdval_t prot;

	if (__LINUX_ARM_ARCH__ >= 6 && !IS_ENABLED(CONFIG_ARM_LPAE))
		prot = user_pmd_table;
	else
		prot = _PAGE_USER_TABLE;

	__pmd_populate(pmdp, page_to_phys(ptep), prot);
}

可见,用户空间内存映射,将内存的 domain number 设置为 DOMAIN_USER

2.3 Domain fault

看一个 domain 访问触发的 Domain fault 例子:

c 复制代码
Unhandled fault: page domain fault (0x01b) at 0x00000000
pgd = 9ea5c000
[00000000] *pgd=7eedc831, *pte=00000000, *ppte=00000000
Internal error: : 1b [#1] SMP ARM
Modules linked in:
CPU: 1 PID: 982 Comm: ping Not tainted 4.14.111 #21
Hardware name: ARM-Versatile Express
task: 9ee44800 task.stack: 9ea36000
PC is at ___bpf_prog_run+0x1440/0x19f8
LR is at irq_work_queue+0x14/0x78
pc : [<801eec58>]    lr : [<801eae7c>]    psr: 800e0013
sp : 9ea37978  ip : 00000000  fp : 9ea379dc
r10: 801eb234  r9 : 8080c668  r8 : 9ea37a00
r7 : 00000000  r6 : 9ef94c00  r5 : 801fa818  r4 : a12be094
r3 : 00000001  r2 : 9ea37a08  r1 : 00000000  r0 : 00000000
Flags: Nzcv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 7ea5c06a  DAC: 00000051
Process ping (pid: 982, stack limit = 0x9ea36210)
Stack: (0x9ea37978 to 0x9ea38000)
7960:                                                       0000000c 00000000
7980: 9ef94c00 00000000 9ef94c00 00000011 600e0013 00000000 00000000 00000000
79a0: 808ce7e4 00000000 9ea379f4 9ea379b8 80169f5c a12be000 9ef94c00 00000011
79c0: 8092230c 9f411000 00000062 00000062 9ea37a64 9ea379e0 801ef648 801ed824
79e0: 00000062 00000062 9ea37a0c 9ea379f8 8016a258 80169e7c 80926de4 00000001
7a00: 9f7b7098 00000000 00000000 00000000 9ea379fc 00000000 00000001 00000000
7a20: 9ea37a4c 9f60c5c0 9ef94c00 00000011 9ef94c00 00000000 8016af94 8016b5e0
7a40: 9ea37a6c 9ea37a5c 9ea37a9c 9ea37a68 9ea37a00 00000000 9ea37a9c 9ea37a68
7a60: 806c0c64 801ef618 00000000 00000000 00000000 00000000 00000000 00000000
7a80: 9ef94c00 9ee1dc00 9f411000 9efe6202 9ea37ad4 9ea37aa0 806c0d98 806c0bf4
7aa0: c3db3c00 1889692d 9ef949c0 806c0d10 9ef94c00 9ee1df80 9ef949c0 9f411054
7ac0: 9f411000 80921294 9ea37b14 9ea37ad8 8061dd40 806c0d1c 80b04e7c 9f411000
7ae0: 01080020 00000000 401d7c69 9ef949c0 80a86204 00000000 9f411000 9f5fec00
7b00: 9f411000 00000000 9ea37b6c 9ea37b18 806237ec 8061dadc 9ef949c0 80a86204
7b20: 9ea37b88 80b02d00 80b04e7c 80b03c6c 9ea36000 80b65244 80b65230 9f5fec00
7b40: 80710b34 9ef949c0 80a86204 00000001 00000000 9f5fec00 9f411000 8064c65c
7b60: 9ea37bcc 9ea37b70 806240f0 8062374c 80632de4 9ef94900 9f6cdec8 9f411000
7b80: 9ea37bac 00000200 fffffff4 ffffe000 80634030 9ef94914 00000008 8064c688
7ba0: 00000800 9ef949c0 9ef94900 9f411000 00000000 00000000 9ef9495c 8064c65c
7bc0: 9ea37bdc 9ea37bd0 80624194 80623aa8 9ea37c14 9ea37be0 80633fcc 80624184
7be0: 00000000 00000054 9ea37c14 9f411000 9ef949c0 9ef94900 9f5d9980 80b54740
7c00: 0000ffff 00000000 9ea37c44 9ea37c18 8065e668 80633ec8 00000008 0100007f
7c20: 9ef949c0 9eeeec00 9ef949c0 9f411000 80b54740 0000ffff 9ea37c7c 9ea37c48
7c40: 80660524 8065e464 9ea37d14 00000000 9ea37c7c 9ef949c0 80b54740 9eeeec00
7c60: 9f411000 9ea37d14 00000000 00000000 9ea37c9c 9ea37c80 8066089c 80660348
7c80: 9ef949c0 80b54740 9eeeec00 00000000 9ea37cbc 9ea37ca0 8065e1e4 806607f8
7ca0: 00000040 80b54740 9eeeec00 00000000 9ea37cd4 9ea37cc0 8066116c 8065e184
7cc0: 00000040 9ea37f48 9ea37ce4 9ea37cd8 80661220 80661154 9ea37dec 9ea37ce8
7ce0: 8068a2e0 806611f8 00000040 00000000 9ea37d2c 9ea37d14 00000000 80b03cac
7d00: 9ea37d7c 9ea37d10 801562cc 80151a94 80a868c0 00000000 9fbc38c0 9ea37f48
7d20: 80700008 ffffe000 00000002 9ea37d38 80140000 0100007f 00000000 00000000
7d40: ffff0000 00000200 00000001 00000001 00000000 00010000 00000000 9ea37e48
7d60: 00000000 00000000 00000000 802767f4 0100007f 0100007f 00000008 9ea37e24
7d80: 00000001 00000001 9ea36000 00000128 9ea37dd4 9ea37da0 8042a2d4 8027676c
7da0: 9ea37e48 9ea37dac 00000010 9ea37e48 00000010 9ea37f50 00000000 9ea37e24
7dc0: 00000051 9eeeec00 00000000 00000000 9f2d6480 801080a4 9ea37e28 00000000
7de0: 9ea37e0c 9ea37df0 8069842c 80689e5c 00000001 0001814c 9ea37f48 00000000
7e00: 9ea37e1c 9ea37e10 8060397c 806983e0 9ea37f34 9ea37e20 8060444c 80603964
7e20: 9efae120 00000000 9ea37ea4 9ea37e38 8024ec74 80217d8c 9f2d6320 fffff000
7e40: 9efae120 00000054 00019314 00000040 76f08000 9ea5ddb8 9ea5ddb8 00000000
7e60: 00000000 00000000 00000000 00000000 9ffdad5c 00000000 9ea37ea4 9ea37fb0
7e80: 9ee44800 80000017 00000002 0100007f 00000000 00000000 9ea37efc 9ea37ea8
7ea0: 80711608 80161aa4 00000000 80609330 00001180 00000001 00000000 00000000
7ec0: 00000000 00000000 00000100 00000054 9f2d6480 80b0885c 00000017 807113ac
7ee0: 76f08314 9ea37fb0 00000000 802891f4 7edad114 9f2fb428 9ef57600 80295de4
7f00: 9ea37f1c 9ea37f10 80295de4 00018164 00000000 9f2d6480 00000128 801080a4
7f20: 9ea36000 00000128 9ea37f94 9ea37f38 806052e4 8060426c 00000000 00000000
7f40: 00000000 fffffff7 9ea37e88 00000010 00000001 00000000 00000000 9ea37e50
7f60: 00000000 0000004e 0001814c 00000000 00000000 00000000 7edac150 00019304
7f80: 00000000 00000040 9ea37fa4 9ea37f98 80605328 806052a0 00000000 9ea37fa8
7fa0: 80107ec0 8060531c 00019304 00000000 00000003 00018164 00000000 0001813c
7fc0: 00019304 00000000 00000040 00000128 0002b350 00018230 000192f0 10624dd3
7fe0: 00000000 7edac14c 0000af4b 76f36ad8 400e0030 00000003 00000000 00000000

[<801eec58>] (___bpf_prog_run) from [<801ef648>] (__bpf_prog_run32+0x3c/0x44)
[<801ef648>] (__bpf_prog_run32) from [<806c0c64>] (run_filter+0x7c/0x128)
[<806c0c64>] (run_filter) from [<806c0d98>] (packet_rcv+0x88/0x38c)
[<806c0d98>] (packet_rcv) from [<8061dd40>] (dev_queue_xmit_nit+0x270/0x298)
[<8061dd40>] (dev_queue_xmit_nit) from [<806237ec>] (dev_hard_start_xmit+0xac/0x258)
[<806237ec>] (dev_hard_start_xmit) from [<806240f0>] (__dev_queue_xmit+0x654/0x6dc)
[<806240f0>] (__dev_queue_xmit) from [<80624194>] (dev_queue_xmit+0x1c/0x20)
[<80624194>] (dev_queue_xmit) from [<80633fcc>] (neigh_resolve_output+0x110/0x198)
[<80633fcc>] (neigh_resolve_output) from [<8065e668>] (ip_finish_output2+0x210/0x44c)
[<8065e668>] (ip_finish_output2) from [<80660524>] (ip_finish_output+0x1e8/0x260)
[<80660524>] (ip_finish_output) from [<8066089c>] (ip_output+0xb0/0xbc)
[<8066089c>] (ip_output) from [<8065e1e4>] (ip_local_out+0x6c/0x78)
[<8065e1e4>] (ip_local_out) from [<8066116c>] (ip_send_skb+0x24/0xa4)
[<8066116c>] (ip_send_skb) from [<80661220>] (ip_push_pending_frames+0x34/0x40)
[<80661220>] (ip_push_pending_frames) from [<8068a2e0>] (raw_sendmsg+0x490/0x83c)
[<8068a2e0>] (raw_sendmsg) from [<8069842c>] (inet_sendmsg+0x58/0xf4)
[<8069842c>] (inet_sendmsg) from [<8060397c>] (sock_sendmsg+0x24/0x34)
[<8060397c>] (sock_sendmsg) from [<8060444c>] (___sys_sendmsg+0x1ec/0x214)
[<8060444c>] (___sys_sendmsg) from [<806052e4>] (__sys_sendmsg+0x50/0x7c)
[<806052e4>] (__sys_sendmsg) from [<80605328>] (SyS_sendmsg+0x18/0x1c)
[<80605328>] (SyS_sendmsg) from [<80107ec0>] (ret_fast_syscall+0x0/0x28)

Code: e2844008 e798c182 e3a01000 e0882183 (e79c0000) 

---[ end trace b3bef05bd8168206 ]---
Kernel panic - not syncing: Fatal exception in interrupt

CPU2: stopping
CPU: 2 PID: 0 Comm: swapper/2 Tainted: G      D         4.14.111 #21
Hardware name: ARM-Versatile Express
[<80110660>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f4e18>] (dump_stack+0xac/0xd8)
[<806f4e18>] (dump_stack) from [<8010ea44>] (handle_IPI+0x2d0/0x34c)
[<8010ea44>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<80710b30>] (__irq_svc+0x70/0x98)
Exception stack(0x9f509f28 to 0x9f509f70)
9f20:                   00000001 00000000 00000000 80b0402c 9f508000 00000000
9f40: 00000000 80b03cb8 80b03c6c 80a85c08 9f509f98 9f509f84 9f509f78 9f509f78
9f60: 801088c8 801088cc 60070013 ffffffff
[<80710b30>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<807102b4>] (default_idle_call+0x34/0x48)
[<807102b4>] (default_idle_call) from [<8015e9b4>] (do_idle+0x16c/0x218)
[<8015e9b4>] (do_idle) from [<8015ed0c>] (cpu_startup_entry+0x28/0x2c)
[<8015ed0c>] (cpu_startup_entry) from [<8010e504>] (secondary_start_kernel+0x168/0x170)
[<8010e504>] (secondary_start_kernel) from [<60101a0c>] (0x60101a0c)
CPU3: stopping
CPU: 3 PID: 0 Comm: swapper/3 Tainted: G      D         4.14.111 #21
Hardware name: ARM-Versatile Express
[<80110660>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f4e18>] (dump_stack+0xac/0xd8)
[<806f4e18>] (dump_stack) from [<8010ea44>] (handle_IPI+0x2d0/0x34c)
[<8010ea44>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<80710b30>] (__irq_svc+0x70/0x98)
Exception stack(0x9f50bf28 to 0x9f50bf70)
bf20:                   00000001 00000000 00000000 80b0402c 9f50a000 00000000
bf40: 00000000 80b03cb8 80b03c6c 80a85c08 9f50bf98 9f50bf84 9f50bf78 9f50bf78
bf60: 801088c8 801088cc 600e0013 ffffffff
[<80710b30>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<807102b4>] (default_idle_call+0x34/0x48)
[<807102b4>] (default_idle_call) from [<8015e9b4>] (do_idle+0x16c/0x218)
[<8015e9b4>] (do_idle) from [<8015ed0c>] (cpu_startup_entry+0x28/0x2c)
[<8015ed0c>] (cpu_startup_entry) from [<8010e504>] (secondary_start_kernel+0x168/0x170)
[<8010e504>] (secondary_start_kernel) from [<60101a0c>] (0x60101a0c)
CPU0: stopping
CPU: 0 PID: 0 Comm: swapper/0 Tainted: G      D         4.14.111 #21
Hardware name: ARM-Versatile Express
[<80110660>] (unwind_backtrace) from [<8010c3e0>] (show_stack+0x20/0x24)
[<8010c3e0>] (show_stack) from [<806f4e18>] (dump_stack+0xac/0xd8)
[<806f4e18>] (dump_stack) from [<8010ea44>] (handle_IPI+0x2d0/0x34c)
[<8010ea44>] (handle_IPI) from [<80101570>] (gic_handle_irq+0x9c/0xa0)
[<80101570>] (gic_handle_irq) from [<80710b30>] (__irq_svc+0x70/0x98)
Exception stack(0x80b01ee0 to 0x80b01f28)
1ee0: 00000001 00000000 00000000 80b0402c 80b00000 00000000 00000000 80b03cb8
1f00: 80b03c6c 80a85c08 80b01f50 80b01f3c 80b01f30 80b01f30 801088c8 801088cc
1f20: 60070013 ffffffff
[<80710b30>] (__irq_svc) from [<801088cc>] (arch_cpu_idle+0x34/0x4c)
[<801088cc>] (arch_cpu_idle) from [<807102b4>] (default_idle_call+0x34/0x48)
[<807102b4>] (default_idle_call) from [<8015e9b4>] (do_idle+0x16c/0x218)
[<8015e9b4>] (do_idle) from [<8015ed0c>] (cpu_startup_entry+0x28/0x2c)
[<8015ed0c>] (cpu_startup_entry) from [<80709ce0>] (rest_init+0xbc/0xc0)
[<80709ce0>] (rest_init) from [<80a00dec>] (start_kernel+0x3a8/0x3b4)
---[ end Kernel panic - not syncing: Fatal exception in interrupt

这里是一个 eBPF 测试程序导致的 Domain fault 异常。简略分析下日志:

c 复制代码
Unhandled fault: page domain fault (0x01b) at 0x00000000

日志显示,当前 DFSR = 0x01b

所以得知 domain number0x1 (即 DOMAIN_USER) ,原因 fault 类型是 0xb (0b01011),即下图中的 Second level 页表访问引发的 Domain fault:

page domain fault 信息来自 arch/arm/mm/fsr-2level.cfsr_info[] 表(上下文使用 2 级分页):

c 复制代码
static struct fsr_info fsr_info[] = {
	...
	{ do_bad,		SIGSEGV, SEGV_ACCERR,	"page domain fault"		   }, 
	...
};

Unhandled fault: page domain fault (0x01b) at 0x00000000 打印自函数 data abort 异常 C 入口:

c 复制代码
/* arch/arm/mm/fault.c */

static int
do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
	return 1;
}

asmlinkage void __exception
do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
	const struct fsr_info *inf = fsr_info + fsr_fs(fsr);
	struct siginfo info;

	if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs)) /* Domain fault 进入 do_bad() */
		return;

	pr_alert("Unhandled fault: %s (0x%03x) at 0x%08lx\n",
		inf->name, fsr, addr);
	show_pte(current->mm, addr);

	info.si_signo = inf->sig;
	info.si_errno = 0;
	info.si_code  = inf->code;
	info.si_addr  = (void __user *)addr;
	arm_notify_die("", regs, &info, fsr, 0);
}

另外,从页表项的打印日志

c 复制代码
pgd = 9ea5c000
[00000000] *pgd=7eedc831, *pte=00000000, *ppte=00000000

*pgd=7eedc831,即第一级页表项的值 0x7eedc831,也可以推导出是对 DOMAIN_USER 的访问出错:0x7eedc831 = 0b0111111011101101110010 0 0001 1 0 0 01,得知 domain number = 0b0001,即 domain = DOMAIN_USER

3. 参考资料

1\] DDI0406C_d_armv7ar_arm.pdf

相关推荐
warton882 小时前
ubuntu24下操作配置mysql8相关目录到指定地址
linux·运维·mysql
小亮亮虫2 小时前
linux-交叉编译链安装
linux
学Linux的语莫2 小时前
本地部署ollama
linux·服务器·langchain
`林中水滴`2 小时前
Linux系列:Linux 安装 MySQL 5.7.27 教程
linux·mysql
m0_738120722 小时前
应急响应——知攻善防蓝队溯源靶机Linux-2详细流程
linux·服务器·网络·安全·web安全·php
散人10242 小时前
Linux(Ubuntu)RIME 中文输入法-朙月拼音
linux·ubuntu·rime
草莓熊Lotso2 小时前
Linux 2.6 内核 O(1) 调度队列深度解析:为什么它能实现常数时间调度?
linux·运维·服务器·数据结构·人工智能·哈希算法·散列表
不染尘.2 小时前
Linux的基本管理及命令(上)
linux·windows·ssh
米高梅狮子2 小时前
12. SELinux 加固 Linux 安全
linux·运维·安全