linux tag: v6.8-rc1
kvm performance optimization technologies, part one
IPI: inter-processor interrupts, 即处理器间中断。
pv-IPI 是针对处理器间中断的半虚拟化解决方案,当 guest 发送 IPI 时,它可以将目标 vCPU 的 apicid 记录到 bitmap 中,然后通过 kvm_hypercall
触发一次 VM-exit 就可以把 IPI 发送给所有目标 vCPU。而无需针对每个目标 vCPU 分别写 ICR,大大减少 VM-exit 的触发。
使用时,只需要在 qemu 命令行中加上 +kvm-pv-ipi
(不过该特性是默认打开的)。例如:
c
qemu-system-x86_64 --no-reboot -nodefaults -device pc-testdev -device isa-debug-exit,iobase=0xf4,iosize=0x4 -vnc none -serial stdio -device pci-testdev -machine accel=kvm -bios /usr/share/qemu/OVMF.fd -object tdx-guest,id=tdx0 -machine q35,kernel_irqchip=split,confidential-guest-support=tdx0 -kernel x86/smptest.efi -net none -nographic -m 256 -smp 2 -cpu host,+kvm-pv-ipi
为测试该特性是否工作,可以在 guest 执行时跟踪 host 中有处理 kvm_hypercall KVM_HC_SEND_IPI
。具体步骤:
bash
# host
cd /sys/kernel/debug/tracing
echo kvm:kvm_hypercall > set_event
echo 1 > tracing_on
# 启动 guest。为更好地观察到现象,可以将多个 vCPU 绑定到2个 pCPU 上启动 guest
numactl -C 2-3 qemu-system-x86_64 -smp 6 -cpu host,+kvm-pv-ipi ...
# guest 中检查 eax[11]是否为 1
[root@guest a]# cpuid -l 0x40000001 -r1
CPU:
0x40000001 0x00: eax=0x0100fadb ebx=0x00000000 ecx=0x00000000 edx=0x00000000
# host。查看是否跟踪到 kvm_hypercall 被调用,并且 nr 为 0xa, 对应 `KVM_HC_SEND_IPI`
[root@pc tracing]# cat trace
CPU 2/KVM-855680 [002] ..... 103623.618837: kvm_hypercall: nr 0xa a0 0x13 a1 0x0 a2 0x0 a3 0xfc
CPU 2/KVM-855680 [002] ..... 103623.620278: kvm_hypercall: nr 0xa a0 0xb a1 0x0 a2 0x0 a3 0xfc
# 到此可以看出 pv-IPI 功能生效了
pv-IPI 的代码实现逻辑如下。
1. KVM side
KVM 暴露 KVM_FEATURE_PV_SEND_IPI
给 guest,表示支持 pv-IPI。
c
__do_cpuid_func
case KVM_CPUID_FEATURES:
entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
// ...
(1 << KVM_FEATURE_PV_SEND_IPI) |
当 Guest 调用 kvm_hypercall KVM_HC_SEND_IPI
发送 IPI 时,会退出到 KVM 中处理。
c
#define KVM_HC_SEND_IPI 10
kvm_emulate_hypercall
__kvm_emulate_hypercall
trace_kvm_hypercall(nr, a0, a1, a2, a3); // kvm_hypercall 跟踪点
case KVM_HC_SEND_IPI:
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit)
{
struct kvm_apic_map *map;
struct kvm_lapic_irq irq = {0};
int cluster_size = op_64_bit ? 64 : 32;
int count;
if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
return -KVM_EINVAL;
irq.vector = icr & APIC_VECTOR_MASK;
irq.delivery_mode = icr & APIC_MODE_MASK;
irq.level = (icr & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr & APIC_INT_LEVELTRIG;
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
count = -EOPNOTSUPP;
if (likely(map)) {
count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
min += cluster_size;
count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
}
rcu_read_unlock();
return count;
}
2. Guest side
kernal 中为 apic 建立的数据结构,并定义了默认回调函数。
c
struct apic {
// ...
/* IPI related functions */
void (*wait_icr_idle)(void);
u32 (*safe_wait_icr_idle)(void);
void (*send_IPI)(int cpu, int vector);
void (*send_IPI_mask)(const struct cpumask *mask, int vector);
void (*send_IPI_mask_allbutself)(const struct cpumask *msk, int vec);
void (*send_IPI_allbutself)(int vector);
void (*send_IPI_all)(int vector);
void (*send_IPI_self)(int vector);
// ...
char *name;
};
// apic 默认回调函数
static struct apic apic_x2apic_cluster __ro_after_init = {
.name = "cluster x2apic",
// ...
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
};
apic_driver(apic_x2apic_cluster);
2.1 kvm_setup_pv_ipi ()
Guest 会通过 CPUID.0x40000001:EAX[11]
, 检查是否可以使用 pv-IPI。如果可以,执行 kvm_setup_pv_ipi
配置发送 IPI 的回调函数。
c
// 调用顺序
apic_intr_mode_init
if (x86_platform.apic_post_init)
x86_platform.apic_post_init();
==> kvm_apic_init
static void __init kvm_apic_init(void)
{
#ifdef CONFIG_SMP
// 如果支持 pv-ipi, 则进行配置
if (pv_ipi_supported())
kvm_setup_pv_ipi();
#endif
}
// 检查 feature bit `KVM_FEATURE_PV_SEND_IPI` 是否置上,
// 以及 VM 是否为 1 个以上 cpu
static bool pv_ipi_supported(void)
{
return (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI) &&
(num_possible_cpus() != 1));
}
#define KVM_FEATURE_PV_SEND_IPI 11
设置 apic 的回调函数为 pv-IPI 的处理逻辑。 主要为 apic->send_IPI_mask
和 apic->send_IPI_mask_allbutself
设置了回调函数。
kvm_send_ipi_mask
kvm_send_ipi_mask_allbutself
c
static __init void kvm_setup_pv_ipi(void)
{
apic_update_callback(send_IPI_mask, kvm_send_ipi_mask);
// 等价于:
// __x86_apic_override.send_IPI_mask = kvm_send_ipi_mask;
// apic->send_IPI_mask = kvm_send_ipi_mask
// static_call_update(apic_call_send_IPI_mask, kvm_send_ipi_mask)
apic_update_callback(send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself);
// 等价于:
// __x86_apic_override.send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
// apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself
// static_call_update(apic_call_send_IPI_mask_allbutself, kvm_send_ipi_mask_allbutself)
pr_info("setup PV IPIs\n");
}
// arch/x86/include/asm/apic.h
extern struct apic *apic;
#define apic_update_callback(_callback, _fn) { \
__x86_apic_override._callback = _fn; \
apic->_callback = _fn; \
static_call_update(apic_call_##_callback, _fn); \
pr_info("APIC: %s() replaced with %ps()\n", #_callback, _fn); \
}
apic->send_IPI_mask*()
最终被封装为:
__apic_send_IPI_mask
__apic_send_IPI_mask_allbutself
c
// arch/x86/include/asm/apic.h
static __always_inline void __apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
static_call_mod(apic_call_send_IPI_mask)(mask, vector);
}
static __always_inline void __apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
static_call(apic_call_send_IPI_mask_allbutself)(mask, vector);
}
2.2 kvm_send_ipi_mask*()
Guest 通过 kvm_send_ipi_mask
/ kvm_send_ipi_mask_allbutself
发送 IPI 的主要逻辑在 __send_ipi_mask
中。
guest 会将目标 vCPU 记录到 bitmap 中,然后通过 kvm_hypercall KVM_HC_SEND_IPI
交给 KVM 处理。64bit 模式下,bitmap 可以记录 128 个目标 vCPU;32 位模式下记录 64vCPU。
c
kvm_send_ipi_mask / kvm_send_ipi_mask_allbutself
kvm_send_ipi_mask
__send_ipi_mask // 使用 kvm_hypercall4(KVM_HC_SEND_IPI, ...);
// bitmap 容纳的 apicid 范围
#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
static void __send_ipi_mask(const struct cpumask *mask, int vector)
{
unsigned long flags;
int cpu, min = 0, max = 0;
#ifdef CONFIG_X86_64
__uint128_t ipi_bitmap = 0;
#else
u64 ipi_bitmap = 0;
#endif
u32 apic_id, icr;
long ret;
if (cpumask_empty(mask))
return;
local_irq_save(flags);
// 将中断向量号记录到 `icr`, 并配置中断投递模式 delivery mode
switch (vector) {
default:
icr = APIC_DM_FIXED | vector;
break;
case NMI_VECTOR:
icr = APIC_DM_NMI;
break;
}
// 遍历 mask 记录的所有目标 vCPU
// min, max 表示遍历过程中出现的最小/最大 apicid。
// 以便超过 bitmap 可以包含的范围时,分多次调用 kvm_hypercall
for_each_cpu(cpu, mask) {
apic_id = per_cpu(x86_cpu_to_apicid, cpu);
if (!ipi_bitmap) {
min = max = apic_id;
} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
ipi_bitmap <<= min - apic_id;
min = apic_id;
} else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
max = apic_id < max ? max : apic_id;
} else {
// 超过一次调用 kvm_hypercall 可以容纳的 apicid 范围
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
ret);
min = max = apic_id;
ipi_bitmap = 0;
}
// 对于 bitmap, bit `x` 置 1 表示的apicid 为 `x + min`
__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
}
if (ipi_bitmap) {
// 调用 kvm_hypercall 时,会将 ipi_bitmap_low, ipi_bitmap_low, min, icr 传入 KVM
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", ret);
}
local_irq_restore(flags);
}
3. kvm_hypercall
kvm_hypercall
不同于普通的 hypercall, 本质上它会调用 vmcall
或 vmmcall
指令触发 VM-exit, 以请求 KVM 的处理。Intel CPU 中使用的是 vmcall
, 对于该指令,除了触发 VM-exit, 它不会做其他事情。所以 host 端需要定义相关的处理逻辑。
c
// arch/x86/include/asm/kvm_para.h
static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
unsigned long p2, unsigned long p3,
unsigned long p4)
{
long ret;
asm volatile(KVM_HYPERCALL
: "=a"(ret)
: "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
: "memory");
return ret;
}
#define KVM_HYPERCALL \
ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL)
KVM 中定义了由 vmcall caused VM-exit 的处理逻辑,见前文 kvm_emulate_hypercall
部分。
c
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
// ...
[EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
// ...
}
本文作者:文七安
本文链接:虚拟机中的 IPI 优化: pv-IPI - 掘金 (juejin.cn)
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!