kvm驱动调用大致流程
open("/dev/kvm") → KVM_CREATE_VM → KVM_CREATE_VCPU → KVM_RUN
kvm内核源码框架设计

1、kvm设备文件操作接口
cpp
static struct file_operations kvm_chardev_ops = {
.unlocked_ioctl = kvm_dev_ioctl,
.llseek = noop_llseek,
KVM_COMPAT(kvm_dev_ioctl),
};
/dev/kvm支持的命令
static long kvm_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
long r = -EINVAL;
switch (ioctl) {
case KVM_GET_API_VERSION:
if (arg)
goto out;
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
r = kvm_dev_ioctl_create_vm(arg);
break;
case KVM_CHECK_EXTENSION:
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
#ifdef CONFIG_X86
r += PAGE_SIZE; /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
case KVM_TRACE_ENABLE:
case KVM_TRACE_PAUSE:
case KVM_TRACE_DISABLE:
r = -EOPNOTSUPP;
break;
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
out:
return r;
}
2、虚拟机操作接口
cpp
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
.llseek = noop_llseek,
KVM_COMPAT(kvm_vm_compat_ioctl),
};
vm支持的命令
cpp
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
if (kvm->mm != current->mm || kvm->vm_bugged)
return -EIO;
switch (ioctl) {
case KVM_CREATE_VCPU:
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
break;
case KVM_ENABLE_CAP: {
struct kvm_enable_cap cap;
r = -EFAULT;
if (copy_from_user(&cap, argp, sizeof(cap)))
goto out;
r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
break;
}
case KVM_SET_USER_MEMORY_REGION: {
struct kvm_userspace_memory_region kvm_userspace_mem;
r = -EFAULT;
if (copy_from_user(&kvm_userspace_mem, argp,
sizeof(kvm_userspace_mem)))
goto out;
r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
break;
}
case KVM_GET_DIRTY_LOG: {
struct kvm_dirty_log log;
r = -EFAULT;
if (copy_from_user(&log, argp, sizeof(log)))
goto out;
r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
break;
}
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
case KVM_CLEAR_DIRTY_LOG: {
struct kvm_clear_dirty_log log;
r = -EFAULT;
if (copy_from_user(&log, argp, sizeof(log)))
goto out;
r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
break;
}
#endif
#ifdef CONFIG_KVM_MMIO
case KVM_REGISTER_COALESCED_MMIO: {
struct kvm_coalesced_mmio_zone zone;
r = -EFAULT;
if (copy_from_user(&zone, argp, sizeof(zone)))
goto out;
r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
break;
}
case KVM_UNREGISTER_COALESCED_MMIO: {
struct kvm_coalesced_mmio_zone zone;
r = -EFAULT;
if (copy_from_user(&zone, argp, sizeof(zone)))
goto out;
r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
break;
}
#endif
case KVM_IRQFD: {
struct kvm_irqfd data;
r = -EFAULT;
if (copy_from_user(&data, argp, sizeof(data)))
goto out;
r = kvm_irqfd(kvm, &data);
break;
}
case KVM_IOEVENTFD: {
struct kvm_ioeventfd data;
r = -EFAULT;
if (copy_from_user(&data, argp, sizeof(data)))
goto out;
r = kvm_ioeventfd(kvm, &data);
break;
}
#ifdef CONFIG_HAVE_KVM_MSI
case KVM_SIGNAL_MSI: {
struct kvm_msi msi;
r = -EFAULT;
if (copy_from_user(&msi, argp, sizeof(msi)))
goto out;
r = kvm_send_userspace_msi(kvm, &msi);
break;
}
#endif
#ifdef __KVM_HAVE_IRQ_LINE
case KVM_IRQ_LINE_STATUS:
case KVM_IRQ_LINE: {
struct kvm_irq_level irq_event;
r = -EFAULT;
if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
goto out;
r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
ioctl == KVM_IRQ_LINE_STATUS);
if (r)
goto out;
r = -EFAULT;
if (ioctl == KVM_IRQ_LINE_STATUS) {
if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
goto out;
}
r = 0;
break;
}
#endif
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
case KVM_SET_GSI_ROUTING: {
struct kvm_irq_routing routing;
struct kvm_irq_routing __user *urouting;
struct kvm_irq_routing_entry *entries = NULL;
r = -EFAULT;
if (copy_from_user(&routing, argp, sizeof(routing)))
goto out;
r = -EINVAL;
if (!kvm_arch_can_set_irq_routing(kvm))
goto out;
if (routing.nr > KVM_MAX_IRQ_ROUTES)
goto out;
if (routing.flags)
goto out;
if (routing.nr) {
r = -ENOMEM;
entries = vmalloc(array_size(sizeof(*entries),
routing.nr));
if (!entries)
goto out;
r = -EFAULT;
urouting = argp;
if (copy_from_user(entries, urouting->entries,
routing.nr * sizeof(*entries)))
goto out_free_irq_routing;
}
r = kvm_set_irq_routing(kvm, entries, routing.nr,
routing.flags);
out_free_irq_routing:
vfree(entries);
break;
}
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
case KVM_CREATE_DEVICE: {
struct kvm_create_device cd;
r = -EFAULT;
if (copy_from_user(&cd, argp, sizeof(cd)))
goto out;
r = kvm_ioctl_create_device(kvm, &cd);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &cd, sizeof(cd)))
goto out;
r = 0;
break;
}
case KVM_CHECK_EXTENSION:
r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
break;
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}
out:
return r;
}
3、vcpu操作接口
cpp
static struct file_operations kvm_vcpu_fops = {
.release = kvm_vcpu_release,
.unlocked_ioctl = kvm_vcpu_ioctl,
.mmap = kvm_vcpu_mmap,
.llseek = noop_llseek,
KVM_COMPAT(kvm_vcpu_compat_ioctl),
};
vcpu支持的命令
cpp
static long kvm_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
struct kvm_fpu *fpu = NULL;
struct kvm_sregs *kvm_sregs = NULL;
if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged)
return -EIO;
if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
return -EINVAL;
/*
* Some architectures have vcpu ioctls that are asynchronous to vcpu
* execution; mutex_lock() would break them.
*/
r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
if (r != -ENOIOCTLCMD)
return r;
if (mutex_lock_killable(&vcpu->mutex))
return -EINTR;
switch (ioctl) {
case KVM_RUN: {
struct pid *oldpid;
r = -EINVAL;
if (arg)
goto out;
oldpid = rcu_access_pointer(vcpu->pid);
if (unlikely(oldpid != task_pid(current))) {
/* The thread running this VCPU changed. */
struct pid *newpid;
r = kvm_arch_vcpu_run_pid_change(vcpu);
if (r)
break;
newpid = get_task_pid(current, PIDTYPE_PID);
rcu_assign_pointer(vcpu->pid, newpid);
if (oldpid)
synchronize_rcu();
put_pid(oldpid);
}
r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
case KVM_GET_REGS: {
struct kvm_regs *kvm_regs;
r = -ENOMEM;
kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
if (!kvm_regs)
goto out;
r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
if (r)
goto out_free1;
r = -EFAULT;
if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
goto out_free1;
r = 0;
out_free1:
kfree(kvm_regs);
break;
}
case KVM_SET_REGS: {
struct kvm_regs *kvm_regs;
r = -ENOMEM;
kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
if (IS_ERR(kvm_regs)) {
r = PTR_ERR(kvm_regs);
goto out;
}
r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
kfree(kvm_regs);
break;
}
case KVM_GET_SREGS: {
kvm_sregs = kzalloc(sizeof(struct kvm_sregs),
GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!kvm_sregs)
goto out;
r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
goto out;
r = 0;
break;
}
case KVM_SET_SREGS: {
kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
if (IS_ERR(kvm_sregs)) {
r = PTR_ERR(kvm_sregs);
kvm_sregs = NULL;
goto out;
}
r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
break;
}
case KVM_GET_MP_STATE: {
struct kvm_mp_state mp_state;
r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
goto out;
r = 0;
break;
}
case KVM_SET_MP_STATE: {
struct kvm_mp_state mp_state;
r = -EFAULT;
if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
goto out;
r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
break;
}
case KVM_TRANSLATE: {
struct kvm_translation tr;
r = -EFAULT;
if (copy_from_user(&tr, argp, sizeof(tr)))
goto out;
r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, &tr, sizeof(tr)))
goto out;
r = 0;
break;
}
case KVM_SET_GUEST_DEBUG: {
struct kvm_guest_debug dbg;
r = -EFAULT;
if (copy_from_user(&dbg, argp, sizeof(dbg)))
goto out;
r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
break;
}
case KVM_SET_SIGNAL_MASK: {
struct kvm_signal_mask __user *sigmask_arg = argp;
struct kvm_signal_mask kvm_sigmask;
sigset_t sigset, *p;
p = NULL;
if (argp) {
r = -EFAULT;
if (copy_from_user(&kvm_sigmask, argp,
sizeof(kvm_sigmask)))
goto out;
r = -EINVAL;
if (kvm_sigmask.len != sizeof(sigset))
goto out;
r = -EFAULT;
if (copy_from_user(&sigset, sigmask_arg->sigset,
sizeof(sigset)))
goto out;
p = &sigset;
}
r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
break;
}
case KVM_GET_FPU: {
fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!fpu)
goto out;
r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
if (r)
goto out;
r = -EFAULT;
if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
goto out;
r = 0;
break;
}
case KVM_SET_FPU: {
fpu = memdup_user(argp, sizeof(*fpu));
if (IS_ERR(fpu)) {
r = PTR_ERR(fpu);
fpu = NULL;
goto out;
}
r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
break;
}
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
out:
mutex_unlock(&vcpu->mutex);
kfree(fpu);
kfree(kvm_sregs);
return r;
}
内存映射: 用户态对 vcpu fd 调用 mmap(), 把内核中 vcpu 共享页(vcpu run structure) 映射到 QEMU 用户态地址空间 (HVA)。示例代码:
cpp
static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
vma->vm_ops = &kvm_vcpu_vm_ops;
return 0;
}
代码示例:
cpp
int kvm_init_vcpu(CPUState *env)
{
KVMState *s = kvm_state;
long mmap_size;
int ret;
// 1. 创建 vcpu,得到 vcpu_fd
ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
if (ret < 0) {
return -1;
}
env->kvm_fd = ret; // 保存 vcpu_fd
// 2. 获取需要 mmap 的大小(内核里 struct kvm_run 大小)
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
return -1;
}
// 3. 关键:调用 mmap(2),触发内核 kvm_vcpu_mmap
env->kvm_run = mmap(NULL, mmap_size,
PROT_READ | PROT_WRITE,
MAP_SHARED,
env->kvm_fd, 0);
if (env->kvm_run == MAP_FAILED) {
return -1;
}
return 0;
}