一、NVMe驱动
1、dma_alloc_coherent
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
&nvmeq->cq_dma_addr, GFP_KERNEL);
- 核心概念:两个地址,同一块内存
CPU 视角的虚拟地址(如 nvme->sqes):
由 dma_alloc_coherent 返回,供驱动程序写入命令或读取完成状态。CPU 通过常规的访存指令操作这个地址。
设备视角的 DMA 地址(如 sq_dma_addr):
由 dma_alloc_coherent 通过 dma_handle 参数返回,是设备(NVMe 控制器)可以识别的总线地址。驱动将该地址写入设备寄存器(如 ASQ 寄存器),设备通过 DMA 引擎从该地址读取数据。
关键点:这两个地址对应的是同一块物理内存。dma_alloc_coherent 保证了这块内存的 CPU 缓存与设备访问的一致性(通常通过分配非缓存(non-cacheable)或写合并(write-combine)的内存区域,或依赖硬件自动维护一致性)。因此 CPU 写入的命令不需要任何缓存刷新操作,设备就能直接看到正确的数据。
2、dev->queue[0]和dev->ctrl.admin_q 关系
++dev->queue[0]是Admin Queue的物理实体,而dev->ctrl.admin_q是它在内核I/O栈中的逻辑接口。++
在 Linux NVMe 驱动中,hctx->driver_data 指向 dev->queues[0],而 hctx 本身是 dev->ctrl.admin_q(一个 request_queue)的内部组成部分。hctx->driver_data 并不是直接赋值给 dev->ctrl.admin_q ,而是通过 blk-mq 框架将 hctx 嵌入到 request_queue 中,并在初始化时将 hctx->driver_data 设置为底层硬件队列。后续通过 dev->ctrl.admin_q 提交请求时,blk-mq 会自动找到对应的 hctx,并从中取出 driver_data 来操作硬件。具体分析如下:
1、在nvme probe中为dev->queues申请资源,如下dev->queues
cpp
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
struct nvme_dev *dev;
unsigned long quirks = id->driver_data;
size_t alloc_size;
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);//申请nvme queue
if (!dev)
return -ENOMEM;
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
dev->queues = kcalloc_node(dev->nr_allocated_queues,
sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
goto free;
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
result = nvme_dev_map(dev);
if (result)
goto put_pci;
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
result = nvme_setup_prp_pools(dev);
if (result)
goto unmap;
quirks |= check_vendor_combination_bug(pdev);
if (!noacpi && nvme_acpi_storage_d3(pdev)) {
/*
* Some systems use a bios work around to ask for D3 on
* platforms that support kernel managed suspend.
*/
dev_info(&pdev->dev,
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
/*
* Double check that our mempool alloc size will cover the biggest
* command we support.
*/
alloc_size = nvme_pci_iod_alloc_size();
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
mempool_kfree,
(void *) alloc_size,
GFP_KERNEL, node);
if (!dev->iod_mempool) {
result = -ENOMEM;
goto release_pools;
}
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
if (result)
goto release_mempool;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
nvme_reset_ctrl(&dev->ctrl);
async_schedule(nvme_async_probe, dev);
return 0;
release_mempool:
mempool_destroy(dev->iod_mempool);
release_pools:
nvme_release_prp_pools(dev);
unmap:
nvme_dev_unmap(dev);
put_pci:
put_device(dev->dev);
free:
kfree(dev->queues);
kfree(dev);
return result;
}
2、在blk_mq_init_queue分配了ctrl.admin_q,是一个request_queue类型,分析如下:
cpp
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
if (!dev->ctrl.admin_q) {
dev->admin_tagset.ops = &nvme_mq_admin_ops;
dev->admin_tagset.nr_hw_queues = 1;
dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev->ctrl.numa_node;
dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
dev->admin_tagset.driver_data = dev;
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
return -ENOMEM;
dev->ctrl.admin_tagset = &dev->admin_tagset;
dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);//admin q初始化
if (IS_ERR(dev->ctrl.admin_q)) {
blk_mq_free_tag_set(&dev->admin_tagset);
return -ENOMEM;
}
if (!blk_get_queue(dev->ctrl.admin_q)) {
nvme_dev_remove_admin(dev);
dev->ctrl.admin_q = NULL;
return -ENODEV;
}
} else
blk_mq_unquiesce_queue(dev->ctrl.admin_q);
return 0;
}
blk_mq_init_queue(struct blk_mq_tag_set *set)返回类型是struct request_queue 指针:
cpp
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
return blk_mq_init_queue_data(set, NULL);
}
blk_mq_init_queue_data函数如下:
cpp
struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
{
struct request_queue *uninit_q, *q;
uninit_q = blk_alloc_queue(set->numa_node);
if (!uninit_q)
return ERR_PTR(-ENOMEM);
uninit_q->queuedata = queuedata;
/*
* Initialize the queue without an elevator. device_add_disk() will do
* the initialization.
*/
q = blk_mq_init_allocated_queue(set, uninit_q, false);
if (IS_ERR(q))
blk_cleanup_queue(uninit_q);
return q;
}
blk_mq_init_allocated_queue如下:
cpp
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q,
bool elevator_init)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
blk_mq_poll_stats_bkt,
BLK_MQ_POLL_STATS_BKTS, q);
if (!q->poll_cb)
goto err_exit;
if (blk_mq_alloc_ctxs(q))
goto err_poll;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
blk_mq_realloc_hw_ctxs(set, q);//初始化ctxs
if (!q->nr_hw_queues)
goto err_hctxs;
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
if (set->nr_maps > HCTX_TYPE_POLL &&
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
q->sg_reserved_size = INT_MAX;
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
q->nr_requests = set->queue_depth;
/*
* Default to classic polling
*/
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
if (elevator_init)
elevator_init_mq(q);
return q;
err_hctxs:
kfree(q->queue_hw_ctx);
q->nr_hw_queues = 0;
blk_mq_sysfs_deinit(q);
err_poll:
blk_stat_free_callback(q->poll_cb);
q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
return ERR_PTR(-ENOMEM);
}
blk_mq_realloc_hw_ctxs:
cpp
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
int i, j, end;
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
if (q->nr_hw_queues < set->nr_hw_queues) {
struct blk_mq_hw_ctx **new_hctxs;
new_hctxs = kcalloc_node(set->nr_hw_queues,
sizeof(*new_hctxs), GFP_KERNEL,
set->numa_node);
if (!new_hctxs)
return;
if (hctxs)
memcpy(new_hctxs, hctxs, q->nr_hw_queues *
sizeof(*hctxs));
q->queue_hw_ctx = new_hctxs;
kfree(hctxs);
hctxs = new_hctxs;
}
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int node;
struct blk_mq_hw_ctx *hctx;
node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
/*
* If the hw queue has been mapped to another numa node,
* we need to realloc the hctx. If allocation fails, fallback
* to use the previous one.
*/
if (hctxs[i] && (hctxs[i]->numa_node == node))
continue;
hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
if (hctx) {
if (hctxs[i])
blk_mq_exit_hctx(q, set, hctxs[i], i);
hctxs[i] = hctx;
} else {
if (hctxs[i])
pr_warn("Allocate new hctx on node %d fails,\
fallback to previous one on node %d\n",
node, hctxs[i]->numa_node);
else
break;
}
}
/*
* Increasing nr_hw_queues fails. Free the newly allocated
* hctxs and keep the previous q->nr_hw_queues.
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
end = i;
} else {
j = i;
end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
for (; j < end; j++) {
struct blk_mq_hw_ctx *hctx = hctxs[j];
if (hctx) {
if (hctx->tags)
blk_mq_free_map_and_requests(set, j);
blk_mq_exit_hctx(q, set, hctx, j);
hctxs[j] = NULL;
}
}
mutex_unlock(&q->sysfs_lock);
}
blk_mq_alloc_and_init_hctx:
cpp
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
int hctx_idx, int node)
{
struct blk_mq_hw_ctx *hctx = NULL, *tmp;
/* reuse dead hctx first */
spin_lock(&q->unused_hctx_lock);
list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
if (tmp->numa_node == node) {
hctx = tmp;
break;
}
}
if (hctx)
list_del_init(&hctx->hctx_list);
spin_unlock(&q->unused_hctx_lock);
if (!hctx)
hctx = blk_mq_alloc_hctx(q, set, node);
if (!hctx)
goto fail;
if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
goto free_hctx;
return hctx;
free_hctx:
kobject_put(&hctx->kobj);
fail:
return NULL;
}
blk_mq_init_hctx:
cpp
static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
hctx->queue_num = hctx_idx;
if (!(hctx->flags & BLK_MQ_F_STACKING))
cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
&hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
if (set->ops->init_hctx &&
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
goto unregister_cpu_notifier;
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
return 0;
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
unregister_cpu_notifier:
blk_mq_remove_cpuhp(hctx);
return -1;
}
set->ops->init_hctx(hctx, set->driver_data, hctx_idx))进行初始化,如下nvme_admin_init_hctx:
cpp
//操作函数如下
static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq= nvme_queue_rq,
.complete= nvme_pci_complete_rq,
.init_hctx= nvme_admin_init_hctx,
.init_request= nvme_init_request,
.timeout= nvme_timeout,
};
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
hctx->driver_data = nvmeq;
return 0;
}
nvme_admin_init_hctx中,hctx->driver_data就是dev->queue[0]。在后续admin命令执行时候,会用hctx->driver_data获取dev->queue[0],如下nvme_queue_rq函数,会在blk_mq_dispatch_rq_list或者__blk_mq_issue_directly中调用,属于block层,执行命令调度分发。
cpp
/*
* NOTE: ns is NULL when called on the admin queue.
*/
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_queue *nvmeq = hctx->driver_data;//获取hctx的queue[0]
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_command cmnd;
blk_status_t ret;
iod->aborted = 0;
iod->npages = -1;
iod->nents = 0;
/*
* We should not need to do this, but we're still using this to
* ensure we can drain requests on a dying queue.
*/
if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
return BLK_STS_IOERR;
ret = nvme_setup_cmd(ns, req, &cmnd);
if (ret)
return ret;
if (blk_rq_nr_phys_segments(req)) {
ret = nvme_map_data(dev, req, &cmnd);
if (ret)
goto out_free_cmd;
}
if (blk_integrity_rq(req)) {
ret = nvme_map_metadata(dev, req, &cmnd);
if (ret)
goto out_unmap_data;
}
blk_mq_start_request(req);
nvme_submit_cmd(nvmeq, &cmnd, bd->last);
return BLK_STS_OK;
out_unmap_data:
nvme_unmap_data(dev, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}
3、关于IO queue申请个数的分析
IO queue真实创建的数量,受MSI-X 中断数量、CPU个数、以及set Feature命令返回支持的queue num决定,即
IO_Queue_Num = min(MSI_X, CPU, Queue_Num)
分析流程如下:
nvme_setup_io_queues中,分析IO queue实际创建的个数
cpp
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
unsigned int nr_io_queues;
unsigned long size;
int result;
/*
* Sample the module parameters once at reset time so that we have
* stable values to work with.
*/
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
/*
* If tags are shared with admin queue (Apple bug), then
* make sure we only use one IO queue.
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
nr_io_queues = 1;
else
nr_io_queues = min(nvme_max_io_queues(dev),
dev->nr_allocated_queues - 1);
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
if (nr_io_queues == 0)
return 0;
clear_bit(NVMEQ_ENABLED, &adminq->flags);
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0)
dev->q_depth = result;
else
dev->cmb_use_sqes = false;
}
do {
size = db_bar_size(dev, nr_io_queues);
result = nvme_remap_bar(dev, size);
if (!result)
break;
if (!--nr_io_queues)
return -ENOMEM;
} while (1);
adminq->q_db = dev->dbs;
retry:
/* Deregister the admin queue's interrupt */
pci_free_irq(pdev, 0, adminq);
/*
* If we enable msix early due to not intx, disable it again before
* setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
result = nvme_setup_irqs(dev, nr_io_queues);
if (result <= 0)
return -EIO;
dev->num_vecs = result;
result = max(result - 1, 1);
dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
/*
* Should investigate if there's a performance win from allocating
* more queues than interrupt vectors; it might allow the submission
* path to scale better, even if the receive path is limited by the
* number of interrupts.
*/
result = queue_request_irq(adminq);
if (result)
return result;
set_bit(NVMEQ_ENABLED, &adminq->flags);
result = nvme_create_io_queues(dev);
if (result || dev->online_queues < 2)
return result;
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
nvme_disable_io_queues(dev);
nvme_suspend_io_queues(dev);
goto retry;
}
dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
dev->io_queues[HCTX_TYPE_DEFAULT],
dev->io_queues[HCTX_TYPE_READ],
dev->io_queues[HCTX_TYPE_POLL]);
return 0;
}
3.1 nr_io_queues
cpp
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
nr_io_queues = 1;
else
nr_io_queues = min(nvme_max_io_queues(dev),
dev->nr_allocated_queues - 1);
如下函数nvme_max_io_queues
cpp
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
}
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
即通过参数可以设置 没设置就是默认值0
设置方式 modprobe nvme write_queues=4 poll_queues=2
cpp
dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
在probe函数中,取决于nvme_max_io_queues
因此 nr_io_queues目前就是CPU个数;
3.2 nvme_setup_io_queues
分析nvme_setup_io_queues函数,此函数中会处理两个方面来获取queue num,如下:
一方面,nvme_set_queue_count函数获取result,即通过set Feature命令,控制器向nvme设备请求IO个数 ,count为nr_io_queues。
cpp
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
u32 q_count = (*count - 1) | ((*count - 1) << 16);
u32 result;
int status, nr_io_queues;
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
&result);
if (status < 0)
return status;
/*
* Degraded controllers might return an error when setting the queue
* count. We still want to be able to bring them online and offer
* access to the admin queue, as that might be only way to fix them up.
*/
if (status > 0) {
dev_err(ctrl->device, "Could not set queue count (%d)\n", status);
*count = 0;
} else {
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
*count = min(*count, nr_io_queues);
}
return 0;
}
代码中返回count是取nr_io_queues和count最小值。
另一方面,nvme_setup_irqs返回值也是实际创建的queue num。
cpp
result = nvme_setup_irqs(dev, nr_io_queues);
if (result <= 0)
return -EIO;
dev->num_vecs = result;
result = max(result - 1, 1);
dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
分析nvme_setup_irqs
cpp
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
struct irq_affinity affd = {
.pre_vectors = 1,
.calc_sets = nvme_calc_irq_sets,
.priv = dev,
};
unsigned int irq_queues, poll_queues;
/*
* Poll queues don't need interrupts, but we need at least one I/O queue
* left over for non-polled I/O.
*/
poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
/*
* Initialize for the single interrupt case, will be updated in
* nvme_calc_irq_sets().
*/
dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
dev->io_queues[HCTX_TYPE_READ] = 0;
/*
* We need interrupts for the admin queue and each non-polled I/O queue,
* but some Apple controllers require all queues to use the first
* vector.
*/
irq_queues = 1;
if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
irq_queues += (nr_io_queues - poll_queues);
return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
cpp
int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
unsigned int max_vecs, unsigned int flags,
struct irq_affinity *affd)
{
struct irq_affinity msi_default_affd = {0};
int nvecs = -ENOSPC;
if (flags & PCI_IRQ_AFFINITY) {
if (!affd)
affd = &msi_default_affd;
} else {
if (WARN_ON(affd))
affd = NULL;
}
if (flags & PCI_IRQ_MSIX) {
nvecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
affd, flags);
if (nvecs > 0)
return nvecs;
}
if (flags & PCI_IRQ_MSI) {
nvecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, affd);
if (nvecs > 0)
return nvecs;
}
/* use legacy IRQ if allowed */
if (flags & PCI_IRQ_LEGACY) {
if (min_vecs == 1 && dev->irq) {
/*
* Invoke the affinity spreading logic to ensure that
* the device driver can adjust queue configuration
* for the single interrupt case.
*/
if (affd)
irq_create_affinity_masks(1, affd);
pci_intx(dev, 1);
return 1;
}
}
return nvecs;
}
为PCIE设备分配的MSI-X中断个数。
综合上面分析,在函数中会使用set featue命令去请求nvme设备queue num,返回值result;
再次根据申请MSI-X中断数,返回result。
最后综合比较最小值,进行创建queue。
cpp
static int nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max, rw_queues;
int ret = 0;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
if (nvme_alloc_queue(dev, i, dev->q_depth)) {
ret = -ENOMEM;
break;
}
}
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
dev->io_queues[HCTX_TYPE_READ];
} else {
rw_queues = max;
}
for (i = dev->online_queues; i <= max; i++) {//dev->online_queues在递增 IO queue依次增加
bool polled = i > rw_queues;
ret = nvme_create_queue(&dev->queues[i], i, polled);
if (ret)
break;
}
/*
* Ignore failing Create SQ/CQ commands, we can continue with less
* than the desired amount of queues, and even a controller without
* I/O queues can still be used to issue admin commands. This might
* be useful to upgrade a buggy firmware for example.
*/
return ret >= 0 ? 0 : ret;
}
4、nvme admin命令执行流程
nvme admin执行过程,以scan ns流程中,identity命令为例:
cpp
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
{
const int nr_entries = NVME_IDENTIFY_DATA_SIZE / sizeof(__le32);
__le32 *ns_list;
u32 prev = 0;
int ret = 0, i;
if (nvme_ctrl_limited_cns(ctrl))
return -EOPNOTSUPP;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
for (;;) {
struct nvme_command cmd = {
.identify.opcode = nvme_admin_identify,
.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST,
.identify.nsid = cpu_to_le32(prev),
};
ret = nvme_submit_sync_cmd(ctrl->admin_q, &cmd, ns_list,
NVME_IDENTIFY_DATA_SIZE);
if (ret)
goto free;
for (i = 0; i < nr_entries; i++) {
u32 nsid = le32_to_cpu(ns_list[i]);
if (!nsid) /* end of the list? */
goto out;
nvme_validate_or_alloc_ns(ctrl, nsid);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
free:
kfree(ns_list);
return ret;
}
nvme_submit_sync_cmd
cpp
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
NVME_QID_ANY, 0, 0, false);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
cpp
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
blk_mq_req_flags_t flags, bool poll)
{
struct request *req;
int ret;
req = nvme_alloc_request(q, cmd, flags, qid);
if (IS_ERR(req))
return PTR_ERR(req);
req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
if (buffer && bufflen) {
ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
if (ret)
goto out;
}
if (poll)
nvme_execute_rq_polled(req->q, NULL, req, at_head);
else
blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
ret = nvme_req(req)->status;
out:
blk_mq_free_request(req);
return ret;
}
EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd);
cpp
void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq, int at_head)
{
DECLARE_COMPLETION_ONSTACK(wait);
unsigned long hang_check;
rq->end_io_data = &wait;
blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
else
wait_for_completion_io(&wait);
}
EXPORT_SYMBOL(blk_execute_rq);
准备好 request 后,驱动通过 blk_execute_rq() 函数将请求正式提交给块层 。从这里开始,流程进入blk-mq框架:
块层调度 (blk_mq_queue_rq) :blk-mq 层收到请求后,进入blk_mq_sched_insert_request,会根据请求的队列映射,找到对应的硬件上下文(blk_mq_hw_ctx)。
驱动入口 (nvme_queue_rq) :blk-mq 会调用与该请求队列绑定的操作函数集中的 .queue_rq() 回调函数。对于 Admin 队列,这个回调就是 nvme_queue_rq() 。
cpp
/*
* NOTE: ns is NULL when called on the admin queue.
*/
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_queue *nvmeq = hctx->driver_data;//获取hctx的queue[0]
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_command cmnd;
blk_status_t ret;
iod->aborted = 0;
iod->npages = -1;
iod->nents = 0;
/*
* We should not need to do this, but we're still using this to
* ensure we can drain requests on a dying queue.
*/
if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
return BLK_STS_IOERR;
ret = nvme_setup_cmd(ns, req, &cmnd);
if (ret)
return ret;
if (blk_rq_nr_phys_segments(req)) {
ret = nvme_map_data(dev, req, &cmnd);
if (ret)
goto out_free_cmd;
}
if (blk_integrity_rq(req)) {
ret = nvme_map_metadata(dev, req, &cmnd);
if (ret)
goto out_unmap_data;
}
blk_mq_start_request(req);
nvme_submit_cmd(nvmeq, &cmnd, bd->last);
return BLK_STS_OK;
out_unmap_data:
nvme_unmap_data(dev, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}
在这个函数中,关键的步骤是:struct nvme_queue *nvmeq = hctx->driver_data; 从硬件上下文中取出了真正的硬件队列指针 nvmeq[上面描述的dev->queue[0]]
写入硬件队列 (nvme_submit_cmd) :nvme_queue_rq() 会调用 nvme_submit_cmd() 。这个函数负责将命令实际投递到硬件的提交队列(Submission Queue)中。
cpp
/**
* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
* @cmd: The command to send
* @write_sq: whether to write to the SQ doorbell
*/
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
nvme_write_sq_db(nvmeq, write_sq);
spin_unlock(&nvmeq->sq_lock);
}
它通过 nvmeq->sq_cmds 找到提交队列在内存中的映射地址,并将命令数据复制过去 。
最后,它通过写门铃寄存器的方式通知NVMe控制器有新命令到达:writel(nvmeq->sq_tail, nvmeq->q_db); 。这个操作标志着命令从软件侧转移到了硬件侧。
5、nvme IO命令执行流程
- 命令发起与块层封装
发起者:用户态程序(通过 read/write 或 io_uring)或内核自身(如直接 I/O)向块设备下发 I/O 请求。会经过文件系统或者系统调用
通用块层:将 I/O 请求转化为 bio(Block I/O)结构,并调用提交函数(如 submit_bio)。blk-mq 接收 bio,为其分配 struct request(请求),并尝试合并或调度。
cpp
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
* submit_bio() is used to submit I/O requests to block devices. It is passed a
* fully set up &struct bio that describes the I/O that needs to be done. The
* bio will be send to the device described by the bi_disk and bi_partno fields.
*
* The success/failure status of the request, along with notification of
* completion, is delivered asynchronously through the ->bi_end_io() callback
* in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
* been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;
if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
else
count = bio_sectors(bio);
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
op_is_write(bio_op(bio)) ? "WRITE" : "READ",
(unsigned long long)bio->bi_iter.bi_sector,
bio_devname(bio, b), count);
}
}
/*
* If we're reading data that is part of the userspace workingset, count
* submission time as memory stall. When the device is congested, or
* the submitting cgroup IO-throttled, submission can be a significant
* part of overall IO time.
*/
if (unlikely(bio_op(bio) == REQ_OP_READ &&
bio_flagged(bio, BIO_WORKINGSET))) {
unsigned long pflags;
blk_qc_t ret;
psi_memstall_enter(&pflags);
ret = submit_bio_noacct(bio);
psi_memstall_leave(&pflags);
return ret;
}
return submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
-
disk->fops是块设备的操作表(struct block_device_operations *fops)。对于大多数现代设备(包括 NVMe、virtio-blk 等使用 blk-mq 框架的设备),其fops中的submit_bio回调为NULL。(不会初始化注册 注册是传统方式) -
blk-mq 路径 :如果
submit_bio为 NULL,则调用blk_mq_submit_bio(bio),这是 blk-mq 框架的标准入口。它会将 bio 包装成struct request,选择合适的硬件队列,最终通过底层驱动的.queue_rq回调发送给设备。 -
传统(legacy)路径 :如果驱动实现了
submit_bio回调(例如某些老式块驱动或特殊设备),则直接调用该回调,由驱动自己处理 bio。
cpp
/**
* submit_bio_noacct - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
* This is a version of submit_bio() that shall only be used for I/O that is
* resubmitted to lower level drivers by stacking block drivers. All file
* systems and other upper level users of the block layer should use
* submit_bio() instead.
*/
blk_qc_t submit_bio_noacct(struct bio *bio)
{
if (!submit_bio_checks(bio))
return BLK_QC_T_NONE;
/*
* We only want one ->submit_bio to be active at a time, else stack
* usage with stacked devices could be a problem. Use current->bio_list
* to collect a list of requests submited by a ->submit_bio method while
* it is active, and then process them after it returned.
*/
if (current->bio_list) {
bio_list_add(¤t->bio_list[0], bio);
return BLK_QC_T_NONE;
}
if (!bio->bi_disk->fops->submit_bio)
return __submit_bio_noacct_mq(bio);
return __submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);
static blk_qc_t __submit_bio_noacct_mq(struct bio *bio)
{
struct bio_list bio_list[2] = { };
blk_qc_t ret = BLK_QC_T_NONE;
current->bio_list = bio_list;
do {
struct gendisk *disk = bio->bi_disk;
if (unlikely(bio_queue_enter(bio) != 0))
continue;
if (!blk_crypto_bio_prep(&bio)) {
blk_queue_exit(disk->queue);
ret = BLK_QC_T_NONE;
continue;
}
ret = blk_mq_submit_bio(bio);
} while ((bio = bio_list_pop(&bio_list[0])));
current->bio_list = NULL;
return ret;
}
IO如果是多路径,那么分析blk_mq_submit_bio:
cpp
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
*
* Builds up a request structure from @q and @bio and send to the device. The
* request may not be queued directly to hardware if:
* * This request can be merged with another one
* * We want to place request at plug queue for possible future merging
* * There is an IO scheduler active at this queue
*
* It will not queue the request if there is an error with the bio, or at the
* request creation.
*
* Returns: Request queue cookie.
*/
blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = {
.q = q,
};
struct request *rq;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
unsigned int nr_segs;
blk_qc_t cookie;
blk_status_t ret;
blk_queue_bounce(q, &bio);
__blk_queue_split(&bio, &nr_segs);
if (!bio_integrity_prep(bio))
goto queue_exit;
if (!is_flush_fua && !blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
goto queue_exit;
if (blk_mq_sched_bio_merge(q, bio, nr_segs))
goto queue_exit;
rq_qos_throttle(q, bio);
data.cmd_flags = bio->bi_opf;
rq = __blk_mq_alloc_request(&data);////调用
if (unlikely(!rq)) {
rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
goto queue_exit;
}
trace_block_getrq(q, bio, bio->bi_opf);
rq_qos_track(q, rq, bio);
cookie = request_to_qc_t(data.hctx, rq);
blk_mq_bio_to_request(rq, bio, nr_segs);
ret = blk_crypto_init_request(rq);
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
bio_endio(bio);
blk_mq_free_request(rq);
return BLK_QC_T_NONE;
}
plug = blk_mq_plug(q, bio);
if (unlikely(is_flush_fua)) {
/* Bypass scheduler for flush requests */
blk_insert_flush(rq);
blk_mq_run_hw_queue(data.hctx, true);
} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
!blk_queue_nonrot(q))) {
/*
* Use plugging if we have a ->commit_rqs() hook as well, as
* we know the driver uses bd->last in a smart fashion.
*
* Use normal plugging if this disk is slow HDD, as sequential
* IO may benefit a lot from plug merging.
*/
unsigned int request_count = plug->rq_count;
struct request *last = NULL;
if (!request_count)
trace_block_plug(q);
else
last = list_entry_rq(plug->mq_list.prev);
if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
blk_add_rq_to_plug(plug, rq);
} else if (q->elevator) {
/* Insert the request at the IO scheduler queue */
blk_mq_sched_insert_request(rq, false, true, true);
} else if (plug && !blk_queue_nomerges(q)) {
/*
* We do limited plugging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
* issued. So the plug list will have one request at most
* The plug list might get flushed before this. If that happens,
* the plug list is empty, and same_queue_rq is invalid.
*/
if (list_empty(&plug->mq_list))
same_queue_rq = NULL;
if (same_queue_rq) {
list_del_init(&same_queue_rq->queuelist);
plug->rq_count--;
}
blk_add_rq_to_plug(plug, rq);
trace_block_plug(q);
if (same_queue_rq) {
data.hctx = same_queue_rq->mq_hctx;
trace_block_unplug(q, 1, true);
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
} else if ((q->nr_hw_queues > 1 && is_sync) ||
!data.hctx->dispatch_busy) {
/*
* There is no scheduler and we can try to send directly
* to the hardware.
*/
blk_mq_try_issue_directly(data.hctx, rq, &cookie);////调用
} else {
/* Default case. */
blk_mq_sched_insert_request(rq, false, true, true);
}
return cookie;
queue_exit:
blk_queue_exit(q);
return BLK_QC_T_NONE;
}
会调用blk_mq_try_issue_directly(data.hctx, rq, &cookie),调用关系如下:
blk_mq_try_issue_directly->__blk_mq_try_issue_directly->blk_mq_sched_insert_request->blk_mq_try_issue_list_directly->__blk_mq_try_issue_directly->blk_mq_request_issue_directly->
__blk_mq_issue_directly->q->mq_ops->queue_rq
cpp
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
blk_status_t ret;
int srcu_idx;
might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
hctx_lock(hctx, &srcu_idx);
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);////
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
blk_mq_request_bypass_insert(rq, false, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
hctx_unlock(hctx, srcu_idx);
}
static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie,
bool bypass_insert, bool last)
{
struct request_queue *q = rq->q;
bool run_queue = true;
/*
* RCU or SRCU read lock is needed before checking quiesced flag.
*
* When queue is stopped or quiesced, ignore 'bypass_insert' from
* blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
* and avoid driver to try to dispatch again.
*/
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
run_queue = false;
bypass_insert = false;
goto insert;
}
if (q->elevator && !bypass_insert)
goto insert;
if (!blk_mq_get_dispatch_budget(q))
goto insert;
if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(q);
goto insert;
}
return __blk_mq_issue_directly(hctx, rq, cookie, last);
insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
blk_mq_sched_insert_request(rq, false, run_queue, false);
return BLK_STS_OK;
}
cpp
void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx,
struct list_head *list, bool run_queue_async)
{
struct elevator_queue *e;
struct request_queue *q = hctx->queue;
/*
* blk_mq_sched_insert_requests() is called from flush plug
* context only, and hold one usage counter to prevent queue
* from being released.
*/
percpu_ref_get(&q->q_usage_counter);
e = hctx->queue->elevator;
if (e && e->type->ops.insert_requests)
e->type->ops.insert_requests(hctx, list, false);
else {
/*
* try to issue requests directly if the hw queue isn't
* busy in case of 'none' scheduler, and this way may save
* us one extra enqueue & dequeue to sw queue.
*/
if (!hctx->dispatch_busy && !e && !run_queue_async) {
blk_mq_try_issue_list_directly(hctx, list);///调用
if (list_empty(list))
goto out;
}
blk_mq_insert_requests(hctx, ctx, list);
}
blk_mq_run_hw_queue(hctx, run_queue_async);
out:
percpu_ref_put(&q->q_usage_counter);
}
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list)
{
int queued = 0;
int errors = 0;
while (!list_empty(list)) {
blk_status_t ret;
struct request *rq = list_first_entry(list, struct request,
queuelist);
list_del_init(&rq->queuelist);
ret = blk_mq_request_issue_directly(rq, list_empty(list));
if (ret != BLK_STS_OK) {
if (ret == BLK_STS_RESOURCE ||
ret == BLK_STS_DEV_RESOURCE) {
blk_mq_request_bypass_insert(rq, false,
list_empty(list));
break;
}
blk_mq_end_request(rq, ret);
errors++;
} else
queued++;
}
/*
* If we didn't flush the entire list, we could have told
* the driver there was more coming, but that turned out to
* be a lie.
*/
if ((!list_empty(list) || errors) &&
hctx->queue->mq_ops->commit_rqs && queued)
hctx->queue->mq_ops->commit_rqs(hctx);
}
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
blk_status_t ret;
int srcu_idx;
blk_qc_t unused_cookie;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
hctx_lock(hctx, &srcu_idx);
ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
hctx_unlock(hctx, srcu_idx);
return ret;
}
static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie,
bool bypass_insert, bool last)
{
struct request_queue *q = rq->q;
bool run_queue = true;
/*
* RCU or SRCU read lock is needed before checking quiesced flag.
*
* When queue is stopped or quiesced, ignore 'bypass_insert' from
* blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
* and avoid driver to try to dispatch again.
*/
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
run_queue = false;
bypass_insert = false;
goto insert;
}
if (q->elevator && !bypass_insert)
goto insert;
if (!blk_mq_get_dispatch_budget(q))
goto insert;
if (!blk_mq_get_driver_tag(rq)) {
blk_mq_put_dispatch_budget(q);
goto insert;
}
return __blk_mq_issue_directly(hctx, rq, cookie, last);
insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
blk_mq_sched_insert_request(rq, false, run_queue, false);
return BLK_STS_OK;
}
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq,
blk_qc_t *cookie, bool last)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
.last = last,
};
blk_qc_t new_cookie;
blk_status_t ret;
new_cookie = request_to_qc_t(hctx, rq);
/*
* For OK queue, we are done. For error, caller may kill it.
* Any other error (busy), just add it to our list as we
* previously would have done.
*/
ret = q->mq_ops->queue_rq(hctx, &bd);////调用
switch (ret) {
case BLK_STS_OK:
blk_mq_update_dispatch_busy(hctx, false);
*cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
blk_mq_update_dispatch_busy(hctx, true);
__blk_mq_requeue_request(rq);
break;
default:
blk_mq_update_dispatch_busy(hctx, false);
*cookie = BLK_QC_T_NONE;
break;
}
return ret;
}
最后q->mq_ops->queue_rq(hctx, &bd)调用**nvme_queue_rq,传入到nvme层,接下来就是nvme驱动处理,映射硬件队列。**
- 映射到硬件队列
选择硬件队列:blk-mq 根据请求的上下文(如 CPU 亲和性)或设备策略(如轮询)将请求映射到某个硬件队列(blk_mq_hw_ctx)。每个硬件队列对应一个 NVMe I/O 提交队列和完成队列。
调用 queue_rq:blk-mq 调用设备驱动注册的 .queue_rq 回调。对于 NVMe I/O 队列,该回调为 nvme_queue_rq(与 Admin 队列使用的是同一个函数名,但所属的 blk_mq_ops 不同:Admin 使用 nvme_mq_admin_ops,I/O 使用 nvme_mq_ops)。
- NVMe 驱动处理
获取硬件队列:nvme_queue_rq 的参数中包含 blk_mq_hw_ctx,其 driver_data 字段在初始化时已被设置为对应的 struct nvme_queue *(即 I/O 队列的底层表示)。通过 hctx->driver_data 拿到该队列。
设置命令:调用 nvme_setup_cmd(),根据请求类型(读/写/刷新等)填充 NVMe 命令(struct nvme_command)。关键步骤包括设置命令 opcode、namespace ID、以及数据的物理地址(通过 PRP 或 SGL 列表描述)。
cpp
blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd)
{
blk_status_t ret = BLK_STS_OK;
nvme_clear_nvme_request(req);
memset(cmd, 0, sizeof(*cmd));
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
nvme_setup_passthrough(req, cmd);
break;
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
case REQ_OP_ZONE_RESET_ALL:
case REQ_OP_ZONE_RESET:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
break;
case REQ_OP_ZONE_OPEN:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
break;
case REQ_OP_ZONE_CLOSE:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
break;
case REQ_OP_ZONE_FINISH:
ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
break;
case REQ_OP_WRITE_ZEROES:
ret = nvme_setup_write_zeroes(ns, req, cmd);
break;
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
case REQ_OP_READ:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
break;
case REQ_OP_WRITE:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
break;
case REQ_OP_ZONE_APPEND:
ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
break;
default:
WARN_ON_ONCE(1);
return BLK_STS_IOERR;
}
cmd->common.command_id = req->tag;
trace_nvme_setup_cmd(req, cmd);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
提交到硬件:调用 nvme_submit_cmd() 将命令写入 I/O 提交队列的环形缓冲区,然后写相应的门铃寄存器通知控制器。(后面和admin命令处理一样)
cpp
/*
* NOTE: ns is NULL when called on the admin queue.
*/
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_queue *nvmeq = hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_command cmnd;
blk_status_t ret;
iod->aborted = 0;
iod->npages = -1;
iod->nents = 0;
/*
* We should not need to do this, but we're still using this to
* ensure we can drain requests on a dying queue.
*/
if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
return BLK_STS_IOERR;
ret = nvme_setup_cmd(ns, req, &cmnd);
if (ret)
return ret;
if (blk_rq_nr_phys_segments(req)) {
ret = nvme_map_data(dev, req, &cmnd);
if (ret)
goto out_free_cmd;
}
if (blk_integrity_rq(req)) {
ret = nvme_map_metadata(dev, req, &cmnd);
if (ret)
goto out_unmap_data;
}
blk_mq_start_request(req);
nvme_submit_cmd(nvmeq, &cmnd, bd->last);
return BLK_STS_OK;
out_unmap_data:
nvme_unmap_data(dev, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}
- 硬件执行与完成
控制器处理:NVMe 控制器从提交队列取命令,执行 DMA 数据传输,完成后将完成项放入对应的 I/O 完成队列,并可能触发中断(每个队列可拥有独立的中断向量)。
中断处理:NVMe 驱动的中断处理函数(如 nvme_irq)调用 nvme_process_cq() 遍历完成队列,对每个完成项调用 nvme_complete_rq()。后者将 request 标记为完成,并回调块层(如通过 blk_mq_complete_request)最终通知上层 I/O 结束。
总结
IO路径的两种方式如下:
块层路径(通用块层 + 驱动)
-
触发方式 :文件系统、直接 I/O(
O_DIRECT)或内核其他模块通过submit_bio将struct bio递交给通用块层。 -
处理流程:
-
通用块层可能对 bio 进行合并、分区偏移转换、I/O 调度等。
-
最终通过请求队列派发给底层驱动(如 NVMe 驱动的
nvme_queue_rq)。 -
驱动将 bio 转换为硬件命令(如 NVMe 的 Submission Queue 条目),提交给设备,并在完成时通过中断回调通知上层。
-
-
特点:
-
遵循内核统一的 I/O 栈,享受块层的功能(如 I/O 调度、限速、统计等)。
-
支持页高速缓存(除非指定
O_DIRECT)、分区、设备映射等。 -
适用于通用文件系统操作、标准读写等。
-
ioctl 路径(直接命令通道)
-
触发方式 :用户程序通过
ioctl(fd, NVMe_IOCTL_*, ...)发起,驱动中的nvme_ioctl处理。 -
处理流程:
-
用户传入 NVMe 命令结构体(如
struct nvme_passthru_cmd)。 -
驱动直接构造命令提交给硬件(通常放入提交队列),等待完成并返回结果。
-
可携带用户态数据缓冲区(通过 copy_from/to_user 或 DMA 映射)。
-
-
特点:
-
完全绕过通用块层,直接与硬件交互。
-
允许发送任意 NVMe 命令(包括管理命令、自定义读写命令、固件下载等)。
-
通常用于诊断、设备管理、特殊控制指令,也可实现高性能的直接读写(但需用户自行处理数据布局和对齐)。
-
不经过 I/O 调度、分区表、页缓存等,更加"原始"。
-