在前面的文章我们介绍了GIC-V3 ITS驱动,本篇我们介绍怎么基于ITS驱动实现pcie msi功能。kernel版本为6.6。
我们前面文章中介绍到kernel 创建了一个MSI/MSIX domain。这部分代码在irq-gic-v3-its-pci-msi.c中实现的。执行流程如下:
its_pci_msi_init
->its_pci_of_msi_init
->its_pci_msi_init_one
->pci_msi_create_irq_domain
->msi_create_irq_domain
->__msi_create_irq_domain
可以看到__msi_create_irq_domain创建了一个hierarchy类型的domain,bus_token为DOMAIN_BUS_PCI_MSI。我们不过多的介绍代码了,实现比较简单,下面从一个PCIE devcie驱动的角度看一下,如何申请并使用MSI中断的。
cpp
808 static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
809 struct msi_domain_info *info,
810 unsigned int flags,
811 struct irq_domain *parent)
812 {
813 struct irq_domain *domain;
814
815 if (info->hwsize > MSI_XA_DOMAIN_SIZE)
816 return NULL;
817
818 /*
819 * Hardware size 0 is valid for backwards compatibility and for
820 * domains which are not backed by a hardware table. Grant the
821 * maximum index space.
822 */
823 if (!info->hwsize)
824 info->hwsize = MSI_XA_DOMAIN_SIZE;
825
826 msi_domain_update_dom_ops(info);
827 if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
828 msi_domain_update_chip_ops(info);
829
830 domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
831 fwnode, &msi_domain_ops, info);
832
833 if (domain)
834 irq_domain_update_bus_token(domain, info->bus_token);
835
836 return domain;
837 }
PCIE device是通过pci_alloc_irq_vectors申请msi vector的。执行流程如下:
pci_alloc_irq_vectors
->pci_alloc_irq_vectors_affinity
->__pci_enable_msi_range
->pci_setup_msi_context
->msi_setup_device_data
->msi_capability_init
->pci_msi_setup_msi_irqs
->msi_domain_alloc_irqs_all_locked
->msi_domain_alloc_locked
->__msi_domain_alloc_locked
先看一下msi_setup_device_data,319行这个dev->msi.domain是在设备的创建过程中添加的(pci_device_add->pci_set_msi_domain),就是我们在《Linux pcie【8】GIC-V3 ITS驱动》介绍的MSI/MSIX domain,而且不是一个msi parent。320-323行会把给个domain作为device的MSI_DEFAULT_DOMAIN使用。
cpp
282 /**
283 * msi_setup_device_data - Setup MSI device data
284 * @dev: Device for which MSI device data should be set up
285 *
286 * Return: 0 on success, appropriate error code otherwise
287 *
288 * This can be called more than once for @dev. If the MSI device data is
289 * already allocated the call succeeds. The allocated memory is
290 * automatically released when the device is destroyed.
291 */
292 int msi_setup_device_data(struct device *dev)
293 {
294 struct msi_device_data *md;
295 int ret, i;
296
297 if (dev->msi.data)
298 return 0;
299
300 md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
301 if (!md)
302 return -ENOMEM;
303
304 ret = msi_sysfs_create_group(dev);
305 if (ret) {
306 devres_free(md);
307 return ret;
308 }
309
310 for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
311 xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
312
313 /*
314 * If @dev::msi::domain is set and is a global MSI domain, copy the
315 * pointer into the domain array so all code can operate on domain
316 * ids. The NULL pointer check is required to keep the legacy
317 * architecture specific PCI/MSI support working.
318 */
319 if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
320 md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
321
322 mutex_init(&md->mutex);
323 dev->msi.data = md;
324 devres_add(dev, md);
325 return 0;
326 }
接着看__msi_domain_alloc_locked,1334行获取的domain就是我们前面赋值的MSI_DEFAULT_DOMAIN。 1340行申请一个msi_descs,这里先不管。1ops->domain_alloc_irqs没有配置,所以会走1348行__msi_domain_alloc_irqs。
cpp
1324 static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
1325 {
1326 struct msi_domain_info *info;
1327 struct msi_domain_ops *ops;
1328 struct irq_domain *domain;
1329 int ret;
1330
1331 if (!msi_ctrl_valid(dev, ctrl))
1332 return -EINVAL;
1333
1334 domain = msi_get_device_domain(dev, ctrl->domid);
1335 if (!domain)
1336 return -ENODEV;
1337
1338 info = domain->host_data;
1339
1340 ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
1341 if (ret)
1342 return ret;
1343
1344 ops = info->ops;
1345 if (ops->domain_alloc_irqs)
1346 return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
1347
1348 return __msi_domain_alloc_irqs(dev, domain, ctrl);
1349 }
我们再看一下__msi_domain_alloc_irqs的调用流程:
__msi_domain_alloc_irqs
->msi_domain_prepare_irqs
->__irq_domain_alloc_irqs
->irq_domain_alloc_irqs_locked
->irq_domain_alloc_irqs_hierarchy
首先msi_domain_prepare_irqs最终会调用到its_pci_msi_prepare。主要是81行,会计算pci_device的requrestId,requestId是以EP的busId+devId+fucId为输入通过msi-map得到的。
然后赋值给info->scratchpad[0].ul。94行,继续调用上一级的msi_prepare,即its_msi_prepare。
cpp
53 static int its_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
54 int nvec, msi_alloc_info_t *info)
55 {
56 struct pci_dev *pdev, *alias_dev;
57 struct msi_domain_info *msi_info;
58 int alias_count = 0, minnvec = 1;
59
60 if (!dev_is_pci(dev))
61 return -EINVAL;
62
63 msi_info = msi_get_domain_info(domain->parent);
64
65 pdev = to_pci_dev(dev);
66 /*
67 * If pdev is downstream of any aliasing bridges, take an upper
68 * bound of how many other vectors could map to the same DevID.
69 * Also tell the ITS that the signalling will come from a proxy
70 * device, and that special allocation rules apply.
71 */
72 pci_for_each_dma_alias(pdev, its_get_pci_alias, &alias_dev);
73 if (alias_dev != pdev) {
74 if (alias_dev->subordinate)
75 pci_walk_bus(alias_dev->subordinate,
76 its_pci_msi_vec_count, &alias_count);
77 info->flags |= MSI_ALLOC_FLAGS_PROXY_DEVICE;
78 }
79
80 /* ITS specific DeviceID, as the core ITS ignores dev. */
81 info->scratchpad[0].ul = pci_msi_domain_get_msi_rid(domain, pdev);
82
83 /*
84 * Always allocate a power of 2, and special case device 0 for
85 * broken systems where the DevID is not wired (and all devices
86 * appear as DevID 0). For that reason, we generously allocate a
87 * minimum of 32 MSIs for DevID 0. If you want more because all
88 * your devices are aliasing to DevID 0, consider fixing your HW.
89 */
90 nvec = max(nvec, alias_count);
91 if (!info->scratchpad[0].ul)
92 minnvec = 32;
93 nvec = max_t(int, minnvec, roundup_pow_of_two(nvec));
94 return msi_info->ops->msi_prepare(domain->parent, dev, nvec, info);
95 }
再看its_msi_prepare,3057行-3539行,会基于前面生成的requsetId查找或者新创建一个its_device。3547行,将its_device赋值给info->scratchpad[0].ptr。
cpp
3492 static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
3493 int nvec, msi_alloc_info_t *info)
3494 {
3495 struct its_node *its;
3496 struct its_device *its_dev;
3497 struct msi_domain_info *msi_info;
3498 u32 dev_id;
3499 int err = 0;
3500
3501 /*
3502 * We ignore "dev" entirely, and rely on the dev_id that has
3503 * been passed via the scratchpad. This limits this domain's
3504 * usefulness to upper layers that definitely know that they
3505 * are built on top of the ITS.
3506 */
3507 dev_id = info->scratchpad[0].ul;
3508
3509 msi_info = msi_get_domain_info(domain);
3510 its = msi_info->data;
3511
3512 if (!gic_rdists->has_direct_lpi &&
3513 vpe_proxy.dev &&
3514 vpe_proxy.dev->its == its &&
3515 dev_id == vpe_proxy.dev->device_id) {
3516 /* Bad luck. Get yourself a better implementation */
3517 WARN_ONCE(1, "DevId %x clashes with GICv4 VPE proxy device\n",
3518 dev_id);
3519 return -EINVAL;
3520 }
3521
3522 mutex_lock(&its->dev_alloc_lock);
3523 its_dev = its_find_device(its, dev_id);
3524 if (its_dev) {
3525 /*
3526 * We already have seen this ID, probably through
3527 * another alias (PCI bridge of some sort). No need to
3528 * create the device.
3529 */
3530 its_dev->shared = true;
3531 pr_debug("Reusing ITT for devID %x\n", dev_id);
3532 goto out;
3533 }
3534
3535 its_dev = its_create_device(its, dev_id, nvec, true);
3536 if (!its_dev) {
3537 err = -ENOMEM;
3538 goto out;
3539 }
3540
3541 if (info->flags & MSI_ALLOC_FLAGS_PROXY_DEVICE)
3542 its_dev->shared = true;
3543
3544 pr_debug("ITT %d entries, %d bits\n", nvec, ilog2(nvec));
3545 out:
3546 mutex_unlock(&its->dev_alloc_lock);
3547 info->scratchpad[0].ptr = its_dev;
3548 return err;
3549 }
3550
3551 static struct msi_domain_ops its_msi_domain_ops = {
3552 .msi_prepare = its_msi_prepare,
3553 };
我们仔细看一下its_create_device,3407行为requestId分配一个its device table。3423行,分配lpi_base和lpi_map。3421行分配一块itt空间。3459行建立requestId和itt的map。
cpp
3394 static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
3395 int nvecs, bool alloc_lpis)
3396 {
3397 struct its_device *dev;
3398 unsigned long *lpi_map = NULL;
3399 unsigned long flags;
3400 u16 *col_map = NULL;
3401 void *itt;
3402 int lpi_base;
3403 int nr_lpis;
3404 int nr_ites;
3405 int sz;
3406
3407 if (!its_alloc_device_table(its, dev_id))
3408 return NULL;
3409
3410 if (WARN_ON(!is_power_of_2(nvecs)))
3411 nvecs = roundup_pow_of_two(nvecs);
3412
3413 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
3414 /*
3415 * Even if the device wants a single LPI, the ITT must be
3416 * sized as a power of two (and you need at least one bit...).
3417 */
3418 nr_ites = max(2, nvecs);
3419 sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
3420 sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
3421 itt = kzalloc_node(sz, GFP_KERNEL, its->numa_node);
3422 if (alloc_lpis) {
3423 lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
3424 if (lpi_map)
3425 col_map = kcalloc(nr_lpis, sizeof(*col_map),
3426 GFP_KERNEL);
3427 } else {
3428 col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL);
3429 nr_lpis = 0;
3430 lpi_base = 0;
3431 }
3432
3433 if (!dev || !itt || !col_map || (!lpi_map && alloc_lpis)) {
3434 kfree(dev);
3435 kfree(itt);
3436 bitmap_free(lpi_map);
3437 kfree(col_map);
3438 return NULL;
3439 }
3440
3441 gic_flush_dcache_to_poc(itt, sz);
3442
3443 dev->its = its;
3444 dev->itt = itt;
3445 dev->nr_ites = nr_ites;
3446 dev->event_map.lpi_map = lpi_map;
3447 dev->event_map.col_map = col_map;
3448 dev->event_map.lpi_base = lpi_base;
3449 dev->event_map.nr_lpis = nr_lpis;
3450 raw_spin_lock_init(&dev->event_map.vlpi_lock);
3451 dev->device_id = dev_id;
3452 INIT_LIST_HEAD(&dev->entry);
3453
3454 raw_spin_lock_irqsave(&its->lock, flags);
3455 list_add(&dev->entry, &its->its_device_list);
3456 raw_spin_unlock_irqrestore(&its->lock, flags);
3457
3458 /* Map device to its ITT */
3459 its_send_mapd(dev, 1);
3460
3461 return dev;
3462 }
继续看irq alloc这一路。irq_domain_alloc_irqs_hierarchy最终调用domain的回调函数来分配virq,回调函数为msi_domain_alloc。696行会一路调用parent domain的alloc。所以最终调用的是its驱动里的its_irq_domain_alloc。
cpp
684 static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq,
685 unsigned int nr_irqs, void *arg)
686 {
687 struct msi_domain_info *info = domain->host_data;
688 struct msi_domain_ops *ops = info->ops;
689 irq_hw_number_t hwirq = ops->get_hwirq(info, arg);
690 int i, ret;
691
692 if (irq_find_mapping(domain, hwirq) > 0)
693 return -EEXIST;
694
695 if (domain->parent) {
696 ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
697 if (ret < 0)
698 return ret;
699 }
700
701 for (i = 0; i < nr_irqs; i++) {
702 ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg);
703 if (ret < 0) {
704 if (ops->msi_free) {
705 for (i--; i > 0; i--)
706 ops->msi_free(domain, info, virq + i);
707 }
708 irq_domain_free_irqs_top(domain, virq, nr_irqs);
709 return ret;
710 }
711 }
712
713 return 0;
714 }
我们再看下its_irq_domain_alloc,3590行获取hwirq,3599行通过调用gic domain的alloc 建立映射。
cpp
3579 static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
3580 unsigned int nr_irqs, void *args)
3581 {
3582 msi_alloc_info_t *info = args;
3583 struct its_device *its_dev = info->scratchpad[0].ptr;
3584 struct its_node *its = its_dev->its;
3585 struct irq_data *irqd;
3586 irq_hw_number_t hwirq;
3587 int err;
3588 int i;
3589
3590 err = its_alloc_device_irq(its_dev, nr_irqs, &hwirq);
3591 if (err)
3592 return err;
3593
3594 err = iommu_dma_prepare_msi(info->desc, its->get_msi_base(its_dev));
3595 if (err)
3596 return err;
3597
3598 for (i = 0; i < nr_irqs; i++) {
3599 err = its_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
3600 if (err)
3601 return err;
3602
3603 irq_domain_set_hwirq_and_chip(domain, virq + i,
3604 hwirq + i, &its_irq_chip, its_dev);
3605 irqd = irq_get_irq_data(virq + i);
3606 irqd_set_single_target(irqd);
3607 irqd_set_affinity_on_activate(irqd);
3608 irqd_set_resend_when_in_progress(irqd);
3609 pr_debug("ID:%d pID:%d vID:%d\n",
3610 (int)(hwirq + i - its_dev->event_map.lpi_base),
3611 (int)(hwirq + i), virq + i);
3612 }
3613
3614 return 0;
3615 }
再看一下hwirq的获取过程its_alloc_device_irq。就是从its_device分配一个idx,然后加上分配给这个设备的lpi base。
cpp
3476 static int its_alloc_device_irq(struct its_device *dev, int nvecs, irq_hw_number_t *hwirq)
3477 {
3478 int idx;
3479
3480 /* Find a free LPI region in lpi_map and allocate them. */
3481 idx = bitmap_find_free_region(dev->event_map.lpi_map,
3482 dev->event_map.nr_lpis,
3483 get_count_order(nvecs));
3484 if (idx < 0)
3485 return -ENOSPC;
3486
3487 *hwirq = dev->event_map.lpi_base + idx;
3488
3489 return 0;
3490 }
我们看一先requestId eventId和initId的关系,对于pcie来说requtestId是固定的,我们会通过RC dts节点里的msi-map将他map成一个deviceId,一般都是一一映射,然后用deviceId和eventId在ITS table里创建一个到InitId的映射,这个initId就是GIV-V3的LPI中断。我们给设备分配eventId时会配置成initId - lpi_base。lpi_base是分配给设备专用的。这样设备发出一个msi就会由一个对应initId由ITS发给GIV-V3的redistributor。
