一、硬件卸载
硬件加速,听名字就是明白是利用硬件加速。不太准确,硬件加速其实更有效进行硬件的分工,通过分工实现硬件的整体的效率的提升。其实硬件卸载就是硬件加速,而实现硬件加速就需要进行功能卸载,整体上就可以叫做硬件卸载。
硬件卸载是指将某些任务或计算从计算机的主处理器 (CPU) 或一些软件功能转移到专用硬件组件(例如网络接口卡 (NIC) 或图形处理单元 (GPU))的过程,以提高系统性能和效率。而提到硬件卸载就不得提到软件定义网络:
软件定义网络的主流的解决方案是从传统的包含专用硬件与控制平面相结合并提供选定的功能的交换机等单个设备,转移到软件定义网络抽象出的控制平面、数据平面、管理平面三个不同的层。
第一层是管理层或管理平面。它包括像OpenStack这类应用所在的位置,可以将一些配置应用于网络并将其用于虚拟化等领域。下一层是控制平面。最下面是数据平面。它由硬件(例如白盒交换机)和软件(软件数据平面)组成,数据平面就是硬件卸载所在的地方。而软件定义网络一般是指通过某种技术将网络设备的控制面与数据面分离开,形成灵活智能的网络。
软件数据平面是一个常用的术语,用来描述处理网络数据包中用户数据的应用程序。DPDK,数据平面开发套件,这下明白了吧。
二、DPDK中的应用
DPDK主要是应对网络应用开发,所以其主要和网卡打交道。DPDK为了支持网卡的硬件卸载,需要提供相关的软件接口,具体到实际情况,网卡的硬件卸载可能是基于端口设置也可能是基于每个包设置使能。DPDK的mbuf可能是对应着一个包也可能多个mbuf对应着一个包,这在mbuf中都有对应的标识来标定(ol_flags)。在"\dpdk-stable-19.11.14\lib\librte_mbuf\rte_mbuf_core.h"中有这些具体的定义,如"PKT_RX_IP_CKSUM_NONE"等等。在相同文件夹下,还有其它一些具体的宏定义,大家也可以参考。
相关的硬件卸载功能主要是与实际的网卡的实现情况有关,其主要有以下几种情况:
1、硬件与更新功能卸载
它根据实际情况又可以分为:
a.VLAN硬件卸载,通过报文中Tag标识来进行标识控制。通过硬件卸载可以直接使用硬件操作而非软件利用CPU来进行操作,减轻了CPU的负荷。
b.IEEE1588硬件卸载功能,PTP(Precision Timing Protocol,精准时间同步协议),这个用硬件实现更便捷准确
c.IP TCP/UDP/SCTP checksum硬件卸载功能,这个就不用说了,硬件更快捷高效
d.Tunnel硬件卸载功能,这个更好理解,其实就是纯粹的协议的控制,隧道,就是一个安全通道,一个专门的通道,可以将不同的协议连接起来。类似于集装箱,直接把原始包扔进去做负载。这种简单的封装其实更适合用硬件来完成。
2、分片功能卸载
分片说得有点高级,其实也很好理解,一个大城市不好管理,划成一个个小片不就好管理了,这就是分片。在网络通信中,上层应用可能不会顾及底层的硬件和驱动的缓冲大小,向下输出大量数据,可实际的TCP的传输是有大小控制的,这就需要将上层的数据分片成合适的大小。而这个分片的过程软件其实是不如硬件更容易实现。因为这种机械的固定的功能最是硬件的擅长的了。
3、组包功能卸载
组包其实就上面分片的逆操作。毕竟数据最终还是回流到上层应用,必须保持数据的一致性呈现,这就需要把在数据传输过程中的一系列辅助的动作恢复原样。这就和在网上购物一样,可能是一个很小的物件,但在实际传输中可能不断的被打包,验证,然后装箱,运输。等到达目的地时再逆向操作,最终原样送到购买者手中。
三、源码分析
在DPDK提供了对IEEE1588的功能支持:
c
int
rte_eth_timesync_enable(uint16_t port_id)
{
struct rte_eth_dev *dev;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
dev = &rte_eth_devices[port_id];
RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_enable, -ENOTSUP);
return eth_err(port_id, (*dev->dev_ops->timesync_enable)(dev));
}
int
rte_eth_timesync_disable(uint16_t port_id)
{
struct rte_eth_dev *dev;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
dev = &rte_eth_devices[port_id];
RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_disable, -ENOTSUP);
return eth_err(port_id, (*dev->dev_ops->timesync_disable)(dev));
}
int
rte_eth_timesync_read_rx_timestamp(uint16_t port_id, struct timespec *timestamp,
uint32_t flags)
{
struct rte_eth_dev *dev;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
dev = &rte_eth_devices[port_id];
RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_read_rx_timestamp, -ENOTSUP);
return eth_err(port_id, (*dev->dev_ops->timesync_read_rx_timestamp)
(dev, timestamp, flags));
}
int
rte_eth_timesync_read_tx_timestamp(uint16_t port_id,
struct timespec *timestamp)
{
struct rte_eth_dev *dev;
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
dev = &rte_eth_devices[port_id];
RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->timesync_read_tx_timestamp, -ENOTSUP);
return eth_err(port_id, (*dev->dev_ops->timesync_read_tx_timestamp)
(dev, timestamp));
}
上面的代码很清晰就是使能和读取双向的时间戳。而在app/test-pmd中有计算checksum的方法:
c
/* if possible, calculate the checksum of a packet in hw or sw,
* depending on the testpmd command line configuration */
static uint64_t
process_inner_cksums(void *l3_hdr, const struct testpmd_offload_info *info,
uint64_t tx_offloads)
{
struct rte_ipv4_hdr *ipv4_hdr = l3_hdr;
struct rte_udp_hdr *udp_hdr;
struct rte_tcp_hdr *tcp_hdr;
struct rte_sctp_hdr *sctp_hdr;
uint64_t ol_flags = 0;
uint32_t max_pkt_len, tso_segsz = 0;
/* ensure packet is large enough to require tso */
if (!info->is_tunnel) {
max_pkt_len = info->l2_len + info->l3_len + info->l4_len +
info->tso_segsz;
if (info->tso_segsz != 0 && info->pkt_len > max_pkt_len)
tso_segsz = info->tso_segsz;
} else {
max_pkt_len = info->outer_l2_len + info->outer_l3_len +
info->l2_len + info->l3_len + info->l4_len +
info->tunnel_tso_segsz;
if (info->tunnel_tso_segsz != 0 && info->pkt_len > max_pkt_len)
tso_segsz = info->tunnel_tso_segsz;
}
if (info->ethertype == _htons(RTE_ETHER_TYPE_IPV4)) {
ipv4_hdr = l3_hdr;
ol_flags |= PKT_TX_IPV4;
if (info->l4_proto == IPPROTO_TCP && tso_segsz) {
ol_flags |= PKT_TX_IP_CKSUM;
} else {
if (tx_offloads & DEV_TX_OFFLOAD_IPV4_CKSUM) {
ol_flags |= PKT_TX_IP_CKSUM;
} else {
ipv4_hdr->hdr_checksum = 0;
ipv4_hdr->hdr_checksum =
rte_ipv4_cksum(ipv4_hdr);
}
}
} else if (info->ethertype == _htons(RTE_ETHER_TYPE_IPV6))
ol_flags |= PKT_TX_IPV6;
else
return 0; /* packet type not supported, nothing to do */
if (info->l4_proto == IPPROTO_UDP) {
udp_hdr = (struct rte_udp_hdr *)((char *)l3_hdr + info->l3_len);
/* do not recalculate udp cksum if it was 0 */
if (udp_hdr->dgram_cksum != 0) {
if (tx_offloads & DEV_TX_OFFLOAD_UDP_CKSUM) {
ol_flags |= PKT_TX_UDP_CKSUM;
} else {
udp_hdr->dgram_cksum = 0;
udp_hdr->dgram_cksum =
get_udptcp_checksum(l3_hdr, udp_hdr,
info->ethertype);
}
}
if (info->gso_enable)
ol_flags |= PKT_TX_UDP_SEG;
} else if (info->l4_proto == IPPROTO_TCP) {
tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + info->l3_len);
if (tso_segsz)
ol_flags |= PKT_TX_TCP_SEG;
else if (tx_offloads & DEV_TX_OFFLOAD_TCP_CKSUM) {
ol_flags |= PKT_TX_TCP_CKSUM;
} else {
tcp_hdr->cksum = 0;
tcp_hdr->cksum =
get_udptcp_checksum(l3_hdr, tcp_hdr,
info->ethertype);
}
if (info->gso_enable)
ol_flags |= PKT_TX_TCP_SEG;
} else if (info->l4_proto == IPPROTO_SCTP) {
sctp_hdr = (struct rte_sctp_hdr *)
((char *)l3_hdr + info->l3_len);
/* sctp payload must be a multiple of 4 to be
* offloaded */
if ((tx_offloads & DEV_TX_OFFLOAD_SCTP_CKSUM) &&
((ipv4_hdr->total_length & 0x3) == 0)) {
ol_flags |= PKT_TX_SCTP_CKSUM;
} else {
sctp_hdr->cksum = 0;
/* XXX implement CRC32c, example available in
* RFC3309 */
}
}
return ol_flags;
}
看一下组包的支持:
c
* Initializes Receive Unit.
*/
int __attribute__((cold))
ixgbe_dev_rx_init(struct rte_eth_dev *dev)
{
struct ixgbe_hw *hw;
struct ixgbe_rx_queue *rxq;
uint64_t bus_addr;
uint32_t rxctrl;
uint32_t fctrl;
uint32_t hlreg0;
uint32_t maxfrs;
uint32_t srrctl;
uint32_t rdrxctl;
uint32_t rxcsum;
uint16_t buf_size;
uint16_t i;
struct rte_eth_rxmode *rx_conf = &dev->data->dev_conf.rxmode;
int rc;
PMD_INIT_FUNC_TRACE();
hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
/*
* Make sure receives are disabled while setting
* up the RX context (registers, descriptor rings, etc.).
*/
rxctrl = IXGBE_READ_REG(hw, IXGBE_RXCTRL);
IXGBE_WRITE_REG(hw, IXGBE_RXCTRL, rxctrl & ~IXGBE_RXCTRL_RXEN);
/* Enable receipt of broadcasted frames */
fctrl = IXGBE_READ_REG(hw, IXGBE_FCTRL);
fctrl |= IXGBE_FCTRL_BAM;
fctrl |= IXGBE_FCTRL_DPF;
fctrl |= IXGBE_FCTRL_PMCF;
IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl);
/*
* Configure CRC stripping, if any.
*/
hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0);
if (rx_conf->offloads & DEV_RX_OFFLOAD_KEEP_CRC)
hlreg0 &= ~IXGBE_HLREG0_RXCRCSTRP;
else
hlreg0 |= IXGBE_HLREG0_RXCRCSTRP;
/*
* Configure jumbo frame support, if any.
*/
if (rx_conf->offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) {
hlreg0 |= IXGBE_HLREG0_JUMBOEN;
maxfrs = IXGBE_READ_REG(hw, IXGBE_MAXFRS);
maxfrs &= 0x0000FFFF;
maxfrs |= (rx_conf->max_rx_pkt_len << 16);
IXGBE_WRITE_REG(hw, IXGBE_MAXFRS, maxfrs);
} else
hlreg0 &= ~IXGBE_HLREG0_JUMBOEN;
/*
* If loopback mode is configured, set LPBK bit.
*/
if (dev->data->dev_conf.lpbk_mode != 0) {
rc = ixgbe_check_supported_loopback_mode(dev);
if (rc < 0) {
PMD_INIT_LOG(ERR, "Unsupported loopback mode");
return rc;
}
hlreg0 |= IXGBE_HLREG0_LPBK;
} else {
hlreg0 &= ~IXGBE_HLREG0_LPBK;
}
IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0);
/*
* Assume no header split and no VLAN strip support
* on any Rx queue first .
*/
rx_conf->offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP;
/* Setup RX queues */
for (i = 0; i < dev->data->nb_rx_queues; i++) {
rxq = dev->data->rx_queues[i];
/*
* Reset crc_len in case it was changed after queue setup by a
* call to configure.
*/
if (rx_conf->offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
else
rxq->crc_len = 0;
/* Setup the Base and Length of the Rx Descriptor Rings */
bus_addr = rxq->rx_ring_phys_addr;
IXGBE_WRITE_REG(hw, IXGBE_RDBAL(rxq->reg_idx),
(uint32_t)(bus_addr & 0x00000000ffffffffULL));
IXGBE_WRITE_REG(hw, IXGBE_RDBAH(rxq->reg_idx),
(uint32_t)(bus_addr >> 32));
IXGBE_WRITE_REG(hw, IXGBE_RDLEN(rxq->reg_idx),
rxq->nb_rx_desc * sizeof(union ixgbe_adv_rx_desc));
IXGBE_WRITE_REG(hw, IXGBE_RDH(rxq->reg_idx), 0);
IXGBE_WRITE_REG(hw, IXGBE_RDT(rxq->reg_idx), 0);
/* Configure the SRRCTL register */
srrctl = IXGBE_SRRCTL_DESCTYPE_ADV_ONEBUF;
/* Set if packets are dropped when no descriptors available */
if (rxq->drop_en)
srrctl |= IXGBE_SRRCTL_DROP_EN;
/*
* Configure the RX buffer size in the BSIZEPACKET field of
* the SRRCTL register of the queue.
* The value is in 1 KB resolution. Valid values can be from
* 1 KB to 16 KB.
*/
buf_size = (uint16_t)(rte_pktmbuf_data_room_size(rxq->mb_pool) -
RTE_PKTMBUF_HEADROOM);
srrctl |= ((buf_size >> IXGBE_SRRCTL_BSIZEPKT_SHIFT) &
IXGBE_SRRCTL_BSIZEPKT_MASK);
IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rxq->reg_idx), srrctl);
buf_size = (uint16_t) ((srrctl & IXGBE_SRRCTL_BSIZEPKT_MASK) <<
IXGBE_SRRCTL_BSIZEPKT_SHIFT);
/* It adds dual VLAN length for supporting dual VLAN */
if (dev->data->dev_conf.rxmode.max_rx_pkt_len +
2 * IXGBE_VLAN_TAG_SIZE > buf_size)
dev->data->scattered_rx = 1;
if (rxq->offloads & DEV_RX_OFFLOAD_VLAN_STRIP)
rx_conf->offloads |= DEV_RX_OFFLOAD_VLAN_STRIP;
}
if (rx_conf->offloads & DEV_RX_OFFLOAD_SCATTER)
dev->data->scattered_rx = 1;
/*
* Device configured with multiple RX queues.
*/
ixgbe_dev_mq_rx_configure(dev);
/*
* Setup the Checksum Register.
* Disable Full-Packet Checksum which is mutually exclusive with RSS.
* Enable IP/L4 checkum computation by hardware if requested to do so.
*/
rxcsum = IXGBE_READ_REG(hw, IXGBE_RXCSUM);
rxcsum |= IXGBE_RXCSUM_PCSD;
if (rx_conf->offloads & DEV_RX_OFFLOAD_CHECKSUM)
rxcsum |= IXGBE_RXCSUM_IPPCSE;
else
rxcsum &= ~IXGBE_RXCSUM_IPPCSE;
IXGBE_WRITE_REG(hw, IXGBE_RXCSUM, rxcsum);
if (hw->mac.type == ixgbe_mac_82599EB ||
hw->mac.type == ixgbe_mac_X540) {
rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
if (rx_conf->offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rdrxctl &= ~IXGBE_RDRXCTL_CRCSTRIP;
else
rdrxctl |= IXGBE_RDRXCTL_CRCSTRIP;
rdrxctl &= ~IXGBE_RDRXCTL_RSCFRSTSIZE;
IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
}
rc = ixgbe_set_rsc(dev);
if (rc)
return rc;
ixgbe_set_rx_function(dev);
return 0;
}
/**
* ixgbe_set_rsc - configure RSC related port HW registers
*
* Configures the port's RSC related registers according to the 4.6.7.2 chapter
* of 82599 Spec (x540 configuration is virtually the same).
*
* @dev port handle
*
* Returns 0 in case of success or a non-zero error code
*/
static int
ixgbe_set_rsc(struct rte_eth_dev *dev)
{
struct rte_eth_rxmode *rx_conf = &dev->data->dev_conf.rxmode;
struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
struct rte_eth_dev_info dev_info = { 0 };
bool rsc_capable = false;
uint16_t i;
uint32_t rdrxctl;
uint32_t rfctl;
/* Sanity check */
dev->dev_ops->dev_infos_get(dev, &dev_info);
if (dev_info.rx_offload_capa & DEV_RX_OFFLOAD_TCP_LRO)
rsc_capable = true;
if (!rsc_capable && (rx_conf->offloads & DEV_RX_OFFLOAD_TCP_LRO)) {
PMD_INIT_LOG(CRIT, "LRO is requested on HW that doesn't "
"support it");
return -EINVAL;
}
/* RSC global configuration (chapter 4.6.7.2.1 of 82599 Spec) */
if ((rx_conf->offloads & DEV_RX_OFFLOAD_KEEP_CRC) &&
(rx_conf->offloads & DEV_RX_OFFLOAD_TCP_LRO)) {
/*
* According to chapter of 4.6.7.2.1 of the Spec Rev.
* 3.0 RSC configuration requires HW CRC stripping being
* enabled. If user requested both HW CRC stripping off
* and RSC on - return an error.
*/
PMD_INIT_LOG(CRIT, "LRO can't be enabled when HW CRC "
"is disabled");
return -EINVAL;
}
/* RFCTL configuration */
rfctl = IXGBE_READ_REG(hw, IXGBE_RFCTL);
if ((rsc_capable) && (rx_conf->offloads & DEV_RX_OFFLOAD_TCP_LRO))
rfctl &= ~IXGBE_RFCTL_RSC_DIS;
else
rfctl |= IXGBE_RFCTL_RSC_DIS;
/* disable NFS filtering */
rfctl |= IXGBE_RFCTL_NFSW_DIS | IXGBE_RFCTL_NFSR_DIS;
IXGBE_WRITE_REG(hw, IXGBE_RFCTL, rfctl);
/* If LRO hasn't been requested - we are done here. */
if (!(rx_conf->offloads & DEV_RX_OFFLOAD_TCP_LRO))
return 0;
/* Set RDRXCTL.RSCACKC bit */
rdrxctl = IXGBE_READ_REG(hw, IXGBE_RDRXCTL);
rdrxctl |= IXGBE_RDRXCTL_RSCACKC;
IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
/* Per-queue RSC configuration (chapter 4.6.7.2.2 of 82599 Spec) */
for (i = 0; i < dev->data->nb_rx_queues; i++) {
struct ixgbe_rx_queue *rxq = dev->data->rx_queues[i];
uint32_t srrctl =
IXGBE_READ_REG(hw, IXGBE_SRRCTL(rxq->reg_idx));
uint32_t rscctl =
IXGBE_READ_REG(hw, IXGBE_RSCCTL(rxq->reg_idx));
uint32_t psrtype =
IXGBE_READ_REG(hw, IXGBE_PSRTYPE(rxq->reg_idx));
uint32_t eitr =
IXGBE_READ_REG(hw, IXGBE_EITR(rxq->reg_idx));
/*
* ixgbe PMD doesn't support header-split at the moment.
*
* Following the 4.6.7.2.1 chapter of the 82599/x540
* Spec if RSC is enabled the SRRCTL[n].BSIZEHEADER
* should be configured even if header split is not
* enabled. We will configure it 128 bytes following the
* recommendation in the spec.
*/
srrctl &= ~IXGBE_SRRCTL_BSIZEHDR_MASK;
srrctl |= (128 << IXGBE_SRRCTL_BSIZEHDRSIZE_SHIFT) &
IXGBE_SRRCTL_BSIZEHDR_MASK;
/*
* TODO: Consider setting the Receive Descriptor Minimum
* Threshold Size for an RSC case. This is not an obviously
* beneficiary option but the one worth considering...
*/
rscctl |= IXGBE_RSCCTL_RSCEN;
rscctl |= ixgbe_get_rscctl_maxdesc(rxq->mb_pool);
psrtype |= IXGBE_PSRTYPE_TCPHDR;
/*
* RSC: Set ITR interval corresponding to 2K ints/s.
*
* Full-sized RSC aggregations for a 10Gb/s link will
* arrive at about 20K aggregation/s rate.
*
* 2K inst/s rate will make only 10% of the
* aggregations to be closed due to the interrupt timer
* expiration for a streaming at wire-speed case.
*
* For a sparse streaming case this setting will yield
* at most 500us latency for a single RSC aggregation.
*/
eitr &= ~IXGBE_EITR_ITR_INT_MASK;
eitr |= IXGBE_EITR_INTERVAL_US(IXGBE_QUEUE_ITR_INTERVAL_DEFAULT);
eitr |= IXGBE_EITR_CNT_WDIS;
IXGBE_WRITE_REG(hw, IXGBE_SRRCTL(rxq->reg_idx), srrctl);
IXGBE_WRITE_REG(hw, IXGBE_RSCCTL(rxq->reg_idx), rscctl);
IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(rxq->reg_idx), psrtype);
IXGBE_WRITE_REG(hw, IXGBE_EITR(rxq->reg_idx), eitr);
/*
* RSC requires the mapping of the queue to the
* interrupt vector.
*/
ixgbe_set_ivar(dev, rxq->reg_idx, i, 0);
}
dev->data->lro = 1;
PMD_INIT_LOG(DEBUG, "enabling LRO mode");
return 0;
}
ixgbe_set_rsc会对DEV_RX_OFFLOAD_TCP_LRO进行判断来决定是否组包,这个LRO(Large Receive Offload)其实是RSC(Receive Side Coalescing)的另外一种描述方式。
四、总结
将整个转发的流程相关内容搞清楚后,再去和代码匹配学习,就会发现学习起来相对容易了很多。其实还是推荐大家多看一下DPDK的官方文档,这样会更准确更清晰。但看官方文档有时对小白确实不是一个很友好的入门的方法。这就需要看一看别人学习和实践的心得了。
仁者见仁吧,目的只有一个,学会并能熟练使用DPDK。