Linux网络设备驱动底层原理与实现详解
1. Linux网络设备驱动架构
1.1 网络子系统整体架构
Linux网络子系统采用分层架构设计,以net_device结构体为核心,构建起协议栈与硬件设备之间的桥梁。
用户空间
↓
系统调用接口 (socket)
↓
协议栈 (TCP/IP)
↓
网络设备接口层
↓
驱动程序
↓
硬件设备
核心组件关系
网络子系统主要包含以下关键组件:
- Socket层:提供用户空间网络编程接口
- 协议栈:实现TCP/IP协议族
- 设备接口层:抽象网络设备操作
- 驱动层:具体硬件设备控制
- PHY层:物理层芯片管理
1.2 net_device结构体核心地位
net_device是Linux网络子系统的中枢数据结构,定义了网络设备的所有属性和操作接口:
c
struct net_device {
char name[IFNAMSIZ]; /* 设备名称 */
struct hlist_node name_hlist; /* 名称哈希链表 */
struct hlist_node index_hlist; /* 索引哈希链表 */
int ifindex; /* 设备索引 */
/* 设备状态 */
unsigned long state; /* 设备状态位图 */
unsigned long priv_flags; /* 私有标志 */
/* 网络层信息 */
struct net *nd_net; /* 网络命名空间 */
struct in_device __rcu *ip_ptr; /* IPv4配置 */
struct inet6_dev *ip6_ptr; /* IPv6配置 */
/* 硬件地址 */
unsigned char perm_addr[MAX_ADDR_LEN]; /* 永久硬件地址 */
unsigned char addr_assign_type; /* 地址分配类型 */
/* 设备操作函数 */
const struct net_device_ops *netdev_ops;
const struct ethtool_ops *ethtool_ops;
/* 特性标志 */
netdev_features_t features; /* 当前特性 */
netdev_features_t hw_features; /* 硬件支持特性 */
netdev_features_t wanted_features; /* 期望特性 */
/* 统计数据 */
struct net_device_stats stats; /* 基本统计信息 */
atomic_long_t rx_dropped; /* 接收丢包计数 */
atomic_long_t tx_dropped; /* 发送丢包计数 */
/* NAPI相关 */
struct napi_struct *napi_list; /* NAPI结构链表 */
/* 私有数据 */
void *priv; /* 驱动私有数据 */
};
1.3 协议栈与驱动交互机制
NAPI模式(New API)
NAPI是Linux内核为高性能网络处理设计的接口,采用中断+轮询的混合模式:
c
struct napi_struct {
struct list_head poll_list; /* 轮询链表 */
unsigned long state; /* NAPI状态 */
int weight; /* 轮询权重 */
int (*poll)(struct napi_struct *, int); /* 轮询函数 */
struct net_device *dev; /* 关联设备 */
struct gro_list gro_list; /* GRO链表 */
};
NAPI工作流程:
- 中断触发:网卡接收到数据包时触发中断
- 关闭中断:驱动关闭中断,切换到轮询模式
- 轮询处理:内核调用注册的poll函数批量处理数据包
- 重新启用:处理完成后重新启用中断
非NAPI模式
传统的中断驱动模式,每个数据包都触发中断处理:
c
// 中断处理函数示例
static irqreturn_t network_interrupt(int irq, void *dev_id)
{
struct net_device *dev = dev_id;
struct network_priv *priv = netdev_priv(dev);
/* 检查中断状态 */
status = readl(priv->base + STATUS_REG);
if (status & RX_INTERRUPT) {
/* 处理接收中断 */
network_rx(dev);
}
if (status & TX_INTERRUPT) {
/* 处理发送完成中断 */
network_tx_complete(dev);
}
return IRQ_HANDLED;
}
1.4 内核网络数据流路径
发送路径(Tx)
用户空间send() → 系统调用 → 协议栈 → dev_queue_xmit() →
驱动发送函数 → 硬件DMA → 网卡发送 → 网络介质
详细流程:
- 系统调用:用户空间调用send()或sendto()
- 协议栈处理:TCP/UDP层添加协议头
- 路由选择:根据路由表选择出口设备
- 设备队列:调用dev_queue_xmit()进入设备队列
- 驱动发送:驱动程序将数据包映射到DMA缓冲区
- 硬件发送:网卡通过DMA读取数据并发送到网络
接收路径(Rx)
网络介质 → 网卡接收 → DMA传输 → 中断触发 →
驱动接收函数 → netif_rx() → 协议栈 → 用户空间
详细流程:
- 硬件接收:网卡检测到数据包到达
- DMA传输:网卡将数据通过DMA写入内存
- 中断通知:网卡触发中断通知CPU
- 驱动处理:驱动分配sk_buff并填充数据
- 协议栈处理:调用netif_rx()进入协议栈
- 用户接收:数据最终到达用户空间socket缓冲区
2. 关键数据结构解析
2.1 net_device结构体字段详解
基本属性字段
c
struct net_device {
/* 设备标识 */
char name[IFNAMSIZ]; /* 设备名称,如eth0 */
int ifindex; /* 唯一设备索引 */
/* 设备状态 */
unsigned long state; /* 设备状态位 */
#define __LINK_STATE_START 0 /* 设备已启动 */
#define __LINK_STATE_PRESENT 1 /* 设备存在 */
#define __LINK_STATE_NOCARRIER 2 /* 无载波 */
/* 硬件地址 */
unsigned char perm_addr[MAX_ADDR_LEN]; /* 永久MAC地址 */
unsigned char dev_addr[MAX_ADDR_LEN]; /* 当前MAC地址 */
unsigned char broadcast[MAX_ADDR_LEN]; /* 广播地址 */
/* 网络层配置 */
struct in_device __rcu *ip_ptr; /* IPv4配置信息 */
struct inet6_dev *ip6_ptr; /* IPv6配置信息 */
};
操作函数集
c
struct net_device_ops {
/* 设备生命周期 */
int (*ndo_init)(struct net_device *dev);
void (*ndo_uninit)(struct net_device *dev);
int (*ndo_open)(struct net_device *dev);
int (*ndo_stop)(struct net_device *dev);
/* 数据发送 */
netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
struct net_device *dev);
/* 设备配置 */
int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
/* 统计信息 */
struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
/* VLAN支持 */
int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
/* 多队列支持 */
u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
struct net_device *sb_dev);
};
2.2 sk_buff结构体内存布局
sk_buff是Linux网络子系统中最重要的数据结构,用于表示网络数据包:
c
struct sk_buff {
/* 链表管理 */
struct sk_buff *next; /* 下一个缓冲区 */
struct sk_buff *prev; /* 前一个缓冲区 */
/* 数据缓冲区管理 */
struct sk_buff_head *list; /* 所属链表头 */
unsigned char *head; /* 缓冲区起始位置 */
unsigned char *data; /* 数据起始位置 */
unsigned char *tail; /* 数据结束位置 */
unsigned char *end; /* 缓冲区结束位置 */
/* 数据长度信息 */
unsigned int len; /* 数据总长度 */
unsigned int data_len; /* 数据部分长度 */
__u16 mac_len; /* MAC头长度 */
__u16 hdr_len; /* 克隆时头部长度 */
/* 协议信息 */
__u16 protocol; /* 协议类型 */
__u16 transport_header; /* 传输层头偏移 */
__u16 network_header; /* 网络层头偏移 */
__u16 mac_header; /* MAC层头偏移 */
/* 设备信息 */
struct net_device *dev; /* 关联设备 */
struct net_device *input_dev; /* 输入设备 */
/* 特性标志 */
__u32 flags; /* 缓冲区标志 */
/* 校验和信息 */
__u32 ip_summed; /* IP校验和状态 */
__u32 csum; /* 校验和值 */
__u32 csum_start; /* 校验和开始位置 */
__u32 csum_offset; /* 校验和偏移 */
/* 时间戳 */
ktime_t tstamp; /* 时间戳 */
/* 私有数据 */
char cb[48] __aligned(8); /* 控制缓冲区 */
/* 引用计数 */
atomic_t users; /* 引用计数 */
};
sk_buff内存操作API
c
/* 分配sk_buff */
struct sk_buff *alloc_skb(unsigned int size, gfp_t priority);
struct sk_buff *dev_alloc_skb(unsigned int length);
/* 释放sk_buff */
void kfree_skb(struct sk_buff *skb);
void dev_kfree_skb(struct sk_buff *skb);
/* 数据预留和对齐 */
static inline void skb_reserve(struct sk_buff *skb, int len);
static inline unsigned char *skb_push(struct sk_buff *skb, unsigned int len);
static inline unsigned char *skb_pull(struct sk_buff *skb, unsigned int len);
static inline unsigned char *skb_put(struct sk_buff *skb, unsigned int len);
/* 克隆和复制 */
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
/* 协议头处理 */
static inline struct ethhdr *eth_hdr(const struct sk_buff *skb);
static inline struct iphdr *ip_hdr(const struct sk_buff *skb);
static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb);
2.3 网络设备操作集详解
基本操作实现
c
static const struct net_device_ops my_netdev_ops = {
.ndo_init = my_init,
.ndo_uninit = my_uninit,
.ndo_open = my_open,
.ndo_stop = my_stop,
.ndo_start_xmit = my_xmit,
.ndo_set_mac_address = my_set_mac,
.ndo_validate_addr = eth_validate_addr,
.ndo_do_ioctl = my_ioctl,
.ndo_get_stats = my_get_stats,
.ndo_change_mtu = eth_change_mtu,
.ndo_tx_timeout = my_tx_timeout,
.ndo_set_rx_mode = my_set_multicast_list,
.ndo_vlan_rx_add_vid = my_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = my_vlan_rx_kill_vid,
};
3. 驱动实现核心技术
3.1 物理层适配(PHY/MAC交互)
PHY设备管理
c
struct phy_device {
struct bus_type *bus; /* 总线类型 */
struct device dev; /* 设备结构 */
u32 phy_id; /* PHY芯片ID */
const char *drv_name; /* 驱动名称 */
/* PHY状态 */
int speed; /* 连接速度 */
int duplex; /* 双工模式 */
int link; /* 连接状态 */
int pause; /* 流控状态 */
/* 状态机 */
enum phy_state state; /* PHY状态 */
struct delayed_work state_queue; /* 状态查询工作队列 */
/* 配置 */
struct phy_driver *drv; /* PHY驱动 */
u32 supported; /* 支持特性 */
u32 advertising; /* 通告特性 */
u32 lp_advertising; /* 对端通告特性 */
};
MAC层与PHY层交互
c
/* PHY状态变化通知 */
void phy_state_machine(struct work_struct *work)
{
struct phy_device *phydev =
container_of(work, struct phy_device, state_queue.work);
int old_state = phydev->state;
int needs_aneg = 0, err = 0;
mutex_lock(&phydev->lock);
switch (phydev->state) {
case PHY_UP:
needs_aneg = 1;
/* 配置自动协商 */
phydev->state = PHY_AN;
break;
case PHY_AN:
err = phy_read_status(phydev);
if (err)
break;
if (phydev->link) {
phydev->state = PHY_RUNNING;
netif_carrier_on(phydev->attached_dev);
} else {
phydev->state = PHY_NOLINK;
netif_carrier_off(phydev->attached_dev);
}
break;
case PHY_NOLINK:
case PHY_RUNNING:
err = phy_read_status(phydev);
if (err)
break;
if (phydev->link && phydev->state == PHY_NOLINK) {
phydev->state = PHY_RUNNING;
netif_carrier_on(phydev->attached_dev);
} else if (!phydev->link && phydev->state == PHY_RUNNING) {
phydev->state = PHY_NOLINK;
netif_carrier_off(phydev->attached_dev);
}
break;
default:
break;
}
mutex_unlock(&phydev->lock);
if (needs_aneg)
phy_start_aneg(phydev);
/* 调度下一次状态检查 */
schedule_delayed_work(&phydev->state_queue, HZ);
}
3.2 DMA环形缓冲区实现原理
环形缓冲区设计
c
struct dma_ring {
dma_addr_t *dma_addr; /* DMA地址数组 */
struct sk_buff **skb; /* sk_buff指针数组 */
u32 size; /* 缓冲区大小 */
u32 count; /* 缓冲区数量 */
/* 生产者指针 */
u32 prod; /* 生产索引 */
u32 prod_next; /* 下一个生产索引 */
/* 消费者指针 */
u32 cons; /* 消费索引 */
u32 cons_next; /* 下一个消费索引 */
/* 同步机制 */
spinlock_t lock; /* 自旋锁 */
atomic_t pending; /* 待处理计数 */
};
DMA映射和内存管理
c
/* 分配DMA缓冲区 */
static int dma_ring_alloc(struct dma_ring *ring, u32 size,
struct device *dev)
{
int i;
ring->size = size;
ring->count = size;
ring->prod = 0;
ring->cons = 0;
/* 分配DMA地址数组 */
ring->dma_addr = kzalloc(sizeof(dma_addr_t) * size, GFP_KERNEL);
if (!ring->dma_addr)
return -ENOMEM;
/* 分配sk_buff指针数组 */
ring->skb = kzalloc(sizeof(struct sk_buff *) * size, GFP_KERNEL);
if (!ring->skb) {
kfree(ring->dma_addr);
return -ENOMEM;
}
/* 初始化每个缓冲区 */
for (i = 0; i < size; i++) {
struct sk_buff *skb;
dma_addr_t dma_addr;
/* 分配sk_buff */
skb = netdev_alloc_skb_ip_align(dev, RX_BUF_SIZE);
if (!skb)
goto err_free;
/* 映射DMA */
dma_addr = dma_map_single(dev, skb->data,
RX_BUF_SIZE, DMA_FROM_DEVICE);
if (dma_mapping_error(dev, dma_addr)) {
dev_kfree_skb_any(skb);
goto err_free;
}
ring->skb[i] = skb;
ring->dma_addr[i] = dma_addr;
}
spin_lock_init(&ring->lock);
atomic_set(&ring->pending, 0);
return 0;
err_free:
for (i = 0; i < ring->count; i++) {
if (ring->skb[i]) {
dma_unmap_single(dev, ring->dma_addr[i],
RX_BUF_SIZE, DMA_FROM_DEVICE);
dev_kfree_skb_any(ring->skb[i]);
}
}
kfree(ring->skb);
kfree(ring->dma_addr);
return -ENOMEM;
}
DMA接收流程
c
static int dma_rx_process(struct net_device *dev,
struct dma_ring *ring, int budget)
{
struct device *dma_dev = dev->dev.parent;
int processed = 0;
while (processed < budget && ring->cons != ring->prod) {
struct sk_buff *skb;
u32 cons = ring->cons;
int pkt_len;
/* 获取数据包长度 */
pkt_len = dma_get_pkt_len(dev, cons);
if (pkt_len < 0)
break;
skb = ring->skb[cons];
/* 取消DMA映射 */
dma_unmap_single(dma_dev, ring->dma_addr[cons],
RX_BUF_SIZE, DMA_FROM_DEVICE);
/* 设置数据包长度 */
skb_put(skb, pkt_len);
/* 设置协议类型 */
skb->protocol = eth_type_trans(skb, dev);
/* 更新统计信息 */
dev->stats.rx_packets++;
dev->stats.rx_bytes += pkt_len;
/* 提交给协议栈 */
netif_receive_skb(skb);
/* 分配新的sk_buff */
skb = netdev_alloc_skb_ip_align(dev, RX_BUF_SIZE);
if (skb) {
dma_addr_t dma_addr;
dma_addr = dma_map_single(dma_dev, skb->data,
RX_BUF_SIZE, DMA_FROM_DEVICE);
if (!dma_mapping_error(dma_dev, dma_addr)) {
ring->skb[cons] = skb;
ring->dma_addr[cons] = dma_addr;
/* 重新提交给硬件 */
dma_submit_rx_desc(dev, cons, dma_addr, RX_BUF_SIZE);
} else {
dev_kfree_skb_any(skb);
}
}
ring->cons = (cons + 1) % ring->count;
processed++;
}
return processed;
}
3.3 中断处理与轮询模式选择
中断处理优化
c
/* 中断处理函数 */
static irqreturn_t network_irq_handler(int irq, void *dev_id)
{
struct net_device *dev = dev_id;
struct network_priv *priv = netdev_priv(dev);
u32 status;
/* 读取中断状态 */
status = readl(priv->base + INTR_STATUS_REG);
/* 清除中断 */
writel(status, priv->base + INTR_ACK_REG);
if (status & RX_INTR) {
if (priv->use_napi) {
/* NAPI模式:关闭中断,调度NAPI */
writel(0, priv->base + INTR_MASK_REG);
napi_schedule(&priv->napi);
} else {
/* 传统中断模式:直接处理 */
network_rx(dev);
}
}
if (status & TX_INTR) {
/* 处理发送完成中断 */
network_tx_complete(dev);
}
return IRQ_HANDLED;
}
/* NAPI轮询函数 */
static int network_poll(struct napi_struct *napi, int budget)
{
struct network_priv *priv = container_of(napi, struct network_priv, napi);
struct net_device *dev = priv->dev;
int work_done = 0;
/* 处理接收 */
work_done = network_rx_poll(dev, budget);
/* 处理发送完成 */
network_tx_complete(dev);
if (work_done < budget) {
/* 处理完成,重新启用中断 */
napi_complete(napi);
writel(INTR_MASK_DEFAULT, priv->base + INTR_MASK_REG);
}
return work_done;
}
自适应中断调节
c
/* 根据负载动态调整中断模式 */
static void network_adaptive_irq(struct net_device *dev)
{
struct network_priv *priv = netdev_priv(dev);
unsigned long now = jiffies;
/* 统计最近时间窗口内的数据包数量 */
if (time_after(now, priv->stats_time + HZ)) {
u32 pkt_rate = priv->rx_packets - priv->last_rx_packets;
if (pkt_rate > HIGH_PKT_RATE_THRESHOLD) {
/* 高负载:切换到NAPI轮询模式 */
if (!priv->use_napi) {
priv->use_napi = 1;
/* 配置NAPI参数 */
priv->napi.weight = 64;
netif_napi_add(dev, &priv->napi, network_poll, 64);
napi_enable(&priv->napi);
}
} else if (pkt_rate < LOW_PKT_RATE_THRESHOLD) {
/* 低负载:切换到中断模式 */
if (priv->use_napi) {
napi_disable(&priv->napi);
netif_napi_del(&priv->napi);
priv->use_napi = 0;
}
}
/* 更新统计 */
priv->last_rx_packets = priv->rx_packets;
priv->stats_time = now;
}
}
3.4 流量控制(QoS)实现机制
流量控制框架
c
struct netdev_queue {
struct net_device *dev; /* 所属设备 */
/* 队列状态 */
spinlock_t _xmit_lock; /* 发送锁 */
int xmit_lock_owner; /* 锁持有者 */
/* 队列长度管理 */
unsigned long tx_maxrate; /* 最大发送速率 */
unsigned long tx_rate; /* 当前发送速率 */
/* Qdisc配置 */
struct Qdisc *qdisc; /* 排队规则 */
struct Qdisc *qdisc_sleeping; /* 休眠Qdisc */
/* 统计数据 */
struct netdev_queue_stats *stats; /* 统计信息 */
};
QoS队列实现
c
/* 多队列发送函数 */
static netdev_tx_t network_select_queue(struct sk_buff *skb,
struct net_device *dev,
struct net_device *sb_dev)
{
struct network_priv *priv = netdev_priv(dev);
u16 queue_index = 0;
/* 基于数据包特征选择队列 */
if (skb->priority >= 0 && skb->priority < priv->num_tx_queues) {
queue_index = skb->priority;
} else {
/* 基于哈希选择队列 */
queue_index = skb_get_hash(skb) % priv->num_tx_queues;
}
return queue_index;
}
/* 流量整形 */
static int network_setup_tc(struct net_device *dev, enum tc_setup_type type,
void *type_data)
{
struct network_priv *priv = netdev_priv(dev);
switch (type) {
case TC_SETUP_QDISC_MQPRIO:
return network_setup_mqprio(dev, type_data);
case TC_SETUP_CLSFLOWER:
return network_setup_clsflower(dev, type_data);
case TC_SETUP_QDISC_TBF:
return network_setup_tbf(dev, type_data);
default:
return -EOPNOTSUPP;
}
}
/* 多队列优先级配置 */
static int network_setup_mqprio(struct net_device *dev, void *type_data)
{
struct tc_mqprio_qopt *qopt = type_data;
struct network_priv *priv = netdev_priv(dev);
int i;
/* 验证队列数量 */
if (qopt->num_tc > priv->max_tc || qopt->num_queues > priv->num_tx_queues)
return -EINVAL;
/* 配置流量类别 */
for (i = 0; i < qopt->num_tc; i++) {
priv->tc_to_queue[i] = qopt->offset[i];
priv->tc_to_count[i] = qopt->count[i];
}
priv->num_tc = qopt->num_tc;
/* 更新硬件寄存器 */
network_hw_setup_tc(priv, qopt);
return 0;
}
4. 性能优化要点
4.1 零拷贝技术实现
DMA零拷贝
c
/* 零拷贝发送实现 */
static netdev_tx_t network_xmit_zero_copy(struct sk_buff *skb,
struct net_device *dev)
{
struct network_priv *priv = netdev_priv(dev);
struct dma_tx_desc *desc;
dma_addr_t dma_addr;
int entry;
/* 获取发送描述符 */
entry = priv->tx_prod;
desc = &priv->tx_ring[entry];
/* 直接使用sk_buff数据,避免复制 */
dma_addr = dma_map_single(dev->dev.parent, skb->data,
skb->len, DMA_TO_DEVICE);
if (dma_mapping_error(dev->dev.parent, dma_addr)) {
dev_kfree_skb_any(skb);
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
/* 填充描述符 */
desc->addr = dma_addr;
desc->len = skb->len;
desc->cmd = TX_DESC_CMD_EOP | TX_DESC_CMD_RS;
/* 保存sk_buff指针用于完成处理 */
priv->tx_skb[entry] = skb;
/* 更新生产者指针 */
priv->tx_prod = (entry + 1) % TX_RING_SIZE;
/* 通知硬件 */
writel(TX_RING_TAIL(priv->tx_prod), priv->base + TX_TAIL_REG);
/* 检查是否需要停止队列 */
if ((priv->tx_prod + 1) % TX_RING_SIZE == priv->tx_cons) {
netif_stop_queue(dev);
}
return NETDEV_TX_OK;
}
页面零拷贝
c
/* 页面映射零拷贝 */
static int network_map_page(struct net_device *dev, struct page *page,
unsigned int offset, unsigned int size,
int direction)
{
struct network_priv *priv = netdev_priv(dev);
dma_addr_t dma_addr;
/* 映射页面到DMA */
dma_addr = dma_map_page(dev->dev.parent, page, offset, size, direction);
if (dma_mapping_error(dev->dev.parent, dma_addr))
return -ENOMEM;
/* 配置硬件描述符 */
network_setup_page_desc(priv, dma_addr, size, offset);
return 0;
}
4.2 多队列网卡驱动设计
RSS(接收端缩放)
c
struct rss_config {
u8 num_queues; /* RSS队列数量 */
u8 hash_key[40]; /* RSS哈希密钥 */
u32 hash_types; /* 哈希类型 */
u16 indirection_table[128]; /* 重定向表 */
};
/* RSS配置函数 */
static int network_setup_rss(struct net_device *dev, u32 *indir,
u8 *key, u8 hfunc)
{
struct network_priv *priv = netdev_priv(dev);
struct rss_config *rss = &priv->rss;
int i;
/* 验证参数 */
if (hfunc != ETH_RSS_HASH_TOP)
return -EOPNOTSUPP;
/* 配置重定向表 */
if (indir) {
for (i = 0; i < 128; i++) {
if (indir[i] >= priv->num_rx_queues)
return -EINVAL;
rss->indirection_table[i] = indir[i];
}
}
/* 配置哈希密钥 */
if (key) {
memcpy(rss->hash_key, key, 40);
}
/* 更新硬件 */
network_hw_setup_rss(priv, rss);
return 0;
}
/* RSS哈希计算 */
static u32 network_rss_hash(struct net_device *dev, struct sk_buff *skb)
{
struct network_priv *priv = netdev_priv(dev);
u32 hash;
/* 根据数据包类型选择哈希函数 */
switch (skb->protocol) {
case htons(ETH_P_IP):
hash = network_rss_hash_ipv4(priv, skb);
break;
case htons(ETH_P_IPV6):
hash = network_rss_hash_ipv6(priv, skb);
break;
default:
hash = skb_get_hash_raw(skb);
break;
}
return hash;
}
RPS(接收包转向)
c
/* RPS配置 */
static int network_setup_rps(struct net_device *dev, int queue_id,
struct rps_map *map)
{
struct network_priv *priv = netdev_priv(dev);
struct netdev_rx_queue *rx_queue;
if (queue_id >= priv->num_rx_queues)
return -EINVAL;
rx_queue = &priv->rx_queues[queue_id];
/* 配置CPU映射 */
if (map) {
rx_queue->rps_map = map;
rx_queue->rps_enabled = 1;
} else {
rx_queue->rps_enabled = 0;
}
return 0;
}
4.3 软中断负载均衡
软中断处理优化
c
/* 软中断处理函数 */
static void network_napi_complete(struct napi_struct *napi)
{
struct network_priv *priv = container_of(napi, struct network_priv, napi);
int cpu = smp_processor_id();
/* 动态调整NAPI权重 */
if (napi->weight < 64 && priv->rx_packets > 10000) {
napi->weight = 64;
} else if (napi->weight > 16 && priv->rx_packets < 1000) {
napi->weight = 16;
}
/* 重新调度到合适的CPU */
if (cpu != priv->preferred_cpu && priv->rx_packets > 5000) {
/* 高负载时迁移到首选CPU */
irq_set_affinity(priv->irq, cpumask_of(priv->preferred_cpu));
}
napi_complete(napi);
}
CPU亲和性管理
c
/* CPU亲和性配置 */
static int network_set_cpu_affinity(struct net_device *dev,
const struct cpumask *mask)
{
struct network_priv *priv = netdev_priv(dev);
int cpu;
/* 验证CPU有效性 */
for_each_cpu(cpu, mask) {
if (!cpu_online(cpu))
return -EINVAL;
}
/* 配置IRQ亲和性 */
if (priv->irq >= 0) {
int ret = irq_set_affinity(priv->irq, mask);
if (ret)
return ret;
}
/* 配置NAPI亲和性 */
if (priv->use_napi) {
netif_set_xps_queue(dev, mask, 0);
}
/* 保存配置 */
cpumask_copy(&priv->cpu_affinity, mask);
return 0;
}
4.4 巨型帧(Jumbo Frame)支持
巨型帧配置
c
/* MTU变更处理 */
static int network_change_mtu(struct net_device *dev, int new_mtu)
{
struct network_priv *priv = netdev_priv(dev);
int max_mtu = priv->hw->max_mtu;
int min_mtu = ETH_MIN_MTU;
/* 验证MTU范围 */
if (new_mtu < min_mtu || new_mtu > max_mtu)
return -EINVAL;
/* 检查是否需要重新分配缓冲区 */
if (new_mtu > PAGE_SIZE) {
int needed_pages = (new_mtu + PAGE_SIZE - 1) / PAGE_SIZE;
/* 验证系统支持 */
if (!priv->hw->support_multi_page) {
netdev_err(dev, "Hardware doesn't support multi-page frames\n");
return -EOPNOTSUPP;
}
}
/* 停止设备 */
if (netif_running(dev))
network_close(dev);
/* 更新缓冲区大小 */
priv->rx_buf_size = new_mtu + ETH_HLEN + VLAN_HLEN + NET_IP_ALIGN;
priv->tx_buf_size = new_mtu + ETH_HLEN + VLAN_HLEN;
/* 重新分配环形缓冲区 */
network_free_rings(dev);
network_alloc_rings(dev);
/* 重新启动设备 */
if (priv->state == NETWORK_STATE_OPEN)
network_open(dev);
dev->mtu = new_mtu;
return 0;
}
巨型帧接收处理
c
/* 巨型帧接收处理 */
static int network_rx_jumbo(struct net_device *dev, int budget)
{
struct network_priv *priv = netdev_priv(dev);
int processed = 0;
while (processed < budget) {
struct sk_buff *skb;
u32 len, status;
int page_cnt;
/* 读取状态 */
status = readl(priv->base + RX_STATUS_REG);
if (!(status & RX_STATUS_VALID))
break;
len = (status >> 16) & 0x3FFF;
/* 计算需要的页面数 */
page_cnt = (len + PAGE_SIZE - 1) / PAGE_SIZE;
if (page_cnt == 1) {
/* 单页帧处理 */
skb = netdev_alloc_skb_ip_align(dev, len);
if (!skb) {
dev->stats.rx_errors++;
continue;
}
/* 复制数据 */
memcpy_fromio(skb->data, priv->rx_buf, len);
skb_put(skb, len);
} else {
/* 多页帧处理 */
skb = network_build_jumbo_skb(dev, len, page_cnt);
if (!skb) {
dev->stats.rx_errors++;
continue;
}
}
/* 设置协议类型 */
skb->protocol = eth_type_trans(skb, dev);
/* 更新统计 */
dev->stats.rx_packets++;
dev->stats.rx_bytes += len;
/* 提交给协议栈 */
napi_gro_receive(&priv->napi, skb);
processed++;
}
return processed;
}
/* 构建多页sk_buff */
static struct sk_buff *network_build_jumbo_skb(struct net_device *dev,
u32 len, int page_cnt)
{
struct sk_buff *skb;
struct page *page;
int i, offset = 0;
/* 分配sk_buff */
skb = netdev_alloc_skb(dev, 0);
if (!skb)
return NULL;
/* 分配页面并填充数据 */
for (i = 0; i < page_cnt; i++) {
int copy_len = min_t(int, len - offset, PAGE_SIZE);
page = alloc_page(GFP_ATOMIC);
if (!page)
goto err_free;
/* 复制数据到页面 */
memcpy_fromio(page_address(page),
priv->rx_buf + offset, copy_len);
/* 添加到sk_buff */
if (skb_add_rx_frag(skb, i, page, 0, copy_len, PAGE_SIZE) < 0) {
__free_page(page);
goto err_free;
}
offset += copy_len;
}
skb->len = len;
skb->data_len = len;
skb->truesize += len;
return skb;
err_free:
dev_kfree_skb_any(skb);
return NULL;
}
5. 实际驱动开发示例
5.1 字符设备注册流程
设备初始化
c
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (C) 2024 Linux Network Driver Project */
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/mii.h>
#include <linux/dma-mapping.h>
#include <linux/interrupt.h>
#define DRV_NAME "example_network"
#define DRV_VERSION "1.0.0"
MODULE_AUTHOR("Linux Network Driver Project");
MODULE_DESCRIPTION("Example Network Driver Implementation");
MODULE_LICENSE("GPL v2");
MODULE_VERSION(DRV_VERSION);
/* 设备私有数据结构 */
struct example_priv {
struct net_device *dev; /* 网络设备 */
void __iomem *base; /* I/O内存基地址 */
int irq; /* 中断号 */
/* PHY管理 */
struct phy_device *phydev; /* PHY设备 */
struct mii_bus *mii_bus; /* MII总线 */
/* DMA管理 */
dma_addr_t rx_dma; /* RX DMA地址 */
dma_addr_t tx_dma; /* TX DMA地址 */
void *rx_buf; /* RX缓冲区 */
void *tx_buf; /* TX缓冲区 */
/* NAPI */
struct napi_struct napi; /* NAPI结构 */
/* 统计信息 */
struct net_device_stats stats; /* 网络统计 */
/* 配置参数 */
int rx_ring_size; /* RX环大小 */
int tx_ring_size; /* TX环大小 */
int msg_enable; /* 消息级别 */
};
/* 设备探测函数 */
static int example_probe(struct platform_device *pdev)
{
struct net_device *dev;
struct example_priv *priv;
struct resource *res;
int ret;
/* 分配网络设备 */
dev = alloc_etherdev(sizeof(struct example_priv));
if (!dev) {
dev_err(&pdev->dev, "Failed to allocate netdev\n");
return -ENOMEM;
}
priv = netdev_priv(dev);
priv->dev = dev;
/* 设置设备操作函数 */
dev->netdev_ops = &example_netdev_ops;
dev->ethtool_ops = &example_ethtool_ops;
/* 获取I/O资源 */
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
priv->base = devm_ioremap_resource(&pdev->dev, res);
if (IS_ERR(priv->base)) {
ret = PTR_ERR(priv->base);
goto err_free_netdev;
}
/* 获取中断资源 */
priv->irq = platform_get_irq(pdev, 0);
if (priv->irq < 0) {
ret = priv->irq;
goto err_free_netdev;
}
/* 设置默认配置 */
priv->rx_ring_size = 256;
priv->tx_ring_size = 256;
priv->msg_enable = NETIF_MSG_LINK | NETIF_MSG_IFUP;
/* 设置设备特性 */
dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
/* 注册网络设备 */
ret = register_netdev(dev);
if (ret) {
dev_err(&pdev->dev, "Failed to register netdev\n");
goto err_free_netdev;
}
platform_set_drvdata(pdev, dev);
dev_info(&pdev->dev, "%s: Example network driver initialized\n",
dev->name);
return 0;
err_free_netdev:
free_netdev(dev);
return ret;
}
/* 设备移除函数 */
static int example_remove(struct platform_device *pdev)
{
struct net_device *dev = platform_get_drvdata(pdev);
unregister_netdev(dev);
free_netdev(dev);
return 0;
}
/* 平台驱动结构 */
static struct platform_driver example_driver = {
.probe = example_probe,
.remove = example_remove,
.driver = {
.name = DRV_NAME,
.of_match_table = example_of_match,
},
};
module_platform_driver(example_driver);
5.2 基本发送/接收函数实现
数据包发送函数
c
/* 数据包发送函数 */
static netdev_tx_t example_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct example_priv *priv = netdev_priv(dev);
unsigned int len = skb->len;
dma_addr_t dma_addr;
int ret;
/* 检查设备状态 */
if (!(readl(priv->base + NET_CTRL_REG) & NET_CTRL_TX_ENABLE)) {
dev->stats.tx_dropped++;
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
/* 映射DMA缓冲区 */
dma_addr = dma_map_single(dev->dev.parent, skb->data, len, DMA_TO_DEVICE);
if (dma_mapping_error(dev->dev.parent, dma_addr)) {
dev->stats.tx_dropped++;
dev_kfree_skb_any(skb);
return NETDEV_TX_OK;
}
/* 保存sk_buff指针 */
priv->tx_skb = skb;
/* 配置发送描述符 */
writel(dma_addr, priv->base + TX_DESC_ADDR_REG);
writel(len, priv->base + TX_DESC_LEN_REG);
writel(TX_DESC_CMD_EOP | TX_DESC_CMD_RS, priv->base + TX_DESC_CTRL_REG);
/* 启动发送 */
writel(TX_START, priv->base + TX_START_REG);
/* 停止队列直到发送完成 */
netif_stop_queue(dev);
/* 设置超时 */
priv->tx_timeout = jiffies + TX_TIMEOUT;
return NETDEV_TX_OK;
}
/* 发送完成中断处理 */
static void example_tx_complete(struct net_device *dev)
{
struct example_priv *priv = netdev_priv(dev);
struct sk_buff *skb;
u32 status;
/* 读取发送状态 */
status = readl(priv->base + TX_STATUS_REG);
if (status & TX_STATUS_COMPLETE) {
skb = priv->tx_skb;
if (skb) {
dma_addr_t dma_addr;
/* 获取DMA地址 */
dma_addr = readl(priv->base + TX_DESC_ADDR_REG);
/* 取消DMA映射 */
dma_unmap_single(dev->dev.parent, dma_addr,
skb->len, DMA_TO_DEVICE);
/* 更新统计 */
dev->stats.tx_packets++;
dev->stats.tx_bytes += skb->len;
/* 释放sk_buff */
dev_consume_skb_any(skb);
priv->tx_skb = NULL;
/* 重新启动队列 */
netif_wake_queue(dev);
}
}
if (status & TX_STATUS_ERROR) {
dev->stats.tx_errors++;
/* 处理错误 */
if (status & TX_STATUS_UNDERRUN) {
/* FIFO下溢,增加FIFO阈值 */
network_adjust_fifo_threshold(priv);
}
}
}
数据包接收函数
c
/* 数据包接收函数 */
static int example_rx(struct net_device *dev, int budget)
{
struct example_priv *priv = netdev_priv(dev);
int received = 0;
while (received < budget) {
struct sk_buff *skb;
dma_addr_t dma_addr;
u32 len, status;
/* 读取接收状态 */
status = readl(priv->base + RX_STATUS_REG);
if (!(status & RX_STATUS_VALID))
break;
/* 读取数据包长度 */
len = (status >> 16) & 0x3FFF;
/* 分配sk_buff */
skb = netdev_alloc_skb_ip_align(dev, len + NET_IP_ALIGN);
if (!skb) {
dev->stats.rx_dropped++;
/* 丢弃数据包 */
writel(RX_DISCARD, priv->base + RX_CTRL_REG);
continue;
}
/* 映射DMA缓冲区 */
dma_addr = dma_map_single(dev->dev.parent, skb->data,
len, DMA_FROM_DEVICE);
if (dma_mapping_error(dev->dev.parent, dma_addr)) {
dev_kfree_skb_any(skb);
dev->stats.rx_dropped++;
continue;
}
/* 配置接收描述符 */
writel(dma_addr, priv->base + RX_DESC_ADDR_REG);
writel(len, priv->base + RX_DESC_LEN_REG);
writel(RX_DESC_CMD_OWN, priv->base + RX_DESC_CTRL_REG);
/* 启动接收 */
writel(RX_START, priv->base + RX_START_REG);
/* 等待接收完成 */
if (!network_wait_for_rx(priv)) {
/* 接收超时 */
dma_unmap_single(dev->dev.parent, dma_addr,
len, DMA_FROM_DEVICE);
dev_kfree_skb_any(skb);
dev->stats.rx_errors++;
continue;
}
/* 取消DMA映射 */
dma_unmap_single(dev->dev.parent, dma_addr,
len, DMA_FROM_DEVICE);
/* 设置数据包长度 */
skb_put(skb, len);
/* 设置协议类型 */
skb->protocol = eth_type_trans(skb, dev);
/* 更新统计 */
dev->stats.rx_packets++;
dev->stats.rx_bytes += len;
/* 校验和检查 */
if (status & RX_STATUS_CSUM_OK) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
} else {
skb->ip_summed = CHECKSUM_NONE;
}
/* 提交给协议栈 */
napi_gro_receive(&priv->napi, skb);
received++;
}
return received;
}
/* NAPI轮询函数 */
static int example_poll(struct napi_struct *napi, int budget)
{
struct example_priv *priv = container_of(napi, struct example_priv, napi);
struct net_device *dev = priv->dev;
int work_done;
/* 处理接收 */
work_done = example_rx(dev, budget);
/* 处理发送完成 */
example_tx_complete(dev);
/* 如果处理完成,重新启用中断 */
if (work_done < budget) {
napi_complete(napi);
/* 重新启用中断 */
writel(INTR_ENABLE, priv->base + INTR_MASK_REG);
}
return work_done;
}
5.3 统计计数器维护
硬件统计收集
c
/* 更新硬件统计信息 */
static void example_update_stats(struct net_device *dev)
{
struct example_priv *priv = netdev_priv(dev);
struct net_device_stats *stats = &dev->stats;
/* 读取硬件统计寄存器 */
stats->rx_packets += readl(priv->base + RX_PACKETS_REG);
stats->tx_packets += readl(priv->base + TX_PACKETS_REG);
stats->rx_bytes += readl(priv->base + RX_BYTES_REG);
stats->tx_bytes += readl(priv->base + TX_BYTES_REG);
stats->rx_errors += readl(priv->base + RX_ERRORS_REG);
stats->tx_errors += readl(priv->base + TX_ERRORS_REG);
stats->rx_dropped += readl(priv->base + RX_DROPPED_REG);
stats->tx_dropped += readl(priv->base + TX_DROPPED_REG);
stats->multicast += readl(priv->base + RX_MULTICAST_REG);
stats->collisions += readl(priv->base + COLLISIONS_REG);
/* 详细错误统计 */
stats->rx_length_errors += readl(priv->base + RX_LEN_ERRORS_REG);
stats->rx_over_errors += readl(priv->base + RX_OVER_ERRORS_REG);
stats->rx_crc_errors += readl(priv->base + RX_CRC_ERRORS_REG);
stats->rx_frame_errors += readl(priv->base + RX_FRAME_ERRORS_REG);
stats->rx_fifo_errors += readl(priv->base + RX_FIFO_ERRORS_REG);
stats->rx_missed_errors += readl(priv->base + RX_MISSED_ERRORS_REG);
stats->tx_aborted_errors += readl(priv->base + TX_ABORT_ERRORS_REG);
stats->tx_carrier_errors += readl(priv->base + TX_CARRIER_ERRORS_REG);
stats->tx_fifo_errors += readl(priv->base + TX_FIFO_ERRORS_REG);
stats->tx_heartbeat_errors += readl(priv->base + TX_HB_ERRORS_REG);
stats->tx_window_errors += readl(priv->base + TX_WINDOW_ERRORS_REG);
/* 清除硬件计数器 */
writel(0, priv->base + STATS_CLEAR_REG);
}
/* ethtool统计实现 */
static void example_get_ethtool_stats(struct net_device *dev,
struct ethtool_stats *stats, u64 *data)
{
struct example_priv *priv = netdev_priv(dev);
int i;
/* 更新统计信息 */
example_update_stats(dev);
/* 填充ethtool统计 */
for (i = 0; i < EXAMPLE_NUM_STATS; i++) {
data[i] = example_ethtool_stats[i].get_stat(priv);
}
}
/* ethtool统计信息定义 */
static const struct example_ethtool_stats example_ethtool_stats[] = {
ETHTOOL_STAT(rx_packets, get_rx_packets),
ETHTOOL_STAT(tx_packets, get_tx_packets),
ETHTOOL_STAT(rx_bytes, get_rx_bytes),
ETHTOOL_STAT(tx_bytes, get_tx_bytes),
ETHTOOL_STAT(rx_errors, get_rx_errors),
ETHTOOL_STAT(tx_errors, get_tx_errors),
ETHTOOL_STAT(rx_dropped, get_rx_dropped),
ETHTOOL_STAT(tx_dropped, get_tx_dropped),
/* 更多统计项... */
};
static int example_get_sset_count(struct net_device *dev, int sset)
{
switch (sset) {
case ETH_SS_STATS:
return EXAMPLE_NUM_STATS;
default:
return -EOPNOTSUPP;
}
}
static void example_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
u8 *p = data;
int i;
switch (stringset) {
case ETH_SS_STATS:
for (i = 0; i < EXAMPLE_NUM_STATS; i++) {
memcpy(p, example_ethtool_stats[i].name, ETH_GSTRING_LEN);
p += ETH_GSTRING_LEN;
}
break;
}
}
5.4 ethtool支持实现
ethtool操作实现
c
/* ethtool操作函数 */
static const struct ethtool_ops example_ethtool_ops = {
.get_link = ethtool_op_get_link,
.get_drvinfo = example_get_drvinfo,
.get_msglevel = example_get_msglevel,
.set_msglevel = example_set_msglevel,
.get_regs_len = example_get_regs_len,
.get_regs = example_get_regs,
.get_wol = example_get_wol,
.set_wol = example_set_wol,
.get_eeprom_len = example_get_eeprom_len,
.get_eeprom = example_get_eeprom,
.set_eeprom = example_set_eeprom,
.get_pauseparam = example_get_pauseparam,
.set_pauseparam = example_set_pauseparam,
.get_ringparam = example_get_ringparam,
.set_ringparam = example_set_ringparam,
.get_coalesce = example_get_coalesce,
.set_coalesce = example_set_coalesce,
.get_strings = example_get_strings,
.get_ethtool_stats = example_get_ethtool_stats,
.get_sset_count = example_get_sset_count,
};
/* 驱动信息获取 */
static void example_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
struct example_priv *priv = netdev_priv(dev);
strscpy(info->driver, DRV_NAME, sizeof(info->driver));
strscpy(info->version, DRV_VERSION, sizeof(info->version));
strscpy(info->bus_info, dev_name(dev->dev.parent), sizeof(info->bus_info));
/* 固件版本 */
snprintf(info->fw_version, sizeof(info->fw_version), "%d.%d",
priv->fw_major, priv->fw_minor);
/* 特性信息 */
info->n_stats = EXAMPLE_NUM_STATS;
info->testinfo_len = 0;
info->eedump_len = example_get_eeprom_len(dev);
info->regdump_len = example_get_regs_len(dev);
}
/* 寄存器转储 */
static void example_get_regs(struct net_device *dev,
struct ethtool_regs *regs, void *p)
{
struct example_priv *priv = netdev_priv(dev);
u32 *regs_buff = p;
int i;
/* 读取所有寄存器 */
for (i = 0; i < regs->len / 4; i++) {
regs_buff[i] = readl(priv->base + i * 4);
}
}
/* 环参数配置 */
static void example_get_ringparam(struct net_device *dev,
struct ethtool_ringparam *ring)
{
struct example_priv *priv = netdev_priv(dev);
ring->rx_max_pending = MAX_RX_RING_SIZE;
ring->rx_mini_max_pending = 0;
ring->rx_jumbo_max_pending = MAX_RX_JUMBO_RING_SIZE;
ring->tx_max_pending = MAX_TX_RING_SIZE;
ring->rx_pending = priv->rx_ring_size;
ring->rx_mini_pending = 0;
ring->rx_jumbo_pending = priv->rx_jumbo_ring_size;
ring->tx_pending = priv->tx_ring_size;
}
static int example_set_ringparam(struct net_device *dev,
struct ethtool_ringparam *ring)
{
struct example_priv *priv = netdev_priv(dev);
/* 验证参数 */
if (ring->rx_pending > MAX_RX_RING_SIZE ||
ring->rx_pending < MIN_RX_RING_SIZE ||
ring->tx_pending > MAX_TX_RING_SIZE ||
ring->tx_pending < MIN_TX_RING_SIZE)
return -EINVAL;
/* 如果设备正在运行,需要重新配置 */
if (netif_running(dev)) {
/* 停止设备 */
example_close(dev);
/* 更新配置 */
priv->rx_ring_size = ring->rx_pending;
priv->tx_ring_size = ring->tx_pending;
/* 重新启动设备 */
return example_open(dev);
}
priv->rx_ring_size = ring->rx_pending;
priv->tx_ring_size = ring->tx_pending;
return 0;
}
6. 调试与测试方法
6.1 procfs/sysfs接口调试
procfs接口实现
c
/* proc文件系统接口 */
static int example_proc_show(struct seq_file *seq, void *v)
{
struct net_device *dev = seq->private;
struct example_priv *priv = netdev_priv(dev);
struct net_device_stats *stats = &dev->stats;
seq_puts(seq, "Example Network Driver Statistics\n");
seq_puts(seq, "=================================\n");
seq_printf(seq, "Device: %s\n", dev->name);
seq_printf(seq, "Driver: %s version %s\n", DRV_NAME, DRV_VERSION);
seq_printf(seq, "Link: %s\n", netif_carrier_ok(dev) ? "up" : "down");
seq_printf(seq, "Speed: %d Mbps\n", priv->link_speed);
seq_printf(seq, "Duplex: %s\n", priv->link_duplex ? "full" : "half");
seq_puts(seq, "\nRX Statistics:\n");
seq_printf(seq, " Packets: %llu\n", stats->rx_packets);
seq_printf(seq, " Bytes: %llu\n", stats->rx_bytes);
seq_printf(seq, " Errors: %llu\n", stats->rx_errors);
seq_printf(seq, " Dropped: %llu\n", stats->rx_dropped);
seq_printf(seq, " Multicast: %llu\n", stats->multicast);
seq_puts(seq, "\nTX Statistics:\n");
seq_printf(seq, " Packets: %llu\n", stats->tx_packets);
seq_printf(seq, " Bytes: %llu\n", stats->tx_bytes);
seq_printf(seq, " Errors: %llu\n", stats->tx_errors);
seq_printf(seq, " Dropped: %llu\n", stats->tx_dropped);
seq_printf(seq, " Collisions: %llu\n", stats->collisions);
seq_puts(seq, "\nHardware Registers:\n");
seq_printf(seq, " Control: 0x%08x\n", readl(priv->base + NET_CTRL_REG));
seq_printf(seq, " Status: 0x%08x\n", readl(priv->base + NET_STATUS_REG));
seq_printf(seq, " Interrupt: 0x%08x\n", readl(priv->base + INTR_STATUS_REG));
return 0;
}
static int example_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, example_proc_show, PDE_DATA(inode));
}
static const struct proc_ops example_proc_ops = {
.proc_open = example_proc_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
/* 创建proc接口 */
static void example_create_proc(struct net_device *dev)
{
struct example_priv *priv = netdev_priv(dev);
char name[32];
snprintf(name, sizeof(name), "driver/%s", dev->name);
priv->proc_entry = proc_create_data(name, 0444, NULL,
&example_proc_ops, dev);
}
/* 删除proc接口 */
static void example_remove_proc(struct net_device *dev)
{
struct example_priv *priv = netdev_priv(dev);
char name[32];
if (priv->proc_entry) {
snprintf(name, sizeof(name), "driver/%s", dev->name);
remove_proc_entry(name, NULL);
priv->proc_entry = NULL;
}
}
sysfs接口实现
c
/* sysfs属性定义 */
static ssize_t link_status_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
struct example_priv *priv = netdev_priv(netdev);
return sprintf(buf, "%s\n", netif_carrier_ok(netdev) ? "up" : "down");
}
static ssize_t link_speed_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
struct example_priv *priv = netdev_priv(netdev);
return sprintf(buf, "%d\n", priv->link_speed);
}
static ssize_t registers_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
struct example_priv *priv = netdev_priv(netdev);
int len = 0;
int i;
for (i = 0; i < 0x100; i += 4) {
len += sprintf(buf + len, "0x%04x: 0x%08x\n",
i, readl(priv->base + i));
}
return len;
}
static DEVICE_ATTR_RO(link_status);
static DEVICE_ATTR_RO(link_speed);
static DEVICE_ATTR_RO(registers);
static struct attribute *example_attrs[] = {
&dev_attr_link_status.attr,
&dev_attr_link_speed.attr,
&dev_attr_registers.attr,
NULL
};
static const struct attribute_group example_attr_group = {
.name = "example",
.attrs = example_attrs,
};
/* 注册sysfs属性 */
static int example_sysfs_init(struct net_device *dev)
{
return sysfs_create_group(&dev->dev.kobj, &example_attr_group);
}
/* 注销sysfs属性 */
static void example_sysfs_exit(struct net_device *dev)
{
sysfs_remove_group(&dev->dev.kobj, &example_attr_group);
}
6.2 内核网络跟踪点使用
跟踪点定义
c
/* 定义跟踪点 */
#include <linux/tracepoint.h>
/* 创建跟踪点 */
DECLARE_TRACE(network_rx_entry,
TP_PROTO(struct net_device *dev, struct sk_buff *skb),
TP_ARGS(dev, skb));
DECLARE_TRACE(network_rx_exit,
TP_PROTO(struct net_device *dev, struct sk_buff *skb, int ret),
TP_ARGS(dev, skb, ret));
DECLARE_TRACE(network_tx_entry,
TP_PROTO(struct net_device *dev, struct sk_buff *skb),
TP_ARGS(dev, skb));
DECLARE_TRACE(network_tx_exit,
TP_PROTO(struct net_device *dev, struct sk_buff *skb, int ret),
TP_ARGS(dev, skb, ret));
/* 在代码中使用跟踪点 */
static int example_rx(struct net_device *dev, int budget)
{
struct example_priv *priv = netdev_priv(dev);
int received = 0;
while (received < budget) {
struct sk_buff *skb;
int ret;
/* 跟踪接收开始 */
trace_network_rx_entry(dev, skb);
/* 处理接收 */
ret = example_process_rx_packet(dev, skb);
/* 跟踪接收结束 */
trace_network_rx_exit(dev, skb, ret);
if (ret < 0)
break;
received++;
}
return received;
}
/* 跟踪点实现 */
DEFINE_TRACE(network_rx_entry);
DEFINE_TRACE(network_rx_exit);
DEFINE_TRACE(network_tx_entry);
DEFINE_TRACE(network_tx_exit);
使用ftrace调试
bash
#!/bin/bash
# ftrace调试脚本
echo "启用网络跟踪点"
echo 1 > /sys/kernel/debug/tracing/events/network/enable
echo "开始跟踪"
echo > /sys/kernel/debug/tracing/trace
echo "运行测试"
ping -c 10 192.168.1.1
echo "查看跟踪结果"
cat /sys/kernel/debug/tracing/trace
echo "禁用跟踪点"
echo 0 > /sys/kernel/debug/tracing/events/network/enable
6.3 性能基准测试方法
吞吐量测试
c
/* 内核模块性能测试 */
#include <linux/module.h>
#include <linux/time.h>
#include <linux/netdevice.h>
struct perf_test {
struct timer_list timer;
struct net_device *dev;
atomic_t packets;
atomic_t bytes;
u64 start_time;
u64 end_time;
int duration;
};