一、内核空间的流表同步
在 Open vSwitch 的数据包转发过程中,当数据包在 Datapath 模块无法完全处理时,会通过 upcall 调用将数据包交给用户空间的 vswitchd 守护进程,由 vswitchd 守护进程生成相应的流表和行为,并发送回内核空间。在这个过程中,内核空间和用户空间之间采用 Netlink 进行通信,并且用户空间下发的流表并不会直接发送到 Datapath 模块,而是先发送给内核空间的流表缓存(我们称之为 Flow Table),再交给 Datapath 模块。所谓的 Open vSwitch 流表同步指的是:在内核空间中,将流表项从流表缓存中同步到 Datapath 模块的过程。
二、流表同步 ovs_flow_cmd_new()
函数 ovs_flow_cmd_new() 负责处理 OVS 内核模块中的流表创建和更新操作,包括解析 Netlink 消息、分配和更新流表项、填充回复消息等内容,存储在 ovs-main/datapath/datapath.c 文件中:
cpp
static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) {
struct net *net = sock_net(skb->sk);
struct nlattr **a = info->attrs;
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow *flow = NULL, *new_flow;
struct sw_flow_mask mask;
struct sk_buff *reply;
struct datapath *dp;
struct sw_flow_actions *acts;
struct sw_flow_match match;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
int error;
bool log = !a[OVS_FLOW_ATTR_PROBE];
/* Must have key and actions. */
error = -EINVAL;
if (!a[OVS_FLOW_ATTR_KEY]) {
OVS_NLERR(log, "Flow key attr not present in new flow.");
goto error;
}
if (!a[OVS_FLOW_ATTR_ACTIONS]) {
OVS_NLERR(log, "Flow actions attr not present in new flow.");
goto error;
}
/* Most of the time we need to allocate a new flow, do it before locking. */
new_flow = ovs_flow_alloc();
if (IS_ERR(new_flow)) {
error = PTR_ERR(new_flow);
goto error;
}
/* Extract key. */
ovs_match_init(&match, &new_flow->key, false, &mask);
error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
a[OVS_FLOW_ATTR_MASK], log);
if (error)
goto err_kfree_flow;
/* Extract flow identifier. */
error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID],
&new_flow->key, log);
if (error)
goto err_kfree_flow;
/* unmasked key is needed to match when ufid is not used. */
if (ovs_identifier_is_key(&new_flow->id))
match.key = new_flow->id.unmasked_key;
ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask);
/* Validate actions. */
error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS],
&new_flow->key, &acts, log);
if (error) {
OVS_NLERR(log, "Flow actions may not be safe on all matching packets.");
goto err_kfree_flow;
}
reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, ufid_flags);
if (IS_ERR(reply)) {
error = PTR_ERR(reply);
goto err_kfree_acts;
}
ovs_lock();
dp = get_dp(net, ovs_header->dp_ifindex);
if (unlikely(!dp)) {
error = -ENODEV;
goto err_unlock_ovs;
}
/* Check if this is a duplicate flow */
if (ovs_identifier_is_ufid(&new_flow->id))
flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id);
if (!flow)
flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key);
if (likely(!flow)) {
rcu_assign_pointer(new_flow->sf_acts, acts);
/* Put flow in bucket. */
error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask);
if (unlikely(error)) {
acts = NULL;
goto err_unlock_ovs;
}
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(new_flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags);
BUG_ON(error < 0);
}
ovs_unlock();
} else {
struct sw_flow_actions *old_acts;
/* Bail out if we're not allowed to modify an existing flow.
* We accept NLM_F_CREATE in place of the intended NLM_F_EXCL because Generic Netlink treats the latter as a dump request.
* We also accept NLM_F_EXCL in case that bug ever gets fixed.
*/
if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL))) {
error = -EEXIST;
goto err_unlock_ovs;
}
/* The flow identifier has to be the same for flow updates.
* Look for any overlapping flow.
*/
if (unlikely(!ovs_flow_cmp(flow, &match))) {
if (ovs_identifier_is_key(&flow->id))
flow = ovs_flow_tbl_lookup_exact(&dp->table, &match);
else /* UFID matches but key is different */
flow = NULL;
if (!flow) {
error = -ENOENT;
goto err_unlock_ovs;
}
}
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (unlikely(reply)) {
error = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, info->snd_seq, 0, OVS_FLOW_CMD_NEW, ufid_flags);
BUG_ON(error < 0);
}
ovs_unlock();
ovs_nla_free_flow_actions_rcu(old_acts);
ovs_flow_free(new_flow, false);
}
if (reply)
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
return 0;
err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
ovs_nla_free_flow_actions(acts);
err_kfree_flow:
ovs_flow_free(new_flow, false);
error:
return error;
}
函数的第一个输入参数 struct sk_buff *skb 代表接收到的数据包(包含 Netlink 消息的 socket 缓冲区),第二个输入参数 struct genl_info *info 代表数据包相应的信息和属性(包含 Netlink 消息的元数据)。
函数首先使用 ovs_flow_alloc() 分配一个新的 sw_flow 结构体,该函数存储在 ovs-main/datapath/flow_table.c 文件中:
cpp
struct sw_flow *ovs_flow_alloc(void) {
struct sw_flow *flow;
struct sw_flow_stats *stats;
flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
if (!flow)
return ERR_PTR(-ENOMEM);
flow->stats_last_writer = -1;
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache, GFP_KERNEL | __GFP_ZERO, node_online(0) ? 0 : NUMA_NO_NODE);
if (!stats)
goto err;
spin_lock_init(&stats->lock);
RCU_INIT_POINTER(flow->stats[0], stats);
cpumask_set_cpu(0, &flow->cpu_used_mask);
return flow;
err:
kmem_cache_free(flow_cache, flow);
return ERR_PTR(-ENOMEM);
}
相应的 sw_flow 结构体定义在 ovs-main/datapath/flow.h 头文件中:
cpp
struct sw_flow {
struct rcu_head rcu;
struct {
struct hlist_node node[2];
u32 hash;
} flow_table, ufid_table;
int stats_last_writer; /* CPU id of the last writer on 'stats[0]'. */
struct sw_flow_key key;
struct sw_flow_id id;
struct cpumask cpu_used_mask;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
struct sw_flow_stats __rcu *stats[]; /* One for each CPU.
* First one is allocated at flow creation time, the rest are allocated on demand while holding the 'stats[0].lock'. */
};
然后函数对 Netlink 消息进行解析,并将解析结果存储在这个 sw_flow 结构体中(相应函数细节此处不做展开):
- 函数 ovs_nla_get_match() 用于从 Netlink 消息中解析 flow 的匹配键 match 和掩码 mask
- 函数 ovs_nla_get_identifier() 用于从 Netlink 消息中解析 flow 的标识符 id
- 函数 ovs_flow_mask_key() 用于掩码化 flow 的匹配键 match
- 函数 ovs_nla_copy_actions() 用于从 Netlink 消息中解析 flow 的行为 acts
接下来函数分配一个 sk_buff 用于回复消息,并使用 ovs_flow_cmd_alloc_info() 填充相关信息。然后获取 Datapath 的指针并进行搜索,将 sw_flow 结构体中记录的流表项同步到 Datapath 模块(相应函数细节此处不做展开):
- 如果在 Datapath 中找不到对应的流表项,则将新创建的 flow 插入到 flow 表中
- 如果在 Datapath 中找得到对应的流表项,则更新现有 flow 的动作,并检查是否允许修改
最后,如果有需要的话,则使用 ovs_flow_cmd_fill_info() 填充回复信息,并通过 ovs_notify() 将消息发送出去。此外还要进行错误处理和资源回收。
总结:
通过前面的分析可以看到,在 Open vSwitch 的内核空间中,流表解析和流表同步之间的过程衔接是非常紧密的(实现在同一个函数 ovs_flow_cmd_new() 中)。这个函数首先根据接收的 Netlink 消息将信息存入缓存,也就是上面提到的 sw_flow 结构体(存储在 Flow Table 中),然后将这个结构体的信息同步到 Datapath 内核模块。也就是说,Flow Table 和 Datapath 之间不涉及复杂的分层设计和通信机制,仅仅是在内核空间增加了一个额外的缓存,用于将从用户空间发送过来的数据进行整理,然后再同步到 Datapath 模块。
Tips :流表同步相关的整个过程都是在内核空间中进行的。
由于本人水平有限,以上内容如有不足之处欢迎大家指正(评论区/私信均可)。