赞
踩
本文主要分析macvlan代码实现。分为如下几部分
a. 分析命令行参数如何传递
b. 分析kernel端代码如何解析命令行参数,并创建macvlan虚接口
c. 分析将网卡up起来时需要设置哪些东西
d. 分析报文接收流程。几种模式下不同的操作
e. 分析报文发送流程。只有bridge模式有特殊操作,其他模式直接从父接口发送出去
下面是比较重要的几点注意事项
a. 如果没有指定mtu,则使用父接口的mtu,如果指定了,则不能大于父接口的mtu。
b. passthru 模式下,不能再创建别的macvlan子接口,如果没有配置非混杂,则也要使能父接口的混杂模式。
c. 在macvlan接口上使能混杂或者allmulticast时,会设置filter为全1,这样就可以接收所有组播/广播报文。否则filter只设置广播或者动态添加的组播报文。
d. 如果没设置mode,则默认为 VEPA。
e. 将macvlan子接口up起来后,会将macvlan子接口的mac地址添加到父接口的单播链表中,以便父接口可以接收到macvlan的报文。如果父接口支持单播过滤(IFF_UNICAST_FLT),则将macvlan子接口的mac地址添加到父接口网卡内部过滤表中(可通过bridge fdb list查看单播地址列表),如果父接口不支持单播过滤,则将父接口设置为混杂模式(通过ip -d link show查看promiscuity为1)。这样父接口才会接收目的mac为macvlan子接口的报文。
数据结构
image.png
通过下面命令添加一个macvlan设备
ip link add link ens8 dev macvlan1 type macvlan
命令行填充参数代码如下
- link = ens8
- ifindex = ll_name_to_index(link);
- addattr32(&req->n, sizeof(*req), IFLA_LINK, ifindex);
- req->i.ifi_index = 0; //这里仍然是0
-
- name = dev = macvlan1
- addattr_l(&req->n, sizeof(*req), IFLA_IFNAME, name, strlen(name) + 1);
-
- type = macvlan
- linkinfo = addattr_nest(&req.n, sizeof(req), IFLA_LINKINFO);
- addattr_l(&req.n, sizeof(req), IFLA_INFO_KIND, type, strlen(type));
- //macvlan私有参数
- struct link_util *lu = get_link_kind(type);
- iflatype = IFLA_INFO_DATA;
- data = addattr_nest(&req.n, sizeof(req), iflatype);
- lu->parse_opt(lu, argc, argv, &req.n) //macvlan_parse_opt
- addattr32(n, 1024, IFLA_MACVLAN_MODE, mode);
- addattr16(n, 1024, IFLA_MACVLAN_FLAGS, flags);
- addattr32(n, 1024, IFLA_MACVLAN_MACADDR_MODE, mac_mode);
- //如果 mac_mode 为 MACVLAN_MACADDR_ADD 或者 MACVLAN_MACADDR_DEL,则只添加/删除一个mac
- addattr_l(n, 1024, IFLA_MACVLAN_MACADDR, &mac, ETH_ALEN);
- //如果 mac_mode 为 MACVLAN_MACADDR_SET,则循环添加多个mac
- nmac = addattr_nest(n, 1024, IFLA_MACVLAN_MACADDR_DATA);
- while
- addattr_l(n, 1024, IFLA_MACVLAN_MACADDR, &mac, ETH_ALEN);
- addattr_nest_end(n, nmac);
- addattr_nest_end(&req.n, data);
- addattr_nest_end(&req.n, linkinfo);
- root@ubuntu:~# modprobe macvlan
- root@ubuntu:~# lsmod | grep macvlan
- macvlan 24576 0
驱动初始化代码如下,主要是注册macvlan_link_ops 到静态链表link_ops,添加macvlan设备时,会查找到此ops进行相关操作。
- static struct rtnl_link_ops macvlan_link_ops = {
- .kind = "macvlan",
- .setup = macvlan_setup,
- .newlink = macvlan_newlink,
- .dellink = macvlan_dellink,
- };
-
- module_init(macvlan_init_module);
- static int __init macvlan_init_module(void)
- macvlan_link_register(&macvlan_link_ops);
- /* common fields */
- ops->priv_size = sizeof(struct macvlan_dev);
- ops->validate = macvlan_validate;
- ops->maxtype = IFLA_MACVLAN_MAX;
- ops->policy = macvlan_policy;
- ops->changelink = macvlan_changelink;
- ops->get_size = macvlan_get_size;
- ops->fill_info = macvlan_fill_info;
-
- return rtnl_link_register(ops);
- __rtnl_link_register(ops);
- if (rtnl_link_ops_get(ops->kind))
- return -EEXIST;
- //static LIST_HEAD(link_ops); 为静态全局链表头
- list_add_tail(&ops->list, &link_ops);
- static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
- nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
- ifm = nlmsg_data(nlh);
- //ifm->ifi_index 在创建macvlan时为0
- if (ifm->ifi_index > 0)
- dev = __dev_get_by_index(net, ifm->ifi_index);
- else {
- if (ifname[0])
- dev = __dev_get_by_name(net, ifname); //此时还没有创建此dev,所以dev为空
- else
- dev = NULL;
- }
- //获取嵌套的 linkinfo
- nla_parse_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy);
- //获取 kind,此处为 macvlan
- nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
- ops = rtnl_link_ops_get(kind); //对于macvlan来说,macvlan_link_ops
- if (ops) {
- //获取 kind 私有数据
- if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
- err = nla_parse_nested(attr, ops->maxtype,
- linkinfo[IFLA_INFO_DATA],
- ops->policy);
- if (err < 0)
- return err;
- data = attr;
- }
- //调用 macvlan_validate 验证私有数据
- if (ops->validate) {
- err = ops->validate(tb, data);
- if (err < 0)
- return err;
- }
- }
- //获取 net,如果没有指定其他net namespace,则使用当前net
- dest_net = rtnl_link_get_net(net, tb);
- if (tb[IFLA_NET_NS_PID])
- net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
- else if (tb[IFLA_NET_NS_FD])
- net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
- else
- net = get_net(src_net);
- return net;
- dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
- //获取tx/rx队列个数,如果没配置,则默认为1
- if (tb[IFLA_NUM_TX_QUEUES])
- num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
- else if (ops->get_num_tx_queues)
- num_tx_queues = ops->get_num_tx_queues();
-
- if (tb[IFLA_NUM_RX_QUEUES])
- num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
- else if (ops->get_num_rx_queues)
- num_rx_queues = ops->get_num_rx_queues();
- //ops->priv_size = sizeof(struct macvlan_dev);
- dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues);
- alloc_size = sizeof(struct net_device);
- if (sizeof_priv) {
- /* ensure 32-byte alignment of private area */
- alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
- alloc_size += sizeof_priv;
- }
- /* ensure 32-byte alignment of whole construct */
- alloc_size += NETDEV_ALIGN - 1;
-
- p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
- dev = PTR_ALIGN(p, NETDEV_ALIGN);
- dev_mc_init(dev);
- dev_uc_init(dev);
- dev_net_set(dev, &init_net);
- dev->gso_max_size = GSO_MAX_SIZE;
- dev->gso_max_segs = GSO_MAX_SEGS;
- dev->gso_min_segs = 0;
- setup(dev); //macvlan_setup
- macvlan_common_setup(dev);
- ether_setup(dev);
- dev->header_ops = ð_header_ops;
- dev->type = ARPHRD_ETHER;
- dev->hard_header_len = ETH_HLEN;
- dev->mtu = ETH_DATA_LEN;
- dev->addr_len = ETH_ALEN;
- dev->tx_queue_len = 1000; /* Ethernet wants good queues */
- dev->flags = IFF_BROADCAST|IFF_MULTICAST;
- dev->priv_flags |= IFF_TX_SKB_SHARING;
- memset(dev->broadcast, 0xFF, ETH_ALEN);
- dev->priv_flags &= ~IFF_TX_SKB_SHARING;
- netif_keep_dst(dev);
- dev->priv_flags |= IFF_UNICAST_FLT;
- dev->netdev_ops = &macvlan_netdev_ops;
- dev->destructor = free_netdev;
- dev->header_ops = &macvlan_hard_header_ops;
- dev->ethtool_ops = &macvlan_ethtool_ops;
- dev->tx_queue_len = 0;
- strcpy(dev->name, name);
- dev_net_set(dev, net);
- dev->rtnl_link_ops = ops;
- dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
- //下面的参数,如果指定了就赋值即可
- if (tb[IFLA_MTU])
- dev->mtu = nla_get_u32(tb[IFLA_MTU]);
- if (tb[IFLA_ADDRESS]) {
- memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
- nla_len(tb[IFLA_ADDRESS]));
- dev->addr_assign_type = NET_ADDR_SET;
- }
- if (tb[IFLA_BROADCAST])
- memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
- nla_len(tb[IFLA_BROADCAST]));
- if (tb[IFLA_TXQLEN])
- dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
- if (tb[IFLA_OPERSTATE])
- set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
- if (tb[IFLA_LINKMODE])
- dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
- if (tb[IFLA_GROUP])
- dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
- if (ops->newlink) {
- ops->newlink(net, dev, tb, data); //macvlan_newlink
- macvlan_common_newlink(src_net, dev, tb, data);
- struct macvlan_dev *vlan = netdev_priv(dev);
- struct macvlan_port *port;
- struct net_device *lowerdev;
- //IFLA_LINK 表示macvlan的父接口,如果没有指定肯定不能创建
- if (!tb[IFLA_LINK])
- return -EINVAL;
- //获取父接口,会把父接口赋给 macvlan->lowerdev 保存下来,以便指定此macvlan虚拟接口的父接口是哪个设备
- struct net_device *lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
- //如果基于macvlan接口创建另一个macvlan接口,则使用真实的父接口
- if (netif_is_macvlan(lowerdev)) //return dev->priv_flags & IFF_MACVLAN;
- lowerdev = macvlan_dev_real_dev(lowerdev);
- struct macvlan_dev *macvlan = netdev_priv(dev);
- return macvlan->lowerdev;
- //如果没有指定mtu,则使用父接口的mtu
- //如果指定了,则不能大于父接口的mtu
- if (!tb[IFLA_MTU])
- dev->mtu = lowerdev->mtu;
- else if (dev->mtu > lowerdev->mtu)
- return -EINVAL;
- //如果没有指定mac地址,则随机生成一个
- if (!tb[IFLA_ADDRESS])
- eth_hw_addr_random(dev);
- dev->addr_assign_type = NET_ADDR_RANDOM;
- eth_random_addr(dev->dev_addr);
- get_random_bytes(addr, ETH_ALEN);
- addr[0] &= 0xfe; /* clear multicast bit */
- addr[0] |= 0x02; /* set local assignment bit (IEEE802) */
- //如果父接口没有此标志 IFF_MACVLAN_PORT,说明是第一个给这个设备添加macvlan子接口
- if (!macvlan_port_exists(lowerdev)) { //#define macvlan_port_exists(dev) (dev->priv_flags & IFF_MACVLAN_PORT)
- macvlan_port_create(lowerdev);
- //如果不是 ether 类型或者是 loopback 类型,则返回错误
- if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
- return -EINVAL;
- struct macvlan_port *port = kzalloc(sizeof(*port), GFP_KERNEL);
- port->passthru = false;
- port->dev = dev;
- //初始化链表,用于存放所有的macvlan子接口
- INIT_LIST_HEAD(&port->vlans);
- //初始化 hash 链表,用于存放所有的 up 状态的子接口(调用了 _dev_open)
- for (i = 0; i < MACVLAN_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&port->vlan_hash[i]);
- //初始化 hash 链表
- for (i = 0; i < MACVLAN_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&port->vlan_source_hash[i]);
- //组播/广播报文会放在链表,由 macvlan_process_broadcast 发给所有的up状态的macvlan子接口
- skb_queue_head_init(&port->bc_queue);
- INIT_WORK(&port->bc_work, macvlan_process_broadcast);
-
- //注册收包函数,这样在 netif_receive_skb 收到报文会调用 macvlan_handle_frame
- err = netdev_rx_handler_register(dev, macvlan_handle_frame, port);
- rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
- rcu_assign_pointer(dev->rx_handler, rx_handler);
- if (err)
- kfree(port);
- else
- //设置标志位
- dev->priv_flags |= IFF_MACVLAN_PORT;
- port = macvlan_port_get_rtnl(lowerdev); //rtnl_dereference(dev->rx_handler_data);
- /* Only 1 macvlan device can be created in passthru mode */
- //passthru 模式下,不能再创建别的macvlan子接口
- if (port->passthru)
- return -EINVAL;
- vlan->lowerdev = lowerdev;
- vlan->dev = dev;
- vlan->port = port;
- vlan->set_features = MACVLAN_FEATURES;
- vlan->nest_level = dev_get_nest_level(lowerdev, netif_is_macvlan) + 1;
- vlan->mode = MACVLAN_MODE_VEPA;
- if (data && data[IFLA_MACVLAN_MODE])
- vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
- if (data && data[IFLA_MACVLAN_FLAGS])
- vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
-
- if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
- //port->count不为0,说明已经有其他模式的macvlan子接口了,这种情况也不允许
- if (port->count)
- return -EINVAL;
- //设置标志位
- port->passthru = true;
- //将父接口的mac地址,赋给 passthru 模式的macvlan子接口
- eth_hw_addr_inherit(dev, lowerdev);
- dst->addr_assign_type = src->addr_assign_type;
- ether_addr_copy(dst->dev_addr, src->dev_addr);
- }
-
- if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
- if (vlan->mode != MACVLAN_MODE_SOURCE)
- return -EINVAL;
- macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
- macvlan_changelink_sources(vlan, macmode, data);
- if (mode == MACVLAN_MACADDR_ADD) {
- macvlan_hash_add_source(vlan, addr);
- struct macvlan_port *port = vlan->port;
- struct macvlan_source_entry *entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- ether_addr_copy(entry->addr, addr);
- entry->vlan = vlan;
- h = &port->vlan_source_hash[macvlan_eth_hash(addr)];
- hlist_add_head_rcu(&entry->hlist, h);
- vlan->macaddr_count++;
-
- //增加macvlan子接口个数的计数
- port->count += 1;
- //注册macvlan子接口
- register_netdevice(dev);
- //设置标志位
- dev->priv_flags |= IFF_MACVLAN;
- netdev_upper_dev_link(lowerdev, dev);
- __netdev_upper_dev_link(dev, upper_dev, false, NULL);
- //将macvlan子接口添加到父接口的 vlans 链表
- list_add_tail_rcu(&vlan->list, &port->vlans);
- netif_stacked_transfer_operstate(lowerdev, dev);
- rtnl_configure_link(dev, ifm);
- old_flags = dev->flags;
- if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
- err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
- if (err < 0)
- return err;
- }
-
- dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
-
- #通过 ip 命令修改网卡配置时
- #ip link set dev xxx up
- req->i.ifi_change |= IFF_UP;
- req->i.ifi_flags |= IFF_UP;
-
- static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh)
- dev = __dev_get_by_index(net, ifm->ifi_index);
- do_setlink(skb, dev, ifm, tb, ifname, status);
- if (ifm->ifi_flags || ifm->ifi_change) {
- err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
-
- #通过socket修改网卡配置时
- static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
- if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) {
- dev_ioctl(net, cmd, argp);
- switch (cmd) {
- case SIOCSIFFLAGS:
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
- dev_load(net, ifr.ifr_name);
- dev_ifsioc(net, &ifr, cmd);
- struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
- switch (cmd) {
- case SIOCSIFFLAGS: /* Set interface flags */
- dev_change_flags(dev, ifr->ifr_flags);
- //都会调用dev_change_flags进行相关操作
- int dev_change_flags(struct net_device *dev, unsigned int flags)
- __dev_change_flags(dev, flags);
- unsigned int old_flags = dev->flags;
- dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
- IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | IFF_AUTOMEDIA)) |
- (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | IFF_ALLMULTI));
- if ((old_flags ^ flags) & IFF_MULTICAST)
- dev_change_rx_flags(dev, IFF_MULTICAST);
- ops->ndo_change_rx_flags(dev, flags); //macvlan_change_rx_flags
- dev_set_rx_mode(dev);
-
- //新旧flag不一样时,如果旧flag是up,说明新flag是down,需要调用 __dev_close
- //如果旧flag不是up,说明新flag是up,需要调用 __dev_open
- if ((old_flags ^ flags) & IFF_UP)
- ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
- static int __dev_open(struct net_device *dev)
- const struct net_device_ops *ops = dev->netdev_ops;
- ops->ndo_open(dev); //对于macvlan设备来说,macvlan_open
- struct macvlan_dev *vlan = netdev_priv(dev);
- struct net_device *lowerdev = vlan->lowerdev;
- int err;
- //passthrough 模式下,如果没有配置非混杂,则也要使能父接口的混杂模式
- if (vlan->port->passthru) {
- if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) {
- err = dev_set_promiscuity(lowerdev, 1);
- if (err < 0)
- goto out;
- }
- goto hash_add;
- }
- //和硬件offload相关的,暂时不管
- if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD &&
- dev->rtnl_link_ops == &macvlan_link_ops) {
- vlan->fwd_priv =
- lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
-
- /* If we get a NULL pointer back, or if we get an error
- * then we should just fall through to the non accelerated path
- */
- if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
- vlan->fwd_priv = NULL;
- } else
- return 0;
- }
- if (macvlan_addr_busy(vlan->port, dev->dev_addr))
- if (ether_addr_equal_64bits(port->dev->dev_addr, addr))
- return 1;
- if (macvlan_hash_lookup(port, addr))
- return 1;
- return 0;
- goto out;
- //将macvlan子接口的mac地址添加到父接口的单播链表中,以便父接口可以接收到macvlan的报文
- err = dev_uc_add(lowerdev, dev->dev_addr);
- __hw_addr_add(&dev->uc, addr, dev->addr_len, NETDEV_HW_ADDR_T_UNICAST);
- __dev_set_rx_mode(dev);
- //如果dev不支持单播过滤功能,即IFF_UNICAST_FLT
- if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
- /* Unicast addresses changes may only happen under the rtnl,
- * therefore calling __dev_set_promiscuity here is safe.
- */
- //单播链表不为空,并且uc_promisc为false,则使能混杂模式。
- //对于macvlan设备来说,这里就是使能父接口的混杂模式
- if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
- __dev_set_promiscuity(dev, 1, false);
- dev->uc_promisc = true;
- } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
- __dev_set_promiscuity(dev, -1, false);
- dev->uc_promisc = false;
- }
- }
- if (dev->flags & IFF_ALLMULTI) {
- err = dev_set_allmulti(lowerdev, 1);
- if (err < 0)
- goto del_unicast;
- }
- hash_add:
- //macvlan设备open后,将设备添加到父接口的 vlan_hash hash链表中
- macvlan_hash_add(vlan);
- struct macvlan_port *port = vlan->port;
- const unsigned char *addr = vlan->dev->dev_addr;
- u32 idx = macvlan_eth_hash(addr);
- hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]);
- return 0;
-
- if ((flags ^ dev->gflags) & IFF_PROMISC) {
- int inc = (flags & IFF_PROMISC) ? 1 : -1;
- unsigned int old_flags = dev->flags;
-
- dev->gflags ^= IFF_PROMISC;
-
- if (__dev_set_promiscuity(dev, inc, false) >= 0)
- dev->flags |= IFF_PROMISC;
- dev->promiscuity += inc;
- if (dev->flags != old_flags)
- dev_set_rx_mode(dev);
- }
- if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
- int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
- dev->gflags ^= IFF_ALLMULTI;
- __dev_set_allmulti(dev, inc, false);
- dev->flags |= IFF_ALLMULTI;
- dev->allmulti += inc;
- if (dev->flags ^ old_flags) {
- dev_change_rx_flags(dev, IFF_ALLMULTI);
- ops->ndo_change_rx_flags(dev, flags); //macvlan_change_rx_flags
- //如果macvlan使能了 ALLMULTI,则也要设置父接口使能 ALLMULTI
- struct macvlan_dev *vlan = netdev_priv(dev);
- //取出父接口
- struct net_device *lowerdev = vlan->lowerdev;
- if (dev->flags & IFF_UP) {
- if (change & IFF_ALLMULTI)
- dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1);
- __dev_set_allmulti(dev, inc, true);
- dev_set_rx_mode(dev);
- __dev_set_rx_mode(dev);
- ops->ndo_set_rx_mode(dev); //macvlan_set_mac_lists
- //如果使能了混杂模式或者allmulti,则设置 vlan->mc_filter 所有 bit 位为1,表示可以接收所有报文
- if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
- bitmap_fill(vlan->mc_filter, MACVLAN_MC_FILTER_SZ);
- } else {
- struct netdev_hw_addr *ha;
- DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ);
- //如果没有使能混杂模式或者allmulti,则只设置设备的dev->mc组播列表地址和广播地址
- bitmap_zero(filter, MACVLAN_MC_FILTER_SZ);
- netdev_for_each_mc_addr(ha, dev) {
- __set_bit(mc_hash(vlan, ha->addr), filter);
- }
-
- __set_bit(mc_hash(vlan, dev->broadcast), filter);
-
- bitmap_copy(vlan->mc_filter, filter, MACVLAN_MC_FILTER_SZ);
- }
- //将macvlan子接口的组播列表和单播列表传递给父接口,以便父接口可以接收到这些报文
- dev_uc_sync(vlan->lowerdev, dev);
- dev_mc_sync(vlan->lowerdev, dev);
对于组播/广播报文:
bridge模式下的macvlan子接口发出去的报文,经过外部交换机反射回来后,会被主接口和其他模式为vepa的macvlan子接口收到。
vepa模式下的macvlan子接口发出去的报文,经过外部交换机反射回来后,会被主接口和其他模式为vepa和bridge的macvlan子接口收到。
private模式下的macvlan子接口发出去的报文,经过外部交换机反射回来后,只会被发送报文的macvlan子接口收到。
pathrough模式下的macvlan子接口发出去的报文,经过外部交换机反射回来后,只会被发送报文的macvlan子接口收到。
- static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
- another_round:
- skb->skb_iif = skb->dev->ifindex;
- ...
- ...
- //对于macvlan来说,macvlan_handle_frame
- rx_handler = rcu_dereference(skb->dev->rx_handler);
- if (rx_handler) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
- }
- switch (rx_handler(&skb)) {
- //说明报文被释放了
- case RX_HANDLER_CONSUMED:
- ret = NET_RX_SUCCESS;
- goto out;
- //说明报文找到了目的地,需要重新走一遍协议栈
- case RX_HANDLER_ANOTHER:
- goto another_round;
- case RX_HANDLER_EXACT:
- deliver_exact = true;
- //表明报文被复制一份处理,可以后续流程
- case RX_HANDLER_PASS:
- break;
- default:
- BUG();
- }
- }
-
- static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
- struct macvlan_port *port = macvlan_port_get_rcu(skb->dev);
- rcu_dereference(dev->rx_handler_data);
- //组播/广播处理流程
- if (is_multicast_ether_addr(eth->h_dest)) {
- skb = ip_check_defrag(skb, IP_DEFRAG_MACVLAN);
- if (!skb)
- return RX_HANDLER_CONSUMED;
- eth = eth_hdr(skb);
- macvlan_forward_source(skb, port, eth->h_source);
- //根据源mac到主接口的 vlan_hash 查找
- src = macvlan_hash_lookup(port, eth->h_source);
- if (src && src->mode != MACVLAN_MODE_VEPA &&
- src->mode != MACVLAN_MODE_BRIDGE) {
- /* forward to original port. */
- struct macvlan_dev *vlan = src;
- ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:netif_rx(skb);
- handle_res = RX_HANDLER_CONSUMED;
- goto out;
- }
-
- MACVLAN_SKB_CB(skb)->src = src;
- macvlan_broadcast_enqueue(port, skb);
- nskb = skb_clone(skb, GFP_ATOMIC);
- //将skb复制一份,存入队列
- if (skb_queue_len(&port->bc_queue) < MACVLAN_BC_QUEUE_LEN) {
- __skb_queue_tail(&port->bc_queue, nskb);
- //bc_work 为 macvlan_process_broadcast,从队列取出报文,根据macvlan mode进行转发
- schedule_work(&port->bc_work);
- return RX_HANDLER_PASS;
- }
- //单播处理流程
- macvlan_forward_source(skb, port, eth->h_source);
-
- if (port->passthru)
- //list_first_or_null_rcu 获取链表的第一个元素,对于 passthru 模式来说,也只有一个 macvlan 子接口
- vlan = list_first_or_null_rcu(&port->vlans, struct macvlan_dev, list);
- else
- //根据目的mac 到父接口的 hash 链表 vlan_hash 中查找是否有匹配的macvlan子接口的mac
- vlan = macvlan_hash_lookup(port, eth->h_dest);
- //如果没有找到合适的macvlan子接口,则返回 RX_HANDLER_PASS,表明此报文没有被macvlan处理,可以继续后面协议栈流程
- if (vlan == NULL)
- return RX_HANDLER_PASS;
-
- //目的地是此设备,但是设备是down的,则释放skb,并返回RX_HANDLER_CONSUMED,表明报文已经被释放
- dev = vlan->dev;
- if (unlikely(!(dev->flags & IFF_UP))) {
- kfree_skb(skb);
- return RX_HANDLER_CONSUMED;
- }
- len = skb->len + ETH_HLEN;
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (!skb) {
- ret = NET_RX_DROP;
- handle_res = RX_HANDLER_CONSUMED;
- goto out;
- }
- //将真正的目的设备 dev 赋给 skb->dev
- skb->dev = dev;
- //表明报文是发给本机的
- skb->pkt_type = PACKET_HOST;
-
- ret = NET_RX_SUCCESS;
- //说明报文找到了目的地,需要重新走一遍协议栈
- handle_res = RX_HANDLER_ANOTHER;
- out:
- //增加报文计数
- macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
- return handle_res;
-
- #组播/广播处理流程
- static void macvlan_process_broadcast(struct work_struct *w)
- struct macvlan_port *port = container_of(w, struct macvlan_port, bc_work);
- struct sk_buff *skb;
- struct sk_buff_head list;
-
- __skb_queue_head_init(&list);
- //将链表port->bc_queue上的报文全部删除,存入本地链表 list
- spin_lock_bh(&port->bc_queue.lock);
- skb_queue_splice_tail_init(&port->bc_queue, &list);
- spin_unlock_bh(&port->bc_queue.lock);
-
- //从本地链表list中循环取出报文进行转发
- while ((skb = __skb_dequeue(&list))) {
- const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;
- rcu_read_lock();
- //src为空,说明报文是从外部主机发过来的,此时,如果macvlan子接口都可以收到此
- //报文(目的mac还得满足 vlan->mc_filter)
- if (!src)
- //外部来的报文,不会发送到模式为 source 的macvlan子接口
- /* frame comes from an external address */
- macvlan_broadcast(skb, port, NULL,
- MACVLAN_MODE_PRIVATE |
- MACVLAN_MODE_VEPA |
- MACVLAN_MODE_PASSTHRU|
- MACVLAN_MODE_BRIDGE);
- //如果src不为空,说明是本机发出去的报文,经过外面交换机处理后又收到了。
- //如果发送此报文的macvlan子接口模式为 MACVLAN_MODE_VEPA,则将
- //报文发送给模式为 MACVLAN_MODE_VEPA 和 MACVLAN_MODE_BRIDGE 的macvlan子接口
- else if (src->mode == MACVLAN_MODE_VEPA)
- /* flood to everyone except source */
- macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA | MACVLAN_MODE_BRIDGE);
- else
- /*
- * flood only to VEPA ports, bridge ports
- * already saw the frame on the way out.
- */
- //如果发送此报文的macvlan子接口模式不为 MACVLAN_MODE_VEPA,则将
- //报文发送给模式为 MACVLAN_MODE_VEPA 的 macvlan 子接口
- macvlan_broadcast(skb, port, src->dev, MACVLAN_MODE_VEPA);
-
- rcu_read_unlock();
- kfree_skb(skb);
- }
-
- static void macvlan_broadcast(struct sk_buff *skb, const struct macvlan_port *port,
- struct net_device *src, enum macvlan_mode mode)
- {
- const struct ethhdr *eth = eth_hdr(skb);
- const struct macvlan_dev *vlan;
- struct sk_buff *nskb;
- unsigned int i;
- int err;
- unsigned int hash;
-
- if (skb->protocol == htons(ETH_P_PAUSE))
- return;
-
- for (i = 0; i < MACVLAN_HASH_SIZE; i++) {
- hlist_for_each_entry_rcu(vlan, &port->vlan_hash[i], hlist) {
- if (vlan->dev == src || !(vlan->mode & mode))
- continue;
-
- hash = mc_hash(vlan, eth->h_dest);
- //目的mac得匹配vlan->mc_filter,对于ipv4广播来说,已经设置好了。
- //对于ipv4组播和ipv6组播,需要使能混杂模式或者allmulticast。
- if (!test_bit(hash, vlan->mc_filter))
- continue;
-
- err = NET_RX_DROP;
- nskb = skb_clone(skb, GFP_ATOMIC);
- if (likely(nskb))
- err = macvlan_broadcast_one(nskb, vlan, eth, mode == MACVLAN_MODE_BRIDGE) ?: netif_rx_ni(nskb);
- macvlan_count_rx(vlan, skb->len + ETH_HLEN,
- err == NET_RX_SUCCESS, true);
- }
- }
- }
- static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, struct net_device *dev)
- {
- unsigned int len = skb->len;
- int ret;
- struct macvlan_dev *vlan = netdev_priv(dev);
-
- if (unlikely(netpoll_tx_running(dev)))
- return macvlan_netpoll_send_skb(vlan, skb);
- //和网卡offload相关的,暂时不看
- if (vlan->fwd_priv) {
- skb->dev = vlan->lowerdev;
- ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);
- } else {//直接看这个流程
- ret = macvlan_queue_xmit(skb, dev);
- }
-
- if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
- struct vlan_pcpu_stats *pcpu_stats;
-
- pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
- u64_stats_update_begin(&pcpu_stats->syncp);
- pcpu_stats->tx_packets++;
- pcpu_stats->tx_bytes += len;
- u64_stats_update_end(&pcpu_stats->syncp);
- } else {
- this_cpu_inc(vlan->pcpu_stats->tx_dropped);
- }
- return ret;
- }
- 在bridge模式下,如果是广播/组播,则将报文发送给其他所
- 有macvlan子接口,然后从父接口发送出去。如果是单播,则
- 查找目的mac是否为其他macvlan子接口的地址,如果找到
- 了,则发送给这个子接口,不再通过父接口发送出去。如果
- 查找不到,说明是外部的报文,需要跳转到xmit_world,通过
- 父接口发送到外部。
- 其他模式下,直接从父接口发送出去。
- static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
- {
- const struct macvlan_dev *vlan = netdev_priv(dev);
- const struct macvlan_port *port = vlan->port;
- const struct macvlan_dev *dest;
- //macvlan 为 bridge 模式
- if (vlan->mode == MACVLAN_MODE_BRIDGE) {
- const struct ethhdr *eth = (void *)skb->data;
- //如果是目的mac为组播,则需要转发给同一个父接口上的其他所有macvlan子接口,不包括本身,也不包括父接口
- //最后还有跳转到xmit_world,通过父接口发送到外部
- /* send to other bridge ports directly */
- if (is_multicast_ether_addr(eth->h_dest)) {
- macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
- goto xmit_world;
- }
- //如果是目的mac为单播,则查找目的mac是否为其他macvlan子接口的地址,如果找到了,则发送给这个子接口,不再通过父接口发送出去。
- //如果查找不到,说明是外部的报文,需要跳转到xmit_world,通过父接口发送到外部
- dest = macvlan_hash_lookup(port, eth->h_dest);
- if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
- /* send to lowerdev first for its network taps */
- dev_forward_skb(vlan->lowerdev, skb);
-
- return NET_XMIT_SUCCESS;
- }
- }
- xmit_world:
- skb->dev = vlan->lowerdev;
- return dev_queue_xmit(skb);
- }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。