linux-brain/net/bridge/br_if.c

777 lines
18 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Userspace interface
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
*/
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/netpoll.h>
#include <linux/ethtool.h>
#include <linux/if_arp.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/rtnetlink.h>
#include <linux/if_ether.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 17:04:11 +09:00
#include <linux/slab.h>
#include <net/dsa.h>
#include <net/sock.h>
#include <linux/if_vlan.h>
#include <net/switchdev.h>
#include <net/net_namespace.h>
#include "br_private.h"
/*
* Determine initial path cost based on speed.
* using recommendations from 802.1d standard
*
* Since driver might sleep need to not be holding any locks.
*/
static int port_cost(struct net_device *dev)
{
struct ethtool_link_ksettings ecmd;
if (!__ethtool_get_link_ksettings(dev, &ecmd)) {
switch (ecmd.base.speed) {
case SPEED_10000:
return 2;
case SPEED_1000:
return 4;
case SPEED_100:
return 19;
case SPEED_10:
return 100;
}
}
/* Old silly heuristics based on name */
if (!strncmp(dev->name, "lec", 3))
return 7;
if (!strncmp(dev->name, "plip", 4))
return 2500;
return 100; /* assume old 10Mbps */
}
/* Check for port carrier transitions. */
void br_port_carrier_check(struct net_bridge_port *p, bool *notified)
{
struct net_device *dev = p->dev;
struct net_bridge *br = p->br;
if (!(p->flags & BR_ADMIN_COST) &&
netif_running(dev) && netif_oper_up(dev))
p->path_cost = port_cost(dev);
*notified = false;
if (!netif_running(br->dev))
return;
spin_lock_bh(&br->lock);
if (netif_running(dev) && netif_oper_up(dev)) {
if (p->state == BR_STATE_DISABLED) {
br_stp_enable_port(p);
*notified = true;
}
} else {
if (p->state != BR_STATE_DISABLED) {
br_stp_disable_port(p);
*notified = true;
}
}
spin_unlock_bh(&br->lock);
}
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 22:59:20 +09:00
static void br_port_set_promisc(struct net_bridge_port *p)
{
int err = 0;
if (br_promisc_port(p))
return;
err = dev_set_promiscuity(p->dev, 1);
if (err)
return;
br_fdb_unsync_static(p->br, p);
p->flags |= BR_PROMISC;
}
static void br_port_clear_promisc(struct net_bridge_port *p)
{
int err;
/* Check if the port is already non-promisc or if it doesn't
* support UNICAST filtering. Without unicast filtering support
* we'll end up re-enabling promisc mode anyway, so just check for
* it here.
*/
if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT))
return;
/* Since we'll be clearing the promisc mode, program the port
* first so that we don't have interruption in traffic.
*/
err = br_fdb_sync_static(p->br, p);
if (err)
return;
dev_set_promiscuity(p->dev, -1);
p->flags &= ~BR_PROMISC;
}
/* When a port is added or removed or when certain port flags
* change, this function is called to automatically manage
* promiscuity setting of all the bridge ports. We are always called
* under RTNL so can skip using rcu primitives.
*/
void br_manage_promisc(struct net_bridge *br)
{
struct net_bridge_port *p;
bool set_all = false;
/* If vlan filtering is disabled or bridge interface is placed
* into promiscuous mode, place all ports in promiscuous mode.
*/
if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev))
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 22:59:20 +09:00
set_all = true;
list_for_each_entry(p, &br->port_list, list) {
if (set_all) {
br_port_set_promisc(p);
} else {
/* If the number of auto-ports is <= 1, then all other
* ports will have their output configuration
* statically specified through fdbs. Since ingress
* on the auto-port becomes forwarding/egress to other
* ports and egress configuration is statically known,
* we can say that ingress configuration of the
* auto-port is also statically known.
* This lets us disable promiscuous mode and write
* this config to hw.
*/
if (br->auto_cnt == 0 ||
(br->auto_cnt == 1 && br_auto_port(p)))
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 22:59:20 +09:00
br_port_clear_promisc(p);
else
br_port_set_promisc(p);
}
}
}
net: bridge: add support for backup port This patch adds a new port attribute - IFLA_BRPORT_BACKUP_PORT, which allows to set a backup port to be used for known unicast traffic if the port has gone carrier down. The backup pointer is rcu protected and set only under RTNL, a counter is maintained so when deleting a port we know how many other ports reference it as a backup and we remove it from all. Also the pointer is in the first cache line which is hot at the time of the check and thus in the common case we only add one more test. The backup port will be used only for the non-flooding case since it's a part of the bridge and the flooded packets will be forwarded to it anyway. To remove the forwarding just send a 0/non-existing backup port. This is used to avoid numerous scalability problems when using MLAG most notably if we have thousands of fdbs one would need to change all of them on port carrier going down which takes too long and causes a storm of fdb notifications (and again when the port comes back up). In a Multi-chassis Link Aggregation setup usually hosts are connected to two different switches which act as a single logical switch. Those switches usually have a control and backup link between them called peerlink which might be used for communication in case a host loses connectivity to one of them. We need a fast way to failover in case a host port goes down and currently none of the solutions (like bond) cannot fulfill the requirements because the participating ports are actually the "master" devices and must have the same peerlink as their backup interface and at the same time all of them must participate in the bridge device. As Roopa noted it's normal practice in routing called fast re-route where a precalculated backup path is used when the main one is down. Another use case of this is with EVPN, having a single vxlan device which is backup of every port. Due to the nature of master devices it's not currently possible to use one device as a backup for many and still have all of them participate in the bridge (which is master itself). More detailed information about MLAG is available at the link below. https://docs.cumulusnetworks.com/display/DOCS/Multi-Chassis+Link+Aggregation+-+MLAG Further explanation and a diagram by Roopa: Two switches acting in a MLAG pair are connected by the peerlink interface which is a bridge port. the config on one of the switches looks like the below. The other switch also has a similar config. eth0 is connected to one port on the server. And the server is connected to both switches. br0 -- team0---eth0 | -- switch-peerlink Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-23 17:16:59 +09:00
int nbp_backup_change(struct net_bridge_port *p,
struct net_device *backup_dev)
{
struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port);
struct net_bridge_port *backup_p = NULL;
ASSERT_RTNL();
if (backup_dev) {
if (!netif_is_bridge_port(backup_dev))
net: bridge: add support for backup port This patch adds a new port attribute - IFLA_BRPORT_BACKUP_PORT, which allows to set a backup port to be used for known unicast traffic if the port has gone carrier down. The backup pointer is rcu protected and set only under RTNL, a counter is maintained so when deleting a port we know how many other ports reference it as a backup and we remove it from all. Also the pointer is in the first cache line which is hot at the time of the check and thus in the common case we only add one more test. The backup port will be used only for the non-flooding case since it's a part of the bridge and the flooded packets will be forwarded to it anyway. To remove the forwarding just send a 0/non-existing backup port. This is used to avoid numerous scalability problems when using MLAG most notably if we have thousands of fdbs one would need to change all of them on port carrier going down which takes too long and causes a storm of fdb notifications (and again when the port comes back up). In a Multi-chassis Link Aggregation setup usually hosts are connected to two different switches which act as a single logical switch. Those switches usually have a control and backup link between them called peerlink which might be used for communication in case a host loses connectivity to one of them. We need a fast way to failover in case a host port goes down and currently none of the solutions (like bond) cannot fulfill the requirements because the participating ports are actually the "master" devices and must have the same peerlink as their backup interface and at the same time all of them must participate in the bridge device. As Roopa noted it's normal practice in routing called fast re-route where a precalculated backup path is used when the main one is down. Another use case of this is with EVPN, having a single vxlan device which is backup of every port. Due to the nature of master devices it's not currently possible to use one device as a backup for many and still have all of them participate in the bridge (which is master itself). More detailed information about MLAG is available at the link below. https://docs.cumulusnetworks.com/display/DOCS/Multi-Chassis+Link+Aggregation+-+MLAG Further explanation and a diagram by Roopa: Two switches acting in a MLAG pair are connected by the peerlink interface which is a bridge port. the config on one of the switches looks like the below. The other switch also has a similar config. eth0 is connected to one port on the server. And the server is connected to both switches. br0 -- team0---eth0 | -- switch-peerlink Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-23 17:16:59 +09:00
return -ENOENT;
backup_p = br_port_get_rtnl(backup_dev);
if (backup_p->br != p->br)
return -EINVAL;
}
if (p == backup_p)
return -EINVAL;
if (old_backup == backup_p)
return 0;
/* if the backup link is already set, clear it */
if (old_backup)
old_backup->backup_redirected_cnt--;
if (backup_p)
backup_p->backup_redirected_cnt++;
rcu_assign_pointer(p->backup_port, backup_p);
return 0;
}
static void nbp_backup_clear(struct net_bridge_port *p)
{
nbp_backup_change(p, NULL);
if (p->backup_redirected_cnt) {
struct net_bridge_port *cur_p;
list_for_each_entry(cur_p, &p->br->port_list, list) {
struct net_bridge_port *backup_p;
backup_p = rtnl_dereference(cur_p->backup_port);
if (backup_p == p)
nbp_backup_change(cur_p, NULL);
}
}
WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt);
}
static void nbp_update_port_count(struct net_bridge *br)
{
struct net_bridge_port *p;
u32 cnt = 0;
list_for_each_entry(p, &br->port_list, list) {
if (br_auto_port(p))
cnt++;
}
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 22:59:20 +09:00
if (br->auto_cnt != cnt) {
br->auto_cnt = cnt;
br_manage_promisc(br);
}
}
static void nbp_delete_promisc(struct net_bridge_port *p)
{
/* If port is currently promiscuous, unset promiscuity.
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 22:59:20 +09:00
* Otherwise, it is a static port so remove all addresses
* from it.
*/
dev_set_allmulti(p->dev, -1);
if (br_promisc_port(p))
dev_set_promiscuity(p->dev, -1);
else
br_fdb_unsync_static(p->br, p);
}
static void release_nbp(struct kobject *kobj)
{
struct net_bridge_port *p
= container_of(kobj, struct net_bridge_port, kobj);
kfree(p);
}
static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
struct net_bridge_port *p = kobj_to_brport(kobj);
net_ns_get_ownership(dev_net(p->dev), uid, gid);
}
static struct kobj_type brport_ktype = {
#ifdef CONFIG_SYSFS
.sysfs_ops = &brport_sysfs_ops,
#endif
.release = release_nbp,
.get_ownership = brport_get_ownership,
};
static void destroy_nbp(struct net_bridge_port *p)
{
struct net_device *dev = p->dev;
p->br = NULL;
p->dev = NULL;
dev_put(dev);
kobject_put(&p->kobj);
}
static void destroy_nbp_rcu(struct rcu_head *head)
{
struct net_bridge_port *p =
container_of(head, struct net_bridge_port, rcu);
destroy_nbp(p);
}
static unsigned get_max_headroom(struct net_bridge *br)
{
unsigned max_headroom = 0;
struct net_bridge_port *p;
list_for_each_entry(p, &br->port_list, list) {
unsigned dev_headroom = netdev_get_fwd_headroom(p->dev);
if (dev_headroom > max_headroom)
max_headroom = dev_headroom;
}
return max_headroom;
}
static void update_headroom(struct net_bridge *br, int new_hr)
{
struct net_bridge_port *p;
list_for_each_entry(p, &br->port_list, list)
netdev_set_rx_headroom(p->dev, new_hr);
br->dev->needed_headroom = new_hr;
}
/* Delete port(interface) from bridge is done in two steps.
* via RCU. First step, marks device as down. That deletes
* all the timers and stops new packets from flowing through.
*
* Final cleanup doesn't occur until after all CPU's finished
* processing packets.
*
* Protected from multiple admin operations by RTNL mutex
*/
static void del_nbp(struct net_bridge_port *p)
{
struct net_bridge *br = p->br;
struct net_device *dev = p->dev;
sysfs_remove_link(br->ifobj, p->dev->name);
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 22:59:20 +09:00
nbp_delete_promisc(p);
spin_lock_bh(&br->lock);
br_stp_disable_port(p);
spin_unlock_bh(&br->lock);
br_ifinfo_notify(RTM_DELLINK, NULL, p);
list_del_rcu(&p->list);
if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom)
update_headroom(br, get_max_headroom(br));
netdev_reset_rx_headroom(dev);
nbp_vlan_flush(p);
br_fdb_delete_by_port(br, p, 0, 1);
switchdev_deferred_process();
net: bridge: add support for backup port This patch adds a new port attribute - IFLA_BRPORT_BACKUP_PORT, which allows to set a backup port to be used for known unicast traffic if the port has gone carrier down. The backup pointer is rcu protected and set only under RTNL, a counter is maintained so when deleting a port we know how many other ports reference it as a backup and we remove it from all. Also the pointer is in the first cache line which is hot at the time of the check and thus in the common case we only add one more test. The backup port will be used only for the non-flooding case since it's a part of the bridge and the flooded packets will be forwarded to it anyway. To remove the forwarding just send a 0/non-existing backup port. This is used to avoid numerous scalability problems when using MLAG most notably if we have thousands of fdbs one would need to change all of them on port carrier going down which takes too long and causes a storm of fdb notifications (and again when the port comes back up). In a Multi-chassis Link Aggregation setup usually hosts are connected to two different switches which act as a single logical switch. Those switches usually have a control and backup link between them called peerlink which might be used for communication in case a host loses connectivity to one of them. We need a fast way to failover in case a host port goes down and currently none of the solutions (like bond) cannot fulfill the requirements because the participating ports are actually the "master" devices and must have the same peerlink as their backup interface and at the same time all of them must participate in the bridge device. As Roopa noted it's normal practice in routing called fast re-route where a precalculated backup path is used when the main one is down. Another use case of this is with EVPN, having a single vxlan device which is backup of every port. Due to the nature of master devices it's not currently possible to use one device as a backup for many and still have all of them participate in the bridge (which is master itself). More detailed information about MLAG is available at the link below. https://docs.cumulusnetworks.com/display/DOCS/Multi-Chassis+Link+Aggregation+-+MLAG Further explanation and a diagram by Roopa: Two switches acting in a MLAG pair are connected by the peerlink interface which is a bridge port. the config on one of the switches looks like the below. The other switch also has a similar config. eth0 is connected to one port on the server. And the server is connected to both switches. br0 -- team0---eth0 | -- switch-peerlink Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2018-07-23 17:16:59 +09:00
nbp_backup_clear(p);
nbp_update_port_count(br);
netdev_upper_dev_unlink(dev, br->dev);
dev->priv_flags &= ~IFF_BRIDGE_PORT;
netdev_rx_handler_unregister(dev);
br_multicast_del_port(p);
kobject_uevent(&p->kobj, KOBJ_REMOVE);
kobject_del(&p->kobj);
br_netpoll_disable(p);
call_rcu(&p->rcu, destroy_nbp_rcu);
}
/* Delete bridge device */
void br_dev_delete(struct net_device *dev, struct list_head *head)
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *p, *n;
list_for_each_entry_safe(p, n, &br->port_list, list) {
del_nbp(p);
}
br_recalculate_neigh_suppress_enabled(br);
br_fdb_delete_by_port(br, NULL, 0, 1);
bridge: flush br's address entry in fdb when remove the bridge dev When the following commands are executed: brctl addbr br0 ifconfig br0 hw ether <addr> rmmod bridge The calltrace will occur: [ 563.312114] device eth1 left promiscuous mode [ 563.312188] br0: port 1(eth1) entered disabled state [ 563.468190] kmem_cache_destroy bridge_fdb_cache: Slab cache still has objects [ 563.468197] CPU: 6 PID: 6982 Comm: rmmod Tainted: G O 3.12.0-0.7-default+ #9 [ 563.468199] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 563.468200] 0000000000000880 ffff88010f111e98 ffffffff814d1c92 ffff88010f111eb8 [ 563.468204] ffffffff81148efd ffff88010f111eb8 0000000000000000 ffff88010f111ec8 [ 563.468206] ffffffffa062a270 ffff88010f111ed8 ffffffffa063ac76 ffff88010f111f78 [ 563.468209] Call Trace: [ 563.468218] [<ffffffff814d1c92>] dump_stack+0x6a/0x78 [ 563.468234] [<ffffffff81148efd>] kmem_cache_destroy+0xfd/0x100 [ 563.468242] [<ffffffffa062a270>] br_fdb_fini+0x10/0x20 [bridge] [ 563.468247] [<ffffffffa063ac76>] br_deinit+0x4e/0x50 [bridge] [ 563.468254] [<ffffffff810c7dc9>] SyS_delete_module+0x199/0x2b0 [ 563.468259] [<ffffffff814e0922>] system_call_fastpath+0x16/0x1b [ 570.377958] Bridge firewalling registered --------------------------- cut here ------------------------------- The reason is that when the bridge dev's address is changed, the br_fdb_change_mac_address() will add new address in fdb, but when the bridge was removed, the address entry in the fdb did not free, the bridge_fdb_cache still has objects when destroy the cache, Fix this by flushing the bridge address entry when removing the bridge. v2: according to the Toshiaki Makita and Vlad's suggestion, I only delete the vlan0 entry, it still have a leak here if the vlan id is other number, so I need to call fdb_delete_by_port(br, NULL, 1) to flush all entries whose dst is NULL for the bridge. Suggested-by: Toshiaki Makita <toshiaki.makita1@gmail.com> Suggested-by: Vlad Yasevich <vyasevich@gmail.com> Signed-off-by: Ding Tianhong <dingtianhong@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-12-07 23:12:05 +09:00
cancel_delayed_work_sync(&br->gc_work);
br_sysfs_delbr(br->dev);
unregister_netdevice_queue(br->dev, head);
}
/* find an available port number */
static int find_portno(struct net_bridge *br)
{
int index;
struct net_bridge_port *p;
unsigned long *inuse;
inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
if (!inuse)
return -ENOMEM;
set_bit(0, inuse); /* zero is reserved */
list_for_each_entry(p, &br->port_list, list) {
set_bit(p->port_no, inuse);
}
index = find_first_zero_bit(inuse, BR_MAX_PORTS);
bitmap_free(inuse);
return (index >= BR_MAX_PORTS) ? -EXFULL : index;
}
/* called with RTNL but without bridge lock */
static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
struct net_bridge_port *p;
int index, err;
index = find_portno(br);
if (index < 0)
return ERR_PTR(index);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return ERR_PTR(-ENOMEM);
p->br = br;
dev_hold(dev);
p->dev = dev;
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
err = br_multicast_add_port(p);
if (err) {
dev_put(dev);
kfree(p);
p = ERR_PTR(err);
}
return p;
}
int br_add_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int res;
dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN,
br_dev_setup);
if (!dev)
return -ENOMEM;
dev_net_set(dev, net);
dev->rtnl_link_ops = &br_link_ops;
res = register_netdev(dev);
if (res)
free_netdev(dev);
return res;
}
int br_del_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int ret = 0;
rtnl_lock();
dev = __dev_get_by_name(net, name);
if (dev == NULL)
ret = -ENXIO; /* Could not find device */
else if (!(dev->priv_flags & IFF_EBRIDGE)) {
/* Attempt to delete non bridge device! */
ret = -EPERM;
}
else if (dev->flags & IFF_UP) {
/* Not shutdown yet. */
ret = -EBUSY;
}
else
br_dev_delete(dev, NULL);
rtnl_unlock();
return ret;
}
/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
static int br_mtu_min(const struct net_bridge *br)
{
const struct net_bridge_port *p;
int ret_mtu = 0;
list_for_each_entry(p, &br->port_list, list)
if (!ret_mtu || ret_mtu > p->dev->mtu)
ret_mtu = p->dev->mtu;
return ret_mtu ? ret_mtu : ETH_DATA_LEN;
}
void br_mtu_auto_adjust(struct net_bridge *br)
{
ASSERT_RTNL();
/* if the bridge MTU was manually configured don't mess with it */
if (br_opt_get(br, BROPT_MTU_SET_BY_USER))
return;
/* change to the minimum MTU and clear the flag which was set by
* the bridge ndo_change_mtu callback
*/
dev_set_mtu(br->dev, br_mtu_min(br));
br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false);
}
static void br_set_gso_limits(struct net_bridge *br)
{
unsigned int gso_max_size = GSO_MAX_SIZE;
u16 gso_max_segs = GSO_MAX_SEGS;
const struct net_bridge_port *p;
list_for_each_entry(p, &br->port_list, list) {
gso_max_size = min(gso_max_size, p->dev->gso_max_size);
gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs);
}
br->dev->gso_max_size = gso_max_size;
br->dev->gso_max_segs = gso_max_segs;
}
/*
* Recomputes features using slave's features
*/
netdev_features_t br_features_recompute(struct net_bridge *br,
netdev_features_t features)
{
struct net_bridge_port *p;
netdev_features_t mask;
if (list_empty(&br->port_list))
return features;
mask = features;
features &= ~NETIF_F_ONE_FOR_ALL;
list_for_each_entry(p, &br->port_list, list) {
features = netdev_increment_features(features,
p->dev->features, mask);
}
features = netdev_add_tso_features(features, mask);
return features;
}
/* called with RTNL */
int br_add_if(struct net_bridge *br, struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct net_bridge_port *p;
int err = 0;
unsigned br_hr, dev_hr;
bool changed_addr, fdb_synced = false;
net: bridge: reject DSA-enabled master netdevices as bridge members DSA-enabled master network devices with a switch tagging protocol should strip the protocol specific format before handing the frame over to higher layer. When adding such a DSA master network device as a bridge member, we go through the following code path when receiving a frame: __netif_receive_skb_core -> first ptype check against ptype_all is not returning any handler for this skb -> check and invoke rx_handler: -> deliver frame to the bridge layer: br_handle_frame DSA registers a ptype handler with the fake ETH_XDSA ethertype, which is called *after* the bridge-layer rx_handler has run. br_handle_frame() tries to parse the frame it received from the DSA master network device, and will not be able to match any of its conditions and jumps straight at the end of the end of br_handle_frame() and returns RX_HANDLER_CONSUMED there. Since we returned RX_HANDLER_CONSUMED, __netif_receive_skb_core() stops RX processing for this frame and returns NET_RX_SUCCESS, so we never get a chance to call our switch tag packet processing logic and deliver frames to the DSA slave network devices, and so we do not get any functional bridge members at all. Instead of cluttering the bridge receive path with DSA-specific checks, and rely on assumptions about how __netif_receive_skb_core() is processing frames, we simply deny adding the DSA master network device (conduit interface) as a bridge member, leaving only the slave DSA network devices to be bridge members, since those will work correctly in all circumstances. Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 02:56:02 +09:00
/* Don't allow bridging non-ethernet like devices, or DSA-enabled
* master network devices since the bridge layer rx_handler prevents
* the DSA fake ethertype handler to be invoked, so we do not strip off
* the DSA switch tag protocol header and the bridge layer just return
* RX_HANDLER_CONSUMED, stopping RX processing for these frames.
*/
if ((dev->flags & IFF_LOOPBACK) ||
dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
net: bridge: reject DSA-enabled master netdevices as bridge members DSA-enabled master network devices with a switch tagging protocol should strip the protocol specific format before handing the frame over to higher layer. When adding such a DSA master network device as a bridge member, we go through the following code path when receiving a frame: __netif_receive_skb_core -> first ptype check against ptype_all is not returning any handler for this skb -> check and invoke rx_handler: -> deliver frame to the bridge layer: br_handle_frame DSA registers a ptype handler with the fake ETH_XDSA ethertype, which is called *after* the bridge-layer rx_handler has run. br_handle_frame() tries to parse the frame it received from the DSA master network device, and will not be able to match any of its conditions and jumps straight at the end of the end of br_handle_frame() and returns RX_HANDLER_CONSUMED there. Since we returned RX_HANDLER_CONSUMED, __netif_receive_skb_core() stops RX processing for this frame and returns NET_RX_SUCCESS, so we never get a chance to call our switch tag packet processing logic and deliver frames to the DSA slave network devices, and so we do not get any functional bridge members at all. Instead of cluttering the bridge receive path with DSA-specific checks, and rely on assumptions about how __netif_receive_skb_core() is processing frames, we simply deny adding the DSA master network device (conduit interface) as a bridge member, leaving only the slave DSA network devices to be bridge members, since those will work correctly in all circumstances. Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2015-01-17 02:56:02 +09:00
!is_valid_ether_addr(dev->dev_addr) ||
netdev_uses_dsa(dev))
return -EINVAL;
/* No bridging of bridges */
if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) {
NL_SET_ERR_MSG(extack,
"Can not enslave a bridge to a bridge");
return -ELOOP;
}
/* Device has master upper dev */
if (netdev_master_upper_dev_get(dev))
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
if (dev->priv_flags & IFF_DONT_BRIDGE) {
NL_SET_ERR_MSG(extack,
"Device does not allow enslaving to a bridge");
return -EOPNOTSUPP;
}
p = new_nbp(br, dev);
if (IS_ERR(p))
return PTR_ERR(p);
call_netdevice_notifiers(NETDEV_JOIN, dev);
bridge: Automatically manage port promiscuous mode. There exist configurations where the administrator or another management entity has the foreknowledge of all the mac addresses of end systems that are being bridged together. In these environments, the administrator can statically configure known addresses in the bridge FDB and disable flooding and learning on ports. This makes it possible to turn off promiscuous mode on the interfaces connected to the bridge. Here is why disabling flooding and learning allows us to control promiscuity: Consider port X. All traffic coming into this port from outside the bridge (ingress) will be either forwarded through other ports of the bridge (egress) or dropped. Forwarding (egress) is defined by FDB entries and by flooding in the event that no FDB entry exists. In the event that flooding is disabled, only FDB entries define the egress. Once learning is disabled, only static FDB entries provided by a management entity define the egress. If we provide information from these static FDBs to the ingress port X, then we'll be able to accept all traffic that can be successfully forwarded and drop all the other traffic sooner without spending CPU cycles to process it. Another way to define the above is as following equations: ingress = egress + drop expanding egress ingress = static FDB + learned FDB + flooding + drop disabling flooding and learning we a left with ingress = static FDB + drop By adding addresses from the static FDB entries to the MAC address filter of an ingress port X, we fully define what the bridge can process without dropping and can thus turn off promiscuous mode, thus dropping packets sooner. There have been suggestions that we may want to allow learning and update the filters with learned addresses as well. This would require mac-level authentication similar to 802.1x to prevent attacks against the hw filters as they are limited resource. Additionally, if the user places the bridge device in promiscuous mode, all ports are placed in promiscuous mode regardless of the changes to flooding and learning. Since the above functionality depends on full static configuration, we have also require that vlan filtering be enabled to take advantage of this. The reason is that the bridge has to be able to receive and process VLAN-tagged frames and the there are only 2 ways to accomplish this right now: promiscuous mode or vlan filtering. Suggested-by: Michael S. Tsirkin <mst@redhat.com> Acked-by: Michael S. Tsirkin <mst@redhat.com> Signed-off-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-16 22:59:20 +09:00
err = dev_set_allmulti(dev, 1);
if (err) {
net: bridge: fix memleak in br_add_if() [ Upstream commit 519133debcc19f5c834e7e28480b60bdc234fe02 ] I got a memleak report: BUG: memory leak unreferenced object 0x607ee521a658 (size 240): comm "syz-executor.0", pid 955, jiffies 4294780569 (age 16.449s) hex dump (first 32 bytes, cpu 1): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d830ea5a>] br_multicast_add_port+0x1c2/0x300 net/bridge/br_multicast.c:1693 [<00000000274d9a71>] new_nbp net/bridge/br_if.c:435 [inline] [<00000000274d9a71>] br_add_if+0x670/0x1740 net/bridge/br_if.c:611 [<0000000012ce888e>] do_set_master net/core/rtnetlink.c:2513 [inline] [<0000000012ce888e>] do_set_master+0x1aa/0x210 net/core/rtnetlink.c:2487 [<0000000099d1cafc>] __rtnl_newlink+0x1095/0x13e0 net/core/rtnetlink.c:3457 [<00000000a01facc0>] rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3488 [<00000000acc9186c>] rtnetlink_rcv_msg+0x369/0xa10 net/core/rtnetlink.c:5550 [<00000000d4aabb9c>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000bc2e12a3>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<00000000bc2e12a3>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000e4dc2d0e>] netlink_sendmsg+0x789/0xc70 net/netlink/af_netlink.c:1929 [<000000000d22c8b3>] sock_sendmsg_nosec net/socket.c:654 [inline] [<000000000d22c8b3>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000e281417a>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000237aa2ab>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404 [<000000004f2dc381>] __sys_sendmsg+0xd3/0x190 net/socket.c:2433 [<0000000005feca6c>] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:47 [<000000007304477d>] entry_SYSCALL_64_after_hwframe+0x44/0xae On error path of br_add_if(), p->mcast_stats allocated in new_nbp() need be freed, or it will be leaked. Fixes: 1080ab95e3c7 ("net: bridge: add support for IGMP/MLD stats and export them via netlink") Reported-by: Hulk Robot <hulkci@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com> Link: https://lore.kernel.org/r/20210809132023.978546-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
2021-08-09 22:20:23 +09:00
br_multicast_del_port(p);
kfree(p); /* kobject not yet init'd, manually free */
goto err1;
}
err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
goto err2;
err = br_sysfs_addif(p);
if (err)
goto err2;
netpoll: Remove gfp parameter from __netpoll_setup The gfp parameter was added in: commit 47be03a28cc6c80e3aa2b3e8ed6d960ff0c5c0af Author: Amerigo Wang <amwang@redhat.com> Date: Fri Aug 10 01:24:37 2012 +0000 netpoll: use GFP_ATOMIC in slave_enable_netpoll() and __netpoll_setup() slave_enable_netpoll() and __netpoll_setup() may be called with read_lock() held, so should use GFP_ATOMIC to allocate memory. Eric suggested to pass gfp flags to __netpoll_setup(). Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: "David S. Miller" <davem@davemloft.net> Reported-by: Dan Carpenter <dan.carpenter@oracle.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: Cong Wang <amwang@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> The reason for the gfp parameter was removed in: commit c4cdef9b7183159c23c7302aaf270d64c549f557 Author: dingtianhong <dingtianhong@huawei.com> Date: Tue Jul 23 15:25:27 2013 +0800 bonding: don't call slave_xxx_netpoll under spinlocks The slave_xxx_netpoll will call synchronize_rcu_bh(), so the function may schedule and sleep, it should't be called under spinlocks. bond_netpoll_setup() and bond_netpoll_cleanup() are always protected by rtnl lock, it is no need to take the read lock, as the slave list couldn't be changed outside rtnl lock. Signed-off-by: Ding Tianhong <dingtianhong@huawei.com> Cc: Jay Vosburgh <fubar@us.ibm.com> Cc: Andy Gospodarek <andy@greyhouse.net> Signed-off-by: David S. Miller <davem@davemloft.net> Nothing else that calls __netpoll_setup or ndo_netpoll_setup requires a gfp paramter, so remove the gfp parameter from both of these functions making the code clearer. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-03-28 07:36:38 +09:00
err = br_netpoll_enable(p);
if (err)
goto err3;
err = netdev_rx_handler_register(dev, br_handle_frame, p);
if (err)
goto err4;
dev->priv_flags |= IFF_BRIDGE_PORT;
err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack);
if (err)
goto err5;
bridge: switchdev: Add forward mark support for stacked devices switchdev_port_fwd_mark_set() is used to set the 'offload_fwd_mark' of port netdevs so that packets being flooded by the device won't be flooded twice. It works by assigning a unique identifier (the ifindex of the first bridge port) to bridge ports sharing the same parent ID. This prevents packets from being flooded twice by the same switch, but will flood packets through bridge ports belonging to a different switch. This method is problematic when stacked devices are taken into account, such as VLANs. In such cases, a physical port netdev can have upper devices being members in two different bridges, thus requiring two different 'offload_fwd_mark's to be configured on the port netdev, which is impossible. The main problem is that packet and netdev marking is performed at the physical netdev level, whereas flooding occurs between bridge ports, which are not necessarily port netdevs. Instead, packet and netdev marking should really be done in the bridge driver with the switch driver only telling it which packets it already forwarded. The bridge driver will mark such packets using the mark assigned to the ingress bridge port and will prevent the packet from being forwarded through any bridge port sharing the same mark (i.e. having the same parent ID). Remove the current switchdev 'offload_fwd_mark' implementation and instead implement the proposed method. In addition, make rocker - the sole user of the mark - use the proposed method. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-26 01:42:37 +09:00
err = nbp_switchdev_mark_set(p);
if (err)
goto err6;
dev_disable_lro(dev);
list_add_rcu(&p->list, &br->port_list);
nbp_update_port_count(br);
if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) {
/* When updating the port count we also update all ports'
* promiscuous mode.
* A port leaving promiscuous mode normally gets the bridge's
* fdb synced to the unicast filter (if supported), however,
* `br_port_clear_promisc` does not distinguish between
* non-promiscuous ports and *new* ports, so we need to
* sync explicitly here.
*/
fdb_synced = br_fdb_sync_static(br, p) == 0;
if (!fdb_synced)
netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n");
}
netdev_update_features(br->dev);
br_hr = br->dev->needed_headroom;
dev_hr = netdev_get_fwd_headroom(dev);
if (br_hr < dev_hr)
update_headroom(br, dev_hr);
else
netdev_set_rx_headroom(dev, br_hr);
bridge: Change local fdb entries whenever mac address of bridge device changes Vlan code may need fdb change when changing mac address of bridge device even if it is caused by the mac address changing of a bridge port. Example configuration: ip link set eth0 address 12:34:56:78:90:ab ip link set eth1 address aa:bb:cc:dd:ee:ff brctl addif br0 eth0 brctl addif br0 eth1 # br0 will have mac address 12:34:56:78:90:ab bridge vlan add dev br0 vid 10 self bridge vlan add dev eth0 vid 10 We will have fdb entry such that f->dst == NULL, f->vlan_id == 10 and f->addr == 12:34:56:78:90:ab at this time. Next, change the mac address of eth0 to greater value. ip link set eth0 address ee:ff:12:34:56:78 Then, mac address of br0 will be recalculated and set to aa:bb:cc:dd:ee:ff. However, an entry aa:bb:cc:dd:ee:ff will not be created and we will be not able to communicate using br0 on vlan 10. Address this issue by deleting and adding local entries whenever changing the mac address of the bridge device. If there already exists an entry that has the same address, for example, in case that br_fdb_changeaddr() has already inserted it, br_fdb_change_mac_address() will simply fail to insert it and no duplicated entry will be made, as it was. This approach also needs br_add_if() to call br_fdb_insert() before br_stp_recalculate_bridge_id() so that we don't create an entry whose dst == NULL in this function to preserve previous behavior. Note that this is a slight change in behavior where the bridge device can receive the traffic to the new address before calling br_stp_recalculate_bridge_id() in br_add_if(). However, it is not a problem because we have already the address on the new port and such a way to insert new one before recalculating bridge id is taken in br_device_event() as well. Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> Acked-by: Vlad Yasevich <vyasevic@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-02-07 16:48:21 +09:00
if (br_fdb_insert(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");
if (br->dev->addr_assign_type != NET_ADDR_SET) {
/* Ask for permission to use this MAC address now, even if we
* don't end up choosing it below.
*/
err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack);
if (err)
goto err7;
}
err = nbp_vlan_init(p, extack);
if (err) {
netdev_err(dev, "failed to initialize vlan filtering on this port\n");
bridge: switchdev: Add forward mark support for stacked devices switchdev_port_fwd_mark_set() is used to set the 'offload_fwd_mark' of port netdevs so that packets being flooded by the device won't be flooded twice. It works by assigning a unique identifier (the ifindex of the first bridge port) to bridge ports sharing the same parent ID. This prevents packets from being flooded twice by the same switch, but will flood packets through bridge ports belonging to a different switch. This method is problematic when stacked devices are taken into account, such as VLANs. In such cases, a physical port netdev can have upper devices being members in two different bridges, thus requiring two different 'offload_fwd_mark's to be configured on the port netdev, which is impossible. The main problem is that packet and netdev marking is performed at the physical netdev level, whereas flooding occurs between bridge ports, which are not necessarily port netdevs. Instead, packet and netdev marking should really be done in the bridge driver with the switch driver only telling it which packets it already forwarded. The bridge driver will mark such packets using the mark assigned to the ingress bridge port and will prevent the packet from being forwarded through any bridge port sharing the same mark (i.e. having the same parent ID). Remove the current switchdev 'offload_fwd_mark' implementation and instead implement the proposed method. In addition, make rocker - the sole user of the mark - use the proposed method. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-26 01:42:37 +09:00
goto err7;
}
spin_lock_bh(&br->lock);
changed_addr = br_stp_recalculate_bridge_id(br);
if (netif_running(dev) && netif_oper_up(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
spin_unlock_bh(&br->lock);
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
br_mtu_auto_adjust(br);
br_set_gso_limits(br);
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
bridge: switchdev: Add forward mark support for stacked devices switchdev_port_fwd_mark_set() is used to set the 'offload_fwd_mark' of port netdevs so that packets being flooded by the device won't be flooded twice. It works by assigning a unique identifier (the ifindex of the first bridge port) to bridge ports sharing the same parent ID. This prevents packets from being flooded twice by the same switch, but will flood packets through bridge ports belonging to a different switch. This method is problematic when stacked devices are taken into account, such as VLANs. In such cases, a physical port netdev can have upper devices being members in two different bridges, thus requiring two different 'offload_fwd_mark's to be configured on the port netdev, which is impossible. The main problem is that packet and netdev marking is performed at the physical netdev level, whereas flooding occurs between bridge ports, which are not necessarily port netdevs. Instead, packet and netdev marking should really be done in the bridge driver with the switch driver only telling it which packets it already forwarded. The bridge driver will mark such packets using the mark assigned to the ingress bridge port and will prevent the packet from being forwarded through any bridge port sharing the same mark (i.e. having the same parent ID). Remove the current switchdev 'offload_fwd_mark' implementation and instead implement the proposed method. In addition, make rocker - the sole user of the mark - use the proposed method. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-26 01:42:37 +09:00
err7:
if (fdb_synced)
br_fdb_unsync_static(br, p);
list_del_rcu(&p->list);
br_fdb_delete_by_port(br, p, 0, 1);
nbp_update_port_count(br);
bridge: switchdev: Add forward mark support for stacked devices switchdev_port_fwd_mark_set() is used to set the 'offload_fwd_mark' of port netdevs so that packets being flooded by the device won't be flooded twice. It works by assigning a unique identifier (the ifindex of the first bridge port) to bridge ports sharing the same parent ID. This prevents packets from being flooded twice by the same switch, but will flood packets through bridge ports belonging to a different switch. This method is problematic when stacked devices are taken into account, such as VLANs. In such cases, a physical port netdev can have upper devices being members in two different bridges, thus requiring two different 'offload_fwd_mark's to be configured on the port netdev, which is impossible. The main problem is that packet and netdev marking is performed at the physical netdev level, whereas flooding occurs between bridge ports, which are not necessarily port netdevs. Instead, packet and netdev marking should really be done in the bridge driver with the switch driver only telling it which packets it already forwarded. The bridge driver will mark such packets using the mark assigned to the ingress bridge port and will prevent the packet from being forwarded through any bridge port sharing the same mark (i.e. having the same parent ID). Remove the current switchdev 'offload_fwd_mark' implementation and instead implement the proposed method. In addition, make rocker - the sole user of the mark - use the proposed method. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-08-26 01:42:37 +09:00
err6:
netdev_upper_dev_unlink(dev, br->dev);
err5:
dev->priv_flags &= ~IFF_BRIDGE_PORT;
netdev_rx_handler_unregister(dev);
err4:
br_netpoll_disable(p);
err3:
sysfs_remove_link(br->ifobj, p->dev->name);
err2:
net: bridge: fix memleak in br_add_if() [ Upstream commit 519133debcc19f5c834e7e28480b60bdc234fe02 ] I got a memleak report: BUG: memory leak unreferenced object 0x607ee521a658 (size 240): comm "syz-executor.0", pid 955, jiffies 4294780569 (age 16.449s) hex dump (first 32 bytes, cpu 1): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d830ea5a>] br_multicast_add_port+0x1c2/0x300 net/bridge/br_multicast.c:1693 [<00000000274d9a71>] new_nbp net/bridge/br_if.c:435 [inline] [<00000000274d9a71>] br_add_if+0x670/0x1740 net/bridge/br_if.c:611 [<0000000012ce888e>] do_set_master net/core/rtnetlink.c:2513 [inline] [<0000000012ce888e>] do_set_master+0x1aa/0x210 net/core/rtnetlink.c:2487 [<0000000099d1cafc>] __rtnl_newlink+0x1095/0x13e0 net/core/rtnetlink.c:3457 [<00000000a01facc0>] rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3488 [<00000000acc9186c>] rtnetlink_rcv_msg+0x369/0xa10 net/core/rtnetlink.c:5550 [<00000000d4aabb9c>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000bc2e12a3>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<00000000bc2e12a3>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000e4dc2d0e>] netlink_sendmsg+0x789/0xc70 net/netlink/af_netlink.c:1929 [<000000000d22c8b3>] sock_sendmsg_nosec net/socket.c:654 [inline] [<000000000d22c8b3>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000e281417a>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000237aa2ab>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404 [<000000004f2dc381>] __sys_sendmsg+0xd3/0x190 net/socket.c:2433 [<0000000005feca6c>] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:47 [<000000007304477d>] entry_SYSCALL_64_after_hwframe+0x44/0xae On error path of br_add_if(), p->mcast_stats allocated in new_nbp() need be freed, or it will be leaked. Fixes: 1080ab95e3c7 ("net: bridge: add support for IGMP/MLD stats and export them via netlink") Reported-by: Hulk Robot <hulkci@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com> Link: https://lore.kernel.org/r/20210809132023.978546-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
2021-08-09 22:20:23 +09:00
br_multicast_del_port(p);
kobject_put(&p->kobj);
dev_set_allmulti(dev, -1);
err1:
dev_put(dev);
return err;
}
/* called with RTNL */
int br_del_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
bool changed_addr;
p = br_port_get_rtnl(dev);
if (!p || p->br != br)
return -EINVAL;
/* Since more than one interface can be attached to a bridge,
* there still maybe an alternate path for netconsole to use;
* therefore there is no reason for a NETDEV_RELEASE event.
*/
del_nbp(p);
br_mtu_auto_adjust(br);
br_set_gso_limits(br);
spin_lock_bh(&br->lock);
changed_addr = br_stp_recalculate_bridge_id(br);
spin_unlock_bh(&br->lock);
if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
netdev_update_features(br->dev);
return 0;
}
void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
{
struct net_bridge *br = p->br;
if (mask & BR_AUTO_MASK)
nbp_update_port_count(br);
if (mask & BR_NEIGH_SUPPRESS)
br_recalculate_neigh_suppress_enabled(br);
}
bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
{
struct net_bridge_port *p;
p = br_port_get_rtnl_rcu(dev);
if (!p)
return false;
return p->flags & flag;
}
EXPORT_SYMBOL_GPL(br_port_flag_is_set);