From 472111920f1c5fbe103022a4b05bfb37128a2a29 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 22 Jul 2021 18:55:38 +0300 Subject: [PATCH 1/5] net: bridge: switchdev: allow the TX data plane forwarding to be offloaded Allow switchdevs to forward frames from the CPU in accordance with the bridge configuration in the same way as is done between bridge ports. This means that the bridge will only send a single skb towards one of the ports under the switchdev's control, and expects the driver to deliver the packet to all eligible ports in its domain. Primarily this improves the performance of multicast flows with multiple subscribers, as it allows the hardware to perform the frame replication. The basic flow between the driver and the bridge is as follows: - When joining a bridge port, the switchdev driver calls switchdev_bridge_port_offload() with tx_fwd_offload = true. - The bridge sends offloadable skbs to one of the ports under the switchdev's control using skb->offload_fwd_mark = true. - The switchdev driver checks the skb->offload_fwd_mark field and lets its FDB lookup select the destination port mask for this packet. v1->v2: - convert br_input_skb_cb::fwd_hwdoms to a plain unsigned long - introduce a static key "br_switchdev_fwd_offload_used" to minimize the impact of the newly introduced feature on all the setups which don't have hardware that can make use of it - introduce a check for nbp->flags & BR_FWD_OFFLOAD to optimize cache line access - reorder nbp_switchdev_frame_mark_accel() and br_handle_vlan() in __br_forward() - do not strip VLAN on egress if forwarding offload on VLAN-aware bridge is being used - propagate errors from .ndo_dfwd_add_station() if not EOPNOTSUPP v2->v3: - replace the solution based on .ndo_dfwd_add_station with a solution based on switchdev_bridge_port_offload - rename BR_FWD_OFFLOAD to BR_TX_FWD_OFFLOAD v3->v4: rebase v4->v5: - make sure the static key is decremented on bridge port unoffload - more function and variable renaming and comments for them: br_switchdev_fwd_offload_used to br_switchdev_tx_fwd_offload br_switchdev_accels_skb to br_switchdev_frame_uses_tx_fwd_offload nbp_switchdev_frame_mark_tx_fwd to nbp_switchdev_frame_mark_tx_fwd_to_hwdom nbp_switchdev_frame_mark_accel to nbp_switchdev_frame_mark_tx_fwd_offload fwd_accel to tx_fwd_offload Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- .../ethernet/freescale/dpaa2/dpaa2-switch.c | 2 +- .../marvell/prestera/prestera_switchdev.c | 2 +- .../mellanox/mlxsw/spectrum_switchdev.c | 2 +- .../microchip/sparx5/sparx5_switchdev.c | 2 +- drivers/net/ethernet/mscc/ocelot_net.c | 2 +- drivers/net/ethernet/rocker/rocker_ofdpa.c | 2 +- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- drivers/net/ethernet/ti/cpsw_new.c | 2 +- include/linux/if_bridge.h | 3 + net/bridge/br_forward.c | 9 +++ net/bridge/br_private.h | 31 +++++++++ net/bridge/br_switchdev.c | 68 +++++++++++++++++-- net/bridge/br_vlan.c | 10 ++- net/dsa/port.c | 2 +- 14 files changed, 125 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 3d021edb78e6..c233e8786e19 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -1936,7 +1936,7 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev, err = switchdev_bridge_port_offload(netdev, netdev, NULL, &dpaa2_switch_port_switchdev_nb, &dpaa2_switch_port_switchdev_blocking_nb, - extack); + false, extack); if (err) goto err_switchdev_offload; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 7fe1287228e5..be01ec8284e6 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -502,7 +502,7 @@ int prestera_bridge_port_join(struct net_device *br_dev, } err = switchdev_bridge_port_offload(br_port->dev, port->dev, NULL, - NULL, NULL, extack); + NULL, NULL, false, extack); if (err) goto err_switchdev_offload; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 0a53f1d8e7e1..f5d0d392efbf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -362,7 +362,7 @@ mlxsw_sp_bridge_port_create(struct mlxsw_sp_bridge_device *bridge_device, bridge_port->ref_count = 1; err = switchdev_bridge_port_offload(brport_dev, mlxsw_sp_port->dev, - NULL, NULL, NULL, extack); + NULL, NULL, NULL, false, extack); if (err) goto err_switchdev_offload; diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c index 807dc45cfae4..649ca609884a 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_switchdev.c @@ -113,7 +113,7 @@ static int sparx5_port_bridge_join(struct sparx5_port *port, set_bit(port->portno, sparx5->bridge_mask); err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL, - extack); + false, extack); if (err) goto err_switchdev_offload; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 3558ee8d9212..c52f175df389 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1200,7 +1200,7 @@ static int ocelot_netdevice_bridge_join(struct net_device *dev, err = switchdev_bridge_port_offload(brport_dev, dev, priv, &ocelot_netdevice_nb, &ocelot_switchdev_blocking_nb, - extack); + false, extack); if (err) goto err_switchdev_offload; diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 03df6a24d0ba..b82e169b7836 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -2599,7 +2599,7 @@ static int ofdpa_port_bridge_join(struct ofdpa_port *ofdpa_port, return err; return switchdev_bridge_port_offload(dev, dev, NULL, NULL, NULL, - extack); + false, extack); } static int ofdpa_port_bridge_leave(struct ofdpa_port *ofdpa_port) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index b285606f963d..229e2f09d605 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2097,7 +2097,7 @@ static int am65_cpsw_netdevice_port_link(struct net_device *ndev, } err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL, - extack); + false, extack); if (err) return err; diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 31030f73840d..4448a91cce54 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1518,7 +1518,7 @@ static int cpsw_netdevice_port_link(struct net_device *ndev, } err = switchdev_bridge_port_offload(ndev, ndev, NULL, NULL, NULL, - extack); + false, extack); if (err) return err; diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index bbf680093823..f0b4ffbd8582 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -57,6 +57,7 @@ struct br_ip_list { #define BR_MRP_AWARE BIT(17) #define BR_MRP_LOST_CONT BIT(18) #define BR_MRP_LOST_IN_CONT BIT(19) +#define BR_TX_FWD_OFFLOAD BIT(20) #define BR_DEFAULT_AGEING_TIME (300 * HZ) @@ -182,6 +183,7 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, + bool tx_fwd_offload, struct netlink_ext_ack *extack); void switchdev_bridge_port_unoffload(struct net_device *brport_dev, const void *ctx, @@ -195,6 +197,7 @@ switchdev_bridge_port_offload(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, + bool tx_fwd_offload, struct netlink_ext_ack *extack) { return -EINVAL; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index bfdbaf3015b9..bc14b1b384e9 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -48,6 +48,8 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb skb_set_network_header(skb, depth); } + skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb); + dev_queue_xmit(skb); return 0; @@ -76,6 +78,11 @@ static void __br_forward(const struct net_bridge_port *to, struct net *net; int br_hook; + /* Mark the skb for forwarding offload early so that br_handle_vlan() + * can know whether to pop the VLAN header on egress or keep it. + */ + nbp_switchdev_frame_mark_tx_fwd_offload(to, skb); + vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, to, vg, skb); if (!skb) @@ -174,6 +181,8 @@ static struct net_bridge_port *maybe_deliver( if (!should_deliver(p, skb)) return prev; + nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb); + if (!prev) goto out; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2f32d330b648..86ca617fec7a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -552,12 +552,20 @@ struct br_input_skb_cb { #endif #ifdef CONFIG_NET_SWITCHDEV + /* Set if TX data plane offloading is used towards at least one + * hardware domain. + */ + u8 tx_fwd_offload:1; /* The switchdev hardware domain from which this packet was received. * If skb->offload_fwd_mark was set, then this packet was already * forwarded by hardware to the other ports in the source hardware * domain, otherwise it wasn't. */ int src_hwdom; + /* Bit mask of hardware domains towards this packet has already been + * transmitted using the TX data plane offload. + */ + unsigned long fwd_hwdoms; #endif }; @@ -1871,6 +1879,12 @@ static inline void br_sysfs_delbr(struct net_device *dev) { return; } /* br_switchdev.c */ #ifdef CONFIG_NET_SWITCHDEV +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); + +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb); +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb); void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb); bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, @@ -1891,6 +1905,23 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb) skb->offload_fwd_mark = 0; } #else +static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) +{ + return false; +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 6bfff28ede23..96ce069d0c8c 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -8,6 +8,46 @@ #include "br_private.h" +static struct static_key_false br_switchdev_tx_fwd_offload; + +static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; + + return (p->flags & BR_TX_FWD_OFFLOAD) && + (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); +} + +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) +{ + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; + + return BR_INPUT_SKB_CB(skb)->tx_fwd_offload; +} + +/* Mark the frame for TX forwarding offload if this egress port supports it */ +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true; +} + +/* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms + * that the skb has been already forwarded to, to avoid further cloning to + * other ports in the same hwdom by making nbp_switchdev_allowed_egress() + * return false. + */ +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms); +} + void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { @@ -18,8 +58,10 @@ void nbp_switchdev_frame_mark(const struct net_bridge_port *p, bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { - return !skb->offload_fwd_mark || - BR_INPUT_SKB_CB(skb)->src_hwdom != p->hwdom; + struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb); + + return !test_bit(p->hwdom, &cb->fwd_hwdoms) && + (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom); } /* Flags that can be offloaded to hardware */ @@ -164,8 +206,11 @@ static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving) static int nbp_switchdev_add(struct net_bridge_port *p, struct netdev_phys_item_id ppid, + bool tx_fwd_offload, struct netlink_ext_ack *extack) { + int err; + if (p->offload_count) { /* Prevent unsupported configurations such as a bridge port * which is a bonding interface, and the member ports are from @@ -189,7 +234,16 @@ static int nbp_switchdev_add(struct net_bridge_port *p, p->ppid = ppid; p->offload_count = 1; - return nbp_switchdev_hwdom_set(p); + err = nbp_switchdev_hwdom_set(p); + if (err) + return err; + + if (tx_fwd_offload) { + p->flags |= BR_TX_FWD_OFFLOAD; + static_branch_inc(&br_switchdev_tx_fwd_offload); + } + + return 0; } static void nbp_switchdev_del(struct net_bridge_port *p) @@ -204,6 +258,11 @@ static void nbp_switchdev_del(struct net_bridge_port *p) if (p->hwdom) nbp_switchdev_hwdom_put(p); + + if (p->flags & BR_TX_FWD_OFFLOAD) { + p->flags &= ~BR_TX_FWD_OFFLOAD; + static_branch_dec(&br_switchdev_tx_fwd_offload); + } } static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, @@ -262,6 +321,7 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev, struct net_device *dev, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb, + bool tx_fwd_offload, struct netlink_ext_ack *extack) { struct netdev_phys_item_id ppid; @@ -278,7 +338,7 @@ int switchdev_bridge_port_offload(struct net_device *brport_dev, if (err) return err; - err = nbp_switchdev_add(p, ppid, extack); + err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack); if (err) return err; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 382ab992badf..325600361487 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -465,7 +465,15 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, u64_stats_update_end(&stats->syncp); } - if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) + /* If the skb will be sent using forwarding offload, the assumption is + * that the switchdev will inject the packet into hardware together + * with the bridge VLAN, so that it can be forwarded according to that + * VLAN. The switchdev should deal with popping the VLAN header in + * hardware on each egress port as appropriate. So only strip the VLAN + * header if forwarding offload is not being used. + */ + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && + !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && diff --git a/net/dsa/port.c b/net/dsa/port.c index d81c283b7358..f2704f101ccf 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -257,7 +257,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, err = switchdev_bridge_port_offload(brport_dev, dev, dp, &dsa_slave_switchdev_notifier, &dsa_slave_switchdev_blocking_notifier, - extack); + false, extack); if (err) goto out_rollback_unbridge; From 5b22d3669f2fa6e762c5302fc4b6051a92b81617 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 22 Jul 2021 18:55:39 +0300 Subject: [PATCH 2/5] net: dsa: track the number of switches in a tree In preparation of supporting data plane forwarding on behalf of a software bridge, some drivers might need to view bridges as virtual switches behind the CPU port in a cross-chip topology. Give them some help and let them know how many physical switches there are in the tree, so that they can count the virtual switches starting from that number on. Note that the first dsa_switch_ops method where this information is reliably available is .setup(). This is because of how DSA works: in a tree with 3 switches, each calling dsa_register_switch(), the first 2 will advance until dsa_tree_setup() -> dsa_tree_setup_routing_table() and exit with error code 0 because the topology is not complete. Since probing is parallel at this point, one switch does not know about the existence of the other. Then the third switch comes, and for it, dsa_tree_setup_routing_table() returns complete = true. This switch goes ahead and calls dsa_tree_setup_switches() for everybody else, calling their .setup() methods too. This acts as the synchronization point. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ net/dsa/dsa2.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/include/net/dsa.h b/include/net/dsa.h index 9e5593885357..929bcaec4d41 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -159,6 +159,9 @@ struct dsa_switch_tree { */ struct net_device **lags; unsigned int lags_len; + + /* Track the largest switch index within a tree */ + unsigned int last_switch; }; #define dsa_lags_foreach_id(_id, _dst) \ diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 185629f27f80..de5e93ba2a9d 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1265,6 +1265,9 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return -EEXIST; } + if (ds->dst->last_switch < ds->index) + ds->dst->last_switch = ds->index; + return 0; } From 123abc06e74f49d9b173a93cb2b797fb85f50ba3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 22 Jul 2021 18:55:40 +0300 Subject: [PATCH 3/5] net: dsa: add support for bridge TX forwarding offload For a DSA switch, to offload the forwarding process of a bridge device means to send the packets coming from the software bridge as data plane packets. This is contrary to everything that DSA has done so far, because the current taggers only know to send control packets (ones that target a specific destination port), whereas data plane packets are supposed to be forwarded according to the FDB lookup, much like packets ingressing on any regular ingress port. If the FDB lookup process returns multiple destination ports (flooding, multicast), then replication is also handled by the switch hardware - the bridge only sends a single packet and avoids the skb_clone(). DSA keeps for each bridge port a zero-based index (the number of the bridge). Multiple ports performing TX forwarding offload to the same bridge have the same dp->bridge_num value, and ports not offloading the TX data plane of a bridge have dp->bridge_num = -1. The tagger can check if the packet that is being transmitted on has skb->offload_fwd_mark = true or not. If it does, it can be sure that the packet belongs to the data plane of a bridge, further information about which can be obtained based on dp->bridge_dev and dp->bridge_num. It can then compose a DSA tag for injecting a data plane packet into that bridge number. For the switch driver side, we offer two new dsa_switch_ops methods, called .port_bridge_fwd_offload_{add,del}, which are modeled after .port_bridge_{join,leave}. These methods are provided in case the driver needs to configure the hardware to treat packets coming from that bridge software interface as data plane packets. The switchdev <-> bridge interaction happens during the netdev_master_upper_dev_link() call, so to switch drivers, the effect is that the .port_bridge_fwd_offload_add() method is called immediately after .port_bridge_join(). If the bridge number exceeds the number of bridges for which the switch driver can offload the TX data plane (and this includes the case where the driver can offload none), DSA falls back to simply returning tx_fwd_offload = false in the switchdev_bridge_port_offload() call. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 18 ++++++++++ net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 2 ++ net/dsa/port.c | 84 +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 104 insertions(+), 1 deletion(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index 929bcaec4d41..f8eb2dc3fbef 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -162,6 +162,9 @@ struct dsa_switch_tree { /* Track the largest switch index within a tree */ unsigned int last_switch; + + /* Track the bridges with forwarding offload enabled */ + unsigned long fwd_offloading_bridges; }; #define dsa_lags_foreach_id(_id, _dst) \ @@ -262,6 +265,7 @@ struct dsa_port { bool vlan_filtering; u8 stp_state; struct net_device *bridge_dev; + int bridge_num; struct devlink_port devlink_port; bool devlink_port_setup; struct phylink *pl; @@ -413,6 +417,12 @@ struct dsa_switch { */ unsigned int num_lag_ids; + /* Drivers that support bridge forwarding offload should set this to + * the maximum number of bridges spanning the same switch tree that can + * be offloaded. + */ + unsigned int num_fwd_offloading_bridges; + size_t num_ports; }; @@ -696,6 +706,14 @@ struct dsa_switch_ops { struct net_device *bridge); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct net_device *bridge); + /* Called right after .port_bridge_join() */ + int (*port_bridge_tx_fwd_offload)(struct dsa_switch *ds, int port, + struct net_device *bridge, + int bridge_num); + /* Called right before .port_bridge_leave() */ + void (*port_bridge_tx_fwd_unoffload)(struct dsa_switch *ds, int port, + struct net_device *bridge, + int bridge_num); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); void (*port_fast_age)(struct dsa_switch *ds, int port); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index de5e93ba2a9d..c7fa85fb3086 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1044,6 +1044,7 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) dp->ds = ds; dp->index = index; + dp->bridge_num = -1; INIT_LIST_HEAD(&dp->list); list_add_tail(&dp->list, &dst->ports); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 78c70f5bdab5..b1d9aa4d313c 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -14,6 +14,8 @@ #include #include +#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG + enum { DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, diff --git a/net/dsa/port.c b/net/dsa/port.c index f2704f101ccf..7b9bf45a76b6 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -230,6 +230,83 @@ static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp) */ } +static int dsa_tree_find_bridge_num(struct dsa_switch_tree *dst, + struct net_device *bridge_dev) +{ + struct dsa_port *dp; + + /* When preparing the offload for a port, it will have a valid + * dp->bridge_dev pointer but a not yet valid dp->bridge_num. + * However there might be other ports having the same dp->bridge_dev + * and a valid dp->bridge_num, so just ignore this port. + */ + list_for_each_entry(dp, &dst->ports, list) + if (dp->bridge_dev == bridge_dev && dp->bridge_num != -1) + return dp->bridge_num; + + return -1; +} + +static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + struct dsa_switch_tree *dst = dp->ds->dst; + int bridge_num = dp->bridge_num; + struct dsa_switch *ds = dp->ds; + + /* No bridge TX forwarding offload => do nothing */ + if (!ds->ops->port_bridge_tx_fwd_unoffload || dp->bridge_num == -1) + return; + + dp->bridge_num = -1; + + /* Check if the bridge is still in use, otherwise it is time + * to clean it up so we can reuse this bridge_num later. + */ + if (!dsa_tree_find_bridge_num(dst, bridge_dev)) + clear_bit(bridge_num, &dst->fwd_offloading_bridges); + + /* Notify the chips only once the offload has been deactivated, so + * that they can update their configuration accordingly. + */ + ds->ops->port_bridge_tx_fwd_unoffload(ds, dp->index, bridge_dev, + bridge_num); +} + +static bool dsa_port_bridge_tx_fwd_offload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + struct dsa_switch_tree *dst = dp->ds->dst; + struct dsa_switch *ds = dp->ds; + int bridge_num, err; + + if (!ds->ops->port_bridge_tx_fwd_offload) + return false; + + bridge_num = dsa_tree_find_bridge_num(dst, bridge_dev); + if (bridge_num < 0) { + /* First port that offloads TX forwarding for this bridge */ + bridge_num = find_first_zero_bit(&dst->fwd_offloading_bridges, + DSA_MAX_NUM_OFFLOADING_BRIDGES); + if (bridge_num >= ds->num_fwd_offloading_bridges) + return false; + + set_bit(bridge_num, &dst->fwd_offloading_bridges); + } + + dp->bridge_num = bridge_num; + + /* Notify the driver */ + err = ds->ops->port_bridge_tx_fwd_offload(ds, dp->index, bridge_dev, + bridge_num); + if (err) { + dsa_port_bridge_tx_fwd_unoffload(dp, bridge_dev); + return false; + } + + return true; +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack) { @@ -241,6 +318,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, }; struct net_device *dev = dp->slave; struct net_device *brport_dev; + bool tx_fwd_offload; int err; /* Here the interface is already bridged. Reflect the current @@ -254,10 +332,12 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, if (err) goto out_rollback; + tx_fwd_offload = dsa_port_bridge_tx_fwd_offload(dp, br); + err = switchdev_bridge_port_offload(brport_dev, dev, dp, &dsa_slave_switchdev_notifier, &dsa_slave_switchdev_blocking_notifier, - false, extack); + tx_fwd_offload, extack); if (err) goto out_rollback_unbridge; @@ -302,6 +382,8 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = NULL; + dsa_port_bridge_tx_fwd_unoffload(dp, br); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); From ce5df6894a5752676a015dfe342f25753971c02f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 22 Jul 2021 18:55:41 +0300 Subject: [PATCH 4/5] net: dsa: mv88e6xxx: map virtual bridges with forwarding offload in the PVT The mv88e6xxx switches have the ability to receive FORWARD (data plane) frames from the CPU port and route them according to the FDB. We can use this to offload the forwarding process of packets sent by the software bridge. Because DSA supports bridge domain isolation between user ports, just sending FORWARD frames is not enough, as they might leak the intended broadcast domain of the bridge on behalf of which the packets are sent. It should be noted that FORWARD frames are also (and typically) used to forward data plane packets on DSA links in cross-chip topologies. The FORWARD frame header contains the source port and switch ID, and switches receiving this frame header forward the packet according to their cross-chip port-based VLAN table (PVT). To address the bridging domain isolation in the context of offloading the forwarding on TX, the idea is that we can reuse the parts of the PVT that don't have any physical switch mapped to them, one entry for each software bridge. The switches will therefore think that behind their upstream port lie many switches, all in fact backed up by software bridges through tag_dsa.c, which constructs FORWARD packets with the right switch ID corresponding to each bridge. The mapping we use is absolutely trivial: DSA gives us a unique bridge number, and we add the number of the physical switches in the DSA switch tree to that, to obtain a unique virtual bridge device number to use in the PVT. Co-developed-by: Tobias Waldekranz Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 78 ++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index beb41572d04e..af764b8445b7 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1221,14 +1221,36 @@ static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip *chip, int dev, int port) bool found = false; u16 pvlan; - list_for_each_entry(dp, &dst->ports, list) { - if (dp->ds->index == dev && dp->index == port) { + /* dev is a physical switch */ + if (dev <= dst->last_switch) { + list_for_each_entry(dp, &dst->ports, list) { + if (dp->ds->index == dev && dp->index == port) { + /* dp might be a DSA link or a user port, so it + * might or might not have a bridge_dev + * pointer. Use the "found" variable for both + * cases. + */ + br = dp->bridge_dev; + found = true; + break; + } + } + /* dev is a virtual bridge */ + } else { + list_for_each_entry(dp, &dst->ports, list) { + if (dp->bridge_num < 0) + continue; + + if (dp->bridge_num + 1 + dst->last_switch != dev) + continue; + + br = dp->bridge_dev; found = true; break; } } - /* Prevent frames from unknown switch or port */ + /* Prevent frames from unknown switch or virtual bridge */ if (!found) return 0; @@ -1236,7 +1258,6 @@ static u16 mv88e6xxx_port_vlan(struct mv88e6xxx_chip *chip, int dev, int port) if (dp->type == DSA_PORT_TYPE_CPU || dp->type == DSA_PORT_TYPE_DSA) return mv88e6xxx_port_mask(chip); - br = dp->bridge_dev; pvlan = 0; /* Frames from user ports can egress any local DSA links and CPU ports, @@ -2422,6 +2443,44 @@ static void mv88e6xxx_crosschip_bridge_leave(struct dsa_switch *ds, mv88e6xxx_reg_unlock(chip); } +/* Treat the software bridge as a virtual single-port switch behind the + * CPU and map in the PVT. First dst->last_switch elements are taken by + * physical switches, so start from beyond that range. + */ +static int mv88e6xxx_map_virtual_bridge_to_pvt(struct dsa_switch *ds, + int bridge_num) +{ + u8 dev = bridge_num + ds->dst->last_switch + 1; + struct mv88e6xxx_chip *chip = ds->priv; + int err; + + mv88e6xxx_reg_lock(chip); + err = mv88e6xxx_pvt_map(chip, dev, 0); + mv88e6xxx_reg_unlock(chip); + + return err; +} + +static int mv88e6xxx_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) +{ + return mv88e6xxx_map_virtual_bridge_to_pvt(ds, bridge_num); +} + +static void mv88e6xxx_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) +{ + int err; + + err = mv88e6xxx_map_virtual_bridge_to_pvt(ds, bridge_num); + if (err) { + dev_err(ds->dev, "failed to remap cross-chip Port VLAN: %pe\n", + ERR_PTR(err)); + } +} + static int mv88e6xxx_software_reset(struct mv88e6xxx_chip *chip) { if (chip->info->ops->reset) @@ -3025,6 +3084,15 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) chip->ds = ds; ds->slave_mii_bus = mv88e6xxx_default_mdio_bus(chip); + /* Since virtual bridges are mapped in the PVT, the number we support + * depends on the physical switch topology. We need to let DSA figure + * that out and therefore we cannot set this at dsa_register_switch() + * time. + */ + if (mv88e6xxx_has_pvt(chip)) + ds->num_fwd_offloading_bridges = MV88E6XXX_MAX_PVT_SWITCHES - + ds->dst->last_switch - 1; + mv88e6xxx_reg_lock(chip); if (chip->info->ops->setup_errata) { @@ -6128,6 +6196,8 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .crosschip_lag_change = mv88e6xxx_crosschip_lag_change, .crosschip_lag_join = mv88e6xxx_crosschip_lag_join, .crosschip_lag_leave = mv88e6xxx_crosschip_lag_leave, + .port_bridge_tx_fwd_offload = mv88e6xxx_bridge_tx_fwd_offload, + .port_bridge_tx_fwd_unoffload = mv88e6xxx_bridge_tx_fwd_unoffload, }; static int mv88e6xxx_register_switch(struct mv88e6xxx_chip *chip) From d82f8ab0d874b1f2ca45cb9b3b65aaa2638760f6 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 22 Jul 2021 18:55:42 +0300 Subject: [PATCH 5/5] net: dsa: tag_dsa: offload the bridge forwarding process Allow the DSA tagger to generate FORWARD frames for offloaded skbs sent from a bridge that we offload, allowing the switch to handle any frame replication that may be required. This also means that source address learning takes place on packets sent from the CPU, meaning that return traffic no longer needs to be flooded as unknown unicast. Signed-off-by: Tobias Waldekranz Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/dsa/tag_dsa.c | 52 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 8 deletions(-) diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index a822355afc90..0f258218c8cf 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -126,7 +126,42 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { struct dsa_port *dp = dsa_slave_to_port(dev); + u8 tag_dev, tag_port; + enum dsa_cmd cmd; u8 *dsa_header; + u16 pvid = 0; + int err; + + if (skb->offload_fwd_mark) { + struct dsa_switch_tree *dst = dp->ds->dst; + struct net_device *br = dp->bridge_dev; + + cmd = DSA_CMD_FORWARD; + + /* When offloading forwarding for a bridge, inject FORWARD + * packets on behalf of a virtual switch device with an index + * past the physical switches. + */ + tag_dev = dst->last_switch + 1 + dp->bridge_num; + tag_port = 0; + + /* If we are offloading forwarding for a VLAN-unaware bridge, + * inject packets to hardware using the bridge's pvid, since + * that's where the packets ingressed from. + */ + if (!br_vlan_enabled(br)) { + /* Safe because __dev_queue_xmit() runs under + * rcu_read_lock_bh() + */ + err = br_vlan_get_pvid_rcu(br, &pvid); + if (err) + return NULL; + } + } else { + cmd = DSA_CMD_FROM_CPU; + tag_dev = dp->ds->index; + tag_port = dp->index; + } if (skb->protocol == htons(ETH_P_8021Q)) { if (extra) { @@ -134,10 +169,10 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, memmove(skb->data, skb->data + extra, 2 * ETH_ALEN); } - /* Construct tagged FROM_CPU DSA tag from 802.1Q tag. */ + /* Construct tagged DSA tag from 802.1Q tag. */ dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | 0x20 | dp->ds->index; - dsa_header[1] = dp->index << 3; + dsa_header[0] = (cmd << 6) | 0x20 | tag_dev; + dsa_header[1] = tag_port << 3; /* Move CFI field from byte 2 to byte 1. */ if (dsa_header[2] & 0x10) { @@ -148,12 +183,13 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, skb_push(skb, DSA_HLEN + extra); memmove(skb->data, skb->data + DSA_HLEN + extra, 2 * ETH_ALEN); - /* Construct untagged FROM_CPU DSA tag. */ + /* Construct untagged DSA tag. */ dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | dp->ds->index; - dsa_header[1] = dp->index << 3; - dsa_header[2] = 0x00; - dsa_header[3] = 0x00; + + dsa_header[0] = (cmd << 6) | tag_dev; + dsa_header[1] = tag_port << 3; + dsa_header[2] = pvid >> 8; + dsa_header[3] = pvid & 0xff; } return skb;