From 8fb37be120469a4bd729be833d32010d4bba4e41 Mon Sep 17 00:00:00 2001 From: Brian King Date: Mon, 15 Jul 2019 16:41:50 -0500 Subject: [PATCH 01/52] bnx2x: Prevent load reordering in tx completion processing [ Upstream commit ea811b795df24644a8eb760b493c43fba4450677 ] This patch fixes an issue seen on Power systems with bnx2x which results in the skb is NULL WARN_ON in bnx2x_free_tx_pkt firing due to the skb pointer getting loaded in bnx2x_free_tx_pkt prior to the hw_cons load in bnx2x_tx_int. Adding a read memory barrier resolves the issue. Signed-off-by: Brian King Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c index e3ce29951c5e..3edb81a4f075 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c @@ -286,6 +286,9 @@ int bnx2x_tx_int(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata) hw_cons = le16_to_cpu(*txdata->tx_cons_sb); sw_cons = txdata->tx_pkt_cons; + /* Ensure subsequent loads occur after hw_cons */ + smp_rmb(); + while (sw_cons != hw_cons) { u16 pkt_cons; From d7cdac6dc4180d89e6289192c3334fa59acd3980 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Mon, 15 Jul 2019 14:10:17 +0900 Subject: [PATCH 02/52] caif-hsi: fix possible deadlock in cfhsi_exit_module() [ Upstream commit fdd258d49e88a9e0b49ef04a506a796f1c768a8e ] cfhsi_exit_module() calls unregister_netdev() under rtnl_lock(). but unregister_netdev() internally calls rtnl_lock(). So deadlock would occur. Fixes: c41254006377 ("caif-hsi: Add rtnl support") Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/caif/caif_hsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c index 433a14b9f731..253a1bbe37e8 100644 --- a/drivers/net/caif/caif_hsi.c +++ b/drivers/net/caif/caif_hsi.c @@ -1455,7 +1455,7 @@ static void __exit cfhsi_exit_module(void) rtnl_lock(); list_for_each_safe(list_node, n, &cfhsi_list) { cfhsi = list_entry(list_node, struct cfhsi, list); - unregister_netdev(cfhsi->ndev); + unregister_netdevice(cfhsi->ndev); } rtnl_unlock(); } From 9770fe1b202f423e69e415a695c46c6a1a2a9a02 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 19 Jul 2019 17:33:51 +0000 Subject: [PATCH 03/52] hv_netvsc: Fix extra rcu_read_unlock in netvsc_recv_callback() [ Upstream commit be4363bdf0ce9530f15aa0a03d1060304d116b15 ] There is an extra rcu_read_unlock left in netvsc_recv_callback(), after a previous patch that removes RCU from this function. This patch removes the extra RCU unlock. Fixes: 345ac08990b8 ("hv_netvsc: pass netvsc_device to receive callback") Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/hyperv/netvsc_drv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index cf6b9b1771f1..cc60ef9634db 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -847,7 +847,6 @@ int netvsc_recv_callback(struct net_device *net, csum_info, vlan, data, len); if (unlikely(!skb)) { ++net_device_ctx->eth_stats.rx_no_memory; - rcu_read_unlock(); return NVSP_STAT_FAIL; } From aee5dd00341b71fe3cec964a9661c578bf46ee52 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Jun 2019 01:27:01 -0700 Subject: [PATCH 04/52] igmp: fix memory leak in igmpv3_del_delrec() [ Upstream commit e5b1c6c6277d5a283290a8c033c72544746f9b5b ] im->tomb and/or im->sources might not be NULL, but we currently overwrite their values blindly. Using swap() will make sure the following call to kfree_pmc(pmc) will properly free the psf structures. Tested with the C repro provided by syzbot, which basically does : socket(PF_INET, SOCK_DGRAM, IPPROTO_IP) = 3 setsockopt(3, SOL_IP, IP_ADD_MEMBERSHIP, "\340\0\0\2\177\0\0\1\0\0\0\0", 12) = 0 ioctl(3, SIOCSIFFLAGS, {ifr_name="lo", ifr_flags=0}) = 0 setsockopt(3, SOL_IP, IP_MSFILTER, "\340\0\0\2\177\0\0\1\1\0\0\0\1\0\0\0\377\377\377\377", 20) = 0 ioctl(3, SIOCSIFFLAGS, {ifr_name="lo", ifr_flags=IFF_UP}) = 0 exit_group(0) = ? BUG: memory leak unreferenced object 0xffff88811450f140 (size 64): comm "softirq", pid 0, jiffies 4294942448 (age 32.070s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 ff ff ff ff 00 00 00 00 ................ 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace: [<00000000c7bad083>] kmemleak_alloc_recursive include/linux/kmemleak.h:43 [inline] [<00000000c7bad083>] slab_post_alloc_hook mm/slab.h:439 [inline] [<00000000c7bad083>] slab_alloc mm/slab.c:3326 [inline] [<00000000c7bad083>] kmem_cache_alloc_trace+0x13d/0x280 mm/slab.c:3553 [<000000009acc4151>] kmalloc include/linux/slab.h:547 [inline] [<000000009acc4151>] kzalloc include/linux/slab.h:742 [inline] [<000000009acc4151>] ip_mc_add1_src net/ipv4/igmp.c:1976 [inline] [<000000009acc4151>] ip_mc_add_src+0x36b/0x400 net/ipv4/igmp.c:2100 [<000000004ac14566>] ip_mc_msfilter+0x22d/0x310 net/ipv4/igmp.c:2484 [<0000000052d8f995>] do_ip_setsockopt.isra.0+0x1795/0x1930 net/ipv4/ip_sockglue.c:959 [<000000004ee1e21f>] ip_setsockopt+0x3b/0xb0 net/ipv4/ip_sockglue.c:1248 [<0000000066cdfe74>] udp_setsockopt+0x4e/0x90 net/ipv4/udp.c:2618 [<000000009383a786>] sock_common_setsockopt+0x38/0x50 net/core/sock.c:3126 [<00000000d8ac0c94>] __sys_setsockopt+0x98/0x120 net/socket.c:2072 [<000000001b1e9666>] __do_sys_setsockopt net/socket.c:2083 [inline] [<000000001b1e9666>] __se_sys_setsockopt net/socket.c:2080 [inline] [<000000001b1e9666>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2080 [<00000000420d395e>] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:301 [<000000007fd83a4b>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 24803f38a5c0 ("igmp: do not remove igmp souce list info when set link down") Signed-off-by: Eric Dumazet Cc: Hangbin Liu Reported-by: syzbot+6ca1abd0db68b5173a4f@syzkaller.appspotmail.com Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/igmp.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d187ee8156a1..b2240b7f225d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1218,12 +1218,8 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im) if (pmc) { im->interface = pmc->interface; if (im->sfmode == MCAST_INCLUDE) { - im->tomb = pmc->tomb; - pmc->tomb = NULL; - - im->sources = pmc->sources; - pmc->sources = NULL; - + swap(im->tomb, pmc->tomb); + swap(im->sources, pmc->sources); for (psf = im->sources; psf; psf = psf->sf_next) psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; } else { From 47ce442783d76f3a779b5a9c429fb7cb263c6018 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 1 Jul 2019 19:01:55 +0200 Subject: [PATCH 05/52] ipv4: don't set IPv6 only flags to IPv4 addresses [ Upstream commit 2e60546368165c2449564d71f6005dda9205b5fb ] Avoid the situation where an IPV6 only flag is applied to an IPv4 address: # ip addr add 192.0.2.1/24 dev dummy0 nodad home mngtmpaddr noprefixroute # ip -4 addr show dev dummy0 2: dummy0: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 inet 192.0.2.1/24 scope global noprefixroute dummy0 valid_lft forever preferred_lft forever Or worse, by sending a malicious netlink command: # ip -4 addr show dev dummy0 2: dummy0: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 inet 192.0.2.1/24 scope global nodad optimistic dadfailed home tentative mngtmpaddr noprefixroute stable-privacy dummy0 valid_lft forever preferred_lft forever Signed-off-by: Matteo Croce Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/devinet.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ea4bd8a52422..d23746143cd2 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -66,6 +66,11 @@ #include #include +#define IPV6ONLY_FLAGS \ + (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ + IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ + IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) + static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, @@ -462,6 +467,9 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; + /* Don't set IPv6 only flags to IPv4 addresses */ + ifa->ifa_flags &= ~IPV6ONLY_FLAGS; + for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; ifap = &ifa1->ifa_next) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && From 0bd84505f16f43e588d91ec8e0834faa5d42e8b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 17 Jul 2019 15:08:43 -0700 Subject: [PATCH 06/52] ipv6: rt6_check should return NULL if 'from' is NULL [ Upstream commit 49d05fe2c9d1b4a27761c9807fec39b8155bef9e ] Paul reported that l2tp sessions were broken after the commit referenced in the Fixes tag. Prior to this commit rt6_check returned NULL if the rt6_info 'from' was NULL - ie., the dst_entry was disconnected from a FIB entry. Restore that behavior. Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Reported-by: Paul Donohue Tested-by: Paul Donohue Signed-off-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 24f7b2cf504b..81220077d62f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2214,7 +2214,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, { u32 rt_cookie = 0; - if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) || + if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || rt_cookie != cookie) return NULL; From c0f4a6447977b165354940db4c0f2e14a54654dc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 17 Jul 2019 23:39:33 +0300 Subject: [PATCH 07/52] ipv6: Unlink sibling route in case of failure [ Upstream commit 54851aa90cf27041d64b12f65ac72e9f97bd90fd ] When a route needs to be appended to an existing multipath route, fib6_add_rt2node() first appends it to the siblings list and increments the number of sibling routes on each sibling. Later, the function notifies the route via call_fib6_entry_notifiers(). In case the notification is vetoed, the route is not unlinked from the siblings list, which can result in a use-after-free. Fix this by unlinking the route from the siblings list before returning an error. Audited the rest of the call sites from which the FIB notification chain is called and could not find more problems. Fixes: 2233000cba40 ("net/ipv6: Move call_fib6_entry_notifiers up for route adds") Signed-off-by: Ido Schimmel Reported-by: Alexander Petrovskiy Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_fib.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index a6c0479c1d55..bbb5ffb3397d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1081,8 +1081,24 @@ add: err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt, extack); - if (err) + if (err) { + struct fib6_info *sibling, *next_sibling; + + /* If the route has siblings, then it first + * needs to be unlinked from them. + */ + if (!rt->fib6_nsiblings) + return err; + + list_for_each_entry_safe(sibling, next_sibling, + &rt->fib6_siblings, + fib6_siblings) + sibling->fib6_nsiblings--; + rt->fib6_nsiblings = 0; + list_del_init(&rt->fib6_siblings); + rt6_multipath_rebalance(next_sibling); return err; + } rcu_assign_pointer(rt->fib6_next, iter); atomic_inc(&rt->fib6_ref); From 5832ef4afd90ef5dda9960b56fa3b8092530e0fb Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Wed, 17 Jul 2019 14:58:53 -0700 Subject: [PATCH 08/52] net: bcmgenet: use promisc for unsupported filters [ Upstream commit 35cbef9863640f06107144687bd13151bc2e8ce3 ] Currently we silently ignore filters if we cannot meet the filter requirements. This will lead to the MAC dropping packets that are expected to pass. A better solution would be to set the NIC to promisc mode when the required filters cannot be met. Also correct the number of MDF filters supported. It should be 17, not 16. Signed-off-by: Justin Chen Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/broadcom/genet/bcmgenet.c | 57 +++++++++---------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index 2d6f090bf644..fd587bed32eb 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -3086,39 +3086,42 @@ static void bcmgenet_timeout(struct net_device *dev) netif_tx_wake_all_queues(dev); } -#define MAX_MC_COUNT 16 +#define MAX_MDF_FILTER 17 static inline void bcmgenet_set_mdf_addr(struct bcmgenet_priv *priv, unsigned char *addr, - int *i, - int *mc) + int *i) { - u32 reg; - bcmgenet_umac_writel(priv, addr[0] << 8 | addr[1], UMAC_MDF_ADDR + (*i * 4)); bcmgenet_umac_writel(priv, addr[2] << 24 | addr[3] << 16 | addr[4] << 8 | addr[5], UMAC_MDF_ADDR + ((*i + 1) * 4)); - reg = bcmgenet_umac_readl(priv, UMAC_MDF_CTRL); - reg |= (1 << (MAX_MC_COUNT - *mc)); - bcmgenet_umac_writel(priv, reg, UMAC_MDF_CTRL); *i += 2; - (*mc)++; } static void bcmgenet_set_rx_mode(struct net_device *dev) { struct bcmgenet_priv *priv = netdev_priv(dev); struct netdev_hw_addr *ha; - int i, mc; + int i, nfilter; u32 reg; netif_dbg(priv, hw, dev, "%s: %08X\n", __func__, dev->flags); - /* Promiscuous mode */ + /* Number of filters needed */ + nfilter = netdev_uc_count(dev) + netdev_mc_count(dev) + 2; + + /* + * Turn on promicuous mode for three scenarios + * 1. IFF_PROMISC flag is set + * 2. IFF_ALLMULTI flag is set + * 3. The number of filters needed exceeds the number filters + * supported by the hardware. + */ reg = bcmgenet_umac_readl(priv, UMAC_CMD); - if (dev->flags & IFF_PROMISC) { + if ((dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) || + (nfilter > MAX_MDF_FILTER)) { reg |= CMD_PROMISC; bcmgenet_umac_writel(priv, reg, UMAC_CMD); bcmgenet_umac_writel(priv, 0, UMAC_MDF_CTRL); @@ -3128,32 +3131,24 @@ static void bcmgenet_set_rx_mode(struct net_device *dev) bcmgenet_umac_writel(priv, reg, UMAC_CMD); } - /* UniMac doesn't support ALLMULTI */ - if (dev->flags & IFF_ALLMULTI) { - netdev_warn(dev, "ALLMULTI is not supported\n"); - return; - } - /* update MDF filter */ i = 0; - mc = 0; /* Broadcast */ - bcmgenet_set_mdf_addr(priv, dev->broadcast, &i, &mc); + bcmgenet_set_mdf_addr(priv, dev->broadcast, &i); /* my own address.*/ - bcmgenet_set_mdf_addr(priv, dev->dev_addr, &i, &mc); - /* Unicast list*/ - if (netdev_uc_count(dev) > (MAX_MC_COUNT - mc)) - return; + bcmgenet_set_mdf_addr(priv, dev->dev_addr, &i); + + /* Unicast */ + netdev_for_each_uc_addr(ha, dev) + bcmgenet_set_mdf_addr(priv, ha->addr, &i); - if (!netdev_uc_empty(dev)) - netdev_for_each_uc_addr(ha, dev) - bcmgenet_set_mdf_addr(priv, ha->addr, &i, &mc); /* Multicast */ - if (netdev_mc_empty(dev) || netdev_mc_count(dev) >= (MAX_MC_COUNT - mc)) - return; - netdev_for_each_mc_addr(ha, dev) - bcmgenet_set_mdf_addr(priv, ha->addr, &i, &mc); + bcmgenet_set_mdf_addr(priv, ha->addr, &i); + + /* Enable filters */ + reg = GENMASK(MAX_MDF_FILTER - 1, MAX_MDF_FILTER - nfilter); + bcmgenet_umac_writel(priv, reg, UMAC_MDF_CTRL); } /* Set the hardware MAC address. */ From 6ab30a4cc5c621248b397a5fa60f6be456940969 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 27 Jun 2019 21:17:39 +0300 Subject: [PATCH 09/52] net: dsa: mv88e6xxx: wait after reset deactivation [ Upstream commit 7b75e49de424ceb53d13e60f35d0a73765626fda ] Add a 1ms delay after reset deactivation. Otherwise the chip returns bogus ID value. This is observed with 88E6390 (Peridot) chip. Signed-off-by: Baruch Siach Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/dsa/mv88e6xxx/chip.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 411cfb806459..703e6bdaf0e1 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -4816,6 +4816,8 @@ static int mv88e6xxx_probe(struct mdio_device *mdiodev) err = PTR_ERR(chip->reset); goto out; } + if (chip->reset) + usleep_range(1000, 2000); err = mv88e6xxx_detect(chip); if (err) From 832d0ea751a8c511d719c3ede60db05981ebc7d4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Jun 2019 20:40:45 +0200 Subject: [PATCH 10/52] net: make skb_dst_force return true when dst is refcounted [ Upstream commit b60a77386b1d4868f72f6353d35dabe5fbe981f2 ] netfilter did not expect that skb_dst_force() can cause skb to lose its dst entry. I got a bug report with a skb->dst NULL dereference in netfilter output path. The backtrace contains nf_reinject(), so the dst might have been cleared when skb got queued to userspace. Other users were fixed via if (skb_dst(skb)) { skb_dst_force(skb); if (!skb_dst(skb)) goto handle_err; } But I think its preferable to make the 'dst might be cleared' part of the function explicit. In netfilter case, skb with a null dst is expected when queueing in prerouting hook, so drop skb for the other hooks. v2: v1 of this patch returned true in case skb had no dst entry. Eric said: Say if we have two skb_dst_force() calls for some reason on the same skb, only the first one will return false. This now returns false even when skb had no dst, as per Erics suggestion, so callers might need to check skb_dst() first before skb_dst_force(). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/dst.h | 5 ++++- net/netfilter/nf_queue.c | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 6cf0870414c7..ffc8ee0ea5e5 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -313,8 +313,9 @@ static inline bool dst_hold_safe(struct dst_entry *dst) * @skb: buffer * * If dst is not yet refcounted and not destroyed, grab a ref on it. + * Returns true if dst is refcounted. */ -static inline void skb_dst_force(struct sk_buff *skb) +static inline bool skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); @@ -325,6 +326,8 @@ static inline void skb_dst_force(struct sk_buff *skb) skb->_skb_refdst = (unsigned long)dst; } + + return skb->_skb_refdst != 0UL; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 7569ba00e732..a96a8c16baf9 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -174,6 +174,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, goto err; } + if (!skb_dst_force(skb) && state->hook != NF_INET_PRE_ROUTING) { + status = -ENETDOWN; + goto err; + } + *entry = (struct nf_queue_entry) { .skb = skb, .state = *state, @@ -182,7 +187,6 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, }; nf_queue_entry_get_refs(entry); - skb_dst_force(skb); switch (entry->state.pf) { case AF_INET: From 257441a072010d880a79048cd3158cde32ff8258 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 14 Jul 2019 23:36:11 +0200 Subject: [PATCH 11/52] net: neigh: fix multiple neigh timer scheduling [ Upstream commit 071c37983d99da07797294ea78e9da1a6e287144 ] Neigh timer can be scheduled multiple times from userspace adding multiple neigh entries and forcing the neigh timer scheduling passing NTF_USE in the netlink requests. This will result in a refcount leak and in the following dump stack: [ 32.465295] NEIGH: BUG, double timer add, state is 8 [ 32.465308] CPU: 0 PID: 416 Comm: double_timer_ad Not tainted 5.2.0+ #65 [ 32.465311] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-2.fc30 04/01/2014 [ 32.465313] Call Trace: [ 32.465318] dump_stack+0x7c/0xc0 [ 32.465323] __neigh_event_send+0x20c/0x880 [ 32.465326] ? ___neigh_create+0x846/0xfb0 [ 32.465329] ? neigh_lookup+0x2a9/0x410 [ 32.465332] ? neightbl_fill_info.constprop.0+0x800/0x800 [ 32.465334] neigh_add+0x4f8/0x5e0 [ 32.465337] ? neigh_xmit+0x620/0x620 [ 32.465341] ? find_held_lock+0x85/0xa0 [ 32.465345] rtnetlink_rcv_msg+0x204/0x570 [ 32.465348] ? rtnl_dellink+0x450/0x450 [ 32.465351] ? mark_held_locks+0x90/0x90 [ 32.465354] ? match_held_lock+0x1b/0x230 [ 32.465357] netlink_rcv_skb+0xc4/0x1d0 [ 32.465360] ? rtnl_dellink+0x450/0x450 [ 32.465363] ? netlink_ack+0x420/0x420 [ 32.465366] ? netlink_deliver_tap+0x115/0x560 [ 32.465369] ? __alloc_skb+0xc9/0x2f0 [ 32.465372] netlink_unicast+0x270/0x330 [ 32.465375] ? netlink_attachskb+0x2f0/0x2f0 [ 32.465378] netlink_sendmsg+0x34f/0x5a0 [ 32.465381] ? netlink_unicast+0x330/0x330 [ 32.465385] ? move_addr_to_kernel.part.0+0x20/0x20 [ 32.465388] ? netlink_unicast+0x330/0x330 [ 32.465391] sock_sendmsg+0x91/0xa0 [ 32.465394] ___sys_sendmsg+0x407/0x480 [ 32.465397] ? copy_msghdr_from_user+0x200/0x200 [ 32.465401] ? _raw_spin_unlock_irqrestore+0x37/0x40 [ 32.465404] ? lockdep_hardirqs_on+0x17d/0x250 [ 32.465407] ? __wake_up_common_lock+0xcb/0x110 [ 32.465410] ? __wake_up_common+0x230/0x230 [ 32.465413] ? netlink_bind+0x3e1/0x490 [ 32.465416] ? netlink_setsockopt+0x540/0x540 [ 32.465420] ? __fget_light+0x9c/0xf0 [ 32.465423] ? sockfd_lookup_light+0x8c/0xb0 [ 32.465426] __sys_sendmsg+0xa5/0x110 [ 32.465429] ? __ia32_sys_shutdown+0x30/0x30 [ 32.465432] ? __fd_install+0xe1/0x2c0 [ 32.465435] ? lockdep_hardirqs_off+0xb5/0x100 [ 32.465438] ? mark_held_locks+0x24/0x90 [ 32.465441] ? do_syscall_64+0xf/0x270 [ 32.465444] do_syscall_64+0x63/0x270 [ 32.465448] entry_SYSCALL_64_after_hwframe+0x49/0xbe Fix the issue unscheduling neigh_timer if selected entry is in 'IN_TIMER' receiving a netlink request with NTF_USE flag set Reported-by: Marek Majkowski Fixes: 0c5c2d308906 ("neigh: Allow for user space users of the neighbour table") Signed-off-by: Lorenzo Bianconi Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/neighbour.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index cd9e991f21d7..c52d6e6b341c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1021,6 +1021,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) atomic_set(&neigh->probes, NEIGH_VAR(neigh->parms, UCAST_PROBES)); + neigh_del_timer(neigh); neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), @@ -1037,6 +1038,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) } } else if (neigh->nud_state & NUD_STALE) { neigh_dbg(2, "neigh %p is delayed\n", neigh); + neigh_del_timer(neigh); neigh->nud_state = NUD_DELAY; neigh->updated = jiffies; neigh_add_timer(neigh, jiffies + From c60bce64615db987ab3f5cedff88cc7f4aa5ff53 Mon Sep 17 00:00:00 2001 From: John Hurley Date: Thu, 27 Jun 2019 14:37:30 +0100 Subject: [PATCH 12/52] net: openvswitch: fix csum updates for MPLS actions [ Upstream commit 0e3183cd2a64843a95b62f8bd4a83605a4cf0615 ] Skbs may have their checksum value populated by HW. If this is a checksum calculated over the entire packet then the CHECKSUM_COMPLETE field is marked. Changes to the data pointer on the skb throughout the network stack still try to maintain this complete csum value if it is required through functions such as skb_postpush_rcsum. The MPLS actions in Open vSwitch modify a CHECKSUM_COMPLETE value when changes are made to packet data without a push or a pull. This occurs when the ethertype of the MAC header is changed or when MPLS lse fields are modified. The modification is carried out using the csum_partial function to get the csum of a buffer and add it into the larger checksum. The buffer is an inversion of the data to be removed followed by the new data. Because the csum is calculated over 16 bits and these values align with 16 bits, the effect is the removal of the old value from the CHECKSUM_COMPLETE and addition of the new value. However, the csum fed into the function and the outcome of the calculation are also inverted. This would only make sense if it was the new value rather than the old that was inverted in the input buffer. Fix the issue by removing the bit inverts in the csum_partial calculation. The bug was verified and the fix tested by comparing the folded value of the updated CHECKSUM_COMPLETE value with the folded value of a full software checksum calculation (reset skb->csum to 0 and run skb_checksum_complete(skb)). Prior to the fix the outcomes differed but after they produce the same result. Fixes: 25cd9ba0abc0 ("openvswitch: Add basic MPLS support to kernel") Fixes: bc7cc5999fd3 ("openvswitch: update checksum in {push,pop}_mpls") Signed-off-by: John Hurley Reviewed-by: Jakub Kicinski Reviewed-by: Simon Horman Acked-by: Pravin B Shelar Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/openvswitch/actions.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 85ae53d8fd09..8211e8e97c96 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -175,8 +175,7 @@ static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, if (skb->ip_summed == CHECKSUM_COMPLETE) { __be16 diff[] = { ~(hdr->h_proto), ethertype }; - skb->csum = ~csum_partial((char *)diff, sizeof(diff), - ~skb->csum); + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); } hdr->h_proto = ethertype; @@ -268,8 +267,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~(stack->label_stack_entry), lse }; - skb->csum = ~csum_partial((char *)diff, sizeof(diff), - ~skb->csum); + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); } stack->label_stack_entry = lse; From 201d7d62a82a01f6085adc08e9fa2c093f717d25 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 21 Jul 2019 18:50:08 +0200 Subject: [PATCH 13/52] net: phy: sfp: hwmon: Fix scaling of RX power [ Upstream commit 0cea0e1148fe134a4a3aaf0b1496f09241fb943a ] The RX power read from the SFP uses units of 0.1uW. This must be scaled to units of uW for HWMON. This requires a divide by 10, not the current 100. With this change in place, sensors(1) and ethtool -m agree: sff2-isa-0000 Adapter: ISA adapter in0: +3.23 V temp1: +33.1 C power1: 270.00 uW power2: 200.00 uW curr1: +0.01 A Laser output power : 0.2743 mW / -5.62 dBm Receiver signal average optical power : 0.2014 mW / -6.96 dBm Reported-by: chris.healy@zii.aero Signed-off-by: Andrew Lunn Fixes: 1323061a018a ("net: phy: sfp: Add HWMON support for module sensors") Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/sfp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 418522aa2f71..998d08ae7431 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -514,7 +514,7 @@ static int sfp_hwmon_read_sensor(struct sfp *sfp, int reg, long *value) static void sfp_hwmon_to_rx_power(long *value) { - *value = DIV_ROUND_CLOSEST(*value, 100); + *value = DIV_ROUND_CLOSEST(*value, 10); } static void sfp_hwmon_calibrate(struct sfp *sfp, unsigned int slope, int offset, From f47f68cc9d33572c3bee66eb86aa9db5dc612ddd Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 8 Jul 2019 14:26:28 +0200 Subject: [PATCH 14/52] net: stmmac: Re-work the queue selection for TSO packets [ Upstream commit 4993e5b37e8bcb55ac90f76eb6d2432647273747 ] Ben Hutchings says: "This is the wrong place to change the queue mapping. stmmac_xmit() is called with a specific TX queue locked, and accessing a different TX queue results in a data race for all of that queue's state. I think this commit should be reverted upstream and in all stable branches. Instead, the driver should implement the ndo_select_queue operation and override the queue mapping there." Fixes: c5acdbee22a1 ("net: stmmac: Send TSO packets always from Queue 0") Suggested-by: Ben Hutchings Signed-off-by: Jose Abreu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 5c18874614ba..0101ebaecf02 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3036,17 +3036,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev) /* Manage oversized TCP frames for GMAC4 device */ if (skb_is_gso(skb) && priv->tso) { - if (skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) { - /* - * There is no way to determine the number of TSO - * capable Queues. Let's use always the Queue 0 - * because if TSO is supported then at least this - * one will be capable. - */ - skb_set_queue_mapping(skb, 0); - + if (skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) return stmmac_tso_xmit(skb, dev); - } } if (unlikely(stmmac_tx_avail(priv, queue) < nfrags + 1)) { @@ -3855,6 +3846,23 @@ static int stmmac_setup_tc(struct net_device *ndev, enum tc_setup_type type, } } +static u16 stmmac_select_queue(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev, + select_queue_fallback_t fallback) +{ + if (skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) { + /* + * There is no way to determine the number of TSO + * capable Queues. Let's use always the Queue 0 + * because if TSO is supported then at least this + * one will be capable. + */ + return 0; + } + + return fallback(dev, skb, NULL) % dev->real_num_tx_queues; +} + static int stmmac_set_mac_address(struct net_device *ndev, void *addr) { struct stmmac_priv *priv = netdev_priv(ndev); @@ -4097,6 +4105,7 @@ static const struct net_device_ops stmmac_netdev_ops = { .ndo_tx_timeout = stmmac_tx_timeout, .ndo_do_ioctl = stmmac_ioctl, .ndo_setup_tc = stmmac_setup_tc, + .ndo_select_queue = stmmac_select_queue, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = stmmac_poll_controller, #endif From 97739e5c9e73b5cde1622c79b60fda12298779cd Mon Sep 17 00:00:00 2001 From: Yang Wei Date: Mon, 8 Jul 2019 22:57:39 +0800 Subject: [PATCH 15/52] nfc: fix potential illegal memory access [ Upstream commit dd006fc434e107ef90f7de0db9907cbc1c521645 ] The frags_q is not properly initialized, it may result in illegal memory access when conn_info is NULL. The "goto free_exit" should be replaced by "goto exit". Signed-off-by: Yang Wei Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/nfc/nci/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 908f25e3773e..5405d073804c 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -119,7 +119,7 @@ static int nci_queue_tx_data_frags(struct nci_dev *ndev, conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) { rc = -EPROTO; - goto free_exit; + goto exit; } __skb_queue_head_init(&frags_q); From 3e4e6b71ece09266e58242029813eb772dc7f535 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 13 Jul 2019 13:45:47 +0200 Subject: [PATCH 16/52] r8169: fix issue with confused RX unit after PHY power-down on RTL8411b [ Upstream commit fe4e8db0392a6c2e795eb89ef5fcd86522e66248 ] On RTL8411b the RX unit gets confused if the PHY is powered-down. This was reported in [0] and confirmed by Realtek. Realtek provided a sequence to fix the RX unit after PHY wakeup. The issue itself seems to have been there longer, the Fixes tag refers to where the fix applies properly. [0] https://bugzilla.redhat.com/show_bug.cgi?id=1692075 Fixes: a99790bf5c7f ("r8169: Reinstate ASPM Support") Tested-by: Ionut Radu Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/realtek/r8169.c | 137 +++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 7a50b911b180..a6992c4c7313 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -5202,6 +5202,143 @@ static void rtl_hw_start_8411_2(struct rtl8169_private *tp) /* disable aspm and clock request before access ephy */ rtl_hw_aspm_clkreq_enable(tp, false); rtl_ephy_init(tp, e_info_8411_2, ARRAY_SIZE(e_info_8411_2)); + + /* The following Realtek-provided magic fixes an issue with the RX unit + * getting confused after the PHY having been powered-down. + */ + r8168_mac_ocp_write(tp, 0xFC28, 0x0000); + r8168_mac_ocp_write(tp, 0xFC2A, 0x0000); + r8168_mac_ocp_write(tp, 0xFC2C, 0x0000); + r8168_mac_ocp_write(tp, 0xFC2E, 0x0000); + r8168_mac_ocp_write(tp, 0xFC30, 0x0000); + r8168_mac_ocp_write(tp, 0xFC32, 0x0000); + r8168_mac_ocp_write(tp, 0xFC34, 0x0000); + r8168_mac_ocp_write(tp, 0xFC36, 0x0000); + mdelay(3); + r8168_mac_ocp_write(tp, 0xFC26, 0x0000); + + r8168_mac_ocp_write(tp, 0xF800, 0xE008); + r8168_mac_ocp_write(tp, 0xF802, 0xE00A); + r8168_mac_ocp_write(tp, 0xF804, 0xE00C); + r8168_mac_ocp_write(tp, 0xF806, 0xE00E); + r8168_mac_ocp_write(tp, 0xF808, 0xE027); + r8168_mac_ocp_write(tp, 0xF80A, 0xE04F); + r8168_mac_ocp_write(tp, 0xF80C, 0xE05E); + r8168_mac_ocp_write(tp, 0xF80E, 0xE065); + r8168_mac_ocp_write(tp, 0xF810, 0xC602); + r8168_mac_ocp_write(tp, 0xF812, 0xBE00); + r8168_mac_ocp_write(tp, 0xF814, 0x0000); + r8168_mac_ocp_write(tp, 0xF816, 0xC502); + r8168_mac_ocp_write(tp, 0xF818, 0xBD00); + r8168_mac_ocp_write(tp, 0xF81A, 0x074C); + r8168_mac_ocp_write(tp, 0xF81C, 0xC302); + r8168_mac_ocp_write(tp, 0xF81E, 0xBB00); + r8168_mac_ocp_write(tp, 0xF820, 0x080A); + r8168_mac_ocp_write(tp, 0xF822, 0x6420); + r8168_mac_ocp_write(tp, 0xF824, 0x48C2); + r8168_mac_ocp_write(tp, 0xF826, 0x8C20); + r8168_mac_ocp_write(tp, 0xF828, 0xC516); + r8168_mac_ocp_write(tp, 0xF82A, 0x64A4); + r8168_mac_ocp_write(tp, 0xF82C, 0x49C0); + r8168_mac_ocp_write(tp, 0xF82E, 0xF009); + r8168_mac_ocp_write(tp, 0xF830, 0x74A2); + r8168_mac_ocp_write(tp, 0xF832, 0x8CA5); + r8168_mac_ocp_write(tp, 0xF834, 0x74A0); + r8168_mac_ocp_write(tp, 0xF836, 0xC50E); + r8168_mac_ocp_write(tp, 0xF838, 0x9CA2); + r8168_mac_ocp_write(tp, 0xF83A, 0x1C11); + r8168_mac_ocp_write(tp, 0xF83C, 0x9CA0); + r8168_mac_ocp_write(tp, 0xF83E, 0xE006); + r8168_mac_ocp_write(tp, 0xF840, 0x74F8); + r8168_mac_ocp_write(tp, 0xF842, 0x48C4); + r8168_mac_ocp_write(tp, 0xF844, 0x8CF8); + r8168_mac_ocp_write(tp, 0xF846, 0xC404); + r8168_mac_ocp_write(tp, 0xF848, 0xBC00); + r8168_mac_ocp_write(tp, 0xF84A, 0xC403); + r8168_mac_ocp_write(tp, 0xF84C, 0xBC00); + r8168_mac_ocp_write(tp, 0xF84E, 0x0BF2); + r8168_mac_ocp_write(tp, 0xF850, 0x0C0A); + r8168_mac_ocp_write(tp, 0xF852, 0xE434); + r8168_mac_ocp_write(tp, 0xF854, 0xD3C0); + r8168_mac_ocp_write(tp, 0xF856, 0x49D9); + r8168_mac_ocp_write(tp, 0xF858, 0xF01F); + r8168_mac_ocp_write(tp, 0xF85A, 0xC526); + r8168_mac_ocp_write(tp, 0xF85C, 0x64A5); + r8168_mac_ocp_write(tp, 0xF85E, 0x1400); + r8168_mac_ocp_write(tp, 0xF860, 0xF007); + r8168_mac_ocp_write(tp, 0xF862, 0x0C01); + r8168_mac_ocp_write(tp, 0xF864, 0x8CA5); + r8168_mac_ocp_write(tp, 0xF866, 0x1C15); + r8168_mac_ocp_write(tp, 0xF868, 0xC51B); + r8168_mac_ocp_write(tp, 0xF86A, 0x9CA0); + r8168_mac_ocp_write(tp, 0xF86C, 0xE013); + r8168_mac_ocp_write(tp, 0xF86E, 0xC519); + r8168_mac_ocp_write(tp, 0xF870, 0x74A0); + r8168_mac_ocp_write(tp, 0xF872, 0x48C4); + r8168_mac_ocp_write(tp, 0xF874, 0x8CA0); + r8168_mac_ocp_write(tp, 0xF876, 0xC516); + r8168_mac_ocp_write(tp, 0xF878, 0x74A4); + r8168_mac_ocp_write(tp, 0xF87A, 0x48C8); + r8168_mac_ocp_write(tp, 0xF87C, 0x48CA); + r8168_mac_ocp_write(tp, 0xF87E, 0x9CA4); + r8168_mac_ocp_write(tp, 0xF880, 0xC512); + r8168_mac_ocp_write(tp, 0xF882, 0x1B00); + r8168_mac_ocp_write(tp, 0xF884, 0x9BA0); + r8168_mac_ocp_write(tp, 0xF886, 0x1B1C); + r8168_mac_ocp_write(tp, 0xF888, 0x483F); + r8168_mac_ocp_write(tp, 0xF88A, 0x9BA2); + r8168_mac_ocp_write(tp, 0xF88C, 0x1B04); + r8168_mac_ocp_write(tp, 0xF88E, 0xC508); + r8168_mac_ocp_write(tp, 0xF890, 0x9BA0); + r8168_mac_ocp_write(tp, 0xF892, 0xC505); + r8168_mac_ocp_write(tp, 0xF894, 0xBD00); + r8168_mac_ocp_write(tp, 0xF896, 0xC502); + r8168_mac_ocp_write(tp, 0xF898, 0xBD00); + r8168_mac_ocp_write(tp, 0xF89A, 0x0300); + r8168_mac_ocp_write(tp, 0xF89C, 0x051E); + r8168_mac_ocp_write(tp, 0xF89E, 0xE434); + r8168_mac_ocp_write(tp, 0xF8A0, 0xE018); + r8168_mac_ocp_write(tp, 0xF8A2, 0xE092); + r8168_mac_ocp_write(tp, 0xF8A4, 0xDE20); + r8168_mac_ocp_write(tp, 0xF8A6, 0xD3C0); + r8168_mac_ocp_write(tp, 0xF8A8, 0xC50F); + r8168_mac_ocp_write(tp, 0xF8AA, 0x76A4); + r8168_mac_ocp_write(tp, 0xF8AC, 0x49E3); + r8168_mac_ocp_write(tp, 0xF8AE, 0xF007); + r8168_mac_ocp_write(tp, 0xF8B0, 0x49C0); + r8168_mac_ocp_write(tp, 0xF8B2, 0xF103); + r8168_mac_ocp_write(tp, 0xF8B4, 0xC607); + r8168_mac_ocp_write(tp, 0xF8B6, 0xBE00); + r8168_mac_ocp_write(tp, 0xF8B8, 0xC606); + r8168_mac_ocp_write(tp, 0xF8BA, 0xBE00); + r8168_mac_ocp_write(tp, 0xF8BC, 0xC602); + r8168_mac_ocp_write(tp, 0xF8BE, 0xBE00); + r8168_mac_ocp_write(tp, 0xF8C0, 0x0C4C); + r8168_mac_ocp_write(tp, 0xF8C2, 0x0C28); + r8168_mac_ocp_write(tp, 0xF8C4, 0x0C2C); + r8168_mac_ocp_write(tp, 0xF8C6, 0xDC00); + r8168_mac_ocp_write(tp, 0xF8C8, 0xC707); + r8168_mac_ocp_write(tp, 0xF8CA, 0x1D00); + r8168_mac_ocp_write(tp, 0xF8CC, 0x8DE2); + r8168_mac_ocp_write(tp, 0xF8CE, 0x48C1); + r8168_mac_ocp_write(tp, 0xF8D0, 0xC502); + r8168_mac_ocp_write(tp, 0xF8D2, 0xBD00); + r8168_mac_ocp_write(tp, 0xF8D4, 0x00AA); + r8168_mac_ocp_write(tp, 0xF8D6, 0xE0C0); + r8168_mac_ocp_write(tp, 0xF8D8, 0xC502); + r8168_mac_ocp_write(tp, 0xF8DA, 0xBD00); + r8168_mac_ocp_write(tp, 0xF8DC, 0x0132); + + r8168_mac_ocp_write(tp, 0xFC26, 0x8000); + + r8168_mac_ocp_write(tp, 0xFC2A, 0x0743); + r8168_mac_ocp_write(tp, 0xFC2C, 0x0801); + r8168_mac_ocp_write(tp, 0xFC2E, 0x0BE9); + r8168_mac_ocp_write(tp, 0xFC30, 0x02FD); + r8168_mac_ocp_write(tp, 0xFC32, 0x0C25); + r8168_mac_ocp_write(tp, 0xFC34, 0x00A9); + r8168_mac_ocp_write(tp, 0xFC36, 0x012D); + rtl_hw_aspm_clkreq_enable(tp, true); } From bfa7913575b7b3ccbdeddb81dc0da13dc5fb22de Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jul 2019 15:59:12 +0100 Subject: [PATCH 17/52] rxrpc: Fix send on a connected, but unbound socket [ Upstream commit e835ada07091f40dcfb1bc735082bd0a7c005e59 ] If sendmsg() or sendmmsg() is called on a connected socket that hasn't had bind() called on it, then an oops will occur when the kernel tries to connect the call because no local endpoint has been allocated. Fix this by implicitly binding the socket if it is in the RXRPC_CLIENT_UNBOUND state, just like it does for the RXRPC_UNBOUND state. Further, the state should be transitioned to RXRPC_CLIENT_BOUND after this to prevent further attempts to bind it. This can be tested with: #include #include #include #include #include #include static const unsigned char inet6_addr[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0xac, 0x14, 0x14, 0xaa }; int main(void) { struct sockaddr_rxrpc srx; struct cmsghdr *cm; struct msghdr msg; unsigned char control[16]; int fd; memset(&srx, 0, sizeof(srx)); srx.srx_family = 0x21; srx.srx_service = 0; srx.transport_type = AF_INET; srx.transport_len = 0x1c; srx.transport.sin6.sin6_family = AF_INET6; srx.transport.sin6.sin6_port = htons(0x4e22); srx.transport.sin6.sin6_flowinfo = htons(0x4e22); srx.transport.sin6.sin6_scope_id = htons(0xaa3b); memcpy(&srx.transport.sin6.sin6_addr, inet6_addr, 16); cm = (struct cmsghdr *)control; cm->cmsg_len = CMSG_LEN(sizeof(unsigned long)); cm->cmsg_level = SOL_RXRPC; cm->cmsg_type = RXRPC_USER_CALL_ID; *(unsigned long *)CMSG_DATA(cm) = 0; msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_iov = NULL; msg.msg_iovlen = 0; msg.msg_control = control; msg.msg_controllen = cm->cmsg_len; msg.msg_flags = 0; fd = socket(AF_RXRPC, SOCK_DGRAM, AF_INET); connect(fd, (struct sockaddr *)&srx, sizeof(srx)); sendmsg(fd, &msg, 0); return 0; } Leading to the following oops: BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page ... RIP: 0010:rxrpc_connect_call+0x42/0xa01 ... Call Trace: ? mark_held_locks+0x47/0x59 ? __local_bh_enable_ip+0xb6/0xba rxrpc_new_client_call+0x3b1/0x762 ? rxrpc_do_sendmsg+0x3c0/0x92e rxrpc_do_sendmsg+0x3c0/0x92e rxrpc_sendmsg+0x16b/0x1b5 sock_sendmsg+0x2d/0x39 ___sys_sendmsg+0x1a4/0x22a ? release_sock+0x19/0x9e ? reacquire_held_locks+0x136/0x160 ? release_sock+0x19/0x9e ? find_held_lock+0x2b/0x6e ? __lock_acquire+0x268/0xf73 ? rxrpc_connect+0xdd/0xe4 ? __local_bh_enable_ip+0xb6/0xba __sys_sendmsg+0x5e/0x94 do_syscall_64+0x7d/0x1bf entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 2341e0775747 ("rxrpc: Simplify connect() implementation and simplify sendmsg() op") Reported-by: syzbot+7966f2a0b2c7da8939b4@syzkaller.appspotmail.com Signed-off-by: David Howells Reviewed-by: Marc Dionne Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/rxrpc/af_rxrpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 3c39b8805d01..d76e5e58905d 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -552,6 +552,7 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) switch (rx->sk.sk_state) { case RXRPC_UNBOUND: + case RXRPC_CLIENT_UNBOUND: rx->srx.srx_family = AF_RXRPC; rx->srx.srx_service = 0; rx->srx.transport_type = SOCK_DGRAM; @@ -576,10 +577,9 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) } rx->local = local; - rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; + rx->sk.sk_state = RXRPC_CLIENT_BOUND; /* Fall through */ - case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: if (!m->msg_name && test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) { From d9ee5afd916500f5dc63bcf6c8f0c94a97772090 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Thu, 27 Jun 2019 19:48:10 -0300 Subject: [PATCH 18/52] sctp: fix error handling on stream scheduler initialization [ Upstream commit 4d1415811e492d9a8238f8a92dd0d51612c788e9 ] It allocates the extended area for outbound streams only on sendmsg calls, if they are not yet allocated. When using the priority stream scheduler, this initialization may imply into a subsequent allocation, which may fail. In this case, it was aborting the stream scheduler initialization but leaving the ->ext pointer (allocated) in there, thus in a partially initialized state. On a subsequent call to sendmsg, it would notice the ->ext pointer in there, and trip on uninitialized stuff when trying to schedule the data chunk. The fix is undo the ->ext initialization if the stream scheduler initialization fails and avoid the partially initialized state. Although syzkaller bisected this to commit 4ff40b86262b ("sctp: set chunk transport correctly when it's a new asoc"), this bug was actually introduced on the commit I marked below. Reported-by: syzbot+c1a380d42b190ad1e559@syzkaller.appspotmail.com Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Tested-by: Xin Long Signed-off-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/stream.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 3b47457862cc..0da57938a6c5 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -253,13 +253,20 @@ out: int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) { struct sctp_stream_out_ext *soute; + int ret; soute = kzalloc(sizeof(*soute), GFP_KERNEL); if (!soute) return -ENOMEM; SCTP_SO(stream, sid)->ext = soute; - return sctp_sched_init_sid(stream, sid, GFP_KERNEL); + ret = sctp_sched_init_sid(stream, sid, GFP_KERNEL); + if (ret) { + kfree(SCTP_SO(stream, sid)->ext); + SCTP_SO(stream, sid)->ext = NULL; + } + + return ret; } void sctp_stream_free(struct sctp_stream *stream) From b640ade07295fbc5329779acdecf14384b545bf8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 23 Jul 2019 17:15:25 +0200 Subject: [PATCH 19/52] sky2: Disable MSI on ASUS P6T [ Upstream commit a261e3797506bd561700be643fe1a85bf81e9661 ] The onboard sky2 NIC on ASUS P6T WS PRO doesn't work after PM resume due to the infamous IRQ problem. Disabling MSI works around it, so let's add it to the blacklist. Unfortunately the BIOS on the machine doesn't fill the standard DMI_SYS_* entry, so we pick up DMI_BOARD_* entries instead. BugLink: https://bugzilla.suse.com/show_bug.cgi?id=1142496 Reported-and-tested-by: Marcus Seyfarth Signed-off-by: Takashi Iwai Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/marvell/sky2.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index 1485f66cf7b0..4ade864c8d53 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -4947,6 +4947,13 @@ static const struct dmi_system_id msi_blacklist[] = { DMI_MATCH(DMI_PRODUCT_NAME, "P-79"), }, }, + { + .ident = "ASUS P6T", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P6T"), + }, + }, {} }; From 6323c238bb4374d1477348cfbd5854f2bebe9a21 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Jul 2019 11:52:33 -0700 Subject: [PATCH 20/52] tcp: be more careful in tcp_fragment() [ Upstream commit b617158dc096709d8600c53b6052144d12b89fab ] Some applications set tiny SO_SNDBUF values and expect TCP to just work. Recent patches to address CVE-2019-11478 broke them in case of losses, since retransmits might be prevented. We should allow these flows to make progress. This patch allows the first and last skb in retransmit queue to be split even if memory limits are hit. It also adds the some room due to the fact that tcp_sendmsg() and tcp_sendpage() might overshoot sk_wmem_queued by about one full TSO skb (64KB size). Note this allowance was already present in stable backports for kernels < 4.15 Note for < 4.15 backports : tcp_rtx_queue_tail() will probably look like : static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { struct sk_buff *skb = tcp_send_head(sk); return skb ? tcp_write_queue_prev(sk, skb) : tcp_write_queue_tail(sk); } Fixes: f070ef2ac667 ("tcp: tcp_fragment() should apply sane memory limits") Signed-off-by: Eric Dumazet Reported-by: Andrew Prout Tested-by: Andrew Prout Tested-by: Jonathan Lemon Tested-by: Michal Kubecek Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Christoph Paasch Cc: Jonathan Looney Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tcp.h | 5 +++++ net/ipv4/tcp_output.c | 13 +++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index e75661f92daa..15156cd6e797 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1646,6 +1646,11 @@ static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) return skb_rb_first(&sk->tcp_rtx_queue); } +static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) +{ + return skb_rb_last(&sk->tcp_rtx_queue); +} + static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 221d9b72423b..88c7e821fd11 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1289,6 +1289,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; int nsize, old_factor; + long limit; int nlen; u8 flags; @@ -1299,8 +1300,16 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (nsize < 0) nsize = 0; - if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf && - tcp_queue != TCP_FRAG_IN_WRITE_QUEUE)) { + /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb. + * We need some allowance to not penalize applications setting small + * SO_SNDBUF values. + * Also allow first and last skb in retransmit queue to be split. + */ + limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE); + if (unlikely((sk->sk_wmem_queued >> 1) > limit && + tcp_queue != TCP_FRAG_IN_WRITE_QUEUE && + skb != tcp_rtx_queue_head(sk) && + skb != tcp_rtx_queue_tail(sk))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); return -ENOMEM; } From c60f57dfe995172c2f01e59266e3ffa3419c6cd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 18 Jul 2019 19:28:14 -0700 Subject: [PATCH 21/52] tcp: fix tcp_set_congestion_control() use from bpf hook [ Upstream commit 8d650cdedaabb33e85e9b7c517c0c71fcecc1de9 ] Neal reported incorrect use of ns_capable() from bpf hook. bpf_setsockopt(...TCP_CONGESTION...) -> tcp_set_congestion_control() -> ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) -> ns_capable_common() -> current_cred() -> rcu_dereference_protected(current->cred, 1) Accessing 'current' in bpf context makes no sense, since packets are processed from softirq context. As Neal stated : The capability check in tcp_set_congestion_control() was written assuming a system call context, and then was reused from a BPF call site. The fix is to add a new parameter to tcp_set_congestion_control(), so that the ns_capable() call is only performed under the right context. Fixes: 91b5b21c7c16 ("bpf: Add support for changing congestion control") Signed-off-by: Eric Dumazet Cc: Lawrence Brakmo Reported-by: Neal Cardwell Acked-by: Neal Cardwell Acked-by: Lawrence Brakmo Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tcp.h | 3 ++- net/core/filter.c | 2 +- net/ipv4/tcp.c | 4 +++- net/ipv4/tcp_cong.c | 6 +++--- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 15156cd6e797..abcf53a6db04 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1054,7 +1054,8 @@ void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit); +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); diff --git a/net/core/filter.c b/net/core/filter.c index 34ec9324737b..c996380f2959 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3991,7 +3991,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; ret = tcp_set_congestion_control(sk, name, false, - reinit); + reinit, true); } else { struct tcp_sock *tp = tcp_sk(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 364e6fdaa38f..cc25c8cbe15a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2729,7 +2729,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, true); + err = tcp_set_congestion_control(sk, name, true, true, + ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); release_sock(sk); return err; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index bc6c02f16243..48f79db446a0 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -332,7 +332,8 @@ out: * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ -int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit) +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, + bool reinit, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; @@ -368,8 +369,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo } else { err = -EBUSY; } - } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { + } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; } else if (!try_module_get(ca->owner)) { err = -EBUSY; From 1b200acde418f4d6d87279d3f6f976ebf188f272 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Sat, 6 Jul 2019 16:13:07 -0700 Subject: [PATCH 22/52] tcp: Reset bytes_acked and bytes_received when disconnecting [ Upstream commit e858faf556d4e14c750ba1e8852783c6f9520a0e ] If an app is playing tricks to reuse a socket via tcp_disconnect(), bytes_acked/received needs to be reset to 0. Otherwise tcp_info will report the sum of the current and the old connection.. Cc: Eric Dumazet Fixes: 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info") Fixes: bdd1f9edacb5 ("tcp: add tcpi_bytes_received to tcp_info") Signed-off-by: Christoph Paasch Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cc25c8cbe15a..b7ef367fe6a1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2594,6 +2594,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->bytes_sent = 0; + tp->bytes_acked = 0; + tp->bytes_received = 0; tp->bytes_retrans = 0; tp->dsack_dups = 0; tp->reord_seen = 0; From a2aa162a63319a003982a39e2cffc110bf76c591 Mon Sep 17 00:00:00 2001 From: Peter Kosyh Date: Fri, 19 Jul 2019 11:11:47 +0300 Subject: [PATCH 23/52] vrf: make sure skb->data contains ip header to make routing [ Upstream commit 107e47cc80ec37cb332bd41b22b1c7779e22e018 ] vrf_process_v4_outbound() and vrf_process_v6_outbound() do routing using ip/ipv6 addresses, but don't make sure the header is available in skb->data[] (skb_headlen() is less then header size). Case: 1) igb driver from intel. 2) Packet size is greater then 255. 3) MPLS forwards to VRF device. So, patch adds pskb_may_pull() calls in vrf_process_v4/v6_outbound() functions. Signed-off-by: Peter Kosyh Reviewed-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/vrf.c | 58 ++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 449fc52f9a89..9f895083bc0a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -169,23 +169,29 @@ static int vrf_ip6_local_out(struct net *net, struct sock *sk, static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, struct net_device *dev) { - const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct ipv6hdr *iph; struct net *net = dev_net(skb->dev); - struct flowi6 fl6 = { - /* needed to match OIF rule */ - .flowi6_oif = dev->ifindex, - .flowi6_iif = LOOPBACK_IFINDEX, - .daddr = iph->daddr, - .saddr = iph->saddr, - .flowlabel = ip6_flowinfo(iph), - .flowi6_mark = skb->mark, - .flowi6_proto = iph->nexthdr, - .flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF, - }; + struct flowi6 fl6; int ret = NET_XMIT_DROP; struct dst_entry *dst; struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) + goto err; + + iph = ipv6_hdr(skb); + + memset(&fl6, 0, sizeof(fl6)); + /* needed to match OIF rule */ + fl6.flowi6_oif = dev->ifindex; + fl6.flowi6_iif = LOOPBACK_IFINDEX; + fl6.daddr = iph->daddr; + fl6.saddr = iph->saddr; + fl6.flowlabel = ip6_flowinfo(iph); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = iph->nexthdr; + fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; + dst = ip6_route_output(net, NULL, &fl6); if (dst == dst_null) goto err; @@ -241,21 +247,27 @@ static int vrf_ip_local_out(struct net *net, struct sock *sk, static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, struct net_device *vrf_dev) { - struct iphdr *ip4h = ip_hdr(skb); + struct iphdr *ip4h; int ret = NET_XMIT_DROP; - struct flowi4 fl4 = { - /* needed to match OIF rule */ - .flowi4_oif = vrf_dev->ifindex, - .flowi4_iif = LOOPBACK_IFINDEX, - .flowi4_tos = RT_TOS(ip4h->tos), - .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF, - .flowi4_proto = ip4h->protocol, - .daddr = ip4h->daddr, - .saddr = ip4h->saddr, - }; + struct flowi4 fl4; struct net *net = dev_net(vrf_dev); struct rtable *rt; + if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) + goto err; + + ip4h = ip_hdr(skb); + + memset(&fl4, 0, sizeof(fl4)); + /* needed to match OIF rule */ + fl4.flowi4_oif = vrf_dev->ifindex; + fl4.flowi4_iif = LOOPBACK_IFINDEX; + fl4.flowi4_tos = RT_TOS(ip4h->tos); + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF; + fl4.flowi4_proto = ip4h->protocol; + fl4.daddr = ip4h->daddr; + fl4.saddr = ip4h->saddr; + rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto err; From a8ba53da071e30c3276ad5868400ac01b912a8e6 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 7 Jul 2019 16:57:06 +0300 Subject: [PATCH 24/52] net/mlx5e: IPoIB, Add error path in mlx5_rdma_setup_rn [ Upstream commit ef1ce7d7b67b46661091c7ccc0396186b7a247ef ] Check return value from mlx5e_attach_netdev, add error path on failure. Fixes: 48935bbb7ae8 ("net/mlx5e: IPoIB, Add netdevice profile skeleton") Signed-off-by: Aya Levin Reviewed-by: Feras Daoud Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index 5b7fe8264144..db6aafcced0d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -662,7 +662,9 @@ struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev, profile->init(mdev, netdev, profile, ipriv); - mlx5e_attach_netdev(epriv); + err = mlx5e_attach_netdev(epriv); + if (err) + goto detach; netif_carrier_off(netdev); /* set rdma_netdev func pointers */ @@ -678,6 +680,11 @@ struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev, return netdev; +detach: + profile->cleanup(epriv); + if (ipriv->sub_interface) + return NULL; + mlx5e_destroy_mdev_resources(mdev); destroy_ht: mlx5i_pkey_qpn_ht_cleanup(netdev); destroy_wq: From 21252f49cddf9878e2eb4af65df64b399dd94496 Mon Sep 17 00:00:00 2001 From: Andreas Steinmetz Date: Sun, 30 Jun 2019 22:46:42 +0200 Subject: [PATCH 25/52] macsec: fix use-after-free of skb during RX [ Upstream commit 095c02da80a41cf6d311c504d8955d6d1c2add10 ] Fix use-after-free of skb when rx_handler returns RX_HANDLER_PASS. Signed-off-by: Andreas Steinmetz Acked-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/macsec.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 7de88b33d5b9..7a2dae926194 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -1103,10 +1103,9 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) } skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) { - *pskb = NULL; + *pskb = skb; + if (!skb) return RX_HANDLER_CONSUMED; - } pulled_sci = pskb_may_pull(skb, macsec_extra_len(true)); if (!pulled_sci) { From 0c5cb5a12623b242166b9d7636daebea13a604af Mon Sep 17 00:00:00 2001 From: Andreas Steinmetz Date: Sun, 30 Jun 2019 22:46:45 +0200 Subject: [PATCH 26/52] macsec: fix checksumming after decryption [ Upstream commit 7d8b16b9facb0dd81d1469808dd9a575fa1d525a ] Fix checksumming after decryption. Signed-off-by: Andreas Steinmetz Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/macsec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 7a2dae926194..2c971357e66c 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -869,6 +869,7 @@ static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev) static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len) { + skb->ip_summed = CHECKSUM_NONE; memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN); skb_pull(skb, hdr_len); pskb_trim_unique(skb, skb->len - icv_len); From dc59a2abd33e23ba5eb153924a773e65d72ca65d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 27 Jun 2019 14:30:58 -0700 Subject: [PATCH 27/52] netrom: fix a memory leak in nr_rx_frame() [ Upstream commit c8c8218ec5af5d2598381883acbefbf604e56b5e ] When the skb is associated with a new sock, just assigning it to skb->sk is not sufficient, we have to set its destructor to free the sock properly too. Reported-by: syzbot+d6636a36d3c34bd88938@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netrom/af_netrom.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 71ffd1a6dc7c..f11b6747e18c 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -872,7 +872,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) unsigned short frametype, flags, window, timeout; int ret; - skb->sk = NULL; /* Initially we don't know who it's for */ + skb_orphan(skb); /* * skb->data points to the netrom frame start @@ -971,6 +971,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) window = skb->data[20]; skb->sk = make; + skb->destructor = sock_efree; make->sk_state = TCP_ESTABLISHED; /* Fill in his circuit details */ From 69cd584546159c82e869cef15675cfe191a649e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 22 Jul 2019 20:41:22 -0700 Subject: [PATCH 28/52] netrom: hold sock when setting skb->destructor [ Upstream commit 4638faac032756f7eab5524be7be56bee77e426b ] sock_efree() releases the sock refcnt, if we don't hold this refcnt when setting skb->destructor to it, the refcnt would not be balanced. This leads to several bug reports from syzbot. I have checked other users of sock_efree(), all of them hold the sock refcnt. Fixes: c8c8218ec5af ("netrom: fix a memory leak in nr_rx_frame()") Reported-and-tested-by: Reported-and-tested-by: Reported-and-tested-by: Reported-and-tested-by: Cc: Ralf Baechle Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/netrom/af_netrom.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index f11b6747e18c..43910e50752c 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -970,6 +970,7 @@ int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) window = skb->data[20]; + sock_hold(make); skb->sk = make; skb->destructor = sock_efree; make->sk_state = TCP_ESTABLISHED; From d9571a9f5ec19820571de3c2244d16f5b53b8bf0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 16 Jul 2019 13:57:30 -0700 Subject: [PATCH 29/52] net_sched: unset TCQ_F_CAN_BYPASS when adding filters [ Upstream commit 3f05e6886a595c9a29a309c52f45326be917823c ] For qdisc's that support TC filters and set TCQ_F_CAN_BYPASS, notably fq_codel, it makes no sense to let packets bypass the TC filters we setup in any scenario, otherwise our packets steering policy could not be enforced. This can be reproduced easily with the following script: ip li add dev dummy0 type dummy ifconfig dummy0 up tc qd add dev dummy0 root fq_codel tc filter add dev dummy0 parent 8001: protocol arp basic action mirred egress redirect dev lo tc filter add dev dummy0 parent 8001: protocol ip basic action mirred egress redirect dev lo ping -I dummy0 192.168.112.1 Without this patch, packets are sent directly to dummy0 without hitting any of the filters. With this patch, packets are redirected to loopback as expected. This fix is not perfect, it only unsets the flag but does not set it back because we have to save the information somewhere in the qdisc if we really want that. Note, both fq_codel and sfq clear this flag in their ->bind_tcf() but this is clearly not sufficient when we don't use any class ID. Fixes: 23624935e0c4 ("net_sched: TCQ_F_CAN_BYPASS generalization") Cc: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/cls_api.c | 1 + net/sched/sch_fq_codel.c | 2 -- net/sched/sch_sfq.c | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 2167c6ca55e3..3374903d5f96 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1325,6 +1325,7 @@ replay: tcf_chain_tp_insert(chain, &chain_info, tp); tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false); + q->flags &= ~TCQ_F_CAN_BYPASS; } else { if (tp_created) tcf_proto_destroy(tp, NULL); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 6c0a9d5dbf94..137692cb8b4f 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -600,8 +600,6 @@ static unsigned long fq_codel_find(struct Qdisc *sch, u32 classid) static unsigned long fq_codel_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { - /* we cannot bypass queue discipline anymore */ - sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 2f2678197760..650f21463853 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -828,8 +828,6 @@ static unsigned long sfq_find(struct Qdisc *sch, u32 classid) static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { - /* we cannot bypass queue discipline anymore */ - sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } From fde351aeff4afe74c430395be01dd83c070ab85c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 28 Jun 2019 16:11:39 -0700 Subject: [PATCH 30/52] net/tls: make sure offload also gets the keys wiped [ Upstream commit acd3e96d53a24d219f720ed4012b62723ae05da1 ] Commit 86029d10af18 ("tls: zero the crypto information from tls_context before freeing") added memzero_explicit() calls to clear the key material before freeing struct tls_context, but it missed tls_device.c has its own way of freeing this structure. Replace the missing free. Fixes: 86029d10af18 ("tls: zero the crypto information from tls_context before freeing") Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/tls.h | 1 + net/tls/tls_device.c | 2 +- net/tls/tls_main.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index 954110575891..98f5ad0319a2 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -234,6 +234,7 @@ struct tls_offload_context_rx { (ALIGN(sizeof(struct tls_offload_context_rx), sizeof(void *)) + \ TLS_DRIVER_STATE_SIZE) +void tls_ctx_free(struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); int tls_sk_query(struct sock *sk, int optname, char __user *optval, int __user *optlen); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index ead29c2aefa7..0a613e0ef3bf 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -61,7 +61,7 @@ static void tls_device_free_ctx(struct tls_context *ctx) if (ctx->rx_conf == TLS_HW) kfree(tls_offload_ctx_rx(ctx)); - kfree(ctx); + tls_ctx_free(ctx); } static void tls_device_gc_task(struct work_struct *work) diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 25b3fb585777..4c0ac79f82d4 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -241,7 +241,7 @@ static void tls_write_space(struct sock *sk) ctx->sk_write_space(sk); } -static void tls_ctx_free(struct tls_context *ctx) +void tls_ctx_free(struct tls_context *ctx) { if (!ctx) return; From bc9a2f36a7d6a3cfdfc0001cdaa6f44d73347ec9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jun 2019 16:31:39 +0800 Subject: [PATCH 31/52] sctp: not bind the socket in sctp_connect [ Upstream commit 9b6c08878e23adb7cc84bdca94d8a944b03f099e ] Now when sctp_connect() is called with a wrong sa_family, it binds to a port but doesn't set bp->port, then sctp_get_af_specific will return NULL and sctp_connect() returns -EINVAL. Then if sctp_bind() is called to bind to another port, the last port it has bound will leak due to bp->port is NULL by then. sctp_connect() doesn't need to bind ports, as later __sctp_connect will do it if bp->port is NULL. So remove it from sctp_connect(). While at it, remove the unnecessary sockaddr.sa_family len check as it's already done in sctp_inet_connect. Fixes: 644fbdeacf1d ("sctp: fix the issue that flags are ignored when using kernel_connect") Reported-by: syzbot+079bf326b38072f849d9@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/socket.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 8c00a7ef1bcd..9f5b4e547b63 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4507,34 +4507,18 @@ out_nounlock: static int sctp_connect(struct sock *sk, struct sockaddr *addr, int addr_len, int flags) { - struct inet_sock *inet = inet_sk(sk); struct sctp_af *af; - int err = 0; + int err = -EINVAL; lock_sock(sk); pr_debug("%s: sk:%p, sockaddr:%p, addr_len:%d\n", __func__, sk, addr, addr_len); - /* We may need to bind the socket. */ - if (!inet->inet_num) { - if (sk->sk_prot->get_port(sk, 0)) { - release_sock(sk); - return -EAGAIN; - } - inet->inet_sport = htons(inet->inet_num); - } - /* Validate addr_len before calling common connect/connectx routine. */ af = sctp_get_af_specific(addr->sa_family); - if (!af || addr_len < af->sockaddr_len) { - err = -EINVAL; - } else { - /* Pass correct addr len to common routine (so it knows there - * is only one address being passed. - */ + if (af && addr_len >= af->sockaddr_len) err = __sctp_connect(sk, addr, af->sockaddr_len, flags, NULL); - } release_sock(sk); return err; From caf4488fc06ecafa0f359ff20d6e4569977a9f9e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 2 Jul 2019 15:00:18 +0300 Subject: [PATCH 32/52] net: bridge: mcast: fix stale nsrcs pointer in igmp3/mld2 report handling [ Upstream commit e57f61858b7cf478ed6fa23ed4b3876b1c9625c4 ] We take a pointer to grec prior to calling pskb_may_pull and use it afterwards to get nsrcs so record nsrcs before the pull when handling igmp3 and we get a pointer to nsrcs and call pskb_may_pull when handling mld2 which again could lead to reading 2 bytes out-of-bounds. ================================================================== BUG: KASAN: use-after-free in br_multicast_rcv+0x480c/0x4ad0 [bridge] Read of size 2 at addr ffff8880421302b4 by task ksoftirqd/1/16 CPU: 1 PID: 16 Comm: ksoftirqd/1 Tainted: G OE 5.2.0-rc6+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Call Trace: dump_stack+0x71/0xab print_address_description+0x6a/0x280 ? br_multicast_rcv+0x480c/0x4ad0 [bridge] __kasan_report+0x152/0x1aa ? br_multicast_rcv+0x480c/0x4ad0 [bridge] ? br_multicast_rcv+0x480c/0x4ad0 [bridge] kasan_report+0xe/0x20 br_multicast_rcv+0x480c/0x4ad0 [bridge] ? br_multicast_disable_port+0x150/0x150 [bridge] ? ktime_get_with_offset+0xb4/0x150 ? __kasan_kmalloc.constprop.6+0xa6/0xf0 ? __netif_receive_skb+0x1b0/0x1b0 ? br_fdb_update+0x10e/0x6e0 [bridge] ? br_handle_frame_finish+0x3c6/0x11d0 [bridge] br_handle_frame_finish+0x3c6/0x11d0 [bridge] ? br_pass_frame_up+0x3a0/0x3a0 [bridge] ? virtnet_probe+0x1c80/0x1c80 [virtio_net] br_handle_frame+0x731/0xd90 [bridge] ? select_idle_sibling+0x25/0x7d0 ? br_handle_frame_finish+0x11d0/0x11d0 [bridge] __netif_receive_skb_core+0xced/0x2d70 ? virtqueue_get_buf_ctx+0x230/0x1130 [virtio_ring] ? do_xdp_generic+0x20/0x20 ? virtqueue_napi_complete+0x39/0x70 [virtio_net] ? virtnet_poll+0x94d/0xc78 [virtio_net] ? receive_buf+0x5120/0x5120 [virtio_net] ? __netif_receive_skb_one_core+0x97/0x1d0 __netif_receive_skb_one_core+0x97/0x1d0 ? __netif_receive_skb_core+0x2d70/0x2d70 ? _raw_write_trylock+0x100/0x100 ? __queue_work+0x41e/0xbe0 process_backlog+0x19c/0x650 ? _raw_read_lock_irq+0x40/0x40 net_rx_action+0x71e/0xbc0 ? __switch_to_asm+0x40/0x70 ? napi_complete_done+0x360/0x360 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 ? __schedule+0x85e/0x14d0 __do_softirq+0x1db/0x5f9 ? takeover_tasklets+0x5f0/0x5f0 run_ksoftirqd+0x26/0x40 smpboot_thread_fn+0x443/0x680 ? sort_range+0x20/0x20 ? schedule+0x94/0x210 ? __kthread_parkme+0x78/0xf0 ? sort_range+0x20/0x20 kthread+0x2ae/0x3a0 ? kthread_create_worker_on_cpu+0xc0/0xc0 ret_from_fork+0x35/0x40 The buggy address belongs to the page: page:ffffea0001084c00 refcount:0 mapcount:-128 mapping:0000000000000000 index:0x0 flags: 0xffffc000000000() raw: 00ffffc000000000 ffffea0000cfca08 ffffea0001098608 0000000000000000 raw: 0000000000000000 0000000000000003 00000000ffffff7f 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888042130180: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888042130200: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff > ffff888042130280: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff888042130300: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff888042130380: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Disabling lock debugging due to kernel taint Fixes: bc8c20acaea1 ("bridge: multicast: treat igmpv3 report with INCLUDE and no sources as a leave") Reported-by: Martin Weinelt Signed-off-by: Nikolay Aleksandrov Tested-by: Martin Weinelt Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_multicast.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 75901c4641b1..2bb52be929ea 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1147,6 +1147,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, int type; int err = 0; __be32 group; + u16 nsrcs; ih = igmpv3_report_hdr(skb); num = ntohs(ih->ngrec); @@ -1160,8 +1161,9 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, grec = (void *)(skb->data + len - sizeof(*grec)); group = grec->grec_mca; type = grec->grec_type; + nsrcs = ntohs(grec->grec_nsrcs); - len += ntohs(grec->grec_nsrcs) * 4; + len += nsrcs * 4; if (!pskb_may_pull(skb, len)) return -EINVAL; @@ -1182,7 +1184,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, src = eth_hdr(skb)->h_source; if ((type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE) && - ntohs(grec->grec_nsrcs) == 0) { + nsrcs == 0) { br_ip4_multicast_leave_group(br, port, group, vid, src); } else { err = br_ip4_multicast_add_group(br, port, group, vid, @@ -1217,23 +1219,26 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, len = skb_transport_offset(skb) + sizeof(*icmp6h); for (i = 0; i < num; i++) { - __be16 *nsrcs, _nsrcs; + __be16 *_nsrcs, __nsrcs; + u16 nsrcs; - nsrcs = skb_header_pointer(skb, - len + offsetof(struct mld2_grec, - grec_nsrcs), - sizeof(_nsrcs), &_nsrcs); - if (!nsrcs) + _nsrcs = skb_header_pointer(skb, + len + offsetof(struct mld2_grec, + grec_nsrcs), + sizeof(__nsrcs), &__nsrcs); + if (!_nsrcs) return -EINVAL; + nsrcs = ntohs(*_nsrcs); + if (!pskb_may_pull(skb, len + sizeof(*grec) + - sizeof(struct in6_addr) * ntohs(*nsrcs))) + sizeof(struct in6_addr) * nsrcs)) return -EINVAL; grec = (struct mld2_grec *)(skb->data + len); len += sizeof(*grec) + - sizeof(struct in6_addr) * ntohs(*nsrcs); + sizeof(struct in6_addr) * nsrcs; /* We treat these as MLDv1 reports for now. */ switch (grec->grec_type) { @@ -1252,7 +1257,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, src = eth_hdr(skb)->h_source; if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && - ntohs(*nsrcs) == 0) { + nsrcs == 0) { br_ip6_multicast_leave_group(br, port, &grec->grec_mca, vid, src); } else { From 41a8df71809ec0052ce4c0176f2f363c4f783969 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 2 Jul 2019 15:00:19 +0300 Subject: [PATCH 33/52] net: bridge: mcast: fix stale ipv6 hdr pointer when handling v6 query [ Upstream commit 3b26a5d03d35d8f732d75951218983c0f7f68dff ] We get a pointer to the ipv6 hdr in br_ip6_multicast_query but we may call pskb_may_pull afterwards and end up using a stale pointer. So use the header directly, it's just 1 place where it's needed. Fixes: 08b202b67264 ("bridge br_multicast: IPv6 MLD support.") Signed-off-by: Nikolay Aleksandrov Tested-by: Martin Weinelt Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_multicast.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 2bb52be929ea..fb54d32321ec 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1510,7 +1510,6 @@ static int br_ip6_multicast_query(struct net_bridge *br, struct sk_buff *skb, u16 vid) { - const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct mld_msg *mld; struct net_bridge_mdb_entry *mp; struct mld2_query *mld2q; @@ -1554,7 +1553,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, if (is_general_query) { saddr.proto = htons(ETH_P_IPV6); - saddr.u.ip6 = ip6h->saddr; + saddr.u.ip6 = ipv6_hdr(skb)->saddr; br_multicast_query_received(br, port, &br->ip6_other_query, &saddr, max_delay); From 78701843ecc429db5fc632c51c61b3cc986a2b84 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 2 Jul 2019 15:00:20 +0300 Subject: [PATCH 34/52] net: bridge: don't cache ether dest pointer on input [ Upstream commit 3d26eb8ad1e9b906433903ce05f775cf038e747f ] We would cache ether dst pointer on input in br_handle_frame_finish but after the neigh suppress code that could lead to a stale pointer since both ipv4 and ipv6 suppress code do pskb_may_pull. This means we have to always reload it after the suppress code so there's no point in having it cached just retrieve it directly. Fixes: 057658cb33fbf ("bridge: suppress arp pkts on BR_NEIGH_SUPPRESS ports") Fixes: ed842faeb2bd ("bridge: suppress nd pkts on BR_NEIGH_SUPPRESS ports") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_input.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fed0ff446abb..2532c1a19645 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -79,7 +79,6 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; - const unsigned char *dest; struct net_bridge *br; u16 vid = 0; @@ -97,10 +96,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); local_rcv = !!(br->dev->flags & IFF_PROMISC); - dest = eth_hdr(skb)->h_dest; - if (is_multicast_ether_addr(dest)) { + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) { /* by definition the broadcast is also a multicast address */ - if (is_broadcast_ether_addr(dest)) { + if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) { pkt_type = BR_PKT_BROADCAST; local_rcv = true; } else { @@ -150,7 +148,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } break; case BR_PKT_UNICAST: - dst = br_fdb_find_rcu(br, dest, vid); + dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); default: break; } From b72fb8dec1836773d226caeb4a2c5bc57158005b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 2 Jul 2019 15:00:21 +0300 Subject: [PATCH 35/52] net: bridge: stp: don't cache eth dest pointer before skb pull [ Upstream commit 2446a68ae6a8cee6d480e2f5b52f5007c7c41312 ] Don't cache eth dest pointer before calling pskb_may_pull. Fixes: cf0f02d04a83 ("[BRIDGE]: use llc for receiving STP packets") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_stp_bpdu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index 1b75d6bf12bd..37ddcea3fc96 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -147,7 +147,6 @@ void br_send_tcn_bpdu(struct net_bridge_port *p) void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, struct net_device *dev) { - const unsigned char *dest = eth_hdr(skb)->h_dest; struct net_bridge_port *p; struct net_bridge *br; const unsigned char *buf; @@ -176,7 +175,7 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb, if (p->state == BR_STATE_DISABLED) goto out; - if (!ether_addr_equal(dest, br->group_addr)) + if (!ether_addr_equal(eth_hdr(skb)->h_dest, br->group_addr)) goto out; if (p->flags & BR_BPDU_GUARD) { From 95ee55cab118135c7dd3741e3bc53c404407f9a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Thu, 6 Dec 2018 11:18:40 -0500 Subject: [PATCH 36/52] dma-buf: balance refcount inbalance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5e383a9798990c69fc759a4930de224bb497e62c upstream. The debugfs take reference on fence without dropping them. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Daniel Vetter Cc: Sumit Semwal Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-sig@lists.linaro.org Cc: Stéphane Marchesin Cc: stable@vger.kernel.org Reviewed-by: Christian König Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20181206161840.6578-1-jglisse@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/dma-buf/dma-buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 13884474d158..69842145c223 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1069,6 +1069,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) fence->ops->get_driver_name(fence), fence->ops->get_timeline_name(fence), dma_fence_is_signaled(fence) ? "" : "un"); + dma_fence_put(fence); } rcu_read_unlock(); From c947cf3e95839e9f449d8194fd15979e5ebc5f16 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 4 Jun 2019 13:53:23 +0100 Subject: [PATCH 37/52] dma-buf: Discard old fence_excl on retrying get_fences_rcu for realloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f5b07b04e5f090a85d1e96938520f2b2b58e4a8e upstream. If we have to drop the seqcount & rcu lock to perform a krealloc, we have to restart the loop. In doing so, be careful not to lose track of the already acquired exclusive fence. Fixes: fedf54132d24 ("dma-buf: Restart reservation_object_get_fences_rcu() after writes") Signed-off-by: Chris Wilson Cc: Daniel Vetter Cc: Maarten Lankhorst Cc: Christian König Cc: Alex Deucher Cc: Sumit Semwal Cc: stable@vger.kernel.org #v4.10 Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20190604125323.21396-1-chris@chris-wilson.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/dma-buf/reservation.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma-buf/reservation.c b/drivers/dma-buf/reservation.c index 6c95f61a32e7..49ab09468ba1 100644 --- a/drivers/dma-buf/reservation.c +++ b/drivers/dma-buf/reservation.c @@ -416,6 +416,10 @@ int reservation_object_get_fences_rcu(struct reservation_object *obj, GFP_NOWAIT | __GFP_NOWARN); if (!nshared) { rcu_read_unlock(); + + dma_fence_put(fence_excl); + fence_excl = NULL; + nshared = krealloc(shared, sz, GFP_KERNEL); if (nshared) { shared = nshared; From dd5994ab1f00aa2cf67336d03212e239176e5389 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Mon, 8 Jul 2019 14:19:04 +0530 Subject: [PATCH 38/52] gpio: davinci: silence error prints in case of EPROBE_DEFER commit 541e4095f388c196685685633c950cb9b97f8039 upstream. Silence error prints in case of EPROBE_DEFER. This avoids multiple/duplicate defer prints during boot. Cc: Signed-off-by: Keerthy Signed-off-by: Bartosz Golaszewski Signed-off-by: Greg Kroah-Hartman --- drivers/gpio/gpio-davinci.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpio/gpio-davinci.c b/drivers/gpio/gpio-davinci.c index a5ece8ea79bc..abb332d15a13 100644 --- a/drivers/gpio/gpio-davinci.c +++ b/drivers/gpio/gpio-davinci.c @@ -222,8 +222,9 @@ static int davinci_gpio_probe(struct platform_device *pdev) for (i = 0; i < nirq; i++) { chips->irqs[i] = platform_get_irq(pdev, i); if (chips->irqs[i] < 0) { - dev_info(dev, "IRQ not populated, err = %d\n", - chips->irqs[i]); + if (chips->irqs[i] != -EPROBE_DEFER) + dev_info(dev, "IRQ not populated, err = %d\n", + chips->irqs[i]); return chips->irqs[i]; } } From 0e6ef184315d43ec26ae6c3acaadd88466621e5d Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Tue, 4 Jun 2019 18:33:11 +0200 Subject: [PATCH 39/52] MIPS: lb60: Fix pin mappings commit 1323c3b72a987de57141cabc44bf9cd83656bc70 upstream. The pin mappings introduced in commit 636f8ba67fb6 ("MIPS: JZ4740: Qi LB60: Add pinctrl configuration for several drivers") are completely wrong. The pinctrl driver name is incorrect, and the function and group fields are swapped. Fixes: 636f8ba67fb6 ("MIPS: JZ4740: Qi LB60: Add pinctrl configuration for several drivers") Cc: Signed-off-by: Paul Cercueil Reviewed-by: Linus Walleij Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: od@zcrc.me Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/mips/jz4740/board-qi_lb60.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/mips/jz4740/board-qi_lb60.c b/arch/mips/jz4740/board-qi_lb60.c index 705593d40d12..05c60fa4fa06 100644 --- a/arch/mips/jz4740/board-qi_lb60.c +++ b/arch/mips/jz4740/board-qi_lb60.c @@ -471,27 +471,27 @@ static unsigned long pin_cfg_bias_disable[] = { static struct pinctrl_map pin_map[] __initdata = { /* NAND pin configuration */ PIN_MAP_MUX_GROUP_DEFAULT("jz4740-nand", - "10010000.jz4740-pinctrl", "nand", "nand-cs1"), + "10010000.pin-controller", "nand-cs1", "nand"), /* fbdev pin configuration */ PIN_MAP_MUX_GROUP("jz4740-fb", PINCTRL_STATE_DEFAULT, - "10010000.jz4740-pinctrl", "lcd", "lcd-8bit"), + "10010000.pin-controller", "lcd-8bit", "lcd"), PIN_MAP_MUX_GROUP("jz4740-fb", PINCTRL_STATE_SLEEP, - "10010000.jz4740-pinctrl", "lcd", "lcd-no-pins"), + "10010000.pin-controller", "lcd-no-pins", "lcd"), /* MMC pin configuration */ PIN_MAP_MUX_GROUP_DEFAULT("jz4740-mmc.0", - "10010000.jz4740-pinctrl", "mmc", "mmc-1bit"), + "10010000.pin-controller", "mmc-1bit", "mmc"), PIN_MAP_MUX_GROUP_DEFAULT("jz4740-mmc.0", - "10010000.jz4740-pinctrl", "mmc", "mmc-4bit"), + "10010000.pin-controller", "mmc-4bit", "mmc"), PIN_MAP_CONFIGS_PIN_DEFAULT("jz4740-mmc.0", - "10010000.jz4740-pinctrl", "PD0", pin_cfg_bias_disable), + "10010000.pin-controller", "PD0", pin_cfg_bias_disable), PIN_MAP_CONFIGS_PIN_DEFAULT("jz4740-mmc.0", - "10010000.jz4740-pinctrl", "PD2", pin_cfg_bias_disable), + "10010000.pin-controller", "PD2", pin_cfg_bias_disable), /* PWM pin configuration */ PIN_MAP_MUX_GROUP_DEFAULT("jz4740-pwm", - "10010000.jz4740-pinctrl", "pwm4", "pwm4"), + "10010000.pin-controller", "pwm4", "pwm4"), }; From 75100ec5f079c5c3c65b28a6b8d360acfe00b983 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Mon, 1 Jul 2019 14:07:55 +0300 Subject: [PATCH 40/52] perf/core: Fix exclusive events' grouping commit 8a58ddae23796c733c5dfbd717538d89d036c5bd upstream. So far, we tried to disallow grouping exclusive events for the fear of complications they would cause with moving between contexts. Specifically, moving a software group to a hardware context would violate the exclusivity rules if both groups contain matching exclusive events. This attempt was, however, unsuccessful: the check that we have in the perf_event_open() syscall is both wrong (looks at wrong PMU) and insufficient (group leader may still be exclusive), as can be illustrated by running: $ perf record -e '{intel_pt//,cycles}' uname $ perf record -e '{cycles,intel_pt//}' uname ultimately successfully. Furthermore, we are completely free to trigger the exclusivity violation by: perf -e '{cycles,intel_pt//}' -e '{intel_pt//,instructions}' even though the helpful perf record will not allow that, the ABI will. The warning later in the perf_event_open() path will also not trigger, because it's also wrong. Fix all this by validating the original group before moving, getting rid of broken safeguards and placing a useful one to perf_install_in_context(). Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: mathieu.poirier@linaro.org Cc: will.deacon@arm.com Fixes: bed5b25ad9c8a ("perf: Add a pmu capability for "exclusive" events") Link: https://lkml.kernel.org/r/20190701110755.24646-1-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 34 ++++++++++++++++++++++------------ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 42fc852bf512..b22bc81f3669 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1030,6 +1030,11 @@ static inline int in_software_context(struct perf_event *event) return event->ctx->pmu->task_ctx_nr == perf_sw_context; } +static inline int is_exclusive_pmu(struct pmu *pmu) +{ + return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE; +} + extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); diff --git a/kernel/events/core.c b/kernel/events/core.c index 3b61ff40bfe2..6fb9c60eba00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2541,6 +2541,9 @@ unlock: return ret; } +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx); + /* * Attach a performance event to a context. * @@ -2555,6 +2558,8 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); + WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); + if (event->cpu != -1) event->cpu = cpu; @@ -4341,7 +4346,7 @@ static int exclusive_event_init(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return 0; /* @@ -4372,7 +4377,7 @@ static void exclusive_event_destroy(struct perf_event *event) { struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + if (!is_exclusive_pmu(pmu)) return; /* see comment in exclusive_event_init() */ @@ -4392,14 +4397,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) return false; } -/* Called under the same ctx::mutex as perf_install_in_context() */ static bool exclusive_event_installable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *iter_event; struct pmu *pmu = event->pmu; - if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + lockdep_assert_held(&ctx->mutex); + + if (!is_exclusive_pmu(pmu)) return true; list_for_each_entry(iter_event, &ctx->event_list, event_entry) { @@ -10613,11 +10619,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } - if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { - err = -EBUSY; - goto err_context; - } - /* * Look up the group leader (we will attach this event to it): */ @@ -10705,6 +10706,18 @@ SYSCALL_DEFINE5(perf_event_open, move_group = 0; } } + + /* + * Failure to create exclusive events returns -EBUSY. + */ + err = -EBUSY; + if (!exclusive_event_installable(group_leader, ctx)) + goto err_locked; + + for_each_sibling_event(sibling, group_leader) { + if (!exclusive_event_installable(sibling, ctx)) + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } @@ -10741,9 +10754,6 @@ SYSCALL_DEFINE5(perf_event_open, * because we need to serialize with concurrent event creation. */ if (!exclusive_event_installable(event, ctx)) { - /* exclusive and group stuff are assumed mutually exclusive */ - WARN_ON_ONCE(move_group); - err = -EBUSY; goto err_locked; } From 4a5cc64d8a8a0f937e6d857faf571966c903341d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 13 Jul 2019 11:21:25 +0200 Subject: [PATCH 41/52] perf/core: Fix race between close() and fork() commit 1cf8dfe8a661f0462925df943140e9f6d1ea5233 upstream. Syzcaller reported the following Use-after-Free bug: close() clone() copy_process() perf_event_init_task() perf_event_init_context() mutex_lock(parent_ctx->mutex) inherit_task_group() inherit_group() inherit_event() mutex_lock(event->child_mutex) // expose event on child list list_add_tail() mutex_unlock(event->child_mutex) mutex_unlock(parent_ctx->mutex) ... goto bad_fork_* bad_fork_cleanup_perf: perf_event_free_task() perf_release() perf_event_release_kernel() list_for_each_entry() mutex_lock(ctx->mutex) mutex_lock(event->child_mutex) // event is from the failing inherit // on the other CPU perf_remove_from_context() list_move() mutex_unlock(event->child_mutex) mutex_unlock(ctx->mutex) mutex_lock(ctx->mutex) list_for_each_entry_safe() // event already stolen mutex_unlock(ctx->mutex) delayed_free_task() free_task() list_for_each_entry_safe() list_del() free_event() _free_event() // and so event->hw.target // is the already freed failed clone() if (event->hw.target) put_task_struct(event->hw.target) // WHOOPSIE, already quite dead Which puts the lie to the the comment on perf_event_free_task(): 'unexposed, unused context' not so much. Which is a 'fun' confluence of fail; copy_process() doing an unconditional free_task() and not respecting refcounts, and perf having creative locking. In particular: 82d94856fa22 ("perf/core: Fix lock inversion between perf,trace,cpuhp") seems to have overlooked this 'fun' parade. Solve it by using the fact that detached events still have a reference count on their (previous) context. With this perf_event_free_task() can detect when events have escaped and wait for their destruction. Debugged-by: Alexander Shishkin Reported-by: syzbot+a24c397a29ad22d86c98@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 82d94856fa22 ("perf/core: Fix lock inversion between perf,trace,cpuhp") Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6fb9c60eba00..e8979c72514b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4452,12 +4452,20 @@ static void _free_event(struct perf_event *event) if (event->destroy) event->destroy(event); - if (event->ctx) - put_ctx(event->ctx); - + /* + * Must be after ->destroy(), due to uprobe_perf_close() using + * hw.target. + */ if (event->hw.target) put_task_struct(event->hw.target); + /* + * perf_event_free_task() relies on put_ctx() being 'last', in particular + * all task references must be cleaned up. + */ + if (event->ctx) + put_ctx(event->ctx); + exclusive_event_destroy(event); module_put(event->pmu->module); @@ -4637,8 +4645,17 @@ again: mutex_unlock(&event->child_mutex); list_for_each_entry_safe(child, tmp, &free_list, child_list) { + void *var = &child->ctx->refcount; + list_del(&child->child_list); free_event(child); + + /* + * Wake any perf_event_free_task() waiting for this event to be + * freed. + */ + smp_mb(); /* pairs with wait_var_event() */ + wake_up_var(var); } no_ctx: @@ -11220,11 +11237,11 @@ static void perf_free_event(struct perf_event *event, } /* - * Free an unexposed, unused context as created by inheritance by - * perf_event_init_task below, used by fork() in case of fail. + * Free a context as created by inheritance by perf_event_init_task() below, + * used by fork() in case of fail. * - * Not all locks are strictly required, but take them anyway to be nice and - * help out with the lockdep assertions. + * Even though the task has never lived, the context and events have been + * exposed through the child_list, so we must take care tearing it all down. */ void perf_event_free_task(struct task_struct *task) { @@ -11254,7 +11271,23 @@ void perf_event_free_task(struct task_struct *task) perf_free_event(event, ctx); mutex_unlock(&ctx->mutex); - put_ctx(ctx); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } } From 29171e82348cb0ce787120d42f0b94e8fba2bcae Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 9 Jun 2019 21:41:41 -0400 Subject: [PATCH 42/52] ext4: don't allow any modifications to an immutable file commit 2e53840362771c73eb0a5ff71611507e64e8eecd upstream. Don't allow any modifications to a file that's marked immutable, which means that we have to flush all the writable pages to make the readonly and we have to check the setattr/setflags parameters more closely. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ioctl.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 53d57cdf3c4d..abb6fcff0a1d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -268,6 +268,29 @@ static int uuid_is_zero(__u8 u[16]) } #endif +/* + * If immutable is set and we are not clearing it, we're not allowed to change + * anything else in the inode. Don't error out if we're only trying to set + * immutable on an immutable file. + */ +static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int oldflags = ei->i_flags; + + if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL)) + return 0; + + if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL)) + return -EPERM; + if (ext4_has_feature_project(inode->i_sb) && + __kprojid_val(ei->i_projid) != new_projid) + return -EPERM; + + return 0; +} + static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { @@ -321,6 +344,20 @@ static int ext4_ioctl_setflags(struct inode *inode, goto flags_out; } + /* + * Wait for all pending directio and then flush all the dirty pages + * for this file. The flush marks all the pages readonly, so any + * subsequent attempt to write to the file (particularly mmap pages) + * will come through the filesystem and fail. + */ + if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) && + (flags & EXT4_IMMUTABLE_FL)) { + inode_dio_wait(inode); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto flags_out; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -750,7 +787,11 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return err; inode_lock(inode); - err = ext4_ioctl_setflags(inode, flags); + err = ext4_ioctl_check_immutable(inode, + from_kprojid(&init_user_ns, ei->i_projid), + flags); + if (!err) + err = ext4_ioctl_setflags(inode, flags); inode_unlock(inode); mnt_drop_write_file(filp); return err; @@ -1120,6 +1161,9 @@ resizefs_out: goto out; flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); + err = ext4_ioctl_check_immutable(inode, fa.fsx_projid, flags); + if (err) + goto out; err = ext4_ioctl_setflags(inode, flags); if (err) goto out; From c9ea4620a37f544e46cead4e276b4f273141f08a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 9 Jun 2019 22:04:33 -0400 Subject: [PATCH 43/52] ext4: enforce the immutable flag on open files commit 02b016ca7f99229ae6227e7b2fc950c4e140d74a upstream. According to the chattr man page, "a file with the 'i' attribute cannot be modified..." Historically, this was only enforced when the file was opened, per the rest of the description, "... and the file can not be opened in write mode". There is general agreement that we should standardize all file systems to prevent modifications even for files that were opened at the time the immutable flag is set. Eventually, a change to enforce this at the VFS layer should be landing in mainline. Until then, enforce this at the ext4 level to prevent xfstests generic/553 from failing. Signed-off-by: Theodore Ts'o Cc: "Darrick J. Wong" Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/file.c | 4 ++++ fs/ext4/inode.c | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2c5baa5e8291..f4a24a46245e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -165,6 +165,10 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 05dc5a4ba481..db318b5ec812 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5491,6 +5491,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; + + if (unlikely(IS_APPEND(inode) && + (ia_valid & (ATTR_MODE | ATTR_UID | + ATTR_GID | ATTR_TIMES_SET)))) + return -EPERM; + error = setattr_prepare(dentry, attr); if (error) return error; @@ -6190,6 +6198,9 @@ int ext4_page_mkwrite(struct vm_fault *vmf) get_block_t *get_block; int retries = 0; + if (unlikely(IS_IMMUTABLE(inode))) + return VM_FAULT_SIGBUS; + sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); From 4becd6c11e9a0be4ab8af7b28e9bbc01288da014 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 20 Jun 2019 17:05:37 -0400 Subject: [PATCH 44/52] mm: add filemap_fdatawait_range_keep_errors() commit aa0bfcd939c30617385ffa28682c062d78050eba upstream. In the spirit of filemap_fdatawait_range() and filemap_fdatawait_keep_errors(), introduce filemap_fdatawait_range_keep_errors() which both takes a range upon which to wait and does not clear errors from the address space. Signed-off-by: Ross Zwisler Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/fs.h | 2 ++ mm/filemap.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index d4e1b43a53c3..92420009b9bc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2651,6 +2651,8 @@ extern int filemap_flush(struct address_space *); extern int filemap_fdatawait_keep_errors(struct address_space *mapping); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); +extern int filemap_fdatawait_range_keep_errors(struct address_space *mapping, + loff_t start_byte, loff_t end_byte); static inline int filemap_fdatawait(struct address_space *mapping) { diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..287f3fa02e5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -561,6 +561,28 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, } EXPORT_SYMBOL(filemap_fdatawait_range); +/** + * filemap_fdatawait_range_keep_errors - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space in the + * given range and wait for all of them. Unlike filemap_fdatawait_range(), + * this function does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + */ +int filemap_fdatawait_range_keep_errors(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return filemap_check_and_keep_errors(mapping); +} +EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors); + /** * file_fdatawait_range - wait for writeback to complete * @file: file pointing to address space structure to wait for From af3812b65c3785f7d052b4a1f5dd7c6f54a77e24 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 20 Jun 2019 17:24:56 -0400 Subject: [PATCH 45/52] jbd2: introduce jbd2_inode dirty range scoping commit 6ba0e7dc64a5adcda2fbe65adc466891795d639e upstream. Currently both journal_submit_inode_data_buffers() and journal_finish_inode_data_buffers() operate on the entire address space of each of the inodes associated with a given journal entry. The consequence of this is that if we have an inode where we are constantly appending dirty pages we can end up waiting for an indefinite amount of time in journal_finish_inode_data_buffers() while we wait for all the pages under writeback to be written out. The easiest way to cause this type of workload is do just dd from /dev/zero to a file until it fills the entire filesystem. This can cause journal_finish_inode_data_buffers() to wait for the duration of the entire dd operation. We can improve this situation by scoping each of the inode dirty ranges associated with a given transaction. We do this via the jbd2_inode structure so that the scoping is contained within jbd2 and so that it follows the lifetime and locking rules for that structure. This allows us to limit the writeback & wait in journal_submit_inode_data_buffers() and journal_finish_inode_data_buffers() respectively to the dirty range for a given struct jdb2_inode, keeping us from waiting forever if the inode in question is still being appended to. Signed-off-by: Ross Zwisler Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/jbd2/commit.c | 23 ++++++++++++++------ fs/jbd2/journal.c | 4 ++++ fs/jbd2/transaction.c | 49 ++++++++++++++++++++++++------------------- include/linux/jbd2.h | 22 +++++++++++++++++++ 4 files changed, 71 insertions(+), 27 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 65ea0355a4f6..24f86ffe11d7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -187,14 +187,15 @@ static int journal_wait_on_commit_record(journal_t *journal, * use writepages() because with dealyed allocation we may be doing * block allocation in writepages(). */ -static int journal_submit_inode_data_buffers(struct address_space *mapping) +static int journal_submit_inode_data_buffers(struct address_space *mapping, + loff_t dirty_start, loff_t dirty_end) { int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = mapping->nrpages * 2, - .range_start = 0, - .range_end = i_size_read(mapping->host), + .range_start = dirty_start, + .range_end = dirty_end, }; ret = generic_writepages(mapping, &wbc); @@ -218,6 +219,9 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + loff_t dirty_start = jinode->i_dirty_start; + loff_t dirty_end = jinode->i_dirty_end; + if (!(jinode->i_flags & JI_WRITE_DATA)) continue; mapping = jinode->i_vfs_inode->i_mapping; @@ -230,7 +234,8 @@ static int journal_submit_data_buffers(journal_t *journal, * only allocated blocks here. */ trace_jbd2_submit_inode_data(jinode->i_vfs_inode); - err = journal_submit_inode_data_buffers(mapping); + err = journal_submit_inode_data_buffers(mapping, dirty_start, + dirty_end); if (!ret) ret = err; spin_lock(&journal->j_list_lock); @@ -257,12 +262,16 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + loff_t dirty_start = jinode->i_dirty_start; + loff_t dirty_end = jinode->i_dirty_end; + if (!(jinode->i_flags & JI_WAIT_DATA)) continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - err = filemap_fdatawait_keep_errors( - jinode->i_vfs_inode->i_mapping); + err = filemap_fdatawait_range_keep_errors( + jinode->i_vfs_inode->i_mapping, dirty_start, + dirty_end); if (!ret) ret = err; spin_lock(&journal->j_list_lock); @@ -282,6 +291,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, &jinode->i_transaction->t_inode_list); } else { jinode->i_transaction = NULL; + jinode->i_dirty_start = 0; + jinode->i_dirty_end = 0; } } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e9cf88f0bc29..df390a69c49a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -94,6 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); EXPORT_SYMBOL(jbd2_journal_inode_add_write); EXPORT_SYMBOL(jbd2_journal_inode_add_wait); +EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); +EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); @@ -2588,6 +2590,8 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) jinode->i_next_transaction = NULL; jinode->i_vfs_inode = inode; jinode->i_flags = 0; + jinode->i_dirty_start = 0; + jinode->i_dirty_end = 0; INIT_LIST_HEAD(&jinode->i_list); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e20a6703531f..911ff18249b7 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2500,7 +2500,7 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) * File inode in the inode list of the handle's transaction */ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, - unsigned long flags) + unsigned long flags, loff_t start_byte, loff_t end_byte) { transaction_t *transaction = handle->h_transaction; journal_t *journal; @@ -2512,26 +2512,17 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); - /* - * First check whether inode isn't already on the transaction's - * lists without taking the lock. Note that this check is safe - * without the lock as we cannot race with somebody removing inode - * from the transaction. The reason is that we remove inode from the - * transaction only in journal_release_jbd_inode() and when we commit - * the transaction. We are guarded from the first case by holding - * a reference to the inode. We are safe against the second case - * because if jinode->i_transaction == transaction, commit code - * cannot touch the transaction because we hold reference to it, - * and if jinode->i_next_transaction == transaction, commit code - * will only file the inode where we want it. - */ - if ((jinode->i_transaction == transaction || - jinode->i_next_transaction == transaction) && - (jinode->i_flags & flags) == flags) - return 0; - spin_lock(&journal->j_list_lock); jinode->i_flags |= flags; + + if (jinode->i_dirty_end) { + jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); + jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); + } else { + jinode->i_dirty_start = start_byte; + jinode->i_dirty_end = end_byte; + } + /* Is inode already attached where we need it? */ if (jinode->i_transaction == transaction || jinode->i_next_transaction == transaction) @@ -2566,12 +2557,28 @@ done: int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) { return jbd2_journal_file_inode(handle, jinode, - JI_WRITE_DATA | JI_WAIT_DATA); + JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX); } int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) { - return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA); + return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0, + LLONG_MAX); +} + +int jbd2_journal_inode_ranged_write(handle_t *handle, + struct jbd2_inode *jinode, loff_t start_byte, loff_t length) +{ + return jbd2_journal_file_inode(handle, jinode, + JI_WRITE_DATA | JI_WAIT_DATA, start_byte, + start_byte + length - 1); +} + +int jbd2_journal_inode_ranged_wait(handle_t *handle, struct jbd2_inode *jinode, + loff_t start_byte, loff_t length) +{ + return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, + start_byte, start_byte + length - 1); } /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 583b82b5a1e9..1cf1b9b8e975 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -454,6 +454,22 @@ struct jbd2_inode { * @i_flags: Flags of inode [j_list_lock] */ unsigned long i_flags; + + /** + * @i_dirty_start: + * + * Offset in bytes where the dirty range for this inode starts. + * [j_list_lock] + */ + loff_t i_dirty_start; + + /** + * @i_dirty_end: + * + * Inclusive offset in bytes where the dirty range for this inode + * ends. [j_list_lock] + */ + loff_t i_dirty_end; }; struct jbd2_revoke_table_s; @@ -1399,6 +1415,12 @@ extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); +extern int jbd2_journal_inode_ranged_write(handle_t *handle, + struct jbd2_inode *inode, loff_t start_byte, + loff_t length); +extern int jbd2_journal_inode_ranged_wait(handle_t *handle, + struct jbd2_inode *inode, loff_t start_byte, + loff_t length); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); From caa4e08253eb3d7cb2e00a20daac240b6ef82197 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 20 Jun 2019 17:26:26 -0400 Subject: [PATCH 46/52] ext4: use jbd2_inode dirty range scoping commit 73131fbb003b3691cfcf9656f234b00da497fcd6 upstream. Use the newly introduced jbd2_inode dirty range scoping to prevent us from waiting forever when trying to complete a journal transaction. Signed-off-by: Ross Zwisler Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4_jbd2.h | 12 ++++++------ fs/ext4/inode.c | 13 ++++++++++--- fs/ext4/move_extent.c | 3 ++- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index df908ef79cce..402dc366117e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -361,20 +361,20 @@ static inline int ext4_journal_force_commit(journal_t *journal) } static inline int ext4_jbd2_inode_add_write(handle_t *handle, - struct inode *inode) + struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) - return jbd2_journal_inode_add_write(handle, - EXT4_I(inode)->jinode); + return jbd2_journal_inode_ranged_write(handle, + EXT4_I(inode)->jinode, start_byte, length); return 0; } static inline int ext4_jbd2_inode_add_wait(handle_t *handle, - struct inode *inode) + struct inode *inode, loff_t start_byte, loff_t length) { if (ext4_handle_valid(handle)) - return jbd2_journal_inode_add_wait(handle, - EXT4_I(inode)->jinode); + return jbd2_journal_inode_ranged_wait(handle, + EXT4_I(inode)->jinode, start_byte, length); return 0; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index db318b5ec812..e65559bf7728 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -729,10 +729,16 @@ out_sem: !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { + loff_t start_byte = + (loff_t)map->m_lblk << inode->i_blkbits; + loff_t length = (loff_t)map->m_len << inode->i_blkbits; + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) - ret = ext4_jbd2_inode_add_wait(handle, inode); + ret = ext4_jbd2_inode_add_wait(handle, inode, + start_byte, length); else - ret = ext4_jbd2_inode_add_write(handle, inode); + ret = ext4_jbd2_inode_add_write(handle, inode, + start_byte, length); if (ret) return ret; } @@ -4058,7 +4064,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) - err = ext4_jbd2_inode_add_write(handle, inode); + err = ext4_jbd2_inode_add_write(handle, inode, from, + length); } unlock: diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2f5be02fc6f6..287631bb09e7 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -390,7 +390,8 @@ data_copy: /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ - *err = ext4_jbd2_inode_add_write(handle, orig_inode); + *err = ext4_jbd2_inode_add_write(handle, orig_inode, + (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); unlock_pages: unlock_page(pagep[0]); From 3a17ca864baffc0c6f6e8aad525aa4365775a193 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 20 Jun 2019 21:19:02 -0400 Subject: [PATCH 47/52] ext4: allow directory holes commit 4e19d6b65fb4fc42e352ce9883649e049da14743 upstream. The largedir feature was intended to allow ext4 directories to have unmapped directory blocks (e.g., directory holes). And so the released e2fsprogs no longer enforces this for largedir file systems; however, the corresponding change to the kernel-side code was not made. This commit fixes this oversight. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Greg Kroah-Hartman --- fs/ext4/dir.c | 19 +++++++++---------- fs/ext4/namei.c | 45 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f93f9881ec18..46d5c40f2835 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -108,7 +108,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; - int dir_has_error = 0; struct fscrypt_str fstr = FSTR_INIT(NULL, 0); if (ext4_encrypted_inode(inode)) { @@ -144,8 +143,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - offset = ctx->pos & (sb->s_blocksize - 1); - while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; @@ -154,9 +151,18 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) goto errout; } cond_resched(); + offset = ctx->pos & (sb->s_blocksize - 1); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); + if (err == 0) { + /* m_len should never be zero but let's avoid + * an infinite loop if it somehow is */ + if (map.m_len == 0) + map.m_len = 1; + ctx->pos += map.m_len * sb->s_blocksize; + continue; + } if (err > 0) { pgoff_t index = map.m_pblk >> (PAGE_SHIFT - inode->i_blkbits); @@ -175,13 +181,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } if (!bh) { - if (!dir_has_error) { - EXT4_ERROR_FILE(file, 0, - "directory contains a " - "hole at offset %llu", - (unsigned long long) ctx->pos); - dir_has_error = 1; - } /* corrupt size? Maybe no more blocks to read */ if (ctx->pos > inode->i_blocks << 9) break; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 4c5aa5df6573..61dc1b0e4465 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -81,8 +81,18 @@ static struct buffer_head *ext4_append(handle_t *handle, static int ext4_dx_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent); +/* + * Hints to ext4_read_dirblock regarding whether we expect a directory + * block being read to be an index block, or a block containing + * directory entries (and if the latter, whether it was found via a + * logical block in an htree index block). This is used to control + * what sort of sanity checkinig ext4_read_dirblock() will do on the + * directory block read from the storage device. EITHER will means + * the caller doesn't know what kind of directory block will be read, + * so no specific verification will be done. + */ typedef enum { - EITHER, INDEX, DIRENT + EITHER, INDEX, DIRENT, DIRENT_HTREE } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ @@ -108,11 +118,14 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, return bh; } - if (!bh) { + if (!bh && (type == INDEX || type == DIRENT_HTREE)) { ext4_error_inode(inode, func, line, block, - "Directory hole found"); + "Directory hole found for htree %s block", + (type == INDEX) ? "index" : "leaf"); return ERR_PTR(-EFSCORRUPTED); } + if (!bh) + return NULL; dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ if (is_dx(inode)) { @@ -979,7 +992,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); - bh = ext4_read_dirblock(dir, block, DIRENT); + bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) return PTR_ERR(bh); @@ -1509,7 +1522,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, return (struct buffer_head *) frame; do { block = dx_get_block(frame->at); - bh = ext4_read_dirblock(dir, block, DIRENT); + bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); if (IS_ERR(bh)) goto errout; @@ -2079,6 +2092,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); + if (bh == NULL) { + bh = ext4_bread(handle, dir, block, + EXT4_GET_BLOCKS_CREATE); + goto add_to_new_block; + } if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; @@ -2099,6 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, brelse(bh); } bh = ext4_append(handle, dir, &block); +add_to_new_block: if (IS_ERR(bh)) { retval = PTR_ERR(bh); bh = NULL; @@ -2143,7 +2162,7 @@ again: return PTR_ERR(frame); entries = frame->entries; at = frame->at; - bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); + bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT_HTREE); if (IS_ERR(bh)) { err = PTR_ERR(bh); bh = NULL; @@ -2691,7 +2710,10 @@ bool ext4_empty_dir(struct inode *inode) EXT4_ERROR_INODE(inode, "invalid size"); return true; } - bh = ext4_read_dirblock(inode, 0, EITHER); + /* The first directory block must not be a hole, + * so treat it as DIRENT_HTREE + */ + bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); if (IS_ERR(bh)) return true; @@ -2713,6 +2735,10 @@ bool ext4_empty_dir(struct inode *inode) brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); bh = ext4_read_dirblock(inode, lblock, EITHER); + if (bh == NULL) { + offset += sb->s_blocksize; + continue; + } if (IS_ERR(bh)) return true; de = (struct ext4_dir_entry_2 *) bh->b_data; @@ -3256,7 +3282,10 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { - bh = ext4_read_dirblock(inode, 0, EITHER); + /* The first directory block must not be a hole, so + * treat it as DIRENT_HTREE + */ + bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); if (IS_ERR(bh)) { *retval = PTR_ERR(bh); return NULL; From 967bc679c596ebf41afbb5c9ffc4216354428b6f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 19 Jul 2019 18:41:10 +0200 Subject: [PATCH 48/52] KVM: nVMX: do not use dangling shadow VMCS after guest reset commit 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 upstream. If a KVM guest is reset while running a nested guest, free_nested will disable the shadow VMCS execution control in the vmcs01. However, on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync the VMCS12 to the shadow VMCS which has since been freed. This causes a vmptrld of a NULL pointer on my machime, but Jan reports the host to hang altogether. Let's see how much this trivial patch fixes. Reported-by: Jan Kiszka Cc: Liran Alon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 73d6d585dd66..880bc36a0d5d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8457,6 +8457,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmx->nested.sync_shadow_vmcs = false; } static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) @@ -8468,7 +8469,6 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; @@ -8668,6 +8668,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + if (WARN_ON(!shadow_vmcs)) + return; + preempt_disable(); vmcs_load(shadow_vmcs); @@ -8706,6 +8709,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) u64 field_value = 0; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; + if (WARN_ON(!shadow_vmcs)) + return; + vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { From 7560e33369ed30ab68180dcd6b8ef342bbb3c29a Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 21 Jul 2019 13:52:18 +0200 Subject: [PATCH 49/52] KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested commit cf64527bb33f6cec2ed50f89182fc4688d0056b6 upstream. Letting this pend may cause nested_get_vmcs12_pages to run against an invalid state, corrupting the effective vmcs of L1. This was triggerable in QEMU after a guest corruption in L2, followed by a L1 reset. Signed-off-by: Jan Kiszka Reviewed-by: Liran Alon Cc: stable@vger.kernel.org Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM") Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 880bc36a0d5d..4cf16378dffe 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8490,6 +8490,8 @@ static void free_nested(struct vcpu_vmx *vmx) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; + kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, &vmx->vcpu); + hrtimer_cancel(&vmx->nested.preemption_timer); vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; From c1d98b766ebe9a6b94d41c76f2ada07604c18c95 Mon Sep 17 00:00:00 2001 From: Kuo-Hsin Yang Date: Thu, 11 Jul 2019 20:52:04 -0700 Subject: [PATCH 50/52] mm: vmscan: scan anonymous pages on file refaults commit 2c012a4ad1a2cd3fb5a0f9307b9d219f84eda1fa upstream. When file refaults are detected and there are many inactive file pages, the system never reclaim anonymous pages, the file pages are dropped aggressively when there are still a lot of cold anonymous pages and system thrashes. This issue impacts the performance of applications with large executable, e.g. chrome. With this patch, when file refault is detected, inactive_list_is_low() always returns true for file pages in get_scan_count() to enable scanning anonymous pages. The problem can be reproduced by the following test program. ---8<--- void fallocate_file(const char *filename, off_t size) { struct stat st; int fd; if (!stat(filename, &st) && st.st_size >= size) return; fd = open(filename, O_WRONLY | O_CREAT, 0600); if (fd < 0) { perror("create file"); exit(1); } if (posix_fallocate(fd, 0, size)) { perror("fallocate"); exit(1); } close(fd); } long *alloc_anon(long size) { long *start = malloc(size); memset(start, 1, size); return start; } long access_file(const char *filename, long size, long rounds) { int fd, i; volatile char *start1, *end1, *start2; const int page_size = getpagesize(); long sum = 0; fd = open(filename, O_RDONLY); if (fd == -1) { perror("open"); exit(1); } /* * Some applications, e.g. chrome, use a lot of executable file * pages, map some of the pages with PROT_EXEC flag to simulate * the behavior. */ start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0); if (start1 == MAP_FAILED) { perror("mmap"); exit(1); } end1 = start1 + size / 2; start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2); if (start2 == MAP_FAILED) { perror("mmap"); exit(1); } for (i = 0; i < rounds; ++i) { struct timeval before, after; volatile char *ptr1 = start1, *ptr2 = start2; gettimeofday(&before, NULL); for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size) sum += *ptr1 + *ptr2; gettimeofday(&after, NULL); printf("File access time, round %d: %f (sec) ", i, (after.tv_sec - before.tv_sec) + (after.tv_usec - before.tv_usec) / 1000000.0); } return sum; } int main(int argc, char *argv[]) { const long MB = 1024 * 1024; long anon_mb, file_mb, file_rounds; const char filename[] = "large"; long *ret1; long ret2; if (argc != 4) { printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS "); exit(0); } anon_mb = atoi(argv[1]); file_mb = atoi(argv[2]); file_rounds = atoi(argv[3]); fallocate_file(filename, file_mb * MB); printf("Allocate %ld MB anonymous pages ", anon_mb); ret1 = alloc_anon(anon_mb * MB); printf("Access %ld MB file pages ", file_mb); ret2 = access_file(filename, file_mb * MB, file_rounds); printf("Print result to prevent optimization: %ld ", *ret1 + ret2); return 0; } ---8<--- Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the program fills ram with 2048 MB memory, access a 200 MB file for 10 times. Without this patch, the file cache is dropped aggresively and every access to the file is from disk. $ ./thrash 2048 200 10 Allocate 2048 MB anonymous pages Access 200 MB file pages File access time, round 0: 2.489316 (sec) File access time, round 1: 2.581277 (sec) File access time, round 2: 2.487624 (sec) File access time, round 3: 2.449100 (sec) File access time, round 4: 2.420423 (sec) File access time, round 5: 2.343411 (sec) File access time, round 6: 2.454833 (sec) File access time, round 7: 2.483398 (sec) File access time, round 8: 2.572701 (sec) File access time, round 9: 2.493014 (sec) With this patch, these file pages can be cached. $ ./thrash 2048 200 10 Allocate 2048 MB anonymous pages Access 200 MB file pages File access time, round 0: 2.475189 (sec) File access time, round 1: 2.440777 (sec) File access time, round 2: 2.411671 (sec) File access time, round 3: 1.955267 (sec) File access time, round 4: 0.029924 (sec) File access time, round 5: 0.000808 (sec) File access time, round 6: 0.000771 (sec) File access time, round 7: 0.000746 (sec) File access time, round 8: 0.000738 (sec) File access time, round 9: 0.000747 (sec) Checked the swap out stats during the test [1], 19006 pages swapped out with this patch, 3418 pages swapped out without this patch. There are more swap out, but I think it's within reasonable range when file backed data set doesn't fit into the memory. $ ./thrash 2000 100 2100 5 1 # ANON_MB FILE_EXEC FILE_NOEXEC ROUNDS PROCESSES Allocate 2000 MB anonymous pages active_anon: 1613644, inactive_anon: 348656, active_file: 892, inactive_file: 1384 (kB) pswpout: 7972443, pgpgin: 478615246 Access 100 MB executable file pages Access 2100 MB regular file pages File access time, round 0: 12.165, (sec) active_anon: 1433788, inactive_anon: 478116, active_file: 17896, inactive_file: 24328 (kB) File access time, round 1: 11.493, (sec) active_anon: 1430576, inactive_anon: 477144, active_file: 25440, inactive_file: 26172 (kB) File access time, round 2: 11.455, (sec) active_anon: 1427436, inactive_anon: 476060, active_file: 21112, inactive_file: 28808 (kB) File access time, round 3: 11.454, (sec) active_anon: 1420444, inactive_anon: 473632, active_file: 23216, inactive_file: 35036 (kB) File access time, round 4: 11.479, (sec) active_anon: 1413964, inactive_anon: 471460, active_file: 31728, inactive_file: 32224 (kB) pswpout: 7991449 (+ 19006), pgpgin: 489924366 (+ 11309120) With 4 processes accessing non-overlapping parts of a large file, 30316 pages swapped out with this patch, 5152 pages swapped out without this patch. The swapout number is small comparing to pgpgin. [1]: https://github.com/vovo/testing/blob/master/mem_thrash.c Link: http://lkml.kernel.org/r/20190701081038.GA83398@google.com Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty") Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty") Signed-off-by: Kuo-Hsin Yang Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Sonny Rao Cc: Mel Gorman Cc: Rik van Riel Cc: Vladimir Davydov Cc: Minchan Kim Cc: [4.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds [backported to 4.14.y, 4.19.y, 5.1.y: adjust context] Signed-off-by: Kuo-Hsin Yang Signed-off-by: Greg Kroah-Hartman --- mm/vmscan.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e42f44cf7b43..576379e87421 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2190,7 +2190,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool actual_reclaim) + struct scan_control *sc, bool trace) { enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -2216,7 +2216,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, * rid of the stale workingset quickly. */ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE); - if (file && actual_reclaim && lruvec->refaults != refaults) { + if (file && lruvec->refaults != refaults) { inactive_ratio = 0; } else { gb = (inactive + active) >> (30 - PAGE_SHIFT); @@ -2226,7 +2226,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive_ratio = 1; } - if (actual_reclaim) + if (trace) trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, From 60e9babfda94023ea39a09841804d44551fb98fd Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Sun, 21 Jul 2019 17:44:12 +0300 Subject: [PATCH 51/52] net: sched: verify that q!=NULL before setting q->flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 503d81d428bd598430f7f9d02021634e1a8139a0 upstream. In function int tc_new_tfilter() q pointer can be NULL when adding filter on a shared block. With recent change that resets TCQ_F_CAN_BYPASS after filter creation, following NULL pointer dereference happens in case parent block is shared: [ 212.925060] BUG: kernel NULL pointer dereference, address: 0000000000000010 [ 212.925445] #PF: supervisor write access in kernel mode [ 212.925709] #PF: error_code(0x0002) - not-present page [ 212.925965] PGD 8000000827923067 P4D 8000000827923067 PUD 827924067 PMD 0 [ 212.926302] Oops: 0002 [#1] SMP KASAN PTI [ 212.926539] CPU: 18 PID: 2617 Comm: tc Tainted: G B 5.2.0+ #512 [ 212.926938] Hardware name: Supermicro SYS-2028TP-DECR/X10DRT-P, BIOS 2.0b 03/30/2017 [ 212.927364] RIP: 0010:tc_new_tfilter+0x698/0xd40 [ 212.927633] Code: 74 0d 48 85 c0 74 08 48 89 ef e8 03 aa 62 00 48 8b 84 24 a0 00 00 00 48 8d 78 10 48 89 44 24 18 e8 4d 0c 6b ff 48 8b 44 24 18 <83> 60 10 f b 48 85 ed 0f 85 3d fe ff ff e9 4f fe ff ff e8 81 26 f8 [ 212.928607] RSP: 0018:ffff88884fd5f5d8 EFLAGS: 00010296 [ 212.928905] RAX: 0000000000000000 RBX: 0000000000000000 RCX: dffffc0000000000 [ 212.929201] RDX: 0000000000000007 RSI: 0000000000000004 RDI: 0000000000000297 [ 212.929402] RBP: ffff88886bedd600 R08: ffffffffb91d4b51 R09: fffffbfff7616e4d [ 212.929609] R10: fffffbfff7616e4c R11: ffffffffbb0b7263 R12: ffff88886bc61040 [ 212.929803] R13: ffff88884fd5f950 R14: ffffc900039c5000 R15: ffff88835e927680 [ 212.929999] FS: 00007fe7c50b6480(0000) GS:ffff88886f980000(0000) knlGS:0000000000000000 [ 212.930235] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 212.930394] CR2: 0000000000000010 CR3: 000000085bd04002 CR4: 00000000001606e0 [ 212.930588] Call Trace: [ 212.930682] ? tc_del_tfilter+0xa40/0xa40 [ 212.930811] ? __lock_acquire+0x5b5/0x2460 [ 212.930948] ? find_held_lock+0x85/0xa0 [ 212.931081] ? tc_del_tfilter+0xa40/0xa40 [ 212.931201] rtnetlink_rcv_msg+0x4ab/0x5f0 [ 212.931332] ? rtnl_dellink+0x490/0x490 [ 212.931454] ? lockdep_hardirqs_on+0x260/0x260 [ 212.931589] ? netlink_deliver_tap+0xab/0x5a0 [ 212.931717] ? match_held_lock+0x1b/0x240 [ 212.931844] netlink_rcv_skb+0xd0/0x200 [ 212.931958] ? rtnl_dellink+0x490/0x490 [ 212.932079] ? netlink_ack+0x440/0x440 [ 212.932205] ? netlink_deliver_tap+0x161/0x5a0 [ 212.932335] ? lock_downgrade+0x360/0x360 [ 212.932457] ? lock_acquire+0xe5/0x210 [ 212.932579] netlink_unicast+0x296/0x350 [ 212.932705] ? netlink_attachskb+0x390/0x390 [ 212.932834] ? _copy_from_iter_full+0xe0/0x3a0 [ 212.932976] netlink_sendmsg+0x394/0x600 [ 212.937998] ? netlink_unicast+0x350/0x350 [ 212.943033] ? move_addr_to_kernel.part.0+0x90/0x90 [ 212.948115] ? netlink_unicast+0x350/0x350 [ 212.953185] sock_sendmsg+0x96/0xa0 [ 212.958099] ___sys_sendmsg+0x482/0x520 [ 212.962881] ? match_held_lock+0x1b/0x240 [ 212.967618] ? copy_msghdr_from_user+0x250/0x250 [ 212.972337] ? lock_downgrade+0x360/0x360 [ 212.976973] ? rwlock_bug.part.0+0x60/0x60 [ 212.981548] ? __mod_node_page_state+0x1f/0xa0 [ 212.986060] ? match_held_lock+0x1b/0x240 [ 212.990567] ? find_held_lock+0x85/0xa0 [ 212.994989] ? do_user_addr_fault+0x349/0x5b0 [ 212.999387] ? lock_downgrade+0x360/0x360 [ 213.003713] ? find_held_lock+0x85/0xa0 [ 213.007972] ? __fget_light+0xa1/0xf0 [ 213.012143] ? sockfd_lookup_light+0x91/0xb0 [ 213.016165] __sys_sendmsg+0xba/0x130 [ 213.020040] ? __sys_sendmsg_sock+0xb0/0xb0 [ 213.023870] ? handle_mm_fault+0x337/0x470 [ 213.027592] ? page_fault+0x8/0x30 [ 213.031316] ? lockdep_hardirqs_off+0xbe/0x100 [ 213.034999] ? mark_held_locks+0x24/0x90 [ 213.038671] ? do_syscall_64+0x1e/0xe0 [ 213.042297] do_syscall_64+0x74/0xe0 [ 213.045828] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 213.049354] RIP: 0033:0x7fe7c527c7b8 [ 213.052792] Code: 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 65 8f 0c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f 0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 89 54 [ 213.060269] RSP: 002b:00007ffc3f7908a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 213.064144] RAX: ffffffffffffffda RBX: 000000005d34716f RCX: 00007fe7c527c7b8 [ 213.068094] RDX: 0000000000000000 RSI: 00007ffc3f790910 RDI: 0000000000000003 [ 213.072109] RBP: 0000000000000000 R08: 0000000000000001 R09: 00007fe7c5340cc0 [ 213.076113] R10: 0000000000404ec2 R11: 0000000000000246 R12: 0000000000000080 [ 213.080146] R13: 0000000000480640 R14: 0000000000000080 R15: 0000000000000000 [ 213.084147] Modules linked in: act_gact cls_flower sch_ingress nfsv3 nfs_acl nfs lockd grace fscache bridge stp llc sunrpc intel_rapl_msr intel_rapl_common [<1;69;32Msb_edac rdma_ucm rdma_cm x86_pkg_temp_thermal iw_cm intel_powerclamp ib_cm coretemp kvm_intel kvm irqbypass mlx5_ib ib_uverbs ib_core crct10dif_pclmul crc32_pc lmul crc32c_intel ghash_clmulni_intel mlx5_core intel_cstate intel_uncore iTCO_wdt igb iTCO_vendor_support mlxfw mei_me ptp ses intel_rapl_perf mei pcspkr ipmi _ssif i2c_i801 joydev enclosure pps_core lpc_ich ioatdma wmi dca ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_pad ast i2c_algo_bit drm_vram_helpe r ttm drm_kms_helper drm mpt3sas raid_class scsi_transport_sas [ 213.112326] CR2: 0000000000000010 [ 213.117429] ---[ end trace adb58eb0a4ee6283 ]--- Verify that q pointer is not NULL before setting the 'flags' field. Fixes: 3f05e6886a59 ("net_sched: unset TCQ_F_CAN_BYPASS when adding filters") Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/sched/cls_api.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 3374903d5f96..4159bcb479c6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1325,7 +1325,9 @@ replay: tcf_chain_tp_insert(chain, &chain_info, tp); tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false); - q->flags &= ~TCQ_F_CAN_BYPASS; + /* q pointer is NULL for shared blocks */ + if (q) + q->flags &= ~TCQ_F_CAN_BYPASS; } else { if (tp_created) tcf_proto_destroy(tp, NULL); From 64f4694072aa4ac23eb9ad2feeb0a178d2a054da Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 28 Jul 2019 08:29:30 +0200 Subject: [PATCH 52/52] Linux 4.19.62 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b16485c580d7..a4463d880ae2 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 19 -SUBLEVEL = 61 +SUBLEVEL = 62 EXTRAVERSION = NAME = "People's Front"