From 4141168704e40509b376d0c32822eccc29e00627 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 16 Sep 2020 10:00:58 +0100 Subject: [PATCH 01/72] drm/i915: Break up error capture compression loops with cond_resched() commit 7d5553147613b50149238ac1385c60e5c7cacb34 upstream. As the error capture will compress user buffers as directed to by the user, it can take an arbitrary amount of time and space. Break up the compression loops with a call to cond_resched(), that will allow other processes to schedule (avoiding the soft lockups) and also serve as a warning should we try to make this loop atomic in the future. Testcase: igt/gem_exec_capture/many-* Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: stable@vger.kernel.org Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20200916090059.3189-2-chris@chris-wilson.co.uk (cherry picked from commit 293f43c80c0027ff9299036c24218ac705ce584e) Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index a262a64f5625..ba24ac698e8b 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -268,6 +268,8 @@ static int compress_page(struct compress *c, if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) return -EIO; + + cond_resched(); } while (zstream->avail_in); /* Fallback to uncompressed if we increase size? */ @@ -347,6 +349,7 @@ static int compress_page(struct compress *c, if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) memcpy(ptr, src, PAGE_SIZE); dst->pages[dst->page_count++] = ptr; + cond_resched(); return 0; } From 604ac10d9d1e9a85787d53acbba541f420b9cc9e Mon Sep 17 00:00:00 2001 From: Hoang Huu Le Date: Thu, 27 Aug 2020 09:56:51 +0700 Subject: [PATCH 02/72] tipc: fix use-after-free in tipc_bcast_get_mode commit fdeba99b1e58ecd18c2940c453e19e4ef20ff591 upstream. Syzbot has reported those issues as: ================================================================== BUG: KASAN: use-after-free in tipc_bcast_get_mode+0x3ab/0x400 net/tipc/bcast.c:759 Read of size 1 at addr ffff88805e6b3571 by task kworker/0:6/3850 CPU: 0 PID: 3850 Comm: kworker/0:6 Not tainted 5.8.0-rc7-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events tipc_net_finalize_work Thread 1's call trace: [...] kfree+0x103/0x2c0 mm/slab.c:3757 <- bcbase releasing tipc_bcast_stop+0x1b0/0x2f0 net/tipc/bcast.c:721 tipc_exit_net+0x24/0x270 net/tipc/core.c:112 [...] Thread 2's call trace: [...] tipc_bcast_get_mode+0x3ab/0x400 net/tipc/bcast.c:759 <- bcbase has already been freed by Thread 1 tipc_node_broadcast+0x9e/0xcc0 net/tipc/node.c:1744 tipc_nametbl_publish+0x60b/0x970 net/tipc/name_table.c:752 tipc_net_finalize net/tipc/net.c:141 [inline] tipc_net_finalize+0x1fa/0x310 net/tipc/net.c:131 tipc_net_finalize_work+0x55/0x80 net/tipc/net.c:150 [...] ================================================================== BUG: KASAN: use-after-free in tipc_named_reinit+0xef/0x290 net/tipc/name_distr.c:344 Read of size 8 at addr ffff888052ab2000 by task kworker/0:13/30628 CPU: 0 PID: 30628 Comm: kworker/0:13 Not tainted 5.8.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events tipc_net_finalize_work Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1f0/0x31e lib/dump_stack.c:118 print_address_description+0x66/0x5a0 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report+0x132/0x1d0 mm/kasan/report.c:530 tipc_named_reinit+0xef/0x290 net/tipc/name_distr.c:344 tipc_net_finalize+0x85/0xe0 net/tipc/net.c:138 tipc_net_finalize_work+0x50/0x70 net/tipc/net.c:150 process_one_work+0x789/0xfc0 kernel/workqueue.c:2269 worker_thread+0xaa4/0x1460 kernel/workqueue.c:2415 kthread+0x37e/0x3a0 drivers/block/aoe/aoecmd.c:1234 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293 [...] Freed by task 14058: save_stack mm/kasan/common.c:48 [inline] set_track mm/kasan/common.c:56 [inline] kasan_set_free_info mm/kasan/common.c:316 [inline] __kasan_slab_free+0x114/0x170 mm/kasan/common.c:455 __cache_free mm/slab.c:3426 [inline] kfree+0x10a/0x220 mm/slab.c:3757 tipc_exit_net+0x29/0x50 net/tipc/core.c:113 ops_exit_list net/core/net_namespace.c:186 [inline] cleanup_net+0x708/0xba0 net/core/net_namespace.c:603 process_one_work+0x789/0xfc0 kernel/workqueue.c:2269 worker_thread+0xaa4/0x1460 kernel/workqueue.c:2415 kthread+0x37e/0x3a0 drivers/block/aoe/aoecmd.c:1234 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:293 Fix it by calling flush_scheduled_work() to make sure the tipc_net_finalize_work() stopped before releasing bcbase object. Reported-by: syzbot+6ea1f7a8df64596ef4d7@syzkaller.appspotmail.com Reported-by: syzbot+e9cc557752ab126c1b99@syzkaller.appspotmail.com Acked-by: Jon Maloy Signed-off-by: Hoang Huu Le Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/tipc/core.c b/net/tipc/core.c index 49bb9fc0aa06..ce0f067d0faf 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -93,6 +93,11 @@ out_sk_rht: static void __net_exit tipc_exit_net(struct net *net) { tipc_net_stop(net); + + /* Make sure the tipc_net_finalize_work stopped + * before releasing the resources. + */ + flush_scheduled_work(); tipc_bcast_stop(net); tipc_nametbl_stop(net); tipc_sk_rht_destroy(net); From caf8f9c19a93ec501d0ffca5e6767a969961baea Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 1 Nov 2020 17:07:44 -0800 Subject: [PATCH 03/72] ptrace: fix task_join_group_stop() for the case when current is traced commit 7b3c36fc4c231ca532120bbc0df67a12f09c1d96 upstream. This testcase #include #include #include #include #include #include #include void *tf(void *arg) { return NULL; } int main(void) { int pid = fork(); if (!pid) { kill(getpid(), SIGSTOP); pthread_t th; pthread_create(&th, NULL, tf, NULL); return 0; } waitpid(pid, NULL, WSTOPPED); ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_TRACECLONE); waitpid(pid, NULL, 0); ptrace(PTRACE_CONT, pid, 0,0); waitpid(pid, NULL, 0); int status; int thread = waitpid(-1, &status, 0); assert(thread > 0 && thread != pid); assert(status == 0x80137f); return 0; } fails and triggers WARN_ON_ONCE(!signr) in do_jobctl_trap(). This is because task_join_group_stop() has 2 problems when current is traced: 1. We can't rely on the "JOBCTL_STOP_PENDING" check, a stopped tracee can be woken up by debugger and it can clone another thread which should join the group-stop. We need to check group_stop_count || SIGNAL_STOP_STOPPED. 2. If SIGNAL_STOP_STOPPED is already set, we should not increment sig->group_stop_count and add JOBCTL_STOP_CONSUME. The new thread should stop without another do_notify_parent_cldstop() report. To clarify, the problem is very old and we should blame ptrace_init_task(). But now that we have task_join_group_stop() it makes more sense to fix this helper to avoid the code duplication. Reported-by: syzbot+3485e3773f7da290eecc@syzkaller.appspotmail.com Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Cc: Jens Axboe Cc: Christian Brauner Cc: "Eric W . Biederman" Cc: Zhiqiang Liu Cc: Tejun Heo Cc: Link: https://lkml.kernel.org/r/20201019134237.GA18810@redhat.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/signal.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index 6a5692118139..a02a25acf205 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -385,16 +385,17 @@ static bool task_participate_group_stop(struct task_struct *task) void task_join_group_stop(struct task_struct *task) { + unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; + struct signal_struct *sig = current->signal; + + if (sig->group_stop_count) { + sig->group_stop_count++; + mask |= JOBCTL_STOP_CONSUME; + } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) + return; + /* Have the new thread join an on-going signal group stop */ - unsigned long jobctl = current->jobctl; - if (jobctl & JOBCTL_STOP_PENDING) { - struct signal_struct *sig = current->signal; - unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; - unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; - if (task_set_jobctl_pending(task, signr | gstop)) { - sig->group_stop_count++; - } - } + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } /* From d67543112dc5f39b41f9bd309e47da25be2bb14c Mon Sep 17 00:00:00 2001 From: Mark Deneen Date: Fri, 30 Oct 2020 15:58:14 +0000 Subject: [PATCH 04/72] cadence: force nonlinear buffers to be cloned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 403dc16796f5516acf23d94a1cd9eba564d03210 ] In my test setup, I had a SAMA5D27 device configured with ip forwarding, and second device with usb ethernet (r8152) sending ICMP packets.  If the packet was larger than about 220 bytes, the SAMA5 device would "oops" with the following trace: kernel BUG at net/core/skbuff.c:1863! Internal error: Oops - BUG: 0 [#1] ARM Modules linked in: xt_MASQUERADE ppp_async ppp_generic slhc iptable_nat xt_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 can_raw can bridge stp llc ipt_REJECT nf_reject_ipv4 sd_mod cdc_ether usbnet usb_storage r8152 scsi_mod mii o ption usb_wwan usbserial micrel macb at91_sama5d2_adc phylink gpio_sama5d2_piobu m_can_platform m_can industrialio_triggered_buffer kfifo_buf of_mdio can_dev fixed_phy sdhci_of_at91 sdhci_pltfm libphy sdhci mmc_core ohci_at91 ehci_atmel o hci_hcd iio_rescale industrialio sch_fq_codel spidev prox2_hal(O) CPU: 0 PID: 0 Comm: swapper Tainted: G           O      5.9.1-prox2+ #1 Hardware name: Atmel SAMA5 PC is at skb_put+0x3c/0x50 LR is at macb_start_xmit+0x134/0xad0 [macb] pc : []    lr : []    psr: 20070113 sp : c0d01a60  ip : c07232c0  fp : c4250000 r10: c0d03cc8  r9 : 00000000  r8 : c0d038c0 r7 : 00000000  r6 : 00000008  r5 : c59b66c0  r4 : 0000002a r3 : 8f659eff  r2 : c59e9eea  r1 : 00000001  r0 : c59b66c0 Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none Control: 10c53c7d  Table: 2640c059  DAC: 00000051 Process swapper (pid: 0, stack limit = 0x75002d81) [] (skb_put) from [] (macb_start_xmit+0x134/0xad0 [macb]) [] (macb_start_xmit [macb]) from [] (dev_hard_start_xmit+0x90/0x11c) [] (dev_hard_start_xmit) from [] (sch_direct_xmit+0x124/0x260) [] (sch_direct_xmit) from [] (__dev_queue_xmit+0x4b0/0x6d0) [] (__dev_queue_xmit) from [] (ip_finish_output2+0x350/0x580) [] (ip_finish_output2) from [] (ip_output+0xb4/0x13c) [] (ip_output) from [] (ip_forward+0x474/0x500) [] (ip_forward) from [] (ip_sublist_rcv_finish+0x3c/0x50) [] (ip_sublist_rcv_finish) from [] (ip_sublist_rcv+0x11c/0x188) [] (ip_sublist_rcv) from [] (ip_list_rcv+0xf8/0x124) [] (ip_list_rcv) from [] (__netif_receive_skb_list_core+0x1a0/0x20c) [] (__netif_receive_skb_list_core) from [] (netif_receive_skb_list_internal+0x194/0x230) [] (netif_receive_skb_list_internal) from [] (gro_normal_list.part.0+0x14/0x28) [] (gro_normal_list.part.0) from [] (napi_complete_done+0x16c/0x210) [] (napi_complete_done) from [] (r8152_poll+0x684/0x708 [r8152]) [] (r8152_poll [r8152]) from [] (net_rx_action+0x100/0x328) [] (net_rx_action) from [] (__do_softirq+0xec/0x274) [] (__do_softirq) from [] (irq_exit+0xcc/0xd0) [] (irq_exit) from [] (__handle_domain_irq+0x58/0xa4) [] (__handle_domain_irq) from [] (__irq_svc+0x6c/0x90) Exception stack(0xc0d01ef0 to 0xc0d01f38) 1ee0:                                     00000000 0000003d 0c31f383 c0d0fa00 1f00: c0d2eb80 00000000 c0d2e630 4dad8c49 4da967b0 0000003d 0000003d 00000000 1f20: fffffff5 c0d01f40 c04e0f88 c04e0f8c 30070013 ffffffff [] (__irq_svc) from [] (cpuidle_enter_state+0x7c/0x378) [] (cpuidle_enter_state) from [] (cpuidle_enter+0x28/0x38) [] (cpuidle_enter) from [] (do_idle+0x194/0x214) [] (do_idle) from [] (cpu_startup_entry+0xc/0x14) [] (cpu_startup_entry) from [] (start_kernel+0x46c/0x4a0) Code: e580c054 8a000002 e1a00002 e8bd8070 (e7f001f2) ---[ end trace 146c8a334115490c ]--- The solution was to force nonlinear buffers to be cloned.  This was previously reported by Klaus Doth (https://www.spinics.net/lists/netdev/msg556937.html) but never formally submitted as a patch. This is the third revision, hopefully the formatting is correct this time! Suggested-by: Klaus Doth Fixes: 653e92a9175e ("net: macb: add support for padding and fcs computation") Signed-off-by: Mark Deneen Link: https://lore.kernel.org/r/20201030155814.622831-1-mdeneen@saucontech.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cadence/macb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index acae87f548a1..0374a1ba1010 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -1704,7 +1704,8 @@ static inline int macb_clear_csum(struct sk_buff *skb) static int macb_pad_and_fcs(struct sk_buff **skb, struct net_device *ndev) { - bool cloned = skb_cloned(*skb) || skb_header_cloned(*skb); + bool cloned = skb_cloned(*skb) || skb_header_cloned(*skb) || + skb_is_nonlinear(*skb); int padlen = ETH_ZLEN - (*skb)->len; int headroom = skb_headroom(*skb); int tailroom = skb_tailroom(*skb); From fe478a589404287513bf4cbb074df80805db9f6f Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 2 Nov 2020 23:06:51 +0530 Subject: [PATCH 05/72] chelsio/chtls: fix memory leaks caused by a race [ Upstream commit 8080b462b6aa856ae05ea010441a702599e579f2 ] race between user context and softirq causing memleak, consider the call sequence scenario chtls_setkey() //user context chtls_peer_close() chtls_abort_req_rss() chtls_setkey() //user context work request skb queued in chtls_setkey() won't be freed because resources are already cleaned for this connection, fix it by not queuing work request while socket is closing. v1->v2: - fix W=1 warning. v2->v3: - separate it out from another memleak fix. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Vinay Kumar Yadav Link: https://lore.kernel.org/r/20201102173650.24754-1-vinay.yadav@chelsio.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/chelsio/chtls/chtls_hw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/crypto/chelsio/chtls/chtls_hw.c b/drivers/crypto/chelsio/chtls/chtls_hw.c index 64d24823c65a..2294fb06bef3 100644 --- a/drivers/crypto/chelsio/chtls/chtls_hw.c +++ b/drivers/crypto/chelsio/chtls/chtls_hw.c @@ -368,6 +368,9 @@ int chtls_setkey(struct chtls_sock *csk, u32 keylen, u32 optname) if (ret) goto out_notcb; + if (unlikely(csk_flag(sk, CSK_ABORT_SHUTDOWN))) + goto out_notcb; + set_wr_txq(skb, CPL_PRIORITY_DATA, csk->tlshws.txqid); csk->wr_credits -= DIV_ROUND_UP(len, 16); csk->wr_unacked += DIV_ROUND_UP(len, 16); From 75b0b05ace856333c2a94c2173c92973dd027297 Mon Sep 17 00:00:00 2001 From: Vinay Kumar Yadav Date: Mon, 2 Nov 2020 23:09:10 +0530 Subject: [PATCH 06/72] chelsio/chtls: fix always leaking ctrl_skb [ Upstream commit dbfe394dad33f99cf8458be50483ec40a5d29c34 ] Correct skb refcount in alloc_ctrl_skb(), causing skb memleak when chtls_send_abort() called with NULL skb. it was always leaking the skb, correct it by incrementing skb refs by one. Fixes: cc35c88ae4db ("crypto : chtls - CPL handler definition") Signed-off-by: Vinay Kumar Yadav Link: https://lore.kernel.org/r/20201102173909.24826-1-vinay.yadav@chelsio.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/chelsio/chtls/chtls_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/chelsio/chtls/chtls_cm.c b/drivers/crypto/chelsio/chtls/chtls_cm.c index f7334c42ebd9..35116cdb5e38 100644 --- a/drivers/crypto/chelsio/chtls/chtls_cm.c +++ b/drivers/crypto/chelsio/chtls/chtls_cm.c @@ -175,7 +175,7 @@ static struct sk_buff *alloc_ctrl_skb(struct sk_buff *skb, int len) { if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) { __skb_trim(skb, 0); - refcount_add(2, &skb->users); + refcount_inc(&skb->users); } else { skb = alloc_skb(len, GFP_KERNEL | __GFP_NOFAIL); } From 63a22c7cb9d9a422e959bd7a01d4af07cfa00298 Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Thu, 29 Oct 2020 10:10:56 +0200 Subject: [PATCH 07/72] gianfar: Replace skb_realloc_headroom with skb_cow_head for PTP [ Upstream commit d145c9031325fed963a887851d9fa42516efd52b ] When PTP timestamping is enabled on Tx, the controller inserts the Tx timestamp at the beginning of the frame buffer, between SFD and the L2 frame header. This means that the skb provided by the stack is required to have enough headroom otherwise a new skb needs to be created by the driver to accommodate the timestamp inserted by h/w. Up until now the driver was relying on skb_realloc_headroom() to create new skbs to accommodate PTP frames. Turns out that this method is not reliable in this context at least, as skb_realloc_headroom() for PTP frames can cause random crashes, mostly in subsequent skb_*() calls, when multiple concurrent TCP streams are run at the same time with the PTP flow on the same device (as seen in James' report). I also noticed that when the system is loaded by sending multiple TCP streams, the driver receives cloned skbs in large numbers. skb_cow_head() instead proves to be stable in this scenario, and not only handles cloned skbs too but it's also more efficient and widely used in other drivers. The commit introducing skb_realloc_headroom in the driver goes back to 2009, commit 93c1285c5d92 ("gianfar: reallocate skb when headroom is not enough for fcb"). For practical purposes I'm referencing a newer commit (from 2012) that brings the code to its current structure (and fixes the PTP case). Fixes: 9c4886e5e63b ("gianfar: Fix invalid TX frames returned on error queue when time stamping") Reported-by: James Jurack Suggested-by: Jakub Kicinski Signed-off-by: Claudiu Manoil Link: https://lore.kernel.org/r/20201029081057.8506-1-claudiu.manoil@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/gianfar.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 8243501c3757..2e7988e1c0ab 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -2370,20 +2370,12 @@ static netdev_tx_t gfar_start_xmit(struct sk_buff *skb, struct net_device *dev) fcb_len = GMAC_FCB_LEN + GMAC_TXPAL_LEN; /* make space for additional header when fcb is needed */ - if (fcb_len && unlikely(skb_headroom(skb) < fcb_len)) { - struct sk_buff *skb_new; - - skb_new = skb_realloc_headroom(skb, fcb_len); - if (!skb_new) { + if (fcb_len) { + if (unlikely(skb_cow_head(skb, fcb_len))) { dev->stats.tx_errors++; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } - - if (skb->sk) - skb_set_owner_w(skb_new, skb->sk); - dev_consume_skb_any(skb); - skb = skb_new; } /* total number of fragments in the SKB */ From 3a40f713276b2a03609749c6ef2e9efe4acdfa1a Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Tue, 20 Oct 2020 20:36:05 +0300 Subject: [PATCH 08/72] gianfar: Account for Tx PTP timestamp in the skb headroom [ Upstream commit d6a076d68c6b5d6a5800f3990a513facb7016dea ] When PTP timestamping is enabled on Tx, the controller inserts the Tx timestamp at the beginning of the frame buffer, between SFD and the L2 frame header. This means that the skb provided by the stack is required to have enough headroom otherwise a new skb needs to be created by the driver to accommodate the timestamp inserted by h/w. Up until now the driver was relying on the second option, using skb_realloc_headroom() to create a new skb to accommodate PTP frames. Turns out that this method is not reliable, as reallocation of skbs for PTP frames along with the required overhead (skb_set_owner_w, consume_skb) is causing random crashes in subsequent skb_*() calls, when multiple concurrent TCP streams are run at the same time on the same device (as seen in James' report). Note that these crashes don't occur with a single TCP stream, nor with multiple concurrent UDP streams, but only when multiple TCP streams are run concurrently with the PTP packet flow (doing skb reallocation). This patch enforces the first method, by requesting enough headroom from the stack to accommodate PTP frames, and so avoiding skb_realloc_headroom() & co, and the crashes no longer occur. There's no reason not to set needed_headroom to a large enough value to accommodate PTP frames, so in this regard this patch is a fix. Reported-by: James Jurack Fixes: bee9e58c9e98 ("gianfar:don't add FCB length to hard_header_len") Signed-off-by: Claudiu Manoil Link: https://lore.kernel.org/r/20201020173605.1173-1-claudiu.manoil@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/gianfar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 2e7988e1c0ab..8db0924ec681 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -1388,7 +1388,7 @@ static int gfar_probe(struct platform_device *ofdev) if (dev->features & NETIF_F_IP_CSUM || priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER) - dev->needed_headroom = GMAC_FCB_LEN; + dev->needed_headroom = GMAC_FCB_LEN + GMAC_TXPAL_LEN; /* Initializing some of the rx/tx queue level parameters */ for (i = 0; i < priv->num_tx_queues; i++) { From 1a9118379cc7b66bfd2ac030da4af3a712a9e9e2 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Mon, 2 Nov 2020 12:01:08 +0100 Subject: [PATCH 09/72] net: usb: qmi_wwan: add Telit LE910Cx 0x1230 composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 5fd8477ed8ca77e64b93d44a6dae4aa70c191396 ] Add support for Telit LE910Cx 0x1230 composition: 0x1230: tty, adb, rmnet, audio, tty, tty, tty, tty Signed-off-by: Daniele Palmas Acked-by: Bjørn Mork Link: https://lore.kernel.org/r/20201102110108.17244-1-dnlplm@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index d2612b69257e..6e0b3dc14aa4 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1268,6 +1268,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x1bc7, 0x1101, 3)}, /* Telit ME910 dual modem */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x1230, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1260, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1261, 2)}, /* Telit LE910Cx */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1900, 1)}, /* Telit LN940 series */ From 64752f5cfda61aa7ca12d23ca1ecc7d36e996f93 Mon Sep 17 00:00:00 2001 From: Petr Malat Date: Fri, 30 Oct 2020 14:26:33 +0100 Subject: [PATCH 10/72] sctp: Fix COMM_LOST/CANT_STR_ASSOC err reporting on big-endian platforms [ Upstream commit b6df8c81412190fbd5eaa3cec7f642142d9c16cd ] Commit 978aa0474115 ("sctp: fix some type cast warnings introduced since very beginning")' broke err reading from sctp_arg, because it reads the value as 32-bit integer, although the value is stored as 16-bit integer. Later this value is passed to the userspace in 16-bit variable, thus the user always gets 0 on big-endian platforms. Fix it by reading the __u16 field of sctp_arg union, as reading err field would produce a sparse warning. Fixes: 978aa0474115 ("sctp: fix some type cast warnings introduced since very beginning") Signed-off-by: Petr Malat Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/20201030132633.7045-1-oss@malat.biz Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/sctp/sm_sideeffect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 567517e44811..bb4f1d0da153 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1615,12 +1615,12 @@ static int sctp_cmd_interpreter(enum sctp_event event_type, break; case SCTP_CMD_INIT_FAILED: - sctp_cmd_init_failed(commands, asoc, cmd->obj.u32); + sctp_cmd_init_failed(commands, asoc, cmd->obj.u16); break; case SCTP_CMD_ASSOC_FAILED: sctp_cmd_assoc_failed(commands, asoc, event_type, - subtype, chunk, cmd->obj.u32); + subtype, chunk, cmd->obj.u16); break; case SCTP_CMD_INIT_COUNTER_INC: From 4ff840604a45673bb02dfaaa5f30cd9bc40011e5 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 31 Oct 2020 11:10:53 +0800 Subject: [PATCH 11/72] sfp: Fix error handing in sfp_probe() [ Upstream commit 9621618130bf7e83635367c13b9a6ee53935bb37 ] gpiod_to_irq() never return 0, but returns negative in case of error, check it and set gpio_irq to 0. Fixes: 73970055450e ("sfp: add SFP module support") Signed-off-by: YueHaibing Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20201031031053.25264-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/sfp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/sfp.c b/drivers/net/phy/sfp.c index 998d08ae7431..47d518e6d5d4 100644 --- a/drivers/net/phy/sfp.c +++ b/drivers/net/phy/sfp.c @@ -1886,7 +1886,8 @@ static int sfp_probe(struct platform_device *pdev) continue; irq = gpiod_to_irq(sfp->gpio[i]); - if (!irq) { + if (irq < 0) { + irq = 0; poll = true; continue; } From 8a78b4c0d6292d32d76b4268b5a33ae089a5d791 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:28 +0000 Subject: [PATCH 12/72] blktrace: fix debugfs use after free commit bad8e64fb19d3a0de5e564d9a7271c31bd684369 upstream. On commit 6ac93117ab00 ("blktrace: use existing disk debugfs directory") merged on v4.12 Omar fixed the original blktrace code for request-based drivers (multiqueue). This however left in place a possible crash, if you happen to abuse blktrace while racing to remove / add a device. We used to use asynchronous removal of the request_queue, and with that the issue was easier to reproduce. Now that we have reverted to synchronous removal of the request_queue, the issue is still possible to reproduce, its however just a bit more difficult. We essentially run two instances of break-blktrace which add/remove a loop device, and setup a blktrace and just never tear the blktrace down. We do this twice in parallel. This is easily reproduced with the script run_0004.sh from break-blktrace [0]. We can end up with two types of panics each reflecting where we race, one a failed blktrace setup: [ 252.426751] debugfs: Directory 'loop0' with parent 'block' already present! [ 252.432265] BUG: kernel NULL pointer dereference, address: 00000000000000a0 [ 252.436592] #PF: supervisor write access in kernel mode [ 252.439822] #PF: error_code(0x0002) - not-present page [ 252.442967] PGD 0 P4D 0 [ 252.444656] Oops: 0002 [#1] SMP NOPTI [ 252.446972] CPU: 10 PID: 1153 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 [ 252.452673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 [ 252.456343] RIP: 0010:down_write+0x15/0x40 [ 252.458146] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 00 00 48 0f b1 55 00 75 0f 48 8b 04 25 c0 8b 01 00 48 89 45 08 5d [ 252.463638] RSP: 0018:ffffa626415abcc8 EFLAGS: 00010246 [ 252.464950] RAX: 0000000000000000 RBX: ffff958c25f0f5c0 RCX: ffffff8100000000 [ 252.466727] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 [ 252.468482] RBP: 00000000000000a0 R08: 0000000000000000 R09: 0000000000000001 [ 252.470014] R10: 0000000000000000 R11: ffff958d1f9227ff R12: 0000000000000000 [ 252.471473] R13: ffff958c25ea5380 R14: ffffffff8cce15f1 R15: 00000000000000a0 [ 252.473346] FS: 00007f2e69dee540(0000) GS:ffff958c2fc80000(0000) knlGS:0000000000000000 [ 252.475225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 252.476267] CR2: 00000000000000a0 CR3: 0000000427d10004 CR4: 0000000000360ee0 [ 252.477526] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 252.478776] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 252.479866] Call Trace: [ 252.480322] simple_recursive_removal+0x4e/0x2e0 [ 252.481078] ? debugfs_remove+0x60/0x60 [ 252.481725] ? relay_destroy_buf+0x77/0xb0 [ 252.482662] debugfs_remove+0x40/0x60 [ 252.483518] blk_remove_buf_file_callback+0x5/0x10 [ 252.484328] relay_close_buf+0x2e/0x60 [ 252.484930] relay_open+0x1ce/0x2c0 [ 252.485520] do_blk_trace_setup+0x14f/0x2b0 [ 252.486187] __blk_trace_setup+0x54/0xb0 [ 252.486803] blk_trace_ioctl+0x90/0x140 [ 252.487423] ? do_sys_openat2+0x1ab/0x2d0 [ 252.488053] blkdev_ioctl+0x4d/0x260 [ 252.488636] block_ioctl+0x39/0x40 [ 252.489139] ksys_ioctl+0x87/0xc0 [ 252.489675] __x64_sys_ioctl+0x16/0x20 [ 252.490380] do_syscall_64+0x52/0x180 [ 252.491032] entry_SYSCALL_64_after_hwframe+0x44/0xa9 And the other on the device removal: [ 128.528940] debugfs: Directory 'loop0' with parent 'block' already present! [ 128.615325] BUG: kernel NULL pointer dereference, address: 00000000000000a0 [ 128.619537] #PF: supervisor write access in kernel mode [ 128.622700] #PF: error_code(0x0002) - not-present page [ 128.625842] PGD 0 P4D 0 [ 128.627585] Oops: 0002 [#1] SMP NOPTI [ 128.629871] CPU: 12 PID: 544 Comm: break-blktrace Tainted: G E 5.7.0-rc2-next-20200420+ #164 [ 128.635595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 [ 128.640471] RIP: 0010:down_write+0x15/0x40 [ 128.643041] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00 00 00 48 0f b1 55 00 75 0f 65 48 8b 04 25 c0 8b 01 00 48 89 45 08 5d [ 128.650180] RSP: 0018:ffffa9c3c05ebd78 EFLAGS: 00010246 [ 128.651820] RAX: 0000000000000000 RBX: ffff8ae9a6370240 RCX: ffffff8100000000 [ 128.653942] RDX: 0000000000000001 RSI: ffffff8100000000 RDI: 00000000000000a0 [ 128.655720] RBP: 00000000000000a0 R08: 0000000000000002 R09: ffff8ae9afd2d3d0 [ 128.657400] R10: 0000000000000056 R11: 0000000000000000 R12: 0000000000000000 [ 128.659099] R13: 0000000000000000 R14: 0000000000000003 R15: 00000000000000a0 [ 128.660500] FS: 00007febfd995540(0000) GS:ffff8ae9afd00000(0000) knlGS:0000000000000000 [ 128.662204] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 128.663426] CR2: 00000000000000a0 CR3: 0000000420042003 CR4: 0000000000360ee0 [ 128.664776] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 128.666022] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 128.667282] Call Trace: [ 128.667801] simple_recursive_removal+0x4e/0x2e0 [ 128.668663] ? debugfs_remove+0x60/0x60 [ 128.669368] debugfs_remove+0x40/0x60 [ 128.669985] blk_trace_free+0xd/0x50 [ 128.670593] __blk_trace_remove+0x27/0x40 [ 128.671274] blk_trace_shutdown+0x30/0x40 [ 128.671935] blk_release_queue+0x95/0xf0 [ 128.672589] kobject_put+0xa5/0x1b0 [ 128.673188] disk_release+0xa2/0xc0 [ 128.673786] device_release+0x28/0x80 [ 128.674376] kobject_put+0xa5/0x1b0 [ 128.674915] loop_remove+0x39/0x50 [loop] [ 128.675511] loop_control_ioctl+0x113/0x130 [loop] [ 128.676199] ksys_ioctl+0x87/0xc0 [ 128.676708] __x64_sys_ioctl+0x16/0x20 [ 128.677274] do_syscall_64+0x52/0x180 [ 128.677823] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The common theme here is: debugfs: Directory 'loop0' with parent 'block' already present This crash happens because of how blktrace uses the debugfs directory where it places its files. Upon init we always create the same directory which would be needed by blktrace but we only do this for make_request drivers (multiqueue) block drivers. When you race a removal of these devices with a blktrace setup you end up in a situation where the make_request recursive debugfs removal will sweep away the blktrace files and then later blktrace will also try to remove individual dentries which are already NULL. The inverse is also possible and hence the two types of use after frees. We don't create the block debugfs directory on init for these types of block devices: * request-based block driver block devices * every possible partition * scsi-generic And so, this race should in theory only be possible with make_request drivers. We can fix the UAF by simply re-using the debugfs directory for make_request drivers (multiqueue) and only creating the ephemeral directory for the other type of block devices. The new clarifications on relying on the q->blk_trace_mutex *and* also checking for q->blk_trace *prior* to processing a blktrace ensures the debugfs directories are only created if no possible directory name clashes are possible. This goes tested with: o nvme partitions o ISCSI with tgt, and blktracing against scsi-generic with: o block o tape o cdrom o media changer o blktests This patch is part of the work which disputes the severity of CVE-2019-19770 which shows this issue is not a core debugfs issue, but a misuse of debugfs within blktace. Fixes: 6ac93117ab00 ("blktrace: use existing disk debugfs directory") Reported-by: syzbot+603294af2d01acfdd6da@syzkaller.appspotmail.com Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Cc: Bart Van Assche Cc: Omar Sandoval Cc: Hannes Reinecke Cc: Nicolai Stange Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: "Martin K. Petersen" Cc: "James E.J. Bottomley" Cc: yu kuai Signed-off-by: Jens Axboe [bwh: Backported to 4.19: open-code queue_is_mq()] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- kernel/trace/blktrace.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1442f6152abc..645048bb1e86 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -521,10 +521,18 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; - ret = -ENOENT; - - dir = debugfs_lookup(buts->name, blk_debugfs_root); - if (!dir) +#ifdef CONFIG_BLK_DEBUG_FS + /* + * When tracing whole make_request drivers (multiqueue) block devices, + * reuse the existing debugfs directory created by the block layer on + * init. For request-based block devices, all partitions block devices, + * and scsi-generic block devices we create a temporary new debugfs + * directory that will be removed once the trace ends. + */ + if (q->mq_ops && bdev && bdev == bdev->bd_contains) + dir = q->debugfs_dir; + else +#endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); if (!dir) goto err; @@ -583,8 +591,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, ret = 0; err: - if (dir && !bt->dir) - dput(dir); if (ret) blk_trace_free(bt); return ret; From 7f2b9e8d42bd957332dc2af0ba6a77684fe32b1c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 25 Jan 2019 13:09:15 +0800 Subject: [PATCH 13/72] btrfs: extent_io: Kill the forward declaration of flush_write_bio commit bb58eb9e167d087cc518f7a71c3c00f1671958da upstream. There is no need to forward declare flush_write_bio(), as it only depends on submit_one_bio(). Both of them are pretty small, just move them to kill the forward declaration. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: David Sterba [bwh: Cherry-picked for 4.19 to ease backporting later fixes] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 66 +++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 82d597b16152..128da03721d0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -138,7 +138,38 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static void flush_write_bio(struct extent_page_data *epd); +static int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + blk_status_t ret = 0; + struct bio_vec *bvec = bio_last_bvec_all(bio); + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + + start = page_offset(page) + bvec->bv_offset; + + bio->bi_private = NULL; + + if (tree->ops) + ret = tree->ops->submit_bio_hook(tree->private_data, bio, + mirror_num, bio_flags, start); + else + btrfsic_submit_bio(bio); + + return blk_status_to_errno(ret); +} + +static void flush_write_bio(struct extent_page_data *epd) +{ + if (epd->bio) { + int ret; + + ret = submit_one_bio(epd->bio, 0, 0); + BUG_ON(ret < 0); /* -ENOMEM */ + epd->bio = NULL; + } +} int __init extent_io_init(void) { @@ -2710,28 +2741,6 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) return bio; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) -{ - blk_status_t ret = 0; - struct bio_vec *bvec = bio_last_bvec_all(bio); - struct page *page = bvec->bv_page; - struct extent_io_tree *tree = bio->bi_private; - u64 start; - - start = page_offset(page) + bvec->bv_offset; - - bio->bi_private = NULL; - - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags, start); - else - btrfsic_submit_bio(bio); - - return blk_status_to_errno(ret); -} - /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @tree: tree so we can call our merge_bio hook @@ -4033,17 +4042,6 @@ retry: return ret; } -static void flush_write_bio(struct extent_page_data *epd) -{ - if (epd->bio) { - int ret; - - ret = submit_one_bio(epd->bio, 0, 0); - BUG_ON(ret < 0); /* -ENOMEM */ - epd->bio = NULL; - } -} - int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; From 63ece3bb01489e3942bc15e4aa976afcb3cd864f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 14:27:41 +0800 Subject: [PATCH 14/72] btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up commit f4340622e02261fae599e3da936ff4808b418173 upstream. We have a BUG_ON() in flush_write_bio() to handle the return value of submit_one_bio(). Move the BUG_ON() one level up to all its callers. This patch will introduce temporary variable, @flush_ret to keep code change minimal in this patch. That variable will be cleaned up when enhancing the error handling later. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba [bwh: Cherry-picked for 4.19 to ease backporting later fixes] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 55 +++++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 128da03721d0..e6dab32e37f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,15 +160,28 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, return blk_status_to_errno(ret); } -static void flush_write_bio(struct extent_page_data *epd) +/* + * Submit bio from extent page data via submit_one_bio + * + * Return 0 if everything is OK. + * Return <0 for error. + */ +static int __must_check flush_write_bio(struct extent_page_data *epd) { - if (epd->bio) { - int ret; + int ret = 0; + if (epd->bio) { ret = submit_one_bio(epd->bio, 0, 0); - BUG_ON(ret < 0); /* -ENOMEM */ + /* + * Clean up of epd->bio is handled by its endio function. + * And endio is either triggered by successful bio execution + * or the error handler of submit bio hook. + * So at this point, no matter what happened, we don't need + * to clean up epd->bio. + */ epd->bio = NULL; } + return ret; } int __init extent_io_init(void) @@ -3538,7 +3551,8 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!btrfs_try_tree_write_lock(eb)) { flush = 1; - flush_write_bio(epd); + ret = flush_write_bio(epd); + BUG_ON(ret < 0); btrfs_tree_lock(eb); } @@ -3547,7 +3561,8 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!epd->sync_io) return 0; if (!flush) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + BUG_ON(ret < 0); flush = 1; } while (1) { @@ -3588,7 +3603,8 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!trylock_page(p)) { if (!flush) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + BUG_ON(ret < 0); flush = 1; } lock_page(p); @@ -3779,6 +3795,7 @@ int btree_write_cache_pages(struct address_space *mapping, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; int ret = 0; + int flush_ret; int done = 0; int nr_to_write_done = 0; struct pagevec pvec; @@ -3878,7 +3895,8 @@ retry: index = 0; goto retry; } - flush_write_bio(&epd); + flush_ret = flush_write_bio(&epd); + BUG_ON(flush_ret < 0); return ret; } @@ -3975,7 +3993,8 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - flush_write_bio(epd); + ret = flush_write_bio(epd); + BUG_ON(ret < 0); lock_page(page); } @@ -3985,8 +4004,10 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) - flush_write_bio(epd); + if (PageWriteback(page)) { + ret = flush_write_bio(epd); + BUG_ON(ret < 0); + } wait_on_page_writeback(page); } @@ -4045,6 +4066,7 @@ retry: int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; + int flush_ret; struct extent_page_data epd = { .bio = NULL, .tree = &BTRFS_I(page->mapping->host)->io_tree, @@ -4054,7 +4076,8 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) ret = __extent_writepage(page, wbc, &epd); - flush_write_bio(&epd); + flush_ret = flush_write_bio(&epd); + BUG_ON(flush_ret < 0); return ret; } @@ -4062,6 +4085,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode) { int ret = 0; + int flush_ret; struct address_space *mapping = inode->i_mapping; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *page; @@ -4096,7 +4120,8 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, start += PAGE_SIZE; } - flush_write_bio(&epd); + flush_ret = flush_write_bio(&epd); + BUG_ON(flush_ret < 0); return ret; } @@ -4104,6 +4129,7 @@ int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret = 0; + int flush_ret; struct extent_page_data epd = { .bio = NULL, .tree = &BTRFS_I(mapping->host)->io_tree, @@ -4112,7 +4138,8 @@ int extent_writepages(struct address_space *mapping, }; ret = extent_write_cache_pages(mapping, wbc, &epd); - flush_write_bio(&epd); + flush_ret = flush_write_bio(&epd); + BUG_ON(flush_ret < 0); return ret; } From aa38097b44153d7470ce8956ec4de8f1372390c6 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 12 Oct 2020 23:18:11 +0100 Subject: [PATCH 15/72] Revert "btrfs: flush write bio if we loop in extent_write_cache_pages" This reverts commit 860473714cbe7fbedcf92bfe3eb6d69fae8c74ff. That has an incorrect upstream commit reference, and was modified in a way that conflicts with some older fixes. We can cleanly cherry-pick the upstream commit *after* those fixes. Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e6dab32e37f4..fa8bccca8f8c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4045,14 +4045,6 @@ retry: */ scanned = 1; index = 0; - - /* - * If we're looping we could run into a page that is locked by a - * writer and that writer could be waiting on writeback for a - * page in our current bio, and thus deadlock, so flush the - * write bio here. - */ - flush_write_bio(epd); goto retry; } From 169ae603400293e0182469f3adfd135d04a38470 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 23 Jan 2020 15:33:02 -0500 Subject: [PATCH 16/72] btrfs: flush write bio if we loop in extent_write_cache_pages commit 42ffb0bf584ae5b6b38f72259af1e0ee417ac77f upstream. There exists a deadlock with range_cyclic that has existed forever. If we loop around with a bio already built we could deadlock with a writer who has the page locked that we're attempting to write but is waiting on a page in our bio to be written out. The task traces are as follows PID: 1329874 TASK: ffff889ebcdf3800 CPU: 33 COMMAND: "kworker/u113:5" #0 [ffffc900297bb658] __schedule at ffffffff81a4c33f #1 [ffffc900297bb6e0] schedule at ffffffff81a4c6e3 #2 [ffffc900297bb6f8] io_schedule at ffffffff81a4ca42 #3 [ffffc900297bb708] __lock_page at ffffffff811f145b #4 [ffffc900297bb798] __process_pages_contig at ffffffff814bc502 #5 [ffffc900297bb8c8] lock_delalloc_pages at ffffffff814bc684 #6 [ffffc900297bb900] find_lock_delalloc_range at ffffffff814be9ff #7 [ffffc900297bb9a0] writepage_delalloc at ffffffff814bebd0 #8 [ffffc900297bba18] __extent_writepage at ffffffff814bfbf2 #9 [ffffc900297bba98] extent_write_cache_pages at ffffffff814bffbd PID: 2167901 TASK: ffff889dc6a59c00 CPU: 14 COMMAND: "aio-dio-invalid" #0 [ffffc9003b50bb18] __schedule at ffffffff81a4c33f #1 [ffffc9003b50bba0] schedule at ffffffff81a4c6e3 #2 [ffffc9003b50bbb8] io_schedule at ffffffff81a4ca42 #3 [ffffc9003b50bbc8] wait_on_page_bit at ffffffff811f24d6 #4 [ffffc9003b50bc60] prepare_pages at ffffffff814b05a7 #5 [ffffc9003b50bcd8] btrfs_buffered_write at ffffffff814b1359 #6 [ffffc9003b50bdb0] btrfs_file_write_iter at ffffffff814b5933 #7 [ffffc9003b50be38] new_sync_write at ffffffff8128f6a8 #8 [ffffc9003b50bec8] vfs_write at ffffffff81292b9d #9 [ffffc9003b50bf00] ksys_pwrite64 at ffffffff81293032 I used drgn to find the respective pages we were stuck on page_entry.page 0xffffea00fbfc7500 index 8148 bit 15 pid 2167901 page_entry.page 0xffffea00f9bb7400 index 7680 bit 0 pid 1329874 As you can see the kworker is waiting for bit 0 (PG_locked) on index 7680, and aio-dio-invalid is waiting for bit 15 (PG_writeback) on index 8148. aio-dio-invalid has 7680, and the kworker epd looks like the following crash> struct extent_page_data ffffc900297bbbb0 struct extent_page_data { bio = 0xffff889f747ed830, tree = 0xffff889eed6ba448, extent_locked = 0, sync_io = 0 } Probably worth mentioning as well that it waits for writeback of the page to complete while holding a lock on it (at prepare_pages()). Using drgn I walked the bio pages looking for page 0xffffea00fbfc7500 which is the one we're waiting for writeback on bio = Object(prog, 'struct bio', address=0xffff889f747ed830) for i in range(0, bio.bi_vcnt.value_()): bv = bio.bi_io_vec[i] if bv.bv_page.value_() == 0xffffea00fbfc7500: print("FOUND IT") which validated what I suspected. The fix for this is simple, flush the epd before we loop back around to the beginning of the file during writeout. Fixes: b293f02e1423 ("Btrfs: Add writepages support") CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fa8bccca8f8c..af41f407dee5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4045,7 +4045,16 @@ retry: */ scanned = 1; index = 0; - goto retry; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + ret = flush_write_bio(epd); + if (!ret) + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) From eeda803b77d6cc80c9f9dffcfc86cddc5a52c0fe Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 14:27:42 +0800 Subject: [PATCH 17/72] btrfs: extent_io: Handle errors better in extent_write_full_page() commit 3065976b045f77a910809fa7699f99a1e7c0dbbb upstream. Since now flush_write_bio() could return error, kill the BUG_ON() first. Then don't call flush_write_bio() unconditionally, instead we check the return value from __extent_writepage() first. If __extent_writepage() fails, we do cleanup, and return error without submitting the possible corrupted or half-baked bio. If __extent_writepage() successes, then we call flush_write_bio() and return the result. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index af41f407dee5..0aa7fbafa105 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,6 +160,16 @@ static int __must_check submit_one_bio(struct bio *bio, int mirror_num, return blk_status_to_errno(ret); } +/* Cleanup unsubmitted bios */ +static void end_write_bio(struct extent_page_data *epd, int ret) +{ + if (epd->bio) { + epd->bio->bi_status = errno_to_blk_status(ret); + bio_endio(epd->bio); + epd->bio = NULL; + } +} + /* * Submit bio from extent page data via submit_one_bio * @@ -3461,6 +3471,9 @@ done: * records are inserted to lock ranges in the tree, and as dirty areas * are found, they are marked writeback. Then the lock bits are removed * and the end_io handler clears the writeback ranges + * + * Return 0 if everything goes well. + * Return <0 for error. */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) @@ -3528,6 +3541,7 @@ done: end_extent_writepage(page, ret, start, page_end); } unlock_page(page); + ASSERT(ret <= 0); return ret; done_unlocked: @@ -4067,7 +4081,6 @@ retry: int extent_write_full_page(struct page *page, struct writeback_control *wbc) { int ret; - int flush_ret; struct extent_page_data epd = { .bio = NULL, .tree = &BTRFS_I(page->mapping->host)->io_tree, @@ -4076,9 +4089,14 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) }; ret = __extent_writepage(page, wbc, &epd); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } - flush_ret = flush_write_bio(&epd); - BUG_ON(flush_ret < 0); + ret = flush_write_bio(&epd); + ASSERT(ret <= 0); return ret; } From 22bb77c13fbeeef2e0d0a8506c4efdde41598d32 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 14:27:43 +0800 Subject: [PATCH 18/72] btrfs: extent_io: Handle errors better in btree_write_cache_pages() commit 2b952eea813b1f7e7d4b9782271acd91625b9bb9 upstream. In btree_write_cache_pages(), we can only get @ret <= 0. Add an ASSERT() for it just in case. Then instead of submitting the write bio even we got some error, check the return value first. If we have already hit some error, just clean up the corrupted or half-baked bio, and return error. If there is no error so far, then call flush_write_bio() and return the result. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0aa7fbafa105..822515856ab0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3809,7 +3809,6 @@ int btree_write_cache_pages(struct address_space *mapping, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; int ret = 0; - int flush_ret; int done = 0; int nr_to_write_done = 0; struct pagevec pvec; @@ -3909,8 +3908,12 @@ retry: index = 0; goto retry; } - flush_ret = flush_write_bio(&epd); - BUG_ON(flush_ret < 0); + ASSERT(ret <= 0); + if (ret < 0) { + end_write_bio(&epd, ret); + return ret; + } + ret = flush_write_bio(&epd); return ret; } From 2311ea7ea04fbc15f51760d3b91f6a8381b06c9a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 14:27:46 +0800 Subject: [PATCH 19/72] btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io() commit 2e3c25136adfb293d517e17f761d3b8a43a8fc22 upstream. This function needs some extra checks on locked pages and eb. For error handling we need to unlock locked pages and the eb. There is a rare >0 return value branch, where all pages get locked while write bio is not flushed. Thankfully it's handled by the only caller, btree_write_cache_pages(), as later write_one_eb() call will trigger submit_one_bio(). So there shouldn't be any problem. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 822515856ab0..944ac072081d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3554,19 +3554,27 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } +/* + * Lock eb pages and flush the bio if we can't the locks + * + * Return 0 if nothing went wrong + * Return >0 is same as 0, except bio is not submitted + * Return <0 if something went wrong, no page is locked + */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, struct btrfs_fs_info *fs_info, struct extent_page_data *epd) { - int i, num_pages; + int i, num_pages, failed_page_nr; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - flush = 1; ret = flush_write_bio(epd); - BUG_ON(ret < 0); + if (ret < 0) + return ret; + flush = 1; btrfs_tree_lock(eb); } @@ -3576,7 +3584,8 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, return 0; if (!flush) { ret = flush_write_bio(epd); - BUG_ON(ret < 0); + if (ret < 0) + return ret; flush = 1; } while (1) { @@ -3618,7 +3627,10 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!trylock_page(p)) { if (!flush) { ret = flush_write_bio(epd); - BUG_ON(ret < 0); + if (ret < 0) { + failed_page_nr = i; + goto err_unlock; + } flush = 1; } lock_page(p); @@ -3626,6 +3638,11 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, } return ret; +err_unlock: + /* Unlock already locked pages */ + for (i = 0; i < failed_page_nr; i++) + unlock_page(eb->pages[i]); + return ret; } static void end_extent_buffer_writeback(struct extent_buffer *eb) From 432bdb0d931b3a6e8c8d34ffa29a218479b5f9be Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 11 Sep 2019 17:42:00 +0100 Subject: [PATCH 20/72] Btrfs: fix unwritten extent buffers and hangs on future writeback attempts commit 18dfa7117a3f379862dcd3f67cadd678013bb9dd upstream. The lock_extent_buffer_io() returns 1 to the caller to tell it everything went fine and the callers needs to start writeback for the extent buffer (submit a bio, etc), 0 to tell the caller everything went fine but it does not need to start writeback for the extent buffer, and a negative value if some error happened. When it's about to return 1 it tries to lock all pages, and if a try lock on a page fails, and we didn't flush any existing bio in our "epd", it calls flush_write_bio(epd) and overwrites the return value of 1 to 0 or an error. The page might have been locked elsewhere, not with the goal of starting writeback of the extent buffer, and even by some code other than btrfs, like page migration for example, so it does not mean the writeback of the extent buffer was already started by some other task, so returning a 0 tells the caller (btree_write_cache_pages()) to not start writeback for the extent buffer. Note that epd might currently have either no bio, so flush_write_bio() returns 0 (success) or it might have a bio for another extent buffer with a lower index (logical address). Since we return 0 with the EXTENT_BUFFER_WRITEBACK bit set on the extent buffer and writeback is never started for the extent buffer, future attempts to writeback the extent buffer will hang forever waiting on that bit to be cleared, since it can only be cleared after writeback completes. Such hang is reported with a trace like the following: [49887.347053] INFO: task btrfs-transacti:1752 blocked for more than 122 seconds. [49887.347059] Not tainted 5.2.13-gentoo #2 [49887.347060] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [49887.347062] btrfs-transacti D 0 1752 2 0x80004000 [49887.347064] Call Trace: [49887.347069] ? __schedule+0x265/0x830 [49887.347071] ? bit_wait+0x50/0x50 [49887.347072] ? bit_wait+0x50/0x50 [49887.347074] schedule+0x24/0x90 [49887.347075] io_schedule+0x3c/0x60 [49887.347077] bit_wait_io+0x8/0x50 [49887.347079] __wait_on_bit+0x6c/0x80 [49887.347081] ? __lock_release.isra.29+0x155/0x2d0 [49887.347083] out_of_line_wait_on_bit+0x7b/0x80 [49887.347084] ? var_wake_function+0x20/0x20 [49887.347087] lock_extent_buffer_for_io+0x28c/0x390 [49887.347089] btree_write_cache_pages+0x18e/0x340 [49887.347091] do_writepages+0x29/0xb0 [49887.347093] ? kmem_cache_free+0x132/0x160 [49887.347095] ? convert_extent_bit+0x544/0x680 [49887.347097] filemap_fdatawrite_range+0x70/0x90 [49887.347099] btrfs_write_marked_extents+0x53/0x120 [49887.347100] btrfs_write_and_wait_transaction.isra.4+0x38/0xa0 [49887.347102] btrfs_commit_transaction+0x6bb/0x990 [49887.347103] ? start_transaction+0x33e/0x500 [49887.347105] transaction_kthread+0x139/0x15c So fix this by not overwriting the return value (ret) with the result from flush_write_bio(). We also need to clear the EXTENT_BUFFER_WRITEBACK bit in case flush_write_bio() returns an error, otherwise it will hang any future attempts to writeback the extent buffer, and undo all work done before (set back EXTENT_BUFFER_DIRTY, etc). This is a regression introduced in the 5.2 kernel. Fixes: 2e3c25136adfb ("btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io()") Fixes: f4340622e0226 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up") Reported-by: Zdenek Sojka Link: https://lore.kernel.org/linux-btrfs/GpO.2yos.3WGDOLpx6t%7D.1TUDYM@seznam.cz/T/#u Reported-by: Stefan Priebe - Profihost AG Link: https://lore.kernel.org/linux-btrfs/5c4688ac-10a7-fb07-70e8-c5d31a3fbb38@profihost.ag/T/#t Reported-by: Drazen Kacar Link: https://lore.kernel.org/linux-btrfs/DB8PR03MB562876ECE2319B3E579590F799C80@DB8PR03MB5628.eurprd03.prod.outlook.com/ Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204377 Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 944ac072081d..88ada4a19213 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3554,6 +3554,13 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) TASK_UNINTERRUPTIBLE); } +static void end_extent_buffer_writeback(struct extent_buffer *eb) +{ + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); + smp_mb__after_atomic(); + wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); +} + /* * Lock eb pages and flush the bio if we can't the locks * @@ -3626,8 +3633,11 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, if (!trylock_page(p)) { if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) { + int err; + + err = flush_write_bio(epd); + if (err < 0) { + ret = err; failed_page_nr = i; goto err_unlock; } @@ -3642,16 +3652,23 @@ err_unlock: /* Unlock already locked pages */ for (i = 0; i < failed_page_nr; i++) unlock_page(eb->pages[i]); + /* + * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. + * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can + * be made and undo everything done before. + */ + btrfs_tree_lock(eb); + spin_lock(&eb->refs_lock); + set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); + end_extent_buffer_writeback(eb); + spin_unlock(&eb->refs_lock); + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, + fs_info->dirty_metadata_batch); + btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_tree_unlock(eb); return ret; } -static void end_extent_buffer_writeback(struct extent_buffer *eb) -{ - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); - smp_mb__after_atomic(); - wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); -} - static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; From 1527c0e0229d2dd1c8ae1e73b1579bd8d5866b5b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 12 Feb 2020 14:12:44 +0800 Subject: [PATCH 21/72] btrfs: Don't submit any btree write bio if the fs has errors commit b3ff8f1d380e65dddd772542aa9bff6c86bf715a upstream. [BUG] There is a fuzzed image which could cause KASAN report at unmount time. BUG: KASAN: use-after-free in btrfs_queue_work+0x2c1/0x390 Read of size 8 at addr ffff888067cf6848 by task umount/1922 CPU: 0 PID: 1922 Comm: umount Tainted: G W 5.0.21 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x5b/0x8b print_address_description+0x70/0x280 kasan_report+0x13a/0x19b btrfs_queue_work+0x2c1/0x390 btrfs_wq_submit_bio+0x1cd/0x240 btree_submit_bio_hook+0x18c/0x2a0 submit_one_bio+0x1be/0x320 flush_write_bio.isra.41+0x2c/0x70 btree_write_cache_pages+0x3bb/0x7f0 do_writepages+0x5c/0x130 __writeback_single_inode+0xa3/0x9a0 writeback_single_inode+0x23d/0x390 write_inode_now+0x1b5/0x280 iput+0x2ef/0x600 close_ctree+0x341/0x750 generic_shutdown_super+0x126/0x370 kill_anon_super+0x31/0x50 btrfs_kill_super+0x36/0x2b0 deactivate_locked_super+0x80/0xc0 deactivate_super+0x13c/0x150 cleanup_mnt+0x9a/0x130 task_work_run+0x11a/0x1b0 exit_to_usermode_loop+0x107/0x130 do_syscall_64+0x1e5/0x280 entry_SYSCALL_64_after_hwframe+0x44/0xa9 [CAUSE] The fuzzed image has a completely screwd up extent tree: leaf 29421568 gen 8 total ptrs 6 free space 3587 owner EXTENT_TREE refs 2 lock (w:0 r:0 bw:0 br:0 sw:0 sr:0) lock_owner 0 current 5938 item 0 key (12587008 168 4096) itemoff 3942 itemsize 53 extent refs 1 gen 9 flags 1 ref#0: extent data backref root 5 objectid 259 offset 0 count 1 item 1 key (12591104 168 8192) itemoff 3889 itemsize 53 extent refs 1 gen 9 flags 1 ref#0: extent data backref root 5 objectid 271 offset 0 count 1 item 2 key (12599296 168 4096) itemoff 3836 itemsize 53 extent refs 1 gen 9 flags 1 ref#0: extent data backref root 5 objectid 259 offset 4096 count 1 item 3 key (29360128 169 0) itemoff 3803 itemsize 33 extent refs 1 gen 9 flags 2 ref#0: tree block backref root 5 item 4 key (29368320 169 1) itemoff 3770 itemsize 33 extent refs 1 gen 9 flags 2 ref#0: tree block backref root 5 item 5 key (29372416 169 0) itemoff 3737 itemsize 33 extent refs 1 gen 9 flags 2 ref#0: tree block backref root 5 Note that leaf 29421568 doesn't have its backref in the extent tree. Thus extent allocator can re-allocate leaf 29421568 for other trees. In short, the bug is caused by: - Existing tree block gets allocated to log tree This got its generation bumped. - Log tree balance cleaned dirty bit of offending tree block It will not be written back to disk, thus no WRITTEN flag. - Original owner of the tree block gets COWed Since the tree block has higher transid, no WRITTEN flag, it's reused, and not traced by transaction::dirty_pages. - Transaction aborted Tree blocks get cleaned according to transaction::dirty_pages. But the offending tree block is not recorded at all. - Filesystem unmount All pages are assumed to be are clean, destroying all workqueue, then call iput(btree_inode). But offending tree block is still dirty, which triggers writeback, and causes use-after-free bug. The detailed sequence looks like this: - Initial status eb: 29421568, header=WRITTEN bflags_dirty=0, page_dirty=0, gen=8, not traced by any dirty extent_iot_tree. - New tree block is allocated Since there is no backref for 29421568, it's re-allocated as new tree block. Keep in mind that tree block 29421568 is still referred by extent tree. - Tree block 29421568 is filled for log tree eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 << (gen bumped) traced by btrfs_root::dirty_log_pages - Some log tree operations Since the fs is using node size 4096, the log tree can easily go a level higher. - Log tree needs balance Tree block 29421568 gets all its content pushed to right, thus now it is empty, and we don't need it. btrfs_clean_tree_block() from __push_leaf_right() get called. eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9 traced by btrfs_root::dirty_log_pages - Log tree write back btree_write_cache_pages() goes through dirty pages ranges, but since page of tree block 29421568 gets cleaned already, it's not written back to disk. Thus it doesn't have WRITTEN bit set. But ranges in dirty_log_pages are cleared. eb: 29421568, header=0 bflags_dirty=0, page_dirty=0, gen=9 not traced by any dirty extent_iot_tree. - Extent tree update when committing transaction Since tree block 29421568 has transid equal to running trans, and has no WRITTEN bit, should_cow_block() will use it directly without adding it to btrfs_transaction::dirty_pages. eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 not traced by any dirty extent_iot_tree. At this stage, we're doomed. We have a dirty eb not tracked by any extent io tree. - Transaction gets aborted due to corrupted extent tree Btrfs cleans up dirty pages according to transaction::dirty_pages and btrfs_root::dirty_log_pages. But since tree block 29421568 is not tracked by neither of them, it's still dirty. eb: 29421568, header=0 bflags_dirty=1, page_dirty=1, gen=9 not traced by any dirty extent_iot_tree. - Filesystem unmount Since all cleanup is assumed to be done, all workqueus are destroyed. Then iput(btree_inode) is called, expecting no dirty pages. But tree 29421568 is still dirty, thus triggering writeback. Since all workqueues are already freed, we cause use-after-free. This shows us that, log tree blocks + bad extent tree can cause wild dirty pages. [FIX] To fix the problem, don't submit any btree write bio if the filesytem has any error. This is the last safe net, just in case other cleanup haven't caught catch it. Link: https://github.com/bobfuzzer/CVE/tree/master/CVE-2019-19377 CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba [bwh: Backported to 4.19: fs_info variable already exists in btree_write_cache_pages()] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 88ada4a19213..301111922a1a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3947,7 +3947,39 @@ retry: end_write_bio(&epd, ret); return ret; } - ret = flush_write_bio(&epd); + /* + * If something went wrong, don't allow any metadata write bio to be + * submitted. + * + * This would prevent use-after-free if we had dirty pages not + * cleaned up, which can still happen by fuzzed images. + * + * - Bad extent tree + * Allowing existing tree block to be allocated for other trees. + * + * - Log tree operations + * Exiting tree blocks get allocated to log tree, bumps its + * generation, then get cleaned in tree re-balance. + * Such tree block will not be written back, since it's clean, + * thus no WRITTEN flag set. + * And after log writes back, this tree block is not traced by + * any dirty extent_io_tree. + * + * - Offending tree block gets re-dirtied from its original owner + * Since it has bumped generation, no WRITTEN flag, it can be + * reused without COWing. This tree block will not be traced + * by btrfs_transaction::dirty_pages. + * + * Now such dirty tree block will not be cleaned by any dirty + * extent io tree. Thus we don't want to submit such wild eb + * if the fs already has error. + */ + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = flush_write_bio(&epd); + } else { + ret = -EUCLEAN; + end_write_bio(&epd, ret); + } return ret; } From 5d123c6335e82db8de1e2fe1c15563e003245438 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 13:16:42 +0800 Subject: [PATCH 22/72] btrfs: Move btrfs_check_chunk_valid() to tree-check.[ch] and export it commit 82fc28fbedbb59642f05215db3b0ef4eb91aa31d upstream. By function, chunk item verification is more suitable to be done inside tree-checker. So move btrfs_check_chunk_valid() to tree-checker.c and export it. And since it's now moved to tree-checker, also add a better comment for what this function is doing. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba [bwh: Cherry-picked for 4.19 to ease backporting later fixes] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 97 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/tree-checker.h | 4 ++ fs/btrfs/volumes.c | 94 +-------------------------------------- 3 files changed, 102 insertions(+), 93 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index d98ec885b72a..9f81f4e3f263 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -448,6 +448,103 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, return 0; } +/* + * The common chunk check which could also work on super block sys chunk array. + * + * Return -EIO if anything is corrupted. + * Return 0 if everything is OK. + */ +int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) +{ + u64 length; + u64 stripe_len; + u16 num_stripes; + u16 sub_stripes; + u64 type; + u64 features; + bool mixed = false; + + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + + if (!num_stripes) { + btrfs_err(fs_info, "invalid chunk num_stripes: %u", + num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + btrfs_err(fs_info, "invalid chunk logical %llu", logical); + return -EIO; + } + if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { + btrfs_err(fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } + if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { + btrfs_err(fs_info, "invalid chunk length %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { + btrfs_err(fs_info, "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + type) { + btrfs_err(fs_info, "unrecognized chunk type: %llu", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } + + if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { + btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); + return -EIO; + } + + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && + (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { + btrfs_err(fs_info, + "system chunk with data or metadata type: 0x%llx", type); + return -EIO; + } + + features = btrfs_super_incompat_flags(fs_info->super_copy); + if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) + mixed = true; + + if (!mixed) { + if ((type & BTRFS_BLOCK_GROUP_METADATA) && + (type & BTRFS_BLOCK_GROUP_DATA)) { + btrfs_err(fs_info, + "mixed chunk type in non-mixed mode: 0x%llx", type); + return -EIO; + } + } + + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { + btrfs_err(fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + /* * Common point to switch the item-specific validation. */ diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index ff043275b784..4df45e8a6659 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -25,4 +25,8 @@ int btrfs_check_leaf_relaxed(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf); int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node); +int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical); + #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0879e3dc39c8..3050a93ae6f5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -28,6 +28,7 @@ #include "math.h" #include "dev-replace.h" #include "sysfs.h" +#include "tree-checker.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -6370,99 +6371,6 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -/* Return -EIO if any error, otherwise return 0. */ -static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 logical) -{ - u64 length; - u64 stripe_len; - u16 num_stripes; - u16 sub_stripes; - u64 type; - u64 features; - bool mixed = false; - - length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); - type = btrfs_chunk_type(leaf, chunk); - - if (!num_stripes) { - btrfs_err(fs_info, "invalid chunk num_stripes: %u", - num_stripes); - return -EIO; - } - if (!IS_ALIGNED(logical, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk logical %llu", logical); - return -EIO; - } - if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { - btrfs_err(fs_info, "invalid chunk sectorsize %u", - btrfs_chunk_sector_size(leaf, chunk)); - return -EIO; - } - if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk length %llu", length); - return -EIO; - } - if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { - btrfs_err(fs_info, "invalid chunk stripe length: %llu", - stripe_len); - return -EIO; - } - if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - type) { - btrfs_err(fs_info, "unrecognized chunk type: %llu", - ~(BTRFS_BLOCK_GROUP_TYPE_MASK | - BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)); - return -EIO; - } - - if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { - btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); - return -EIO; - } - - if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && - (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(fs_info, - "system chunk with data or metadata type: 0x%llx", type); - return -EIO; - } - - features = btrfs_super_incompat_flags(fs_info->super_copy); - if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) - mixed = true; - - if (!mixed) { - if ((type & BTRFS_BLOCK_GROUP_METADATA) && - (type & BTRFS_BLOCK_GROUP_DATA)) { - btrfs_err(fs_info, - "mixed chunk type in non-mixed mode: 0x%llx", type); - return -EIO; - } - } - - if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || - (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && - num_stripes != 1)) { - btrfs_err(fs_info, - "invalid num_stripes:sub_stripes %u:%u for profile %llu", - num_stripes, sub_stripes, - type & BTRFS_BLOCK_GROUP_PROFILE_MASK); - return -EIO; - } - - return 0; -} - static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, bool error) { From c24ed5ea52165bf44f405aca10d9af0ad593ae86 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 13:36:06 +0800 Subject: [PATCH 23/72] btrfs: tree-checker: Make chunk item checker messages more readable commit f114024376bceb1c0f61a7bad4a72a0f978767af upstream. Old error message would be something like: BTRFS error (device dm-3): invalid chunk num_stipres: 0 New error message would be: Btrfs critical (device dm-3): corrupt superblock syschunk array: chunk_start=2097152, invalid chunk num_stripes: 0 Or Btrfs critical (device dm-3): corrupt leaf: root=3 block=8388608 slot=3 chunk_start=2097152, invalid chunk num_stripes: 0 And for certain error message, also output expected value. The error message levels are changed from error to critical. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba [bwh: Cherry-picked for 4.19 to ease backporting later fixes] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 81 ++++++++++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 9f81f4e3f263..3f7fb0ddce8a 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -448,6 +448,51 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info, return 0; } +__printf(5, 6) +__cold +static void chunk_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf, + const struct btrfs_chunk *chunk, u64 logical, + const char *fmt, ...) +{ + bool is_sb; + struct va_format vaf; + va_list args; + int i; + int slot = -1; + + /* Only superblock eb is able to have such small offset */ + is_sb = (leaf->start == BTRFS_SUPER_INFO_OFFSET); + + if (!is_sb) { + /* + * Get the slot number by iterating through all slots, this + * would provide better readability. + */ + for (i = 0; i < btrfs_header_nritems(leaf); i++) { + if (btrfs_item_ptr_offset(leaf, i) == + (unsigned long)chunk) { + slot = i; + break; + } + } + } + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (is_sb) + btrfs_crit(fs_info, + "corrupt superblock syschunk array: chunk_start=%llu, %pV", + logical, &vaf); + else + btrfs_crit(fs_info, + "corrupt leaf: root=%llu block=%llu slot=%d chunk_start=%llu, %pV", + BTRFS_CHUNK_TREE_OBJECTID, leaf->start, slot, + logical, &vaf); + va_end(args); +} + /* * The common chunk check which could also work on super block sys chunk array. * @@ -473,31 +518,38 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, type = btrfs_chunk_type(leaf, chunk); if (!num_stripes) { - btrfs_err(fs_info, "invalid chunk num_stripes: %u", - num_stripes); + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk num_stripes, have %u", num_stripes); return -EIO; } if (!IS_ALIGNED(logical, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk logical %llu", logical); + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk logical, have %llu should aligned to %u", + logical, fs_info->sectorsize); return -EIO; } if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { - btrfs_err(fs_info, "invalid chunk sectorsize %u", - btrfs_chunk_sector_size(leaf, chunk)); + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk sectorsize, have %u expect %u", + btrfs_chunk_sector_size(leaf, chunk), + fs_info->sectorsize); return -EIO; } if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { - btrfs_err(fs_info, "invalid chunk length %llu", length); + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk length, have %llu", length); return -EIO; } if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { - btrfs_err(fs_info, "invalid chunk stripe length: %llu", + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk stripe length: %llu", stripe_len); return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & type) { - btrfs_err(fs_info, "unrecognized chunk type: %llu", + chunk_err(fs_info, leaf, chunk, logical, + "unrecognized chunk type: 0x%llx", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); @@ -505,14 +557,17 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, } if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { - btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type); + chunk_err(fs_info, leaf, chunk, logical, + "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", + type, BTRFS_BLOCK_GROUP_TYPE_MASK); return -EIO; } if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && (type & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA))) { - btrfs_err(fs_info, - "system chunk with data or metadata type: 0x%llx", type); + chunk_err(fs_info, leaf, chunk, logical, + "system chunk with data or metadata type: 0x%llx", + type); return -EIO; } @@ -523,7 +578,7 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, if (!mixed) { if ((type & BTRFS_BLOCK_GROUP_METADATA) && (type & BTRFS_BLOCK_GROUP_DATA)) { - btrfs_err(fs_info, + chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); return -EIO; } @@ -535,7 +590,7 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { - btrfs_err(fs_info, + chunk_err(fs_info, leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); From 782aa87c8f1d4472fc45697fdddbe8d47b56ebff Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 13:39:14 +0800 Subject: [PATCH 24/72] btrfs: tree-checker: Make btrfs_check_chunk_valid() return EUCLEAN instead of EIO commit bf871c3b43b1dcc3f2a076ff39a8f1ce7959d958 upstream. To follow the standard behavior of tree-checker. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba [bwh: Cherry-picked for 4.19 to ease backporting later fixes] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 3f7fb0ddce8a..7b59a3dfb5a8 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -496,7 +496,7 @@ static void chunk_err(const struct btrfs_fs_info *fs_info, /* * The common chunk check which could also work on super block sys chunk array. * - * Return -EIO if anything is corrupted. + * Return -EUCLEAN if anything is corrupted. * Return 0 if everything is OK. */ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, @@ -520,31 +520,31 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, if (!num_stripes) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); - return -EIO; + return -EUCLEAN; } if (!IS_ALIGNED(logical, fs_info->sectorsize)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", logical, fs_info->sectorsize); - return -EIO; + return -EUCLEAN; } if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk sectorsize, have %u expect %u", btrfs_chunk_sector_size(leaf, chunk), fs_info->sectorsize); - return -EIO; + return -EUCLEAN; } if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk length, have %llu", length); - return -EIO; + return -EUCLEAN; } if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { chunk_err(fs_info, leaf, chunk, logical, "invalid chunk stripe length: %llu", stripe_len); - return -EIO; + return -EUCLEAN; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & type) { @@ -553,14 +553,14 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); - return -EIO; + return -EUCLEAN; } if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", type, BTRFS_BLOCK_GROUP_TYPE_MASK); - return -EIO; + return -EUCLEAN; } if ((type & BTRFS_BLOCK_GROUP_SYSTEM) && @@ -568,7 +568,7 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, chunk_err(fs_info, leaf, chunk, logical, "system chunk with data or metadata type: 0x%llx", type); - return -EIO; + return -EUCLEAN; } features = btrfs_super_incompat_flags(fs_info->super_copy); @@ -580,7 +580,7 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, (type & BTRFS_BLOCK_GROUP_DATA)) { chunk_err(fs_info, leaf, chunk, logical, "mixed chunk type in non-mixed mode: 0x%llx", type); - return -EIO; + return -EUCLEAN; } } @@ -594,7 +594,7 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); - return -EIO; + return -EUCLEAN; } return 0; From 3c06c86cd08fdabe2ed20cbc3ea5010dcd7d4b29 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 20 Mar 2019 13:42:33 +0800 Subject: [PATCH 25/72] btrfs: tree-checker: Check chunk item at tree block read time commit 075cb3c78fe7976c9f29ca1fa23f9728634ecefc upstream. Since we have btrfs_check_chunk_valid() in tree-checker, let's do chunk item verification in tree-checker too. Since the tree-checker is run at endio time, if one chunk leaf fails chunk verification, we can still retry the other copy, making btrfs more robust to fuzzed image as we may still get a good chunk item. Also since we have done chunk verification in tree block read time, skip the btrfs_check_chunk_valid() call in read_one_chunk() if we're reading chunk items from leaf. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 6 ++++++ fs/btrfs/volumes.c | 12 +++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b59a3dfb5a8..e7099300d3e0 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -608,6 +608,7 @@ static int check_leaf_item(struct btrfs_fs_info *fs_info, struct btrfs_key *key, int slot) { int ret = 0; + struct btrfs_chunk *chunk; switch (key->type) { case BTRFS_EXTENT_DATA_KEY: @@ -624,6 +625,11 @@ static int check_leaf_item(struct btrfs_fs_info *fs_info, case BTRFS_BLOCK_GROUP_ITEM_KEY: ret = check_block_group_item(fs_info, leaf, key, slot); break; + case BTRFS_CHUNK_ITEM_KEY: + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, + key->offset); + break; } return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3050a93ae6f5..8e9d855dceba 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6401,9 +6401,15 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, length = btrfs_chunk_length(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); - if (ret) - return ret; + /* + * Only need to verify chunk item if we're reading from sys chunk array, + * as chunk item in tree block is already verified by tree-checker. + */ + if (leaf->start == BTRFS_SUPER_INFO_OFFSET) { + ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); + if (ret) + return ret; + } read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); From e23e5d2594307d1088e1e1b3621d785add2a57b8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 8 Mar 2019 14:20:03 +0800 Subject: [PATCH 26/72] btrfs: tree-checker: Verify dev item commit ab4ba2e133463c702b37242560d7fabedd2dc750 upstream. [BUG] For fuzzed image whose DEV_ITEM has invalid total_bytes as 0, then kernel will just panic: BUG: unable to handle kernel NULL pointer dereference at 0000000000000098 #PF error: [normal kernel read fault] PGD 800000022b2bd067 P4D 800000022b2bd067 PUD 22b2bc067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 0 PID: 1106 Comm: mount Not tainted 5.0.0-rc8+ #9 RIP: 0010:btrfs_verify_dev_extents+0x2a5/0x5a0 Call Trace: open_ctree+0x160d/0x2149 btrfs_mount_root+0x5b2/0x680 [CAUSE] If device extent verification finds a deivce with 0 total_bytes, then it assumes it's a seed dummy, then search for seed devices. But in this case, there is no seed device at all, causing NULL pointer. [FIX] Since this is caused by fuzzed image, let's go the tree-check way, just add a new verification for device item. Reported-by: Yoon Jungyeon Link: https://bugzilla.kernel.org/show_bug.cgi?id=202691 Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 74 +++++++++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.c | 9 ----- fs/btrfs/volumes.h | 9 +++++ 3 files changed, 83 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index e7099300d3e0..b5e7caeb28a2 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -600,6 +600,77 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, return 0; } +__printf(4, 5) +__cold +static void dev_item_err(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(fs_info, + "corrupt %s: root=%llu block=%llu slot=%d devid=%llu %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, + key.objectid, &vaf); + va_end(args); +} + +static int check_dev_item(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_dev_item *ditem; + u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); + + if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { + dev_item_err(fs_info, leaf, slot, + "invalid objectid: has=%llu expect=%llu", + key->objectid, BTRFS_DEV_ITEMS_OBJECTID); + return -EUCLEAN; + } + if (key->offset > max_devid) { + dev_item_err(fs_info, leaf, slot, + "invalid devid: has=%llu expect=[0, %llu]", + key->offset, max_devid); + return -EUCLEAN; + } + ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); + if (btrfs_device_id(leaf, ditem) != key->offset) { + dev_item_err(fs_info, leaf, slot, + "devid mismatch: key has=%llu item has=%llu", + key->offset, btrfs_device_id(leaf, ditem)); + return -EUCLEAN; + } + + /* + * For device total_bytes, we don't have reliable way to check it, as + * it can be 0 for device removal. Device size check can only be done + * by dev extents check. + */ + if (btrfs_device_bytes_used(leaf, ditem) > + btrfs_device_total_bytes(leaf, ditem)) { + dev_item_err(fs_info, leaf, slot, + "invalid bytes used: have %llu expect [0, %llu]", + btrfs_device_bytes_used(leaf, ditem), + btrfs_device_total_bytes(leaf, ditem)); + return -EUCLEAN; + } + /* + * Remaining members like io_align/type/gen/dev_group aren't really + * utilized. Skip them to make later usage of them easier. + */ + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -630,6 +701,9 @@ static int check_leaf_item(struct btrfs_fs_info *fs_info, ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, key->offset); break; + case BTRFS_DEV_ITEM_KEY: + ret = check_dev_item(fs_info, leaf, key, slot); + break; } return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8e9d855dceba..05daa2b816c3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4606,15 +4606,6 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - - sizeof(struct btrfs_chunk)) \ - / sizeof(struct btrfs_stripe) + 1) - -#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ - - 2 * sizeof(struct btrfs_disk_key) \ - - 2 * sizeof(struct btrfs_chunk)) \ - / sizeof(struct btrfs_stripe) + 1) - static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 start, u64 type) { diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 8e8bf3246de1..65cd023b097c 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -257,6 +257,15 @@ struct btrfs_fs_devices { #define BTRFS_BIO_INLINE_CSUM_SIZE 64 +#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ + - sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + +#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \ + - 2 * sizeof(struct btrfs_disk_key) \ + - 2 * sizeof(struct btrfs_chunk)) \ + / sizeof(struct btrfs_stripe) + 1) + /* * we need the mirror number and stripe index to be passed around * the call chain while we are processing end_io (especially errors). From b5b2a94d846a599a0b843895d3dbd90246d8d1fc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 28 Aug 2019 10:33:13 +0800 Subject: [PATCH 27/72] btrfs: tree-checker: Fix wrong check on max devid commit 8bb177d18f114358a57d8ae7e206861b48b8b4de upstream. [BUG] The following script will cause false alert on devid check. #!/bin/bash dev1=/dev/test/test dev2=/dev/test/scratch1 mnt=/mnt/btrfs umount $dev1 &> /dev/null umount $dev2 &> /dev/null umount $mnt &> /dev/null mkfs.btrfs -f $dev1 mount $dev1 $mnt _fail() { echo "!!! FAILED !!!" exit 1 } for ((i = 0; i < 4096; i++)); do btrfs dev add -f $dev2 $mnt || _fail btrfs dev del $dev1 $mnt || _fail dev_tmp=$dev1 dev1=$dev2 dev2=$dev_tmp done [CAUSE] Tree-checker uses BTRFS_MAX_DEVS() and BTRFS_MAX_DEVS_SYS_CHUNK() as upper limit for devid. But we can have devid holes just like above script. So the check for devid is incorrect and could cause false alert. [FIX] Just remove the whole devid check. We don't have any hard requirement for devid assignment. Furthermore, even devid could get corrupted by a bitflip, we still have dev extents verification at mount time, so corrupted data won't sneak in. This fixes fstests btrfs/194. Reported-by: Anand Jain Fixes: ab4ba2e13346 ("btrfs: tree-checker: Verify dev item") CC: stable@vger.kernel.org # 5.2+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba [bwh: Backported to 4.19: adjust context] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index b5e7caeb28a2..a0222e6a11c5 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -629,7 +629,6 @@ static int check_dev_item(struct btrfs_fs_info *fs_info, struct btrfs_key *key, int slot) { struct btrfs_dev_item *ditem; - u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK); if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) { dev_item_err(fs_info, leaf, slot, @@ -637,12 +636,6 @@ static int check_dev_item(struct btrfs_fs_info *fs_info, key->objectid, BTRFS_DEV_ITEMS_OBJECTID); return -EUCLEAN; } - if (key->offset > max_devid) { - dev_item_err(fs_info, leaf, slot, - "invalid devid: has=%llu expect=[0, %llu]", - key->offset, max_devid); - return -EUCLEAN; - } ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); if (btrfs_device_id(leaf, ditem) != key->offset) { dev_item_err(fs_info, leaf, slot, From bedd9974c106fe96b518da667d49b0bfe6353590 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 13 Mar 2019 12:17:50 +0800 Subject: [PATCH 28/72] btrfs: tree-checker: Enhance chunk checker to validate chunk profile commit 80e46cf22ba0bcb57b39c7c3b52961ab3a0fd5f2 upstream. Btrfs-progs already have a comprehensive type checker, to ensure there is only 0 (SINGLE profile) or 1 (DUP/RAID0/1/5/6/10) bit set for chunk profile bits. Do the same work for kernel. Reported-by: Yoon Jungyeon Link: https://bugzilla.kernel.org/show_bug.cgi?id=202765 Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a0222e6a11c5..ab1553781e9c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -556,6 +556,13 @@ int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, return -EUCLEAN; } + if (!is_power_of_2(type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && + (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) { + chunk_err(fs_info, leaf, chunk, logical, + "invalid chunk profile flag: 0x%llx, expect 0 or 1 bit set", + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EUCLEAN; + } if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) { chunk_err(fs_info, leaf, chunk, logical, "missing chunk type flag, have 0x%llx one bit must be set in 0x%llx", From 3384e8d72567dc6827d53c66071da61f57c517f7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 13 Mar 2019 14:31:35 +0800 Subject: [PATCH 29/72] btrfs: tree-checker: Verify inode item commit 496245cac57e26d8b738d85c7a29cf9a47610f3f upstream. There is a report in kernel bugzilla about mismatch file type in dir item and inode item. This inspires us to check inode mode in inode item. This patch will check the following members: - inode key objectid Should be ROOT_DIR_DIR or [256, (u64)-256] or FREE_INO. - inode key offset Should be 0 - inode item generation - inode item transid No newer than sb generation + 1. The +1 is for log tree. - inode item mode No unknown bits. No invalid S_IF* bit. NOTE: S_IFMT check is not enough, need to check every know type. - inode item nlink Dir should have no more link than 1. - inode item flags Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ctree.h | 15 +++++++ fs/btrfs/tree-checker.c | 94 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 554727d82d43..4d1c12faada8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1459,6 +1459,21 @@ do { \ #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +#define BTRFS_INODE_FLAG_MASK \ + (BTRFS_INODE_NODATASUM | \ + BTRFS_INODE_NODATACOW | \ + BTRFS_INODE_READONLY | \ + BTRFS_INODE_NOCOMPRESS | \ + BTRFS_INODE_PREALLOC | \ + BTRFS_INODE_SYNC | \ + BTRFS_INODE_IMMUTABLE | \ + BTRFS_INODE_APPEND | \ + BTRFS_INODE_NODUMP | \ + BTRFS_INODE_NOATIME | \ + BTRFS_INODE_DIRSYNC | \ + BTRFS_INODE_COMPRESS | \ + BTRFS_INODE_ROOT_ITEM_INIT) + struct btrfs_map_token { const struct extent_buffer *eb; char *kaddr; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ab1553781e9c..e77c25bc6860 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -671,6 +671,97 @@ static int check_dev_item(struct btrfs_fs_info *fs_info, return 0; } +/* Inode item error output has the same format as dir_item_err() */ +#define inode_item_err(fs_info, eb, slot, fmt, ...) \ + dir_item_err(fs_info, eb, slot, fmt, __VA_ARGS__) + +static int check_inode_item(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_inode_item *iitem; + u64 super_gen = btrfs_super_generation(fs_info->super_copy); + u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); + u32 mode; + + if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID) { + generic_err(fs_info, leaf, slot, + "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + return -EUCLEAN; + } + if (key->offset != 0) { + inode_item_err(fs_info, leaf, slot, + "invalid key offset: has %llu expect 0", + key->offset); + return -EUCLEAN; + } + iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); + + /* Here we use super block generation + 1 to handle log tree */ + if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { + inode_item_err(fs_info, leaf, slot, + "invalid inode generation: has %llu expect (0, %llu]", + btrfs_inode_generation(leaf, iitem), + super_gen + 1); + return -EUCLEAN; + } + /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ + if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { + inode_item_err(fs_info, leaf, slot, + "invalid inode generation: has %llu expect [0, %llu]", + btrfs_inode_transid(leaf, iitem), super_gen + 1); + return -EUCLEAN; + } + + /* + * For size and nbytes it's better not to be too strict, as for dir + * item its size/nbytes can easily get wrong, but doesn't affect + * anything in the fs. So here we skip the check. + */ + mode = btrfs_inode_mode(leaf, iitem); + if (mode & ~valid_mask) { + inode_item_err(fs_info, leaf, slot, + "unknown mode bit detected: 0x%x", + mode & ~valid_mask); + return -EUCLEAN; + } + + /* + * S_IFMT is not bit mapped so we can't completely rely on is_power_of_2, + * but is_power_of_2() can save us from checking FIFO/CHR/DIR/REG. + * Only needs to check BLK, LNK and SOCKS + */ + if (!is_power_of_2(mode & S_IFMT)) { + if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { + inode_item_err(fs_info, leaf, slot, + "invalid mode: has 0%o expect valid S_IF* bit(s)", + mode & S_IFMT); + return -EUCLEAN; + } + } + if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { + inode_item_err(fs_info, leaf, slot, + "invalid nlink: has %u expect no more than 1 for dir", + btrfs_inode_nlink(leaf, iitem)); + return -EUCLEAN; + } + if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { + inode_item_err(fs_info, leaf, slot, + "unknown flags detected: 0x%llx", + btrfs_inode_flags(leaf, iitem) & + ~BTRFS_INODE_FLAG_MASK); + return -EUCLEAN; + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -704,6 +795,9 @@ static int check_leaf_item(struct btrfs_fs_info *fs_info, case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(fs_info, leaf, key, slot); break; + case BTRFS_INODE_ITEM_KEY: + ret = check_inode_item(fs_info, leaf, key, slot); + break; } return ret; } From cdf69f3b132c95aa255383b2d7907da5ba63221a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 25 Aug 2020 21:42:51 +0800 Subject: [PATCH 30/72] btrfs: tree-checker: fix the error message for transid error commit f96d6960abbc52e26ad124e69e6815283d3e1674 upstream. The error message for inode transid is the same as for inode generation, which makes us unable to detect the real problem. Reported-by: Tyler Richmond Fixes: 496245cac57e ("btrfs: tree-checker: Verify inode item") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Marcos Paulo de Souza Signed-off-by: Qu Wenruo Signed-off-by: David Sterba [bwh: Backported to 4.19: adjust context] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index e77c25bc6860..9023e6b46396 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -715,7 +715,7 @@ static int check_inode_item(struct btrfs_fs_info *fs_info, /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { inode_item_err(fs_info, leaf, slot, - "invalid inode generation: has %llu expect [0, %llu]", + "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; } From 11af858054baadff13eb92eb347443f93768fde7 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Mon, 2 Nov 2020 13:32:42 -0500 Subject: [PATCH 31/72] Fonts: Replace discarded const qualifier commit 9522750c66c689b739e151fcdf895420dc81efc0 upstream. Commit 6735b4632def ("Fonts: Support FONT_EXTRA_WORDS macros for built-in fonts") introduced the following error when building rpc_defconfig (only this build appears to be affected): `acorndata_8x8' referenced in section `.text' of arch/arm/boot/compressed/ll_char_wr.o: defined in discarded section `.data' of arch/arm/boot/compressed/font.o `acorndata_8x8' referenced in section `.data.rel.ro' of arch/arm/boot/compressed/font.o: defined in discarded section `.data' of arch/arm/boot/compressed/font.o make[3]: *** [/scratch/linux/arch/arm/boot/compressed/Makefile:191: arch/arm/boot/compressed/vmlinux] Error 1 make[2]: *** [/scratch/linux/arch/arm/boot/Makefile:61: arch/arm/boot/compressed/vmlinux] Error 2 make[1]: *** [/scratch/linux/arch/arm/Makefile:317: zImage] Error 2 The .data section is discarded at link time. Reinstating acorndata_8x8 as const ensures it is still available after linking. Do the same for the other 12 built-in fonts as well, for consistency purposes. Cc: Cc: Russell King Reviewed-by: Greg Kroah-Hartman Fixes: 6735b4632def ("Fonts: Support FONT_EXTRA_WORDS macros for built-in fonts") Signed-off-by: Lee Jones Co-developed-by: Peilin Ye Signed-off-by: Peilin Ye Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20201102183242.2031659-1-yepeilin.cs@gmail.com Signed-off-by: Greg Kroah-Hartman --- lib/fonts/font_10x18.c | 2 +- lib/fonts/font_6x10.c | 2 +- lib/fonts/font_6x11.c | 2 +- lib/fonts/font_7x14.c | 2 +- lib/fonts/font_8x16.c | 2 +- lib/fonts/font_8x8.c | 2 +- lib/fonts/font_acorn_8x8.c | 2 +- lib/fonts/font_mini_4x6.c | 2 +- lib/fonts/font_pearl_8x8.c | 2 +- lib/fonts/font_sun12x22.c | 2 +- lib/fonts/font_sun8x16.c | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/fonts/font_10x18.c b/lib/fonts/font_10x18.c index 0e2deac97da0..e02f9df24d1e 100644 --- a/lib/fonts/font_10x18.c +++ b/lib/fonts/font_10x18.c @@ -8,7 +8,7 @@ #define FONTDATAMAX 9216 -static struct font_data fontdata_10x18 = { +static const struct font_data fontdata_10x18 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, 0x00, /* 0000000000 */ diff --git a/lib/fonts/font_6x10.c b/lib/fonts/font_6x10.c index 87da8acd07db..6e3c4b7691c8 100644 --- a/lib/fonts/font_6x10.c +++ b/lib/fonts/font_6x10.c @@ -3,7 +3,7 @@ #define FONTDATAMAX 2560 -static struct font_data fontdata_6x10 = { +static const struct font_data fontdata_6x10 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ diff --git a/lib/fonts/font_6x11.c b/lib/fonts/font_6x11.c index 5e975dfa10a5..2d22a24e816f 100644 --- a/lib/fonts/font_6x11.c +++ b/lib/fonts/font_6x11.c @@ -9,7 +9,7 @@ #define FONTDATAMAX (11*256) -static struct font_data fontdata_6x11 = { +static const struct font_data fontdata_6x11 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ diff --git a/lib/fonts/font_7x14.c b/lib/fonts/font_7x14.c index 86d298f38505..9cc7ae2e03f7 100644 --- a/lib/fonts/font_7x14.c +++ b/lib/fonts/font_7x14.c @@ -8,7 +8,7 @@ #define FONTDATAMAX 3584 -static struct font_data fontdata_7x14 = { +static const struct font_data fontdata_7x14 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 0000000 */ diff --git a/lib/fonts/font_8x16.c b/lib/fonts/font_8x16.c index 37cedd36ca5e..bab25dc59e8d 100644 --- a/lib/fonts/font_8x16.c +++ b/lib/fonts/font_8x16.c @@ -10,7 +10,7 @@ #define FONTDATAMAX 4096 -static struct font_data fontdata_8x16 = { +static const struct font_data fontdata_8x16 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ diff --git a/lib/fonts/font_8x8.c b/lib/fonts/font_8x8.c index 8ab695538395..109d0572368f 100644 --- a/lib/fonts/font_8x8.c +++ b/lib/fonts/font_8x8.c @@ -9,7 +9,7 @@ #define FONTDATAMAX 2048 -static struct font_data fontdata_8x8 = { +static const struct font_data fontdata_8x8 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ diff --git a/lib/fonts/font_acorn_8x8.c b/lib/fonts/font_acorn_8x8.c index 069b3e80c434..fb395f0d4031 100644 --- a/lib/fonts/font_acorn_8x8.c +++ b/lib/fonts/font_acorn_8x8.c @@ -5,7 +5,7 @@ #define FONTDATAMAX 2048 -static struct font_data acorndata_8x8 = { +static const struct font_data acorndata_8x8 = { { 0, 0, FONTDATAMAX, 0 }, { /* 00 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ^@ */ /* 01 */ 0x7e, 0x81, 0xa5, 0x81, 0xbd, 0x99, 0x81, 0x7e, /* ^A */ diff --git a/lib/fonts/font_mini_4x6.c b/lib/fonts/font_mini_4x6.c index 1449876c6a27..592774a90917 100644 --- a/lib/fonts/font_mini_4x6.c +++ b/lib/fonts/font_mini_4x6.c @@ -43,7 +43,7 @@ __END__; #define FONTDATAMAX 1536 -static struct font_data fontdata_mini_4x6 = { +static const struct font_data fontdata_mini_4x6 = { { 0, 0, FONTDATAMAX, 0 }, { /*{*/ /* Char 0: ' ' */ diff --git a/lib/fonts/font_pearl_8x8.c b/lib/fonts/font_pearl_8x8.c index 32d65551e7ed..a6f95ebce950 100644 --- a/lib/fonts/font_pearl_8x8.c +++ b/lib/fonts/font_pearl_8x8.c @@ -14,7 +14,7 @@ #define FONTDATAMAX 2048 -static struct font_data fontdata_pearl8x8 = { +static const struct font_data fontdata_pearl8x8 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, /* 00000000 */ diff --git a/lib/fonts/font_sun12x22.c b/lib/fonts/font_sun12x22.c index 641a6b4dca42..a5b65bd49604 100644 --- a/lib/fonts/font_sun12x22.c +++ b/lib/fonts/font_sun12x22.c @@ -3,7 +3,7 @@ #define FONTDATAMAX 11264 -static struct font_data fontdata_sun12x22 = { +static const struct font_data fontdata_sun12x22 = { { 0, 0, FONTDATAMAX, 0 }, { /* 0 0x00 '^@' */ 0x00, 0x00, /* 000000000000 */ diff --git a/lib/fonts/font_sun8x16.c b/lib/fonts/font_sun8x16.c index 193fe6d988e0..e577e76a6a7c 100644 --- a/lib/fonts/font_sun8x16.c +++ b/lib/fonts/font_sun8x16.c @@ -3,7 +3,7 @@ #define FONTDATAMAX 4096 -static struct font_data fontdata_sun8x16 = { +static const struct font_data fontdata_sun8x16 = { { 0, 0, FONTDATAMAX, 0 }, { /* */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* */ 0x00,0x00,0x7e,0x81,0xa5,0x81,0x81,0xbd,0x99,0x81,0x81,0x7e,0x00,0x00,0x00,0x00, From 0b443d1aff6d6f3706e48c93d9fe1212a569af4b Mon Sep 17 00:00:00 2001 From: Keith Winstein Date: Sun, 25 Oct 2020 22:05:47 -0700 Subject: [PATCH 32/72] ALSA: usb-audio: Add implicit feedback quirk for Zoom UAC-2 commit f15cfca818d756dd1c9492530091dfd583359db3 upstream. The Zoom UAC-2 USB audio interface provides an async playback endpoint ("1 OUT (ASYNC)") and capture endpoint ("2 IN (ASYNC)"), both with 2-channel S32_LE in 44.1, 48, 88.2, 96, 176.4, or 192 kilosamples/s. The device provides explicit feedback to adjust the host's playback rate, but the feedback appears unstable and biased relative to the device's capture rate. "alsaloop -t 1000" experiences playback underruns and tries to resample the captured audio to match the varying playback rate. Forcing the kernel to use implicit feedback appears to produce more stable results. This causes the host to transmit one playback sample for each capture sample received. (Zoom North America has been notified of this change.) Signed-off-by: Keith Winstein Tested-by: Keith Winstein Cc: BugLink: https://lore.kernel.org/r/20201027071841.GA164525@trolley.csail.mit.edu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/pcm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 7743e7bc6bf2..038ce38986e2 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -350,6 +350,10 @@ static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs, ep = 0x81; ifnum = 2; goto add_sync_ep_from_ifnum; + case USB_ID(0x1686, 0xf029): /* Zoom UAC-2 */ + ep = 0x82; + ifnum = 2; + goto add_sync_ep_from_ifnum; case USB_ID(0x1397, 0x0001): /* Behringer UFX1604 */ case USB_ID(0x1397, 0x0002): /* Behringer UFX1204 */ ep = 0x81; From 69a7cae8656039d4420e8613448387d0ebf494f0 Mon Sep 17 00:00:00 2001 From: Artem Lapkin Date: Tue, 3 Nov 2020 18:08:09 +0800 Subject: [PATCH 33/72] ALSA: usb-audio: add usb vendor id as DSD-capable for Khadas devices commit 07815a2b3501adeaae6384a25b9c4a9c81dae59f upstream. Khadas audio devices ( USB_ID_VENDOR 0x3353 ) have DSD-capable implementations from XMOS need add new usb vendor id for recognition Signed-off-by: Artem Lapkin Cc: Link: https://lore.kernel.org/r/20201103103311.5435-1-art@khadas.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index e6dea1c7112b..3effe2e86197 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1466,6 +1466,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case 0x278b: /* Rotel? */ case 0x292b: /* Gustard/Ess based devices */ case 0x2ab6: /* T+A devices */ + case 0x3353: /* Khadas devices */ case 0x3842: /* EVGA */ case 0xc502: /* HiBy devices */ if (fp->dsd_raw) From 9a83c225d4b118e65957abbc75bb844f88e5162b Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Wed, 4 Nov 2020 22:27:17 +1030 Subject: [PATCH 34/72] ALSA: usb-audio: Add implicit feedback quirk for Qu-16 commit 0938ecae432e7ac8b01080c35dd81d50a1e43033 upstream. This patch fixes audio distortion on playback for the Allen&Heath Qu-16. Signed-off-by: Geoffrey D. Bennett Cc: Link: https://lore.kernel.org/r/20201104115717.GA19046@b4.vu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/pcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 038ce38986e2..5a591bace2bd 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -334,6 +334,7 @@ static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs, switch (subs->stream->chip->usb_id) { case USB_ID(0x0763, 0x2030): /* M-Audio Fast Track C400 */ case USB_ID(0x0763, 0x2031): /* M-Audio Fast Track C600 */ + case USB_ID(0x22f0, 0x0006): /* Allen&Heath Qu-16 */ ep = 0x81; ifnum = 3; goto add_sync_ep_from_ifnum; From df0486a4faee7277932136f2526721f07443ec95 Mon Sep 17 00:00:00 2001 From: "Geoffrey D. Bennett" Date: Wed, 4 Nov 2020 22:37:05 +1030 Subject: [PATCH 35/72] ALSA: usb-audio: Add implicit feedback quirk for MODX commit 26201ddc1373c99b2a67c5774da2f0eecd749b93 upstream. This patch fixes audio distortion on playback for the Yamaha MODX. Signed-off-by: Geoffrey D. Bennett Tested-by: Frank Slotta Cc: Link: https://lore.kernel.org/r/20201104120705.GA19126@b4.vu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/pcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index 5a591bace2bd..e4d2fcc89c30 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -344,6 +344,7 @@ static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs, ifnum = 2; goto add_sync_ep_from_ifnum; case USB_ID(0x2466, 0x8003): /* Fractal Audio Axe-Fx II */ + case USB_ID(0x0499, 0x172a): /* Yamaha MODX */ ep = 0x86; ifnum = 2; goto add_sync_ep_from_ifnum; From d4fea27ffcb01bbf8ca8ae54f60479945394b83e Mon Sep 17 00:00:00 2001 From: Shijie Luo Date: Sun, 1 Nov 2020 17:07:40 -0800 Subject: [PATCH 36/72] mm: mempolicy: fix potential pte_unmap_unlock pte error commit 3f08842098e842c51e3b97d0dcdebf810b32558e upstream. When flags in queue_pages_pte_range don't have MPOL_MF_MOVE or MPOL_MF_MOVE_ALL bits, code breaks and passing origin pte - 1 to pte_unmap_unlock seems like not a good idea. queue_pages_pte_range can run in MPOL_MF_MOVE_ALL mode which doesn't migrate misplaced pages but returns with EIO when encountering such a page. Since commit a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified") and early break on the first pte in the range results in pte_unmap_unlock on an underflow pte. This can lead to lockups later on when somebody tries to lock the pte resp. page_table_lock again.. Fixes: a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified") Signed-off-by: Shijie Luo Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Miaohe Lin Cc: Feilong Lin Cc: Shijie Luo Cc: Link: https://lkml.kernel.org/r/20201019074853.50856-1-luoshijie1@huawei.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mempolicy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 68c46da82aac..3cd27c1c729f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -496,7 +496,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long flags = qp->flags; int ret; bool has_unmovable = false; - pte_t *pte; + pte_t *pte, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); @@ -510,7 +510,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; - pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; @@ -542,7 +542,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, } else break; } - pte_unmap_unlock(pte - 1, ptl); + pte_unmap_unlock(mapped_pte, ptl); cond_resched(); if (has_unmovable) From 1471e96263b6e68929e7afed28f80ab3b6bd1066 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Sun, 1 Nov 2020 17:07:47 -0800 Subject: [PATCH 37/72] lib/crc32test: remove extra local_irq_disable/enable commit aa4e460f0976351fddd2f5ac6e08b74320c277a1 upstream. Commit 4d004099a668 ("lockdep: Fix lockdep recursion") uncovered the following issue in lib/crc32test reported on s390: BUG: using __this_cpu_read() in preemptible [00000000] code: swapper/0/1 caller is lockdep_hardirqs_on_prepare+0x48/0x270 CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.9.0-next-20201015-15164-g03d992bd2de6 #19 Hardware name: IBM 3906 M04 704 (LPAR) Call Trace: lockdep_hardirqs_on_prepare+0x48/0x270 trace_hardirqs_on+0x9c/0x1b8 crc32_test.isra.0+0x170/0x1c0 crc32test_init+0x1c/0x40 do_one_initcall+0x40/0x130 do_initcalls+0x126/0x150 kernel_init_freeable+0x1f6/0x230 kernel_init+0x22/0x150 ret_from_fork+0x24/0x2c no locks held by swapper/0/1. Remove extra local_irq_disable/local_irq_enable helpers calls. Fixes: 5fb7f87408f1 ("lib: add module support to crc32 tests") Signed-off-by: Vasily Gorbik Signed-off-by: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Greg Kroah-Hartman Link: https://lkml.kernel.org/r/patch.git-4369da00c06e.your-ad-here.call-01602859837-ext-1679@work.hours Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- lib/crc32test.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/crc32test.c b/lib/crc32test.c index 97d6a57cefcc..61ddce2cff77 100644 --- a/lib/crc32test.c +++ b/lib/crc32test.c @@ -683,7 +683,6 @@ static int __init crc32c_test(void) /* reduce OS noise */ local_irq_save(flags); - local_irq_disable(); nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { @@ -694,7 +693,6 @@ static int __init crc32c_test(void) nsec = ktime_get_ns() - nsec; local_irq_restore(flags); - local_irq_enable(); pr_info("crc32c: CRC_LE_BITS = %d\n", CRC_LE_BITS); @@ -768,7 +766,6 @@ static int __init crc32_test(void) /* reduce OS noise */ local_irq_save(flags); - local_irq_disable(); nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { @@ -783,7 +780,6 @@ static int __init crc32_test(void) nsec = ktime_get_ns() - nsec; local_irq_restore(flags); - local_irq_enable(); pr_info("crc32: CRC_LE_BITS = %d, CRC_BE BITS = %d\n", CRC_LE_BITS, CRC_BE_BITS); From 68e8b8ed787a8acafa3e987d52f6d415c5949ec6 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sun, 1 Nov 2020 17:07:53 -0800 Subject: [PATCH 38/72] kthread_worker: prevent queuing delayed work from timer_fn when it is being canceled commit 6993d0fdbee0eb38bfac350aa016f65ad11ed3b1 upstream. There is a small race window when a delayed work is being canceled and the work still might be queued from the timer_fn: CPU0 CPU1 kthread_cancel_delayed_work_sync() __kthread_cancel_work_sync() __kthread_cancel_work() work->canceling++; kthread_delayed_work_timer_fn() kthread_insert_work(); BUG: kthread_insert_work() should not get called when work->canceling is set. Signed-off-by: Zqiang Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Acked-by: Tejun Heo Cc: Link: https://lkml.kernel.org/r/20201014083030.16895-1-qiang.zhang@windriver.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/kthread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index b786eda90bb5..2eed853ab9cc 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -863,7 +863,8 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); - kthread_insert_work(worker, work, &worker->work_list); + if (!work->canceling) + kthread_insert_work(worker, work, &worker->work_list); spin_unlock(&worker->lock); } From 361d82eb7c301e7473459cb7f1e0d7733cf5e7ec Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 1 Nov 2020 17:08:00 -0800 Subject: [PATCH 39/72] mm: always have io_remap_pfn_range() set pgprot_decrypted() commit f8f6ae5d077a9bdaf5cbf2ac960a5d1a04b47482 upstream. The purpose of io_remap_pfn_range() is to map IO memory, such as a memory mapped IO exposed through a PCI BAR. IO devices do not understand encryption, so this memory must always be decrypted. Automatically call pgprot_decrypted() as part of the generic implementation. This fixes a bug where enabling AMD SME causes subsystems, such as RDMA, using io_remap_pfn_range() to expose BAR pages to user space to fail. The CPU will encrypt access to those BAR pages instead of passing unencrypted IO directly to the device. Places not mapping IO should use remap_pfn_range(). Fixes: aca20d546214 ("x86/mm: Add support to make use of Secure Memory Encryption") Signed-off-by: Jason Gunthorpe Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Tom Lendacky Cc: Thomas Gleixner Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brijesh Singh Cc: Jonathan Corbet Cc: Dmitry Vyukov Cc: "Dave Young" Cc: Alexander Potapenko Cc: Konrad Rzeszutek Wilk Cc: Andy Lutomirski Cc: Larry Woodman Cc: Matt Fleming Cc: Ingo Molnar Cc: "Michael S. Tsirkin" Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Rik van Riel Cc: Toshimitsu Kani Cc: Link: https://lkml.kernel.org/r/0-v1-025d64bdf6c4+e-amd_sme_fix_jgg@nvidia.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/pgtable.h | 4 ---- include/linux/mm.h | 9 +++++++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 15fd0277ffa6..5901b0005929 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1115,10 +1115,6 @@ static inline bool arch_has_pfn_modify_check(void) #endif /* !__ASSEMBLY__ */ -#ifndef io_remap_pfn_range -#define io_remap_pfn_range remap_pfn_range -#endif - #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 diff --git a/include/linux/mm.h b/include/linux/mm.h index 83828c118b6b..f6ecf41aea83 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2561,6 +2561,15 @@ static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +#ifndef io_remap_pfn_range +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); +} +#endif + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) From fe0af0efa7b1ecb8aebb5e8f44d00db07f564c6c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 26 Oct 2020 10:52:29 -0400 Subject: [PATCH 40/72] gfs2: Wake up when sd_glock_disposal becomes zero commit da7d554f7c62d0c17c1ac3cc2586473c2d99f0bd upstream. Commit fc0e38dae645 ("GFS2: Fix glock deallocation race") fixed a sd_glock_disposal accounting bug by adding a missing atomic_dec statement, but it failed to wake up sd_glock_wait when that decrement causes sd_glock_disposal to reach zero. As a consequence, gfs2_gl_hash_clear can now run into a 10-minute timeout instead of being woken up. Add the missing wakeup. Fixes: fc0e38dae645 ("GFS2: Fix glock deallocation race") Cc: stable@vger.kernel.org # v2.6.39+ Signed-off-by: Alexander Aring Signed-off-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/glock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ccdd8c821abd..c20d71d86812 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -870,7 +870,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, out_free: kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); - atomic_dec(&sdp->sd_glock_disposal); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); out: return ret; From b410d07e965a039dc67073889dab9ff01cee8402 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Nov 2020 15:31:27 -0500 Subject: [PATCH 41/72] ring-buffer: Fix recursion protection transitions between interrupt context commit b02414c8f045ab3b9afc816c3735bc98c5c3d262 upstream. The recursion protection of the ring buffer depends on preempt_count() to be correct. But it is possible that the ring buffer gets called after an interrupt comes in but before it updates the preempt_count(). This will trigger a false positive in the recursion code. Use the same trick from the ftrace function callback recursion code which uses a "transition" bit that gets set, to allow for a single recursion for to handle transitions between contexts. Cc: stable@vger.kernel.org Fixes: 567cd4da54ff4 ("ring-buffer: User context bit recursion checking") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ring_buffer.c | 58 ++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index eef05eb3b284..87ce9736043d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -444,14 +444,16 @@ struct rb_event_info { /* * Used for which event context the event is in. - * NMI = 0 - * IRQ = 1 - * SOFTIRQ = 2 - * NORMAL = 3 + * TRANSITION = 0 + * NMI = 1 + * IRQ = 2 + * SOFTIRQ = 3 + * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { + RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, @@ -2620,10 +2622,10 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. + * bit 1 = NMI context + * bit 2 = IRQ context + * bit 3 = SoftIRQ context + * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ @@ -2646,6 +2648,30 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSTION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline int @@ -2661,8 +2687,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) - return 1; + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { + /* + * It is possible that this was called by transitioning + * between interrupt context, and preempt_count() has not + * been updated yet. In this case, use the TRANSITION bit. + */ + bit = RB_CTX_TRANSITION; + if (val & (1 << (bit + cpu_buffer->nest))) + return 1; + } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; @@ -2677,8 +2711,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->current_context - (1 << cpu_buffer->nest); } -/* The recursive locking above uses 4 bits */ -#define NESTED_BITS 4 +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested From ee2b95c08515633da67c048beb164966224ffe9e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Oct 2020 17:31:45 -0400 Subject: [PATCH 42/72] ftrace: Fix recursion check for NMI test commit ee11b93f95eabdf8198edd4668bf9102e7248270 upstream. The code that checks recursion will work to only do the recursion check once if there's nested checks. The top one will do the check, the other nested checks will see recursion was already checked and return zero for its "bit". On the return side, nothing will be done if the "bit" is zero. The problem is that zero is returned for the "good" bit when in NMI context. This will set the bit for NMIs making it look like *all* NMI tracing is recursing, and prevent tracing of anything in NMI context! The simple fix is to return "bit + 1" and subtract that bit on the end to get the real bit. Cc: stable@vger.kernel.org Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ee0c6a313ed1..cb75f93fd254 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -595,7 +595,7 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) current->trace_recursion = val; barrier(); - return bit; + return bit + 1; } static __always_inline void trace_clear_recursion(int bit) @@ -605,6 +605,7 @@ static __always_inline void trace_clear_recursion(int bit) if (!bit) return; + bit--; bit = 1 << bit; val &= ~bit; From 2de780dfbe1ad05c3f15f8d6f2d22dae9cf95267 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Oct 2020 19:35:08 -0400 Subject: [PATCH 43/72] ftrace: Handle tracing when switching between context commit 726b3d3f141fba6f841d715fc4d8a4a84f02c02a upstream. When an interrupt or NMI comes in and switches the context, there's a delay from when the preempt_count() shows the update. As the preempt_count() is used to detect recursion having each context have its own bit get set when tracing starts, and if that bit is already set, it is considered a recursion and the function exits. But if this happens in that section where context has changed but preempt_count() has not been updated, this will be incorrectly flagged as a recursion. To handle this case, create another bit call TRANSITION and test it if the current context bit is already set. Flag the call as a recursion if the TRANSITION bit is already set, and if not, set it and continue. The TRANSITION bit will be cleared normally on the return of the function that set it, or if the current context bit is clear, set it and clear the TRANSITION bit to allow for another transition between the current context and an even higher one. Cc: stable@vger.kernel.org Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.h | 23 +++++++++++++++++++++-- kernel/trace/trace_selftest.c | 9 +++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cb75f93fd254..01475411fd1b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -534,6 +534,12 @@ enum { TRACE_GRAPH_DEPTH_START_BIT, TRACE_GRAPH_DEPTH_END_BIT, + + /* + * When transitioning between context, the preempt_count() may + * not be correct. Allow for a single recursion to cover this case. + */ + TRACE_TRANSITION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -588,8 +594,21 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) return 0; bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) - return -1; + if (unlikely(val & (1 << bit))) { + /* + * It could be that preempt_count has not been updated during + * a switch between contexts. Allow for a single recursion. + */ + bit = TRACE_TRANSITION_BIT; + if (trace_recursion_test(bit)) + return -1; + trace_recursion_set(bit); + barrier(); + return bit + 1; + } + + /* Normal check passed, clear the transition to allow it again */ + trace_recursion_clear(TRACE_TRANSITION_BIT); val |= 1 << bit; current->trace_recursion = val; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 11e9daa4a568..330ba2b081ed 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -492,8 +492,13 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_rec_probe); ret = -1; - if (trace_selftest_recursion_cnt != 1) { - pr_cont("*callback not called once (%d)* ", + /* + * Recursion allows for transitions between context, + * and may call the callback twice. + */ + if (trace_selftest_recursion_cnt != 1 && + trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called once (or twice) (%d)* ", trace_selftest_recursion_cnt); goto out; } From 7e4eeff7da1df5602e9af340d01b2ce1b4662d3e Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Fri, 30 Oct 2020 00:19:05 +0800 Subject: [PATCH 44/72] tracing: Fix out of bounds write in get_trace_buf commit c1acb4ac1a892cf08d27efcb964ad281728b0545 upstream. The nesting count of trace_printk allows for 4 levels of nesting. The nesting counter starts at zero and is incremented before being used to retrieve the current context's buffer. But the index to the buffer uses the nesting counter after it was incremented, and not its original number, which in needs to do. Link: https://lkml.kernel.org/r/20201029161905.4269-1-hqjagain@gmail.com Cc: stable@vger.kernel.org Fixes: 3d9622c12c887 ("tracing: Add barrier to trace_printk() buffer nesting modification") Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6bf617ff0369..c3cc6aaa6f79 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2819,7 +2819,7 @@ static char *get_trace_buf(void) /* Interrupts must see nesting incremented before we use the buffer */ barrier(); - return &buffer->buffer[buffer->nesting][0]; + return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) From c096a3d44ead54b53a3b65d9d210022099bdc7ef Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Nov 2020 16:12:44 +0100 Subject: [PATCH 45/72] futex: Handle transient "ownerless" rtmutex state correctly commit 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 upstream. Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner(). This is one possible chain of events leading to this: Task Prio Operation T1 120 lock(F) T2 120 lock(F) -> blocks (top waiter) T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter) XX timeout/ -> wakes T2 signal T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set) T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter and the lower priority T2 cannot steal the lock. -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON() The comment states that this is invalid and rt_mutex_real_owner() must return a non NULL owner when the trylock failed, but in case of a queued and woken up waiter rt_mutex_real_owner() == NULL is a valid transient state. The higher priority waiter has simply not yet managed to take over the rtmutex. The BUG_ON() is therefore wrong and this is just another retry condition in fixup_pi_state_owner(). Drop the locks, so that T3 can make progress, and then try the fixup again. Gratian provided a great analysis, traces and a reproducer. The analysis is to the point, but it confused the hell out of that tglx dude who had to page in all the futex horrors again. Condensed version is above. [ tglx: Wrote comment and changelog ] Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Reported-by: Gratian Crisan Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index eabb9180ffa8..52f641c00a65 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2417,10 +2417,22 @@ retry: } /* - * Since we just failed the trylock; there must be an owner. + * The trylock just failed, so either there is an owner or + * there is a higher priority waiter than this one. */ newowner = rt_mutex_owner(&pi_state->pi_mutex); - BUG_ON(!newowner); + /* + * If the higher priority waiter has not yet taken over the + * rtmutex then newowner is NULL. We can't return here with + * that state because it's inconsistent vs. the user space + * state. So drop the locks and try again. It's a valid + * situation and not any different from the other retry + * conditions. + */ + if (unlikely(!newowner)) { + err = -EAGAIN; + goto handle_err; + } } else { WARN_ON_ONCE(argowner != current); if (oldowner == current) { From dffce0f0e7f1c2c421ef488642764d81ff62da0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20P=C3=A9ron?= Date: Sat, 3 Oct 2020 12:03:32 +0200 Subject: [PATCH 46/72] ARM: dts: sun4i-a10: fix cpu_alert temperature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit dea252fa41cd8ce332d148444e4799235a8a03ec ] When running dtbs_check thermal_zone warn about the temperature declared. thermal-zones: cpu-thermal:trips:cpu-alert0:temperature:0:0: 850000 is greater than the maximum of 200000 It's indeed wrong the real value is 85°C and not 850°C. Signed-off-by: Clément Péron Signed-off-by: Maxime Ripard Link: https://lore.kernel.org/r/20201003100332.431178-1-peron.clem@gmail.com Signed-off-by: Sasha Levin --- arch/arm/boot/dts/sun4i-a10.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi index 5d46bb0139fa..707ad5074878 100644 --- a/arch/arm/boot/dts/sun4i-a10.dtsi +++ b/arch/arm/boot/dts/sun4i-a10.dtsi @@ -143,7 +143,7 @@ trips { cpu_alert0: cpu-alert0 { /* milliCelsius */ - temperature = <850000>; + temperature = <85000>; hysteresis = <2000>; type = "passive"; }; From f81794c3789dcacf5dc5abe7da557f33e0c60ecf Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 14 Oct 2020 17:24:28 +0800 Subject: [PATCH 47/72] x86/kexec: Use up-to-dated screen_info copy to fill boot params [ Upstream commit afc18069a2cb7ead5f86623a5f3d4ad6e21f940d ] kexec_file_load() currently reuses the old boot_params.screen_info, but if drivers have change the hardware state, boot_param.screen_info could contain invalid info. For example, the video type might be no longer VGA, or the frame buffer address might be changed. If the kexec kernel keeps using the old screen_info, kexec'ed kernel may attempt to write to an invalid framebuffer memory region. There are two screen_info instances globally available, boot_params.screen_info and screen_info. Later one is a copy, and is updated by drivers. So let kexec_file_load use the updated copy. [ mingo: Tidied up the changelog. ] Signed-off-by: Kairui Song Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20201014092429.1415040-2-kasong@redhat.com Signed-off-by: Sasha Levin --- arch/x86/kernel/kexec-bzimage64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 9490a2845f14..273687986a26 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -211,8 +211,7 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; /* Copying screen_info will do? */ - memcpy(¶ms->screen_info, &boot_params.screen_info, - sizeof(struct screen_info)); + memcpy(¶ms->screen_info, &screen_info, sizeof(struct screen_info)); /* Fill in memsize later */ params->screen_info.ext_mem_k = 0; From b8f922aa95aa07759623e2adab65bdba8811e808 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Wed, 21 Oct 2020 11:53:59 +0200 Subject: [PATCH 48/72] of: Fix reserved-memory overlap detection [ Upstream commit ca05f33316559a04867295dd49f85aeedbfd6bfd ] The reserved-memory overlap detection code fails to detect overlaps if either of the regions starts at address 0x0. The code explicitly checks for and ignores such regions, apparently in order to ignore dynamically allocated regions which have an address of 0x0 at this point. These dynamically allocated regions also have a size of 0x0 at this point, so fix this by removing the check and sorting the dynamically allocated regions ahead of any static regions at address 0x0. For example, there are two overlaps in this case but they are not currently reported: foo@0 { reg = <0x0 0x2000>; }; bar@0 { reg = <0x0 0x1000>; }; baz@1000 { reg = <0x1000 0x1000>; }; quux { size = <0x1000>; }; but they are after this patch: OF: reserved mem: OVERLAP DETECTED! bar@0 (0x00000000--0x00001000) overlaps with foo@0 (0x00000000--0x00002000) OF: reserved mem: OVERLAP DETECTED! foo@0 (0x00000000--0x00002000) overlaps with baz@1000 (0x00001000--0x00002000) Signed-off-by: Vincent Whitchurch Link: https://lore.kernel.org/r/ded6fd6b47b58741aabdcc6967f73eca6a3f311e.1603273666.git-series.vincent.whitchurch@axis.com Signed-off-by: Rob Herring Signed-off-by: Sasha Levin --- drivers/of/of_reserved_mem.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 895c83e0c7b6..19f95552da4d 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -218,6 +218,16 @@ static int __init __rmem_cmp(const void *a, const void *b) if (ra->base > rb->base) return 1; + /* + * Put the dynamic allocations (address == 0, size == 0) before static + * allocations at address 0x0 so that overlap detection works + * correctly. + */ + if (ra->size < rb->size) + return -1; + if (ra->size > rb->size) + return 1; + return 0; } @@ -235,8 +245,7 @@ static void __init __rmem_check_for_overlap(void) this = &reserved_mem[i]; next = &reserved_mem[i + 1]; - if (!(this->base && next->base)) - continue; + if (this->base + this->size > next->base) { phys_addr_t this_end, next_end; From 6dba51828dfd31e326e6642727ede24c054a86fe Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 22 Oct 2020 16:58:41 -0400 Subject: [PATCH 49/72] blk-cgroup: Fix memleak on error path [ Upstream commit 52abfcbd57eefdd54737fc8c2dc79d8f46d4a3e5 ] If new_blkg allocation raced with blk_policy change and blkg_lookup_check fails, new_blkg is leaked. Acked-by: Tejun Heo Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index a06547fe6f6b..51fc803c999d 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -882,6 +882,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg = blkg_lookup_check(pos, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); + blkg_free(new_blkg); goto fail_unlock; } From 239aed5d2ecbfd381ca642f0a6bcb272c98408df Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 22 Oct 2020 16:58:42 -0400 Subject: [PATCH 50/72] blk-cgroup: Pre-allocate tree node on blkg_conf_prep [ Upstream commit f255c19b3ab46d3cad3b1b2e1036f4c926cb1d0c ] Similarly to commit 457e490f2b741 ("blkcg: allocate struct blkcg_gq outside request queue spinlock"), blkg_create can also trigger occasional -ENOMEM failures at the radix insertion because any allocation inside blkg_create has to be non-blocking, making it more likely to fail. This causes trouble for userspace tools trying to configure io weights who need to deal with this condition. This patch reduces the occurrence of -ENOMEMs on this path by preloading the radix tree element on a GFP_KERNEL context, such that we guarantee the later non-blocking insertion won't fail. A similar solution exists in blkcg_init_queue for the same situation. Acked-by: Tejun Heo Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- block/blk-cgroup.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 51fc803c999d..85bd46e0a745 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -876,6 +876,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto fail; } + if (radix_tree_preload(GFP_KERNEL)) { + blkg_free(new_blkg); + ret = -ENOMEM; + goto fail; + } + rcu_read_lock(); spin_lock_irq(q->queue_lock); @@ -883,7 +889,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); blkg_free(new_blkg); - goto fail_unlock; + goto fail_preloaded; } if (blkg) { @@ -892,10 +898,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg = blkg_create(pos, q, new_blkg); if (unlikely(IS_ERR(blkg))) { ret = PTR_ERR(blkg); - goto fail_unlock; + goto fail_preloaded; } } + radix_tree_preload_end(); + if (pos == blkcg) goto success; } @@ -905,6 +913,8 @@ success: ctx->body = body; return 0; +fail_preloaded: + radix_tree_preload_end(); fail_unlock: spin_unlock_irq(q->queue_lock); rcu_read_unlock(); From 6280db912e03089b15a3e2c2882d573d3b09bd0c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 10 Oct 2020 11:25:39 +0800 Subject: [PATCH 51/72] scsi: core: Don't start concurrent async scan on same host [ Upstream commit 831e3405c2a344018a18fcc2665acc5a38c3a707 ] The current scanning mechanism is supposed to fall back to a synchronous host scan if an asynchronous scan is in progress. However, this rule isn't strictly respected, scsi_prep_async_scan() doesn't hold scan_mutex when checking shost->async_scan. When scsi_scan_host() is called concurrently, two async scans on same host can be started and a hang in do_scan_async() is observed. Fixes this issue by checking & setting shost->async_scan atomically with shost->scan_mutex. Link: https://lore.kernel.org/r/20201010032539.426615-1-ming.lei@redhat.com Cc: Christoph Hellwig Cc: Ewan D. Milne Cc: Hannes Reinecke Cc: Bart Van Assche Reviewed-by: Lee Duncan Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_scan.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 9a7e3a3bd5ce..009a5b2aa3d0 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -1722,15 +1722,16 @@ static void scsi_sysfs_add_devices(struct Scsi_Host *shost) */ static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost) { - struct async_scan_data *data; + struct async_scan_data *data = NULL; unsigned long flags; if (strncmp(scsi_scan_type, "sync", 4) == 0) return NULL; + mutex_lock(&shost->scan_mutex); if (shost->async_scan) { shost_printk(KERN_DEBUG, shost, "%s called twice\n", __func__); - return NULL; + goto err; } data = kmalloc(sizeof(*data), GFP_KERNEL); @@ -1741,7 +1742,6 @@ static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost) goto err; init_completion(&data->prev_finished); - mutex_lock(&shost->scan_mutex); spin_lock_irqsave(shost->host_lock, flags); shost->async_scan = 1; spin_unlock_irqrestore(shost->host_lock, flags); @@ -1756,6 +1756,7 @@ static struct async_scan_data *scsi_prep_async_scan(struct Scsi_Host *shost) return data; err: + mutex_unlock(&shost->scan_mutex); kfree(data); return NULL; } From 1aa82721dd74c9f08337ea0a22689eb2a94634ae Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Fri, 23 Oct 2020 16:37:57 +0200 Subject: [PATCH 52/72] vsock: use ns_capable_noaudit() on socket create [ Upstream commit af545bb5ee53f5261db631db2ac4cde54038bdaf ] During __vsock_create() CAP_NET_ADMIN is used to determine if the vsock_sock->trusted should be set to true. This value is used later for determing if a remote connection should be allowed to connect to a restricted VM. Unfortunately, if the caller doesn't have CAP_NET_ADMIN, an audit message such as an selinux denial is generated even if the caller does not want a trusted socket. Logging errors on success is confusing. To avoid this, switch the capable(CAP_NET_ADMIN) check to the noaudit version. Reported-by: Roman Kiryanov https://android-review.googlesource.com/c/device/generic/goldfish/+/1468545/ Signed-off-by: Jeff Vander Stoep Reviewed-by: James Morris Link: https://lore.kernel.org/r/20201023143757.377574-1-jeffv@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/vmw_vsock/af_vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index c88dc8ee3144..02374459c417 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -629,7 +629,7 @@ struct sock *__vsock_create(struct net *net, vsk->owner = get_cred(psk->owner); vsk->connect_timeout = psk->connect_timeout; } else { - vsk->trusted = capable(CAP_NET_ADMIN); + vsk->trusted = ns_capable_noaudit(&init_user_ns, CAP_NET_ADMIN); vsk->owner = get_current_cred(); vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; } From 6eecfcbcde431904e5837d285e9e99b5a5eac02c Mon Sep 17 00:00:00 2001 From: Hoegeun Kwon Date: Tue, 27 Oct 2020 13:14:42 +0900 Subject: [PATCH 53/72] drm/vc4: drv: Add error handding for bind [ Upstream commit 9ce0af3e9573fb84c4c807183d13ea2a68271e4b ] There is a problem that if vc4_drm bind fails, a memory leak occurs on the drm_property_create side. Add error handding for drm_mode_config. Signed-off-by: Hoegeun Kwon Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20201027041442.30352-2-hoegeun.kwon@samsung.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/vc4/vc4_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c index 04270a14fcaa..868dd1ef3b69 100644 --- a/drivers/gpu/drm/vc4/vc4_drv.c +++ b/drivers/gpu/drm/vc4/vc4_drv.c @@ -312,6 +312,7 @@ unbind_all: component_unbind_all(dev, drm); gem_destroy: vc4_gem_destroy(drm); + drm_mode_config_cleanup(drm); vc4_bo_cache_destroy(drm); dev_put: drm_dev_put(drm); From a52cdf61125b2189a8e1d85d1e61d654f7fe5d4d Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 27 Oct 2020 21:49:01 +0800 Subject: [PATCH 54/72] ACPI: NFIT: Fix comparison to '-ENXIO' [ Upstream commit 85f971b65a692b68181438e099b946cc06ed499b ] Initial value of rc is '-ENXIO', and we should use the initial value to check it. Signed-off-by: Zhang Qilong Reviewed-by: Pankaj Gupta Reviewed-by: Vishal Verma [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/acpi/nfit/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index dd4c7289610e..cb88f3b43a94 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1535,7 +1535,7 @@ static ssize_t format1_show(struct device *dev, le16_to_cpu(nfit_dcr->dcr->code)); break; } - if (rc != ENXIO) + if (rc != -ENXIO) break; } mutex_unlock(&acpi_desc->init_mutex); From 6612b754ac0c85ca8b1181b5d3ea4461a8c1bbcb Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sun, 8 Nov 2020 16:38:06 +0100 Subject: [PATCH 55/72] vt: Disable KD_FONT_OP_COPY commit 3c4e0dff2095c579b142d5a0693257f1c58b4804 upstream. It's buggy: On Fri, Nov 06, 2020 at 10:30:08PM +0800, Minh Yuan wrote: > We recently discovered a slab-out-of-bounds read in fbcon in the latest > kernel ( v5.10-rc2 for now ). The root cause of this vulnerability is that > "fbcon_do_set_font" did not handle "vc->vc_font.data" and > "vc->vc_font.height" correctly, and the patch > for VT_RESIZEX can't handle this > issue. > > Specifically, we use KD_FONT_OP_SET to set a small font.data for tty6, and > use KD_FONT_OP_SET again to set a large font.height for tty1. After that, > we use KD_FONT_OP_COPY to assign tty6's vc_font.data to tty1's vc_font.data > in "fbcon_do_set_font", while tty1 retains the original larger > height. Obviously, this will cause an out-of-bounds read, because we can > access a smaller vc_font.data with a larger vc_font.height. Further there was only one user ever. - Android's loadfont, busybox and console-tools only ever use OP_GET and OP_SET - fbset documentation only mentions the kernel cmdline font: option, not anything else. - systemd used OP_COPY before release 232 published in Nov 2016 Now unfortunately the crucial report seems to have gone down with gmane, and the commit message doesn't say much. But the pull request hints at OP_COPY being broken https://github.com/systemd/systemd/pull/3651 So in other words, this never worked, and the only project which foolishly every tried to use it, realized that rather quickly too. Instead of trying to fix security issues here on dead code by adding missing checks, fix the entire thing by removing the functionality. Note that systemd code using the OP_COPY function ignored the return value, so it doesn't matter what we're doing here really - just in case a lone server somewhere happens to be extremely unlucky and running an affected old version of systemd. The relevant code from font_copy_to_all_vcs() in systemd was: /* copy font from active VT, where the font was uploaded to */ cfo.op = KD_FONT_OP_COPY; cfo.height = vcs.v_active-1; /* tty1 == index 0 */ (void) ioctl(vcfd, KDFONTOP, &cfo); Note this just disables the ioctl, garbage collecting the now unused callbacks is left for -next. v2: Tetsuo found the old mail, which allowed me to find it on another archive. Add the link too. Acked-by: Peilin Ye Reported-by: Minh Yuan References: https://lists.freedesktop.org/archives/systemd-devel/2016-June/036935.html References: https://github.com/systemd/systemd/pull/3651 Cc: Greg KH Cc: Peilin Ye Cc: Tetsuo Handa Signed-off-by: Daniel Vetter Link: https://lore.kernel.org/r/20201108153806.3140315-1-daniel.vetter@ffwll.ch Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/vt.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 758f522f331e..13ea0579f104 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -4574,27 +4574,6 @@ static int con_font_default(struct vc_data *vc, struct console_font_op *op) return rc; } -static int con_font_copy(struct vc_data *vc, struct console_font_op *op) -{ - int con = op->height; - int rc; - - - console_lock(); - if (vc->vc_mode != KD_TEXT) - rc = -EINVAL; - else if (!vc->vc_sw->con_font_copy) - rc = -ENOSYS; - else if (con < 0 || !vc_cons_allocated(con)) - rc = -ENOTTY; - else if (con == vc->vc_num) /* nothing to do */ - rc = 0; - else - rc = vc->vc_sw->con_font_copy(vc, con); - console_unlock(); - return rc; -} - int con_font_op(struct vc_data *vc, struct console_font_op *op) { switch (op->op) { @@ -4605,7 +4584,8 @@ int con_font_op(struct vc_data *vc, struct console_font_op *op) case KD_FONT_OP_SET_DEFAULT: return con_font_default(vc, op); case KD_FONT_OP_COPY: - return con_font_copy(vc, op); + /* was buggy and never really used */ + return -EINVAL; } return -ENOSYS; } From b177d2d915cea2d0a590f0034a20299dd1ee3ef2 Mon Sep 17 00:00:00 2001 From: Eddy Wu Date: Sat, 7 Nov 2020 14:47:22 +0800 Subject: [PATCH 56/72] fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent commit b4e00444cab4c3f3fec876dc0cccc8cbb0d1a948 upstream. current->group_leader->exit_signal may change during copy_process() if current->real_parent exits. Move the assignment inside tasklist_lock to avoid the race. Signed-off-by: Eddy Wu Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 3ed29bf8eb29..f2c92c100194 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1965,14 +1965,9 @@ static __latent_entropy struct task_struct *copy_process( /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { - p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { - if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = (clone_flags & CSIGNAL); p->group_leader = p; p->tgid = p->pid; } @@ -2017,9 +2012,14 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else + p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; + p->exit_signal = (clone_flags & CSIGNAL); } klp_copy_process(p); From 18f5757d7fd3ae455c2fcfd60efd5242ea9836f6 Mon Sep 17 00:00:00 2001 From: Claire Chang Date: Mon, 2 Nov 2020 20:07:49 +0800 Subject: [PATCH 57/72] serial: 8250_mtk: Fix uart_get_baud_rate warning commit 912ab37c798770f21b182d656937072b58553378 upstream. Mediatek 8250 port supports speed higher than uartclk / 16. If the baud rates in both the new and the old termios setting are higher than uartclk / 16, the WARN_ON in uart_get_baud_rate() will be triggered. Passing NULL as the old termios so uart_get_baud_rate() will use uartclk / 16 - 1 as the new baud rate which will be replaced by the original baud rate later by tty_termios_encode_baud_rate() in mtk8250_set_termios(). Fixes: 551e553f0d4a ("serial: 8250_mtk: Fix high-speed baud rates clamping") Signed-off-by: Claire Chang Link: https://lore.kernel.org/r/20201102120749.374458-1-tientzu@chromium.org Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_mtk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c index 159169a48deb..cc4f0c5e5ddf 100644 --- a/drivers/tty/serial/8250/8250_mtk.c +++ b/drivers/tty/serial/8250/8250_mtk.c @@ -47,7 +47,7 @@ mtk8250_set_termios(struct uart_port *port, struct ktermios *termios, */ baud = tty_termios_baud_rate(termios); - serial8250_do_set_termios(port, termios, old); + serial8250_do_set_termios(port, termios, NULL); tty_termios_encode_baud_rate(termios, baud, baud); From 288609c2bae2b08a21d1c794053c7e761620a946 Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Tue, 3 Nov 2020 16:49:42 +0800 Subject: [PATCH 58/72] serial: txx9: add missing platform_driver_unregister() on error in serial_txx9_init commit 0c5fc92622ed5531ff324b20f014e9e3092f0187 upstream. Add the missing platform_driver_unregister() before return from serial_txx9_init in the error handling case when failed to register serial_txx9_pci_driver with macro ENABLE_SERIAL_TXX9_PCI defined. Fixes: ab4382d27412 ("tty: move drivers/serial/ to drivers/tty/serial/") Signed-off-by: Qinglang Miao Link: https://lore.kernel.org/r/20201103084942.109076-1-miaoqinglang@huawei.com Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/serial_txx9.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/tty/serial/serial_txx9.c b/drivers/tty/serial/serial_txx9.c index 1b4008d022bf..2f7ef64536a0 100644 --- a/drivers/tty/serial/serial_txx9.c +++ b/drivers/tty/serial/serial_txx9.c @@ -1284,6 +1284,9 @@ static int __init serial_txx9_init(void) #ifdef ENABLE_SERIAL_TXX9_PCI ret = pci_register_driver(&serial_txx9_pci_driver); + if (ret) { + platform_driver_unregister(&serial_txx9_plat_driver); + } #endif if (ret == 0) goto out; From 202dae99e649d49355133d2bb9150244fd7695e5 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 26 Oct 2020 09:25:48 +0100 Subject: [PATCH 59/72] USB: serial: cyberjack: fix write-URB completion race commit 985616f0457d9f555fff417d0da56174f70cc14f upstream. The write-URB busy flag was being cleared before the completion handler was done with the URB, something which could lead to corrupt transfers due to a racing write request if the URB is resubmitted. Fixes: 507ca9bc0476 ("[PATCH] USB: add ability for usb-serial drivers to determine if their write urb is currently being used.") Cc: stable # 2.6.13 Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/cyberjack.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c index ebd76ab07b72..36dd688b5795 100644 --- a/drivers/usb/serial/cyberjack.c +++ b/drivers/usb/serial/cyberjack.c @@ -357,11 +357,12 @@ static void cyberjack_write_bulk_callback(struct urb *urb) struct device *dev = &port->dev; int status = urb->status; unsigned long flags; + bool resubmitted = false; - set_bit(0, &port->write_urbs_free); if (status) { dev_dbg(dev, "%s - nonzero write bulk status received: %d\n", __func__, status); + set_bit(0, &port->write_urbs_free); return; } @@ -394,6 +395,8 @@ static void cyberjack_write_bulk_callback(struct urb *urb) goto exit; } + resubmitted = true; + dev_dbg(dev, "%s - priv->wrsent=%d\n", __func__, priv->wrsent); dev_dbg(dev, "%s - priv->wrfilled=%d\n", __func__, priv->wrfilled); @@ -410,6 +413,8 @@ static void cyberjack_write_bulk_callback(struct urb *urb) exit: spin_unlock_irqrestore(&priv->lock, flags); + if (!resubmitted) + set_bit(0, &port->write_urbs_free); usb_serial_port_softint(port); } From 3411d382f62132df82fb697c79ef3c4a7b9c741b Mon Sep 17 00:00:00 2001 From: Ziyi Cao Date: Tue, 20 Oct 2020 00:08:06 +0800 Subject: [PATCH 60/72] USB: serial: option: add Quectel EC200T module support commit a46b973bced1ba57420752bf38426acd9f6cbfa6 upstream. Add usb product id of the Quectel EC200T module. Signed-off-by: Ziyi Cao Link: https://lore.kernel.org/r/17f8a2a3-ce0f-4be7-8544-8fdf286907d0@www.fastmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index c773db129bf9..57f57ddd7af5 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -250,6 +250,7 @@ static void option_instat_callback(struct urb *urb); #define QUECTEL_PRODUCT_EP06 0x0306 #define QUECTEL_PRODUCT_EM12 0x0512 #define QUECTEL_PRODUCT_RM500Q 0x0800 +#define QUECTEL_PRODUCT_EC200T 0x6026 #define CMOTECH_VENDOR_ID 0x16d8 #define CMOTECH_PRODUCT_6001 0x6001 @@ -1117,6 +1118,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_RM500Q, 0xff, 0xff, 0x10), .driver_info = ZLP }, + { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC200T, 0xff, 0, 0) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) }, From 6ea688db659c49334256d2dc69dc45f1e93f3594 Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Sat, 31 Oct 2020 23:54:58 +0100 Subject: [PATCH 61/72] USB: serial: option: add LE910Cx compositions 0x1203, 0x1230, 0x1231 commit 489979b4aab490b6b917c11dc02d81b4b742784a upstream. Add following Telit LE910Cx compositions: 0x1203: rndis, tty, adb, tty, tty, tty, tty 0x1230: tty, adb, rmnet, audio, tty, tty, tty, tty 0x1231: rndis, tty, adb, audio, tty, tty, tty, tty Signed-off-by: Daniele Palmas Link: https://lore.kernel.org/r/20201031225458.10512-1-dnlplm@gmail.com [ johan: add comments after entries ] Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 57f57ddd7af5..0311c333a9b3 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1203,6 +1203,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1203, 0xff), /* Telit LE910Cx (RNDIS) */ + .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920), @@ -1217,6 +1219,10 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1213, 0xff) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE920A4_1214), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1230, 0xff), /* Telit LE910Cx (rmnet) */ + .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1231, 0xff), /* Telit LE910Cx (RNDIS) */ + .driver_info = NCTRL(2) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x1260), .driver_info = NCTRL(0) | RSVD(1) | RSVD(2) }, { USB_DEVICE(TELIT_VENDOR_ID, 0x1261), From 59187e8bba015683ac00a1e10895a18443249cee Mon Sep 17 00:00:00 2001 From: Daniele Palmas Date: Tue, 3 Nov 2020 13:44:25 +0100 Subject: [PATCH 62/72] USB: serial: option: add Telit FN980 composition 0x1055 commit db0362eeb22992502764e825c79b922d7467e0eb upstream. Add the following Telit FN980 composition: 0x1055: tty, adb, tty, tty, tty, tty Signed-off-by: Daniele Palmas Link: https://lore.kernel.org/r/20201103124425.12940-1-dnlplm@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 0311c333a9b3..f28344f03141 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -1191,6 +1191,8 @@ static const struct usb_device_id option_ids[] = { .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1054, 0xff), /* Telit FT980-KS */ .driver_info = NCTRL(2) | RSVD(3) }, + { USB_DEVICE_INTERFACE_CLASS(TELIT_VENDOR_ID, 0x1055, 0xff), /* Telit FN980 (PCIe) */ + .driver_info = NCTRL(0) | RSVD(1) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910), .driver_info = NCTRL(0) | RSVD(1) | RSVD(3) }, { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM), From 327a8935379f2e9b73b1c0f7345df321d6b42c88 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 2 Nov 2020 09:58:21 -0500 Subject: [PATCH 63/72] USB: Add NO_LPM quirk for Kingston flash drive commit afaa2e745a246c5ab95103a65b1ed00101e1bc63 upstream. In Bugzilla #208257, Julien Humbert reports that a 32-GB Kingston flash drive spontaneously disconnects and reconnects, over and over. Testing revealed that disabling Link Power Management for the drive fixed the problem. This patch adds a quirk entry for that drive to turn off LPM permanently. CC: Hans de Goede CC: Reported-and-tested-by: Julien Humbert Signed-off-by: Alan Stern Link: https://lore.kernel.org/r/20201102145821.GA1478741@rowland.harvard.edu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 4ee810531098..5ad14cdd9762 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -378,6 +378,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0926, 0x3333), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, + /* Kingston DataTraveler 3.0 */ + { USB_DEVICE(0x0951, 0x1666), .driver_info = USB_QUIRK_NO_LPM }, + /* X-Rite/Gretag-Macbeth Eye-One Pro display colorimeter */ { USB_DEVICE(0x0971, 0x2000), .driver_info = USB_QUIRK_NO_SET_INTF }, From 83a282f990ba94103eb49be1ad69a8e4b2de4fbd Mon Sep 17 00:00:00 2001 From: Macpaul Lin Date: Fri, 6 Nov 2020 13:54:29 +0800 Subject: [PATCH 64/72] usb: mtu3: fix panic in mtu3_gadget_stop() commit 20914919ad31849ee2b9cfe0428f4a20335c9e2a upstream. This patch fixes a possible issue when mtu3_gadget_stop() already assigned NULL to mtu->gadget_driver during mtu_gadget_disconnect(). [] notifier_call_chain+0xa4/0x128 [] __atomic_notifier_call_chain+0x84/0x138 [] notify_die+0xb0/0x120 [] die+0x1f8/0x5d0 [] __do_kernel_fault+0x19c/0x280 [] do_bad_area+0x44/0x140 [] do_translation_fault+0x4c/0x90 [] do_mem_abort+0xb8/0x258 [] el1_da+0x24/0x3c [] mtu3_gadget_disconnect+0xac/0x128 [] mtu3_irq+0x34c/0xc18 [] __handle_irq_event_percpu+0x2ac/0xcd0 [] handle_irq_event_percpu+0x80/0x138 [] handle_irq_event+0xac/0x148 [] handle_fasteoi_irq+0x234/0x568 [] generic_handle_irq+0x48/0x68 [] __handle_domain_irq+0x264/0x1740 [] gic_handle_irq+0x14c/0x250 [] el1_irq+0xec/0x194 [] dma_pool_alloc+0x6e4/0xae0 [] cmdq_mbox_pool_alloc_impl+0xb0/0x238 [] cmdq_pkt_alloc_buf+0x2dc/0x7c0 [] cmdq_pkt_add_cmd_buffer+0x178/0x270 [] cmdq_pkt_perf_begin+0x108/0x148 [] cmdq_pkt_create+0x178/0x1f0 [] mtk_crtc_config_default_path+0x328/0x7a0 [] mtk_drm_idlemgr_kick+0xa6c/0x1460 [] mtk_drm_crtc_atomic_begin+0x1a4/0x1a68 [] drm_atomic_helper_commit_planes+0x154/0x878 [] mtk_atomic_complete.isra.16+0xe80/0x19c8 [] mtk_atomic_commit+0x258/0x898 [] drm_atomic_commit+0xcc/0x108 [] drm_mode_atomic_ioctl+0x1c20/0x2580 [] drm_ioctl_kernel+0x118/0x1b0 [] drm_ioctl+0x5c0/0x920 [] do_vfs_ioctl+0x188/0x1820 [] SyS_ioctl+0x8c/0xa0 Fixes: df2069acb005 ("usb: Add MediaTek USB3 DRD driver") Signed-off-by: Macpaul Lin Acked-by: Chunfeng Yun Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1604642069-20961-1-git-send-email-macpaul.lin@mediatek.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/mtu3/mtu3_gadget.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/mtu3/mtu3_gadget.c b/drivers/usb/mtu3/mtu3_gadget.c index bbcd3332471d..3828ed6d38a2 100644 --- a/drivers/usb/mtu3/mtu3_gadget.c +++ b/drivers/usb/mtu3/mtu3_gadget.c @@ -573,6 +573,7 @@ static int mtu3_gadget_stop(struct usb_gadget *g) spin_unlock_irqrestore(&mtu->lock, flags); + synchronize_irq(mtu->irq); return 0; } From a7443bebdecaa474900ee5eec5da69de63686c24 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 27 Oct 2020 15:01:17 -0700 Subject: [PATCH 65/72] ARC: stack unwinding: avoid indefinite looping commit 328d2168ca524d501fc4b133d6be076142bd305c upstream. Currently stack unwinder is a while(1) loop which relies on the dwarf unwinder to signal termination, which in turn relies on dwarf info to do so. This in theory could cause an infinite loop if the dwarf info was somehow messed up or the register contents were etc. This fix thus detects the excessive looping and breaks the loop. | Mem: 26184K used, 1009136K free, 0K shrd, 0K buff, 14416K cached | CPU: 0.0% usr 72.8% sys 0.0% nic 27.1% idle 0.0% io 0.0% irq 0.0% sirq | Load average: 4.33 2.60 1.11 2/74 139 | PID PPID USER STAT VSZ %VSZ CPU %CPU COMMAND | 133 2 root SWN 0 0.0 3 22.9 [rcu_torture_rea] | 132 2 root SWN 0 0.0 0 22.0 [rcu_torture_rea] | 131 2 root SWN 0 0.0 3 21.5 [rcu_torture_rea] | 126 2 root RW 0 0.0 2 5.4 [rcu_torture_wri] | 129 2 root SWN 0 0.0 0 0.2 [rcu_torture_fak] | 137 2 root SW 0 0.0 0 0.2 [rcu_torture_cbf] | 127 2 root SWN 0 0.0 0 0.1 [rcu_torture_fak] | 138 115 root R 1464 0.1 2 0.1 top | 130 2 root SWN 0 0.0 0 0.1 [rcu_torture_fak] | 128 2 root SWN 0 0.0 0 0.1 [rcu_torture_fak] | 115 1 root S 1472 0.1 1 0.0 -/bin/sh | 104 1 root S 1464 0.1 0 0.0 inetd | 1 0 root S 1456 0.1 2 0.0 init | 78 1 root S 1456 0.1 0 0.0 syslogd -O /var/log/messages | 134 2 root SW 0 0.0 2 0.0 [rcu_torture_sta] | 10 2 root IW 0 0.0 1 0.0 [rcu_preempt] | 88 2 root IW 0 0.0 1 0.0 [kworker/1:1-eve] | 66 2 root IW 0 0.0 2 0.0 [kworker/2:2-eve] | 39 2 root IW 0 0.0 2 0.0 [kworker/2:1-eve] | unwinder looping too long, aborting ! Cc: Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/stacktrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c index bf40e06f3fb8..0fed32b95923 100644 --- a/arch/arc/kernel/stacktrace.c +++ b/arch/arc/kernel/stacktrace.c @@ -115,7 +115,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, int (*consumer_fn) (unsigned int, void *), void *arg) { #ifdef CONFIG_ARC_DW2_UNWIND - int ret = 0; + int ret = 0, cnt = 0; unsigned int address; struct unwind_frame_info frame_info; @@ -135,6 +135,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs, break; frame_info.regs.r63 = frame_info.regs.r31; + + if (cnt++ > 128) { + printk("unwinder looping too long, aborting !\n"); + return 0; + } } return address; /* return the last address it saw */ From 2590411220774522fe00c598e089cc11baebe26e Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 19 Oct 2020 19:19:57 -0700 Subject: [PATCH 66/72] Revert "ARC: entry: fix potential EFA clobber when TIF_SYSCALL_TRACE" This reverts commit 00fdec98d9881bf5173af09aebd353ab3b9ac729. (but only from 5.2 and prior kernels) The original commit was a preventive fix based on code-review and was auto-picked for stable back-port (for better or worse). It was OK for v5.3+ kernels, but turned up needing an implicit change 68e5c6f073bcf70 "(ARC: entry: EV_Trap expects r10 (vs. r9) to have exception cause)" merged in v5.3 which itself was not backported. So to summarize the stable backport of this patch for v5.2 and prior kernels is busted and it won't boot. The obvious solution is backport 68e5c6f073bcf70 but that is a pain as it doesn't revert cleanly and each of affected kernels (so far v4.19, v4.14, v4.9, v4.4) needs a slightly different massaged varaint. So the easier fix is to simply revert the backport from 5.2 and prior. The issue was not a big deal as it would cause strace to sporadically not work correctly. Waldemar Brodkorb first reported this when running ARC uClibc regressions on latest stable kernels (with offending backport). Once he bisected it, the analysis was trivial, so thx to him for this. Reported-by: Waldemar Brodkorb Bisected-by: Waldemar Brodkorb Cc: stable # 5.2 and prior Signed-off-by: Vineet Gupta Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/entry.S | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index 705a68208423..85d9ea4a0acc 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -156,6 +156,7 @@ END(EV_Extension) tracesys: ; save EFA in case tracer wants the PC of traced task ; using ERET won't work since next-PC has already committed + lr r12, [efa] GET_CURR_TASK_FIELD_PTR TASK_THREAD, r11 st r12, [r11, THREAD_FAULT_ADDR] ; thread.fault_address @@ -198,9 +199,15 @@ tracesys_exit: ; Breakpoint TRAP ; --------------------------------------------- trap_with_param: - mov r0, r12 ; EFA in case ptracer/gdb wants stop_pc + + ; stop_pc info by gdb needs this info + lr r0, [efa] mov r1, sp + ; Now that we have read EFA, it is safe to do "fake" rtie + ; and get out of CPU exception mode + FAKE_RET_FROM_EXCPN + ; Save callee regs in case gdb wants to have a look ; SP will grow up by size of CALLEE Reg-File ; NOTE: clobbers r12 @@ -227,10 +234,6 @@ ENTRY(EV_Trap) EXCEPTION_PROLOGUE - lr r12, [efa] - - FAKE_RET_FROM_EXCPN - ;============ TRAP 1 :breakpoints ; Check ECR for trap with arg (PROLOGUE ensures r9 has ECR) bmsk.f 0, r9, 7 @@ -238,6 +241,9 @@ ENTRY(EV_Trap) ;============ TRAP (no param): syscall top level + ; First return from Exception to pure K mode (Exception/IRQs renabled) + FAKE_RET_FROM_EXCPN + ; If syscall tracing ongoing, invoke pre-post-hooks GET_CURR_THR_INFO_FLAGS r10 btst r10, TIF_SYSCALL_TRACE From 292e5700f4db841593141e9a3b1210e94bc8f74f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 22 Oct 2020 17:38:22 +0200 Subject: [PATCH 67/72] PM: runtime: Resume the device earlier in __device_release_driver() commit 9226c504e364158a17a68ff1fe9d67d266922f50 upstream. Since the device is resumed from runtime-suspend in __device_release_driver() anyway, it is better to do that before looking for busy managed device links from it to consumers, because if there are any, device_links_unbind_consumers() will be called and it will cause the consumer devices' drivers to unbind, so the consumer devices will be runtime-resumed. In turn, resuming each consumer device will cause the supplier to be resumed and when the runtime PM references from the given consumer to it are dropped, it may be suspended. Then, the runtime-resume of the next consumer will cause the supplier to resume again and so on. Update the code accordingly. Signed-off-by: Rafael J. Wysocki Fixes: 9ed9895370ae ("driver core: Functional dependencies tracking support") Cc: All applicable # All applicable Tested-by: Xiang Chen Reviewed-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/base/dd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index b3c569412f4e..4ba9231a6be8 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -931,6 +931,8 @@ static void __device_release_driver(struct device *dev, struct device *parent) drv = dev->driver; if (drv) { + pm_runtime_get_sync(dev); + while (device_links_busy(dev)) { device_unlock(dev); if (parent && dev->bus->need_parent_lock) @@ -946,11 +948,12 @@ static void __device_release_driver(struct device *dev, struct device *parent) * have released the driver successfully while this one * was waiting, so check for that. */ - if (dev->driver != drv) + if (dev->driver != drv) { + pm_runtime_put(dev); return; + } } - pm_runtime_get_sync(dev); pm_runtime_clean_up_links(dev); driver_sysfs_remove(dev); From 29a975bcc107d68e379a55048813ddf3e7b120b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kiyin=28=E5=B0=B9=E4=BA=AE=29?= Date: Wed, 4 Nov 2020 08:23:22 +0300 Subject: [PATCH 68/72] perf/core: Fix a memory leak in perf_event_parse_addr_filter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7bdb157cdebbf95a1cd94ed2e01b338714075d00 upstream. As shown through runtime testing, the "filename" allocation is not always freed in perf_event_parse_addr_filter(). There are three possible ways that this could happen: - It could be allocated twice on subsequent iterations through the loop, - or leaked on the success path, - or on the failure path. Clean up the code flow to make it obvious that 'filename' is always freed in the reallocation path and in the two return paths as well. We rely on the fact that kfree(NULL) is NOP and filename is initialized with NULL. This fixes the leak. No other side effects expected. [ Dan Carpenter: cleaned up the code flow & added a changelog. ] [ Ingo Molnar: updated the changelog some more. ] Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Signed-off-by: "kiyin(尹亮)" Signed-off-by: Dan Carpenter Signed-off-by: Ingo Molnar Cc: "Srivatsa S. Bhat" Cc: Anthony Liguori -- kernel/events/core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index a35d742b0ba8..8b94eb6437c1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9047,6 +9047,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; + kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; @@ -9093,16 +9094,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ ret = -EOPNOTSUPP; if (!event->ctx->task) - goto fail_free_name; + goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) - goto fail_free_name; - - kfree(filename); - filename = NULL; + goto fail; ret = -EINVAL; if (!filter->path.dentry || @@ -9122,13 +9120,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_ACTION) goto fail; + kfree(filename); kfree(orig); return 0; -fail_free_name: - kfree(filename); fail: + kfree(filename); free_filters_list(filters); kfree(orig); From 3ab5872df9211447be82d0b01a2ec5731aa20621 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 7 Nov 2020 16:31:24 -0800 Subject: [PATCH 69/72] tools: perf: Fix build error in v4.19.y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit perf may fail to build in v4.19.y with the following error. util/evsel.c: In function ‘perf_evsel__exit’: util/util.h:25:28: error: passing argument 1 of ‘free’ discards ‘const’ qualifier from pointer target type This is observed (at least) with gcc v6.5.0. The underlying problem is the following statement. zfree(&evsel->pmu_name); evsel->pmu_name is decared 'const *'. zfree in turn is defined as #define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) and thus passes the const * to free(). The problem is not seen in the upstream kernel since zfree() has been rewritten there. The problem has been introduced into v4.19.y with the backport of upstream commit d4953f7ef1a2 (perf parse-events: Fix 3 use after frees found with clang ASAN). One possible fix of this problem would be to not declare pmu_name as const. This patch chooses to typecast the parameter of zfree() to void *, following the guidance from the upstream kernel which does the same since commit 7f7c536f23e6a ("tools lib: Adopt zalloc()/zfree() from tools/perf") Fixes: a0100a363098 ("perf parse-events: Fix 3 use after frees found with clang ASAN") Signed-off-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- tools/perf/util/util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index dc58254a2b69..8c01b2cfdb1a 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -22,7 +22,7 @@ static inline void *zalloc(size_t size) return calloc(1, size); } -#define zfree(ptr) ({ free(*ptr); *ptr = NULL; }) +#define zfree(ptr) ({ free((void *)*ptr); *ptr = NULL; }) struct dirent; struct nsinfo; From 8069a08d35b72e16486a6a81eb06f964b2e1432b Mon Sep 17 00:00:00 2001 From: Xiaofei Shen Date: Fri, 29 Mar 2019 11:04:58 +0530 Subject: [PATCH 70/72] net: dsa: read mac address from DT for slave device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a2c7023f7075ca9b80f944d3f20f60e6574538e2 upstream. Before creating a slave netdevice, get the mac address from DTS and apply in case it is valid. Signed-off-by: Xiaofei Shen Signed-off-by: Vinod Koul Signed-off-by: David S. Miller Cc: Pali Rohár Signed-off-by: Greg Kroah-Hartman --- include/net/dsa.h | 1 + net/dsa/dsa2.c | 1 + net/dsa/slave.c | 5 ++++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/net/dsa.h b/include/net/dsa.h index 461e8a7661b7..d28df7adf948 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -196,6 +196,7 @@ struct dsa_port { unsigned int index; const char *name; const struct dsa_port *cpu_dp; + const char *mac; struct device_node *dn; unsigned int ageing_time; u8 stp_state; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index b036c55f5cb1..7c10bc4dacd3 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -261,6 +261,7 @@ static int dsa_port_setup(struct dsa_port *dp) int err = 0; memset(&dp->devlink_port, 0, sizeof(dp->devlink_port)); + dp->mac = of_get_mac_address(dp->dn); if (dp->type != DSA_PORT_TYPE_UNUSED) err = devlink_port_register(ds->devlink, &dp->devlink_port, diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 8ee28b6016d8..d03c67e761df 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1313,7 +1313,10 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->features = master->vlan_features | NETIF_F_HW_TC; slave_dev->hw_features |= NETIF_F_HW_TC; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; - eth_hw_addr_inherit(slave_dev, master); + if (port->mac && is_valid_ether_addr(port->mac)) + ether_addr_copy(slave_dev->dev_addr, port->mac); + else + eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; slave_dev->switchdev_ops = &dsa_slave_switchdev_ops; From 779d3e303977edbe43c4f0017d1553b4306d8d9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 7 Sep 2020 13:27:17 +0200 Subject: [PATCH 71/72] arm64: dts: marvell: espressobin: Add ethernet switch aliases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b64d814257b027e29a474bcd660f6372490138c7 upstream. Espressobin boards have 3 ethernet ports and some of them got assigned more then one MAC address. MAC addresses are stored in U-Boot environment. Since commit a2c7023f7075c ("net: dsa: read mac address from DT for slave device") kernel can use MAC addresses from DT for particular DSA port. Currently Espressobin DTS file contains alias just for ethernet0. This patch defines additional ethernet aliases in Espressobin DTS files, so bootloader can fill correct MAC address for DSA switch ports if more MAC addresses were specified. DT alias ethernet1 is used for wan port, DT aliases ethernet2 and ethernet3 are used for lan ports for both Espressobin revisions (V5 and V7). Fixes: 5253cb8c00a6f ("arm64: dts: marvell: espressobin: add ethernet alias") Cc: # a2c7023f7075c: dsa: read mac address Signed-off-by: Pali Rohár Reviewed-by: Andrew Lunn Reviewed-by: Andre Heider Signed-off-by: Gregory CLEMENT [pali: Backported Espressobin rev V5 changes to 5.4 and 4.19 versions] Signed-off-by: Greg Kroah-Hartman --- .../boot/dts/marvell/armada-3720-espressobin.dts | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts index 6cbdd66921aa..1a3e6e3b04eb 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts @@ -21,6 +21,10 @@ aliases { ethernet0 = ð0; + /* for dsa slave device */ + ethernet1 = &switch0port1; + ethernet2 = &switch0port2; + ethernet3 = &switch0port3; serial0 = &uart0; serial1 = &uart1; }; @@ -136,25 +140,25 @@ #address-cells = <1>; #size-cells = <0>; - port@0 { + switch0port0: port@0 { reg = <0>; label = "cpu"; ethernet = <ð0>; }; - port@1 { + switch0port1: port@1 { reg = <1>; label = "wan"; phy-handle = <&switch0phy0>; }; - port@2 { + switch0port2: port@2 { reg = <2>; label = "lan0"; phy-handle = <&switch0phy1>; }; - port@3 { + switch0port3: port@3 { reg = <3>; label = "lan1"; phy-handle = <&switch0phy2>; From 53fff24aaf01dcb09cbfabbe060f42db8e61ab01 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 10 Nov 2020 12:36:02 +0100 Subject: [PATCH 72/72] Linux 4.19.156 Tested-by: Jon Hunter Tested-by: Pavel Machek (CIP) Tested-by: Guenter Roeck Tested-by: Shuah Khan Tested-by: Linux Kernel Functional Testing Link: https://lore.kernel.org/r/20201109125019.906191744@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9fc16d34e1bb..82891b34e19e 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 19 -SUBLEVEL = 155 +SUBLEVEL = 156 EXTRAVERSION = NAME = "People's Front"