From 8ecd8cff92a802193419a4b3821d1d27bf92f1f0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 19 Jul 2017 13:06:41 +0300 Subject: [PATCH 01/76] libata: array underflow in ata_find_dev() commit 59a5e266c3f5c1567508888dd61a45b86daed0fa upstream. My static checker complains that "devno" can be negative, meaning that we read before the start of the loop. I've looked at the code, and I think the warning is right. This come from /proc so it's root only or it would be quite a quite a serious bug. The call tree looks like this: proc_scsi_write() <- gets id and channel from simple_strtoul() -> scsi_add_single_device() <- calls shost->transportt->user_scan() -> ata_scsi_user_scan() -> ata_find_dev() Signed-off-by: Dan Carpenter Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-scsi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index e417e1a1d02c..5b2aee83d776 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -2832,10 +2832,12 @@ static unsigned int atapi_xlat(struct ata_queued_cmd *qc) static struct ata_device *ata_find_dev(struct ata_port *ap, int devno) { if (!sata_pmp_attached(ap)) { - if (likely(devno < ata_link_max_devices(&ap->link))) + if (likely(devno >= 0 && + devno < ata_link_max_devices(&ap->link))) return &ap->link.device[devno]; } else { - if (likely(devno < ap->nr_pmp_links)) + if (likely(devno >= 0 && + devno < ap->nr_pmp_links)) return &ap->pmp_link[devno].device[0]; } From c59eec4dad4a95f6da1b8ea688e361416869e42d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jul 2017 18:41:52 -0400 Subject: [PATCH 02/76] workqueue: restore WQ_UNBOUND/max_active==1 to be ordered commit 5c0338c68706be53b3dc472e4308961c36e4ece1 upstream. The combination of WQ_UNBOUND and max_active == 1 used to imply ordered execution. After NUMA affinity 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues"), this is no longer true due to per-node worker pools. While the right way to create an ordered workqueue is alloc_ordered_workqueue(), the documentation has been misleading for a long time and people do use WQ_UNBOUND and max_active == 1 for ordered workqueues which can lead to subtle bugs which are very difficult to trigger. It's unlikely that we'd see noticeable performance impact by enforcing ordering on WQ_UNBOUND / max_active == 1 workqueues. Let's automatically set __WQ_ORDERED for those workqueues. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Reported-by: Alexei Potashnik Fixes: 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues") Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2c2f971f3e75..137332379ed4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3834,6 +3834,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* + * Unbound && max_active == 1 used to imply ordered, which is no + * longer the case on NUMA machines due to per-node pools. While + * alloc_ordered_workqueue() is the right way to create an ordered + * workqueue, keep the previous behavior to avoid subtle breakages + * on NUMA. + */ + if ((flags & WQ_UNBOUND) && max_active == 1) + flags |= __WQ_ORDERED; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; From 458c8be4e2086680fde568f1d0eb26d2d530c481 Mon Sep 17 00:00:00 2001 From: "Sergei A. Trusov" Date: Wed, 2 Aug 2017 20:23:48 +1000 Subject: [PATCH 03/76] ALSA: hda - Fix speaker output from VAIO VPCL14M1R commit 3f3c371421e601fa93b6cb7fb52da9ad59ec90b4 upstream. Sony VAIO VPCL14M1R needs the quirk to make the speaker working properly. Tested-by: Dmitriy Signed-off-by: Sergei A. Trusov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 46a34039ecdc..5cab24f52825 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2233,6 +2233,7 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x8691, "ASUS ROG Ranger VIII", ALC882_FIXUP_GPIO3), SND_PCI_QUIRK(0x104d, 0x9047, "Sony Vaio TT", ALC889_FIXUP_VAIO_TT), SND_PCI_QUIRK(0x104d, 0x905a, "Sony Vaio Z", ALC882_FIXUP_NO_PRIMARY_HP), + SND_PCI_QUIRK(0x104d, 0x9060, "Sony Vaio VPCL14M1R", ALC882_FIXUP_NO_PRIMARY_HP), SND_PCI_QUIRK(0x104d, 0x9043, "Sony Vaio VGC-LN51JGB", ALC882_FIXUP_NO_PRIMARY_HP), SND_PCI_QUIRK(0x104d, 0x9044, "Sony VAIO AiO", ALC882_FIXUP_NO_PRIMARY_HP), From 7de8d0c06e566c8c2625577fc9edea2ddd0ca16f Mon Sep 17 00:00:00 2001 From: Banajit Goswami Date: Fri, 14 Jul 2017 23:15:05 -0700 Subject: [PATCH 04/76] ASoC: do not close shared backend dailink commit b1cd2e34c69a2f3988786af451b6e17967c293a0 upstream. Multiple frontend dailinks may be connected to a backend dailink at the same time. When one of frontend dailinks is closed, the associated backend dailink should not be closed if it is connected to other active frontend dailinks. Change ensures that backend dailink is closed only after all connected frontend dailinks are closed. Signed-off-by: Gopikrishnaiah Anandan Signed-off-by: Banajit Goswami Signed-off-by: Patrick Lai Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-pcm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index a1e605bbc465..977066ba1769 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -181,6 +181,10 @@ int dpcm_dapm_stream_event(struct snd_soc_pcm_runtime *fe, int dir, dev_dbg(be->dev, "ASoC: BE %s event %d dir %d\n", be->dai_link->name, event, dir); + if ((event == SND_SOC_DAPM_STREAM_STOP) && + (be->dpcm[dir].users >= 1)) + continue; + snd_soc_dapm_stream_event(be, dir, event); } From bbccdb1e442be9b705564479d8b0c55b9b202dad Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 1 Aug 2017 05:20:03 -0700 Subject: [PATCH 05/76] KVM: async_pf: make rcu irq exit if not triggered from idle task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 337c017ccdf2653d0040099433fc1a2b1beb5926 upstream. WARNING: CPU: 5 PID: 1242 at kernel/rcu/tree_plugin.h:323 rcu_note_context_switch+0x207/0x6b0 CPU: 5 PID: 1242 Comm: unity-settings- Not tainted 4.13.0-rc2+ #1 RIP: 0010:rcu_note_context_switch+0x207/0x6b0 Call Trace: __schedule+0xda/0xba0 ? kvm_async_pf_task_wait+0x1b2/0x270 schedule+0x40/0x90 kvm_async_pf_task_wait+0x1cc/0x270 ? prepare_to_swait+0x22/0x70 do_async_page_fault+0x77/0xb0 ? do_async_page_fault+0x77/0xb0 async_page_fault+0x28/0x30 RIP: 0010:__d_lookup_rcu+0x90/0x1e0 I encounter this when trying to stress the async page fault in L1 guest w/ L2 guests running. Commit 9b132fbe5419 (Add rcu user eqs exception hooks for async page fault) adds rcu_irq_enter/exit() to kvm_async_pf_task_wait() to exit cpu idle eqs when needed, to protect the code that needs use rcu. However, we need to call the pair even if the function calls schedule(), as seen from the above backtrace. This patch fixes it by informing the RCU subsystem exit/enter the irq towards/away from idle for both n.halted and !n.halted. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Paul E. McKenney Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kvm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index cec49ecf5f31..32187f8a49b4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -151,6 +151,8 @@ void kvm_async_pf_task_wait(u32 token) if (hlist_unhashed(&n.link)) break; + rcu_irq_exit(); + if (!n.halted) { local_irq_enable(); schedule(); @@ -159,11 +161,11 @@ void kvm_async_pf_task_wait(u32 token) /* * We cannot reschedule. So halt. */ - rcu_irq_exit(); native_safe_halt(); local_irq_disable(); - rcu_irq_enter(); } + + rcu_irq_enter(); } if (!n.halted) finish_wait(&n.wq, &wait); From 12f60018f63b899b89299da435ca2de3bd47f2f2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Oct 2016 09:51:14 -0500 Subject: [PATCH 06/76] mm/page_alloc: Remove kernel address exposure in free_reserved_area() commit adb1fe9ae2ee6ef6bc10f3d5a588020e7664dfa7 upstream. Linus suggested we try to remove some of the low-hanging fruit related to kernel address exposure in dmesg. The only leaks I see on my local system are: Freeing SMP alternatives memory: 32K (ffffffff9e309000 - ffffffff9e311000) Freeing initrd memory: 10588K (ffffa0b736b42000 - ffffa0b737599000) Freeing unused kernel memory: 3592K (ffffffff9df87000 - ffffffff9e309000) Freeing unused kernel memory: 1352K (ffffa0b7288ae000 - ffffa0b728a00000) Freeing unused kernel memory: 632K (ffffa0b728d62000 - ffffa0b728e00000) Linus says: "I suspect we should just remove [the addresses in the 'Freeing' messages]. I'm sure they are useful in theory, but I suspect they were more useful back when the whole "free init memory" was originally done. These days, if we have a use-after-free, I suspect the init-mem situation is the easiest situation by far. Compared to all the dynamic allocations which are much more likely to show it anyway. So having debug output for that case is likely not all that productive." With this patch the freeing messages now look like this: Freeing SMP alternatives memory: 32K Freeing initrd memory: 10588K Freeing unused kernel memory: 3592K Freeing unused kernel memory: 1352K Freeing unused kernel memory: 632K Suggested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/6836ff90c45b71d38e5d4405aec56fa9e5d1d4b2.1477405374.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar Cc: Kees Cook Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd17a6bdf131..8b80115e4e9e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5847,8 +5847,8 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) } if (pages && s) - pr_info("Freeing %s memory: %ldK (%p - %p)\n", - s, pages << (PAGE_SHIFT - 10), start, end); + pr_info("Freeing %s memory: %ldK\n", + s, pages << (PAGE_SHIFT - 10)); return pages; } From bad9f6142ce1141b7e4969d898cf0e2d61e90d5b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 5 Aug 2017 17:43:24 -0400 Subject: [PATCH 07/76] ext4: fix SEEK_HOLE/SEEK_DATA for blocksize < pagesize commit fcf5ea10992fbac3c7473a1db33d56a139333cd1 upstream. ext4_find_unwritten_pgoff() does not properly handle a situation when starting index is in the middle of a page and blocksize < pagesize. The following command shows the bug on filesystem with 1k blocksize: xfs_io -f -c "falloc 0 4k" \ -c "pwrite 1k 1k" \ -c "pwrite 3k 1k" \ -c "seek -a -r 0" foo In this example, neither lseek(fd, 1024, SEEK_HOLE) nor lseek(fd, 2048, SEEK_DATA) will return the correct result. Fix the problem by neglecting buffers in a page before starting offset. Reported-by: Andreas Gruenbacher Signed-off-by: Theodore Ts'o Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8772bfc3415b..45ef9975caec 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -500,6 +500,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, lastoff = page_offset(page); bh = head = page_buffers(page); do { + if (lastoff + bh->b_size <= startoff) + goto next; if (buffer_uptodate(bh) || buffer_unwritten(bh)) { if (whence == SEEK_DATA) @@ -514,6 +516,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unlock_page(page); goto out; } +next: lastoff += bh->b_size; bh = bh->b_this_page; } while (bh != head); From 31cd127ca69a1aedd24b9aa7ac806e6071c84655 Mon Sep 17 00:00:00 2001 From: Jerry Lee Date: Sun, 6 Aug 2017 01:18:31 -0400 Subject: [PATCH 08/76] ext4: fix overflow caused by missing cast in ext4_resize_fs() commit aec51758ce10a9c847a62a48a168f8c804c6e053 upstream. On a 32-bit platform, the value of n_blcoks_count may be wrong during the file system is resized to size larger than 2^32 blocks. This may caused the superblock being corrupted with zero blocks count. Fixes: 1c6bd7173d66 Signed-off-by: Jerry Lee Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/resize.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 34038e3598d5..74516efd874c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1926,7 +1926,8 @@ retry: n_desc_blocks = o_desc_blocks + le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); - n_blocks_count = n_group * EXT4_BLOCKS_PER_GROUP(sb); + n_blocks_count = (ext4_fsblk_t)n_group * + EXT4_BLOCKS_PER_GROUP(sb); n_group--; /* set to last group number */ } From 3c2bf2bd88292521ececca2ff7576f6f6bb57ab6 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Wed, 12 Jul 2017 13:23:11 +0200 Subject: [PATCH 09/76] ARM: dts: armada-38x: Fix irq type for pca955 commit 8d4514173211586c6238629b1ef1e071927735f5 upstream. As written in the datasheet the PCA955 can only handle low level irq and not edge irq. Without this fix the interrupt is not usable for pca955: the gpio-pca953x driver already set the irq type as low level which is incompatible with edge type, then the kernel prevents using the interrupt: "irq: type mismatch, failed to map hwirq-18 for /soc/internal-regs/gpio@18100!" Fixes: 928413bd859c ("ARM: mvebu: Add Armada 388 General Purpose Development Board support") Signed-off-by: Gregory CLEMENT Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/armada-388-gp.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/armada-388-gp.dts b/arch/arm/boot/dts/armada-388-gp.dts index cd316021d6ce..6c1b45c1af66 100644 --- a/arch/arm/boot/dts/armada-388-gp.dts +++ b/arch/arm/boot/dts/armada-388-gp.dts @@ -89,7 +89,7 @@ pinctrl-names = "default"; pinctrl-0 = <&pca0_pins>; interrupt-parent = <&gpio0>; - interrupts = <18 IRQ_TYPE_EDGE_FALLING>; + interrupts = <18 IRQ_TYPE_LEVEL_LOW>; gpio-controller; #gpio-cells = <2>; interrupt-controller; @@ -101,7 +101,7 @@ compatible = "nxp,pca9555"; pinctrl-names = "default"; interrupt-parent = <&gpio0>; - interrupts = <18 IRQ_TYPE_EDGE_FALLING>; + interrupts = <18 IRQ_TYPE_LEVEL_LOW>; gpio-controller; #gpio-cells = <2>; interrupt-controller; From fa95dfc750fb2741737773f323e32d4c7d58f53e Mon Sep 17 00:00:00 2001 From: Prabhakar Lad Date: Thu, 20 Jul 2017 08:02:09 -0400 Subject: [PATCH 10/76] media: platform: davinci: return -EINVAL for VPFE_CMD_S_CCDC_RAW_PARAMS ioctl commit da05d52d2f0f6bd61094a0cd045fed94bf7d673a upstream. this patch makes sure VPFE_CMD_S_CCDC_RAW_PARAMS ioctl no longer works for vpfe_capture driver with a minimal patch suitable for backporting. - This ioctl was never in public api and was only defined in kernel header. - The function set_params constantly mixes up pointers and phys_addr_t numbers. - This is part of a 'VPFE_CMD_S_CCDC_RAW_PARAMS' ioctl command that is described as an 'experimental ioctl that will change in future kernels'. - The code to allocate the table never gets called after we copy_from_user the user input over the kernel settings, and then compare them for inequality. - We then go on to use an address provided by user space as both the __user pointer for input and pass it through phys_to_virt to come up with a kernel pointer to copy the data to. This looks like a trivially exploitable root hole. Due to these reasons we make sure this ioctl now returns -EINVAL and backport this patch as far as possible. Fixes: 5f15fbb68fd7 ("V4L/DVB (12251): v4l: dm644x ccdc module for vpfe capture driver") Signed-off-by: Lad, Prabhakar Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/davinci/vpfe_capture.c | 22 ++----------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/media/platform/davinci/vpfe_capture.c b/drivers/media/platform/davinci/vpfe_capture.c index 7767e072d623..1f656a3a84b9 100644 --- a/drivers/media/platform/davinci/vpfe_capture.c +++ b/drivers/media/platform/davinci/vpfe_capture.c @@ -1709,27 +1709,9 @@ static long vpfe_param_handler(struct file *file, void *priv, switch (cmd) { case VPFE_CMD_S_CCDC_RAW_PARAMS: + ret = -EINVAL; v4l2_warn(&vpfe_dev->v4l2_dev, - "VPFE_CMD_S_CCDC_RAW_PARAMS: experimental ioctl\n"); - if (ccdc_dev->hw_ops.set_params) { - ret = ccdc_dev->hw_ops.set_params(param); - if (ret) { - v4l2_dbg(1, debug, &vpfe_dev->v4l2_dev, - "Error setting parameters in CCDC\n"); - goto unlock_out; - } - ret = vpfe_get_ccdc_image_format(vpfe_dev, - &vpfe_dev->fmt); - if (ret < 0) { - v4l2_dbg(1, debug, &vpfe_dev->v4l2_dev, - "Invalid image format at CCDC\n"); - goto unlock_out; - } - } else { - ret = -EINVAL; - v4l2_dbg(1, debug, &vpfe_dev->v4l2_dev, - "VPFE_CMD_S_CCDC_RAW_PARAMS not supported\n"); - } + "VPFE_CMD_S_CCDC_RAW_PARAMS not supported\n"); break; default: ret = -ENOTTY; From 2bf7791c6a5ec21228383f5a1fb5c443c3336622 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Mon, 27 Mar 2017 16:12:43 -0700 Subject: [PATCH 11/76] target: Avoid mappedlun symlink creation during lun shutdown commit 49cb77e297dc611a1b795cfeb79452b3002bd331 upstream. This patch closes a race between se_lun deletion during configfs unlink in target_fabric_port_unlink() -> core_dev_del_lun() -> core_tpg_remove_lun(), when transport_clear_lun_ref() blocks waiting for percpu_ref RCU grace period to finish, but a new NodeACL mappedlun is added before the RCU grace period has completed. This can happen in target_fabric_mappedlun_link() because it only checks for se_lun->lun_se_dev, which is not cleared until after transport_clear_lun_ref() percpu_ref RCU grace period finishes. This bug originally manifested as NULL pointer dereference OOPsen in target_stat_scsi_att_intr_port_show_attr_dev() on v4.1.y code, because it dereferences lun->lun_se_dev without a explicit NULL pointer check. In post v4.1 code with target-core RCU conversion, the code in target_stat_scsi_att_intr_port_show_attr_dev() no longer uses se_lun->lun_se_dev, but the same race still exists. To address the bug, go ahead and set se_lun>lun_shutdown as early as possible in core_tpg_remove_lun(), and ensure new NodeACL mappedlun creation in target_fabric_mappedlun_link() fails during se_lun shutdown. Reported-by: James Shen Cc: James Shen Tested-by: James Shen Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/target_core_fabric_configfs.c | 5 +++++ drivers/target/target_core_tpg.c | 4 ++++ include/target/target_core_base.h | 1 + 3 files changed, 10 insertions(+) diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index f916d18ccb48..b070ddf1dc37 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -92,6 +92,11 @@ static int target_fabric_mappedlun_link( pr_err("Source se_lun->lun_se_dev does not exist\n"); return -EINVAL; } + if (lun->lun_shutdown) { + pr_err("Unable to create mappedlun symlink because" + " lun->lun_shutdown=true\n"); + return -EINVAL; + } se_tpg = lun->lun_tpg; nacl_ci = &lun_acl_ci->ci_parent->ci_group->cg_item; diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 899c33b3c734..f69f4902dc07 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -673,6 +673,8 @@ void core_tpg_remove_lun( */ struct se_device *dev = rcu_dereference_raw(lun->lun_se_dev); + lun->lun_shutdown = true; + core_clear_lun_from_tpg(lun, tpg); /* * Wait for any active I/O references to percpu se_lun->lun_ref to @@ -694,6 +696,8 @@ void core_tpg_remove_lun( } if (!(dev->se_hba->hba_flags & HBA_FLAGS_INTERNAL_USE)) hlist_del_rcu(&lun->link); + + lun->lun_shutdown = false; mutex_unlock(&tpg->tpg_lun_mutex); percpu_ref_exit(&lun->lun_ref); diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index ed66414b91f0..1adf8739980c 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -714,6 +714,7 @@ struct se_lun { #define SE_LUN_LINK_MAGIC 0xffff7771 u32 lun_link_magic; u32 lun_access; + bool lun_shutdown; u32 lun_index; /* RELATIVE TARGET PORT IDENTIFER */ From 119903dc36d3bbd602b96c562bca263a08dede91 Mon Sep 17 00:00:00 2001 From: Jiang Yi Date: Tue, 16 May 2017 17:57:55 +0800 Subject: [PATCH 12/76] iscsi-target: Always wait for kthread_should_stop() before kthread exit commit 5e0cf5e6c43b9e19fc0284f69e5cd2b4a47523b0 upstream. There are three timing problems in the kthread usages of iscsi_target_mod: - np_thread of struct iscsi_np - rx_thread and tx_thread of struct iscsi_conn In iscsit_close_connection(), it calls send_sig(SIGINT, conn->tx_thread, 1); kthread_stop(conn->tx_thread); In conn->tx_thread, which is iscsi_target_tx_thread(), when it receive SIGINT the kthread will exit without checking the return value of kthread_should_stop(). So if iscsi_target_tx_thread() exit right between send_sig(SIGINT...) and kthread_stop(...), the kthread_stop() will try to stop an already stopped kthread. This is invalid according to the documentation of kthread_stop(). (Fix -ECONNRESET logout handling in iscsi_target_tx_thread and early iscsi_target_rx_thread failure case - nab) Signed-off-by: Jiang Yi Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target.c | 28 ++++++++++++++++++----- drivers/target/iscsi/iscsi_target_erl0.c | 6 ++++- drivers/target/iscsi/iscsi_target_erl0.h | 2 +- drivers/target/iscsi/iscsi_target_login.c | 4 ++++ 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index a180c000e246..7b42d12081e2 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -3965,6 +3965,8 @@ int iscsi_target_tx_thread(void *arg) { int ret = 0; struct iscsi_conn *conn = arg; + bool conn_freed = false; + /* * Allow ourselves to be interrupted by SIGINT so that a * connection recovery / failure event can be triggered externally. @@ -3990,12 +3992,14 @@ get_immediate: goto transport_err; ret = iscsit_handle_response_queue(conn); - if (ret == 1) + if (ret == 1) { goto get_immediate; - else if (ret == -ECONNRESET) + } else if (ret == -ECONNRESET) { + conn_freed = true; goto out; - else if (ret < 0) + } else if (ret < 0) { goto transport_err; + } } transport_err: @@ -4005,8 +4009,13 @@ transport_err: * responsible for cleaning up the early connection failure. */ if (conn->conn_state != TARG_CONN_STATE_IN_LOGIN) - iscsit_take_action_for_connection_exit(conn); + iscsit_take_action_for_connection_exit(conn, &conn_freed); out: + if (!conn_freed) { + while (!kthread_should_stop()) { + msleep(100); + } + } return 0; } @@ -4105,6 +4114,7 @@ int iscsi_target_rx_thread(void *arg) u32 checksum = 0, digest = 0; struct iscsi_conn *conn = arg; struct kvec iov; + bool conn_freed = false; /* * Allow ourselves to be interrupted by SIGINT so that a * connection recovery / failure event can be triggered externally. @@ -4116,7 +4126,7 @@ int iscsi_target_rx_thread(void *arg) */ rc = wait_for_completion_interruptible(&conn->rx_login_comp); if (rc < 0 || iscsi_target_check_conn_state(conn)) - return 0; + goto out; if (conn->conn_transport->transport_type == ISCSI_INFINIBAND) { struct completion comp; @@ -4201,7 +4211,13 @@ int iscsi_target_rx_thread(void *arg) transport_err: if (!signal_pending(current)) atomic_set(&conn->transport_failed, 1); - iscsit_take_action_for_connection_exit(conn); + iscsit_take_action_for_connection_exit(conn, &conn_freed); +out: + if (!conn_freed) { + while (!kthread_should_stop()) { + msleep(100); + } + } return 0; } diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c index 210f6e4830e3..6c88fb021444 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.c +++ b/drivers/target/iscsi/iscsi_target_erl0.c @@ -930,8 +930,10 @@ static void iscsit_handle_connection_cleanup(struct iscsi_conn *conn) } } -void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn) +void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn, bool *conn_freed) { + *conn_freed = false; + spin_lock_bh(&conn->state_lock); if (atomic_read(&conn->connection_exit)) { spin_unlock_bh(&conn->state_lock); @@ -942,6 +944,7 @@ void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn) if (conn->conn_state == TARG_CONN_STATE_IN_LOGOUT) { spin_unlock_bh(&conn->state_lock); iscsit_close_connection(conn); + *conn_freed = true; return; } @@ -955,4 +958,5 @@ void iscsit_take_action_for_connection_exit(struct iscsi_conn *conn) spin_unlock_bh(&conn->state_lock); iscsit_handle_connection_cleanup(conn); + *conn_freed = true; } diff --git a/drivers/target/iscsi/iscsi_target_erl0.h b/drivers/target/iscsi/iscsi_target_erl0.h index a9e2f9497fb2..fbc1d84a63c3 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.h +++ b/drivers/target/iscsi/iscsi_target_erl0.h @@ -9,6 +9,6 @@ extern int iscsit_stop_time2retain_timer(struct iscsi_session *); extern void iscsit_connection_reinstatement_rcfr(struct iscsi_conn *); extern void iscsit_cause_connection_reinstatement(struct iscsi_conn *, int); extern void iscsit_fall_back_to_erl0(struct iscsi_session *); -extern void iscsit_take_action_for_connection_exit(struct iscsi_conn *); +extern void iscsit_take_action_for_connection_exit(struct iscsi_conn *, bool *); #endif /*** ISCSI_TARGET_ERL0_H ***/ diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 4a137b0ae3dc..b19edffa7d98 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1436,5 +1436,9 @@ int iscsi_target_login_thread(void *arg) break; } + while (!kthread_should_stop()) { + msleep(100); + } + return 0; } From cf4ab9d57cbea15d4f06d76f03abe1e124f54866 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 27 Feb 2016 18:15:46 -0800 Subject: [PATCH 13/76] iscsi-target: Fix early sk_data_ready LOGIN_FLAGS_READY race commit 8f0dfb3d8b1120c61f6e2cc3729290db10772b2d upstream. There is a iscsi-target/tcp login race in LOGIN_FLAGS_READY state assignment that can result in frequent errors during iscsi discovery: "iSCSI Login negotiation failed." To address this bug, move the initial LOGIN_FLAGS_READY assignment ahead of iscsi_target_do_login() when handling the initial iscsi_target_start_negotiation() request PDU during connection login. As iscsi_target_do_login_rx() work_struct callback is clearing LOGIN_FLAGS_READ_ACTIVE after subsequent calls to iscsi_target_do_login(), the early sk_data_ready ahead of the first iscsi_target_do_login() expects LOGIN_FLAGS_READY to also be set for the initial login request PDU. As reported by Maged, this was first obsered using an MSFT initiator running across multiple VMWare host virtual machines with iscsi-target/tcp. Reported-by: Maged Mokhtar Tested-by: Maged Mokhtar Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target_nego.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c index 549a2bbbf4df..11edf6de16e7 100644 --- a/drivers/target/iscsi/iscsi_target_nego.c +++ b/drivers/target/iscsi/iscsi_target_nego.c @@ -1248,16 +1248,16 @@ int iscsi_target_start_negotiation( { int ret; - ret = iscsi_target_do_login(conn, login); - if (!ret) { - if (conn->sock) { - struct sock *sk = conn->sock->sk; + if (conn->sock) { + struct sock *sk = conn->sock->sk; - write_lock_bh(&sk->sk_callback_lock); - set_bit(LOGIN_FLAGS_READY, &conn->login_flags); - write_unlock_bh(&sk->sk_callback_lock); - } - } else if (ret < 0) { + write_lock_bh(&sk->sk_callback_lock); + set_bit(LOGIN_FLAGS_READY, &conn->login_flags); + write_unlock_bh(&sk->sk_callback_lock); + } + + ret = iscsi_target_do_login(conn, login); + if (ret < 0) { cancel_delayed_work_sync(&conn->login_work); cancel_delayed_work_sync(&conn->login_cleanup_work); iscsi_target_restore_sock_callbacks(conn); From bf54cf1ede97f9236d231dc21107c8a2febb624a Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 24 May 2017 21:47:09 -0700 Subject: [PATCH 14/76] iscsi-target: Fix initial login PDU asynchronous socket close OOPs commit 25cdda95fda78d22d44157da15aa7ea34be3c804 upstream. This patch fixes a OOPs originally introduced by: commit bb048357dad6d604520c91586334c9c230366a14 Author: Nicholas Bellinger Date: Thu Sep 5 14:54:04 2013 -0700 iscsi-target: Add sk->sk_state_change to cleanup after TCP failure which would trigger a NULL pointer dereference when a TCP connection was closed asynchronously via iscsi_target_sk_state_change(), but only when the initial PDU processing in iscsi_target_do_login() from iscsi_np process context was blocked waiting for backend I/O to complete. To address this issue, this patch makes the following changes. First, it introduces some common helper functions used for checking socket closing state, checking login_flags, and atomically checking socket closing state + setting login_flags. Second, it introduces a LOGIN_FLAGS_INITIAL_PDU bit to know when a TCP connection has dropped via iscsi_target_sk_state_change(), but the initial PDU processing within iscsi_target_do_login() in iscsi_np context is still running. For this case, it sets LOGIN_FLAGS_CLOSED, but doesn't invoke schedule_delayed_work(). The original NULL pointer dereference case reported by MNC is now handled by iscsi_target_do_login() doing a iscsi_target_sk_check_close() before transitioning to FFP to determine when the socket has already closed, or iscsi_target_start_negotiation() if the login needs to exchange more PDUs (eg: iscsi_target_do_login returned 0) but the socket has closed. For both of these cases, the cleanup up of remaining connection resources will occur in iscsi_target_start_negotiation() from iscsi_np process context once the failure is detected. Finally, to handle to case where iscsi_target_sk_state_change() is called after the initial PDU procesing is complete, it now invokes conn->login_work -> iscsi_target_do_login_rx() to perform cleanup once existing iscsi_target_sk_check_close() checks detect connection failure. For this case, the cleanup of remaining connection resources will occur in iscsi_target_do_login_rx() from delayed workqueue process context once the failure is detected. Reported-by: Mike Christie Reviewed-by: Mike Christie Tested-by: Mike Christie Cc: Mike Christie Reported-by: Hannes Reinecke Cc: Hannes Reinecke Cc: Sagi Grimberg Cc: Varun Prakash Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target_nego.c | 206 +++++++++++++++-------- include/target/iscsi/iscsi_target_core.h | 1 + 2 files changed, 139 insertions(+), 68 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_nego.c b/drivers/target/iscsi/iscsi_target_nego.c index 11edf6de16e7..58c629aec73c 100644 --- a/drivers/target/iscsi/iscsi_target_nego.c +++ b/drivers/target/iscsi/iscsi_target_nego.c @@ -489,14 +489,60 @@ static void iscsi_target_restore_sock_callbacks(struct iscsi_conn *conn) static int iscsi_target_do_login(struct iscsi_conn *, struct iscsi_login *); -static bool iscsi_target_sk_state_check(struct sock *sk) +static bool __iscsi_target_sk_check_close(struct sock *sk) { if (sk->sk_state == TCP_CLOSE_WAIT || sk->sk_state == TCP_CLOSE) { - pr_debug("iscsi_target_sk_state_check: TCP_CLOSE_WAIT|TCP_CLOSE," + pr_debug("__iscsi_target_sk_check_close: TCP_CLOSE_WAIT|TCP_CLOSE," "returning FALSE\n"); - return false; + return true; } - return true; + return false; +} + +static bool iscsi_target_sk_check_close(struct iscsi_conn *conn) +{ + bool state = false; + + if (conn->sock) { + struct sock *sk = conn->sock->sk; + + read_lock_bh(&sk->sk_callback_lock); + state = (__iscsi_target_sk_check_close(sk) || + test_bit(LOGIN_FLAGS_CLOSED, &conn->login_flags)); + read_unlock_bh(&sk->sk_callback_lock); + } + return state; +} + +static bool iscsi_target_sk_check_flag(struct iscsi_conn *conn, unsigned int flag) +{ + bool state = false; + + if (conn->sock) { + struct sock *sk = conn->sock->sk; + + read_lock_bh(&sk->sk_callback_lock); + state = test_bit(flag, &conn->login_flags); + read_unlock_bh(&sk->sk_callback_lock); + } + return state; +} + +static bool iscsi_target_sk_check_and_clear(struct iscsi_conn *conn, unsigned int flag) +{ + bool state = false; + + if (conn->sock) { + struct sock *sk = conn->sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + state = (__iscsi_target_sk_check_close(sk) || + test_bit(LOGIN_FLAGS_CLOSED, &conn->login_flags)); + if (!state) + clear_bit(flag, &conn->login_flags); + write_unlock_bh(&sk->sk_callback_lock); + } + return state; } static void iscsi_target_login_drop(struct iscsi_conn *conn, struct iscsi_login *login) @@ -536,6 +582,20 @@ static void iscsi_target_do_login_rx(struct work_struct *work) pr_debug("entering iscsi_target_do_login_rx, conn: %p, %s:%d\n", conn, current->comm, current->pid); + /* + * If iscsi_target_do_login_rx() has been invoked by ->sk_data_ready() + * before initial PDU processing in iscsi_target_start_negotiation() + * has completed, go ahead and retry until it's cleared. + * + * Otherwise if the TCP connection drops while this is occuring, + * iscsi_target_start_negotiation() will detect the failure, call + * cancel_delayed_work_sync(&conn->login_work), and cleanup the + * remaining iscsi connection resources from iscsi_np process context. + */ + if (iscsi_target_sk_check_flag(conn, LOGIN_FLAGS_INITIAL_PDU)) { + schedule_delayed_work(&conn->login_work, msecs_to_jiffies(10)); + return; + } spin_lock(&tpg->tpg_state_lock); state = (tpg->tpg_state == TPG_STATE_ACTIVE); @@ -543,26 +603,12 @@ static void iscsi_target_do_login_rx(struct work_struct *work) if (!state) { pr_debug("iscsi_target_do_login_rx: tpg_state != TPG_STATE_ACTIVE\n"); - iscsi_target_restore_sock_callbacks(conn); - iscsi_target_login_drop(conn, login); - iscsit_deaccess_np(np, tpg, tpg_np); - return; + goto err; } - if (conn->sock) { - struct sock *sk = conn->sock->sk; - - read_lock_bh(&sk->sk_callback_lock); - state = iscsi_target_sk_state_check(sk); - read_unlock_bh(&sk->sk_callback_lock); - - if (!state) { - pr_debug("iscsi_target_do_login_rx, TCP state CLOSE\n"); - iscsi_target_restore_sock_callbacks(conn); - iscsi_target_login_drop(conn, login); - iscsit_deaccess_np(np, tpg, tpg_np); - return; - } + if (iscsi_target_sk_check_close(conn)) { + pr_debug("iscsi_target_do_login_rx, TCP state CLOSE\n"); + goto err; } conn->login_kworker = current; @@ -580,34 +626,29 @@ static void iscsi_target_do_login_rx(struct work_struct *work) flush_signals(current); conn->login_kworker = NULL; - if (rc < 0) { - iscsi_target_restore_sock_callbacks(conn); - iscsi_target_login_drop(conn, login); - iscsit_deaccess_np(np, tpg, tpg_np); - return; - } + if (rc < 0) + goto err; pr_debug("iscsi_target_do_login_rx after rx_login_io, %p, %s:%d\n", conn, current->comm, current->pid); rc = iscsi_target_do_login(conn, login); if (rc < 0) { - iscsi_target_restore_sock_callbacks(conn); - iscsi_target_login_drop(conn, login); - iscsit_deaccess_np(np, tpg, tpg_np); + goto err; } else if (!rc) { - if (conn->sock) { - struct sock *sk = conn->sock->sk; - - write_lock_bh(&sk->sk_callback_lock); - clear_bit(LOGIN_FLAGS_READ_ACTIVE, &conn->login_flags); - write_unlock_bh(&sk->sk_callback_lock); - } + if (iscsi_target_sk_check_and_clear(conn, LOGIN_FLAGS_READ_ACTIVE)) + goto err; } else if (rc == 1) { iscsi_target_nego_release(conn); iscsi_post_login_handler(np, conn, zero_tsih); iscsit_deaccess_np(np, tpg, tpg_np); } + return; + +err: + iscsi_target_restore_sock_callbacks(conn); + iscsi_target_login_drop(conn, login); + iscsit_deaccess_np(np, tpg, tpg_np); } static void iscsi_target_do_cleanup(struct work_struct *work) @@ -655,31 +696,54 @@ static void iscsi_target_sk_state_change(struct sock *sk) orig_state_change(sk); return; } + state = __iscsi_target_sk_check_close(sk); + pr_debug("__iscsi_target_sk_close_change: state: %d\n", state); + if (test_bit(LOGIN_FLAGS_READ_ACTIVE, &conn->login_flags)) { pr_debug("Got LOGIN_FLAGS_READ_ACTIVE=1 sk_state_change" " conn: %p\n", conn); + if (state) + set_bit(LOGIN_FLAGS_CLOSED, &conn->login_flags); write_unlock_bh(&sk->sk_callback_lock); orig_state_change(sk); return; } - if (test_and_set_bit(LOGIN_FLAGS_CLOSED, &conn->login_flags)) { + if (test_bit(LOGIN_FLAGS_CLOSED, &conn->login_flags)) { pr_debug("Got LOGIN_FLAGS_CLOSED=1 sk_state_change conn: %p\n", conn); write_unlock_bh(&sk->sk_callback_lock); orig_state_change(sk); return; } - - state = iscsi_target_sk_state_check(sk); - write_unlock_bh(&sk->sk_callback_lock); - - pr_debug("iscsi_target_sk_state_change: state: %d\n", state); - - if (!state) { + /* + * If the TCP connection has dropped, go ahead and set LOGIN_FLAGS_CLOSED, + * but only queue conn->login_work -> iscsi_target_do_login_rx() + * processing if LOGIN_FLAGS_INITIAL_PDU has already been cleared. + * + * When iscsi_target_do_login_rx() runs, iscsi_target_sk_check_close() + * will detect the dropped TCP connection from delayed workqueue context. + * + * If LOGIN_FLAGS_INITIAL_PDU is still set, which means the initial + * iscsi_target_start_negotiation() is running, iscsi_target_do_login() + * via iscsi_target_sk_check_close() or iscsi_target_start_negotiation() + * via iscsi_target_sk_check_and_clear() is responsible for detecting the + * dropped TCP connection in iscsi_np process context, and cleaning up + * the remaining iscsi connection resources. + */ + if (state) { pr_debug("iscsi_target_sk_state_change got failed state\n"); - schedule_delayed_work(&conn->login_cleanup_work, 0); + set_bit(LOGIN_FLAGS_CLOSED, &conn->login_flags); + state = test_bit(LOGIN_FLAGS_INITIAL_PDU, &conn->login_flags); + write_unlock_bh(&sk->sk_callback_lock); + + orig_state_change(sk); + + if (!state) + schedule_delayed_work(&conn->login_work, 0); return; } + write_unlock_bh(&sk->sk_callback_lock); + orig_state_change(sk); } @@ -944,6 +1008,15 @@ static int iscsi_target_do_login(struct iscsi_conn *conn, struct iscsi_login *lo if (iscsi_target_handle_csg_one(conn, login) < 0) return -1; if (login_rsp->flags & ISCSI_FLAG_LOGIN_TRANSIT) { + /* + * Check to make sure the TCP connection has not + * dropped asynchronously while session reinstatement + * was occuring in this kthread context, before + * transitioning to full feature phase operation. + */ + if (iscsi_target_sk_check_close(conn)) + return -1; + login->tsih = conn->sess->tsih; login->login_complete = 1; iscsi_target_restore_sock_callbacks(conn); @@ -970,21 +1043,6 @@ static int iscsi_target_do_login(struct iscsi_conn *conn, struct iscsi_login *lo break; } - if (conn->sock) { - struct sock *sk = conn->sock->sk; - bool state; - - read_lock_bh(&sk->sk_callback_lock); - state = iscsi_target_sk_state_check(sk); - read_unlock_bh(&sk->sk_callback_lock); - - if (!state) { - pr_debug("iscsi_target_do_login() failed state for" - " conn: %p\n", conn); - return -1; - } - } - return 0; } @@ -1251,13 +1309,25 @@ int iscsi_target_start_negotiation( if (conn->sock) { struct sock *sk = conn->sock->sk; - write_lock_bh(&sk->sk_callback_lock); - set_bit(LOGIN_FLAGS_READY, &conn->login_flags); - write_unlock_bh(&sk->sk_callback_lock); - } + write_lock_bh(&sk->sk_callback_lock); + set_bit(LOGIN_FLAGS_READY, &conn->login_flags); + set_bit(LOGIN_FLAGS_INITIAL_PDU, &conn->login_flags); + write_unlock_bh(&sk->sk_callback_lock); + } + /* + * If iscsi_target_do_login returns zero to signal more PDU + * exchanges are required to complete the login, go ahead and + * clear LOGIN_FLAGS_INITIAL_PDU but only if the TCP connection + * is still active. + * + * Otherwise if TCP connection dropped asynchronously, go ahead + * and perform connection cleanup now. + */ + ret = iscsi_target_do_login(conn, login); + if (!ret && iscsi_target_sk_check_and_clear(conn, LOGIN_FLAGS_INITIAL_PDU)) + ret = -1; - ret = iscsi_target_do_login(conn, login); - if (ret < 0) { + if (ret < 0) { cancel_delayed_work_sync(&conn->login_work); cancel_delayed_work_sync(&conn->login_cleanup_work); iscsi_target_restore_sock_callbacks(conn); diff --git a/include/target/iscsi/iscsi_target_core.h b/include/target/iscsi/iscsi_target_core.h index e0efe3fcf739..fdda45f26f75 100644 --- a/include/target/iscsi/iscsi_target_core.h +++ b/include/target/iscsi/iscsi_target_core.h @@ -562,6 +562,7 @@ struct iscsi_conn { #define LOGIN_FLAGS_READ_ACTIVE 1 #define LOGIN_FLAGS_CLOSED 2 #define LOGIN_FLAGS_READY 4 +#define LOGIN_FLAGS_INITIAL_PDU 8 unsigned long login_flags; struct delayed_work login_work; struct delayed_work login_cleanup_work; From ae059833646f75f5fa3955e369b32c9cf7eabafc Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 3 Jun 2017 05:35:47 -0700 Subject: [PATCH 15/76] iscsi-target: Fix delayed logout processing greater than SECONDS_FOR_LOGOUT_COMP commit 105fa2f44e504c830697b0c794822112d79808dc upstream. This patch fixes a BUG() in iscsit_close_session() that could be triggered when iscsit_logout_post_handler() execution from within tx thread context was not run for more than SECONDS_FOR_LOGOUT_COMP (15 seconds), and the TCP connection didn't already close before then forcing tx thread context to automatically exit. This would manifest itself during explicit logout as: [33206.974254] 1 connection(s) still exist for iSCSI session to iqn.1993-08.org.debian:01:3f5523242179 [33206.980184] INFO: NMI handler (kgdb_nmi_handler) took too long to run: 2100.772 msecs [33209.078643] ------------[ cut here ]------------ [33209.078646] kernel BUG at drivers/target/iscsi/iscsi_target.c:4346! Normally when explicit logout attempt fails, the tx thread context exits and iscsit_close_connection() from rx thread context does the extra cleanup once it detects conn->conn_logout_remove has not been cleared by the logout type specific post handlers. To address this special case, if the logout post handler in tx thread context detects conn->tx_thread_active has already been cleared, simply return and exit in order for existing iscsit_close_connection() logic from rx thread context do failed logout cleanup. Reported-by: Bart Van Assche Tested-by: Bart Van Assche Cc: Mike Christie Cc: Hannes Reinecke Cc: Sagi Grimberg Tested-by: Gary Guo Tested-by: Chu Yuan Lin Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/target/iscsi/iscsi_target.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 7b42d12081e2..31d5d9c0e10b 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -4591,8 +4591,11 @@ static void iscsit_logout_post_handler_closesession( * always sleep waiting for RX/TX thread shutdown to complete * within iscsit_close_connection(). */ - if (conn->conn_transport->transport_type == ISCSI_TCP) + if (conn->conn_transport->transport_type == ISCSI_TCP) { sleep = cmpxchg(&conn->tx_thread_active, true, false); + if (!sleep) + return; + } atomic_set(&conn->conn_logout_remove, 0); complete(&conn->conn_logout_comp); @@ -4608,8 +4611,11 @@ static void iscsit_logout_post_handler_samecid( { int sleep = 1; - if (conn->conn_transport->transport_type == ISCSI_TCP) + if (conn->conn_transport->transport_type == ISCSI_TCP) { sleep = cmpxchg(&conn->tx_thread_active, true, false); + if (!sleep) + return; + } atomic_set(&conn->conn_logout_remove, 0); complete(&conn->conn_logout_comp); From 9745cbec9c1ff34feaea602f02534a1f30e79e2a Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 29 Jun 2017 22:21:31 -0700 Subject: [PATCH 16/76] iser-target: Avoid isert_conn->cm_id dereference in isert_login_recv_done commit fce50a2fa4e9c6e103915c351b6d4a98661341d6 upstream. This patch fixes a NULL pointer dereference in isert_login_recv_done() of isert_conn->cm_id due to isert_cma_handler() -> isert_connect_error() resetting isert_conn->cm_id = NULL during a failed login attempt. As per Sagi, we will always see the completion of all recv wrs posted on the qp (given that we assigned a ->done handler), this is a FLUSH error completion, we just don't get to verify that because we deref NULL before. The issue here, was the assumption that dereferencing the connection cm_id is always safe, which is not true since: commit 4a579da2586bd3b79b025947ea24ede2bbfede62 Author: Sagi Grimberg Date: Sun Mar 29 15:52:04 2015 +0300 iser-target: Fix possible deadlock in RDMA_CM connection error As I see it, we have a direct reference to the isert_device from isert_conn which is the one-liner fix that we actually need like we do in isert_rdma_read_done() and isert_rdma_write_done(). Reported-by: Andrea Righi Tested-by: Andrea Righi Reviewed-by: Sagi Grimberg Signed-off-by: Nicholas Bellinger Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/isert/ib_isert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index b0edb66a291b..0b7f5a701c60 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -1581,7 +1581,7 @@ isert_rcv_completion(struct iser_rx_desc *desc, struct isert_conn *isert_conn, u32 xfer_len) { - struct ib_device *ib_dev = isert_conn->cm_id->device; + struct ib_device *ib_dev = isert_conn->device->ib_device; struct iscsi_hdr *hdr; u64 rx_dma; int rx_buflen; From f1181047ff29d4d4d364435040bd347eb54483ca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 2 Aug 2017 13:31:52 -0700 Subject: [PATCH 17/76] mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries commit 3ea277194daaeaa84ce75180ec7c7a2075027a68 upstream. Stable note for 4.4: The upstream patch patches madvise(MADV_FREE) but 4.4 does not have support for that feature. The changelog is left as-is but the hunk related to madvise is omitted from the backport. Nadav Amit identified a theoritical race between page reclaim and mprotect due to TLB flushes being batched outside of the PTL being held. He described the race as follows: CPU0 CPU1 ---- ---- user accesses memory using RW PTE [PTE now cached in TLB] try_to_unmap_one() ==> ptep_get_and_clear() ==> set_tlb_ubc_flush_pending() mprotect(addr, PROT_READ) ==> change_pte_range() ==> [ PTE non-present - no flush ] user writes using cached RW PTE ... try_to_unmap_flush() The same type of race exists for reads when protecting for PROT_NONE and also exists for operations that can leave an old TLB entry behind such as munmap, mremap and madvise. For some operations like mprotect, it's not necessarily a data integrity issue but it is a correctness issue as there is a window where an mprotect that limits access still allows access. For munmap, it's potentially a data integrity issue although the race is massive as an munmap, mmap and return to userspace must all complete between the window when reclaim drops the PTL and flushes the TLB. However, it's theoritically possible so handle this issue by flushing the mm if reclaim is potentially currently batching TLB flushes. Other instances where a flush is required for a present pte should be ok as either the page lock is held preventing parallel reclaim or a page reference count is elevated preventing a parallel free leading to corruption. In the case of page_mkclean there isn't an obvious path that userspace could take advantage of without using the operations that are guarded by this patch. Other users such as gup as a race with reclaim looks just at PTEs. huge page variants should be ok as they don't race with reclaim. mincore only looks at PTEs. userfault also should be ok as if a parallel reclaim takes place, it will either fault the page back in or read some of the data before the flush occurs triggering a fault. Note that a variant of this patch was acked by Andy Lutomirski but this was for the x86 parts on top of his PCID work which didn't make the 4.13 merge window as expected. His ack is dropped from this version and there will be a follow-on patch on top of PCID that will include his ack. [akpm@linux-foundation.org: tweak comments] [akpm@linux-foundation.org: fix spello] Link: http://lkml.kernel.org/r/20170717155523.emckq2esjro6hf3z@suse.de Reported-by: Nadav Amit Signed-off-by: Mel Gorman Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm_types.h | 4 ++++ mm/internal.h | 5 ++++- mm/memory.c | 1 + mm/mprotect.c | 1 + mm/mremap.c | 1 + mm/rmap.c | 36 ++++++++++++++++++++++++++++++++++++ 6 files changed, 47 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2ccccbfcd532..36f4695aa604 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -503,6 +503,10 @@ struct mm_struct { * PROT_NONE or PROT_NUMA mapped page. */ bool tlb_flush_pending; +#endif +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + /* See flush_tlb_batched_pending() */ + bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_X86_INTEL_MPX diff --git a/mm/internal.h b/mm/internal.h index 6979b2bd3227..f63f4393d633 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -453,6 +453,7 @@ struct tlbflush_unmap_batch; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH void try_to_unmap_flush(void); void try_to_unmap_flush_dirty(void); +void flush_tlb_batched_pending(struct mm_struct *mm); #else static inline void try_to_unmap_flush(void) { @@ -460,6 +461,8 @@ static inline void try_to_unmap_flush(void) static inline void try_to_unmap_flush_dirty(void) { } - +static inline void flush_tlb_batched_pending(struct mm_struct *mm) +{ +} #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index e6fa13484447..9ac55172aa7b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1127,6 +1127,7 @@ again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; diff --git a/mm/mprotect.c b/mm/mprotect.c index ef5be8eaab00..c0b4b2a49462 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -72,6 +72,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!pte) return 0; + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { oldpte = *pte; diff --git a/mm/mremap.c b/mm/mremap.c index c25bc6268e46..fe7b7f65f4f4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -135,6 +135,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, diff --git a/mm/rmap.c b/mm/rmap.c index b577fbb98d4b..ede183c32f45 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -648,6 +648,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); tlb_ubc->flush_required = true; + /* + * Ensure compiler does not re-order the setting of tlb_flush_batched + * before the PTE is cleared. + */ + barrier(); + mm->tlb_flush_batched = true; + /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() @@ -675,6 +682,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) return should_defer; } + +/* + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to + * releasing the PTL if TLB flushes are batched. It's possible for a parallel + * operation such as mprotect or munmap to race between reclaim unmapping + * the page and flushing the page. If this race occurs, it potentially allows + * access to data via a stale TLB entry. Tracking all mm's that have TLB + * batching in flight would be expensive during reclaim so instead track + * whether TLB batching occurred in the past and if so then do a flush here + * if required. This will cost one additional flush per reclaim cycle paid + * by the first operation at risk such as mprotect and mumap. + * + * This must be called under the PTL so that an access to tlb_flush_batched + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise + * via the PTL. + */ +void flush_tlb_batched_pending(struct mm_struct *mm) +{ + if (mm->tlb_flush_batched) { + flush_tlb_mm(mm); + + /* + * Do not allow the compiler to re-order the clearing of + * tlb_flush_batched before the tlb is flushed. + */ + barrier(); + mm->tlb_flush_batched = false; + } +} #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, struct page *page, bool writable) From c7c6f63dbc676788f0cb5a804dff57a0639af446 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Fri, 7 Jul 2017 18:49:18 -0300 Subject: [PATCH 18/76] media: lirc: LIRC_GET_REC_RESOLUTION should return microseconds commit 9f5039ba440e499d85c29b1ddbc3cbc9dc90e44b upstream. Since commit e8f4818895b3 ("[media] lirc: advertise LIRC_CAN_GET_REC_RESOLUTION and improve") lircd uses the ioctl LIRC_GET_REC_RESOLUTION to determine the shortest pulse or space that the hardware can detect. This breaks decoding in lirc because lircd expects the answer in microseconds, but nanoseconds is returned. Reported-by: Derek Tested-by: Derek Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/rc/ir-lirc-codec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/rc/ir-lirc-codec.c b/drivers/media/rc/ir-lirc-codec.c index a32659fcd266..efc21b1da211 100644 --- a/drivers/media/rc/ir-lirc-codec.c +++ b/drivers/media/rc/ir-lirc-codec.c @@ -254,7 +254,7 @@ static long ir_lirc_ioctl(struct file *filep, unsigned int cmd, return 0; case LIRC_GET_REC_RESOLUTION: - val = dev->rx_resolution; + val = dev->rx_resolution / 1000; break; case LIRC_SET_WIDEBAND_RECEIVER: From 76517ed2a7fe059840c1984165984e69cb488568 Mon Sep 17 00:00:00 2001 From: Jin Qian Date: Mon, 15 May 2017 10:45:08 -0700 Subject: [PATCH 19/76] f2fs: sanity check checkpoint segno and blkoff commit 15d3042a937c13f5d9244241c7a9c8416ff6e82a upstream. Make sure segno and blkoff read from raw image are valid. Cc: stable@vger.kernel.org Signed-off-by: Jin Qian [Jaegeuk Kim: adjust minor coding style] Signed-off-by: Jaegeuk Kim [AmitP: Found in Android Security bulletin for Aug'17, fixes CVE-2017-10663] Signed-off-by: Amit Pundir Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/super.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 86e1cb899957..4f666368aa85 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1078,6 +1078,8 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1089,6 +1091,20 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From e0f39019aa3fabfe0c0775f2a1f9bd66aceb55d0 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 8 Aug 2017 09:26:41 -0700 Subject: [PATCH 20/76] drm: rcar-du: fix backport bug In the backport of commit 4f7b0d263833 ("drm: rcar-du: Simplify and fix probe error handling"), which is commit 8255d26322a3 in this tree, the error handling path was incorrect. This patch fixes it up. Reported-by: Ben Hutchings Cc: Laurent Pinchart Cc: thongsyho Cc: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_drv.c b/drivers/gpu/drm/rcar-du/rcar_du_drv.c index bf4674aa6405..bb9cd35d7fdf 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_drv.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_drv.c @@ -296,7 +296,7 @@ static int rcar_du_probe(struct platform_device *pdev) mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); rcdu->mmio = devm_ioremap_resource(&pdev->dev, mem); if (IS_ERR(rcdu->mmio)) - ret = PTR_ERR(rcdu->mmio); + return PTR_ERR(rcdu->mmio); /* DRM/KMS objects */ ddev = drm_dev_alloc(&rcar_du_driver, &pdev->dev); From deae9956cfb42f3d480c71f416f934d3ce84c033 Mon Sep 17 00:00:00 2001 From: Steven Toth Date: Tue, 6 Jun 2017 09:30:27 -0300 Subject: [PATCH 21/76] saa7164: fix double fetch PCIe access condition commit 6fb05e0dd32e566facb96ea61a48c7488daa5ac3 upstream. Avoid a double fetch by reusing the values from the prior transfer. Originally reported via https://bugzilla.kernel.org/show_bug.cgi?id=195559 Thanks to Pengfei Wang for reporting. Signed-off-by: Steven Toth Reported-by: Pengfei Wang Signed-off-by: Mauro Carvalho Chehab Cc: Eduardo Valentin Signed-off-by: Greg Kroah-Hartman --- drivers/media/pci/saa7164/saa7164-bus.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/media/pci/saa7164/saa7164-bus.c b/drivers/media/pci/saa7164/saa7164-bus.c index a18fe5d47238..b4857cd7069e 100644 --- a/drivers/media/pci/saa7164/saa7164-bus.c +++ b/drivers/media/pci/saa7164/saa7164-bus.c @@ -393,11 +393,11 @@ int saa7164_bus_get(struct saa7164_dev *dev, struct tmComResInfo* msg, msg_tmp.size = le16_to_cpu((__force __le16)msg_tmp.size); msg_tmp.command = le32_to_cpu((__force __le32)msg_tmp.command); msg_tmp.controlselector = le16_to_cpu((__force __le16)msg_tmp.controlselector); + memcpy(msg, &msg_tmp, sizeof(*msg)); /* No need to update the read positions, because this was a peek */ /* If the caller specifically want to peek, return */ if (peekonly) { - memcpy(msg, &msg_tmp, sizeof(*msg)); goto peekout; } @@ -442,21 +442,15 @@ int saa7164_bus_get(struct saa7164_dev *dev, struct tmComResInfo* msg, space_rem = bus->m_dwSizeGetRing - curr_grp; if (space_rem < sizeof(*msg)) { - /* msg wraps around the ring */ - memcpy_fromio(msg, bus->m_pdwGetRing + curr_grp, space_rem); - memcpy_fromio((u8 *)msg + space_rem, bus->m_pdwGetRing, - sizeof(*msg) - space_rem); if (buf) memcpy_fromio(buf, bus->m_pdwGetRing + sizeof(*msg) - space_rem, buf_size); } else if (space_rem == sizeof(*msg)) { - memcpy_fromio(msg, bus->m_pdwGetRing + curr_grp, sizeof(*msg)); if (buf) memcpy_fromio(buf, bus->m_pdwGetRing, buf_size); } else { /* Additional data wraps around the ring */ - memcpy_fromio(msg, bus->m_pdwGetRing + curr_grp, sizeof(*msg)); if (buf) { memcpy_fromio(buf, bus->m_pdwGetRing + curr_grp + sizeof(*msg), space_rem - sizeof(*msg)); @@ -469,15 +463,10 @@ int saa7164_bus_get(struct saa7164_dev *dev, struct tmComResInfo* msg, } else { /* No wrapping */ - memcpy_fromio(msg, bus->m_pdwGetRing + curr_grp, sizeof(*msg)); if (buf) memcpy_fromio(buf, bus->m_pdwGetRing + curr_grp + sizeof(*msg), buf_size); } - /* Convert from little endian to CPU */ - msg->size = le16_to_cpu((__force __le16)msg->size); - msg->command = le32_to_cpu((__force __le32)msg->command); - msg->controlselector = le16_to_cpu((__force __le16)msg->controlselector); /* Update the read positions, adjusting the ring */ saa7164_writel(bus->m_dwGetReadPos, new_grp); From c10e874bc3a504b3ced84616b52886388f09e0b7 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 17 Jul 2017 12:35:58 +0200 Subject: [PATCH 22/76] ipv4: ipv6: initialize treq->txhash in cookie_v[46]_check() [ Upstream commit 18bcf2907df935981266532e1e0d052aff2e6fae ] KMSAN reported use of uninitialized memory in skb_set_hash_from_sk(), which originated from the TCP request socket created in cookie_v6_check(): ================================================================== BUG: KMSAN: use of uninitialized memory in tcp_transmit_skb+0xf77/0x3ec0 CPU: 1 PID: 2949 Comm: syz-execprog Not tainted 4.11.0-rc5+ #2931 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 TCP: request_sock_TCPv6: Possible SYN flooding on port 20028. Sending cookies. Check SNMP counters. Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x172/0x1c0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:927 __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:469 skb_set_hash_from_sk ./include/net/sock.h:2011 tcp_transmit_skb+0xf77/0x3ec0 net/ipv4/tcp_output.c:983 tcp_send_ack+0x75b/0x830 net/ipv4/tcp_output.c:3493 tcp_delack_timer_handler+0x9a6/0xb90 net/ipv4/tcp_timer.c:284 tcp_delack_timer+0x1b0/0x310 net/ipv4/tcp_timer.c:309 call_timer_fn+0x240/0x520 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 __run_timers+0xc13/0xf10 kernel/time/timer.c:1601 run_timer_softirq+0x36/0xa0 kernel/time/timer.c:1614 __do_softirq+0x485/0x942 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 irq_exit+0x1fa/0x230 kernel/softirq.c:405 exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:657 smp_apic_timer_interrupt+0x5a/0x80 arch/x86/kernel/apic/apic.c:966 apic_timer_interrupt+0x86/0x90 arch/x86/entry/entry_64.S:489 RIP: 0010:native_restore_fl ./arch/x86/include/asm/irqflags.h:36 RIP: 0010:arch_local_irq_restore ./arch/x86/include/asm/irqflags.h:77 RIP: 0010:__msan_poison_alloca+0xed/0x120 mm/kmsan/kmsan_instr.c:440 RSP: 0018:ffff880024917cd8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10 RAX: 0000000000000246 RBX: ffff8800224c0000 RCX: 0000000000000005 RDX: 0000000000000004 RSI: ffff880000000000 RDI: ffffea0000b6d770 RBP: ffff880024917d58 R08: 0000000000000dd8 R09: 0000000000000004 R10: 0000160000000000 R11: 0000000000000000 R12: ffffffff85abf810 R13: ffff880024917dd8 R14: 0000000000000010 R15: ffffffff81cabde4 poll_select_copy_remaining+0xac/0x6b0 fs/select.c:293 SYSC_select+0x4b4/0x4e0 fs/select.c:653 SyS_select+0x76/0xa0 fs/select.c:634 entry_SYSCALL_64_fastpath+0x13/0x94 arch/x86/entry/entry_64.S:204 RIP: 0033:0x4597e7 RSP: 002b:000000c420037ee0 EFLAGS: 00000246 ORIG_RAX: 0000000000000017 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004597e7 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 000000c420037ef0 R08: 000000c420037ee0 R09: 0000000000000059 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000042dc20 R13: 00000000000000f3 R14: 0000000000000030 R15: 0000000000000003 chained origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_save_stack mm/kmsan/kmsan.c:317 kmsan_internal_chain_origin+0x12a/0x1f0 mm/kmsan/kmsan.c:547 __msan_store_shadow_origin_4+0xac/0x110 mm/kmsan/kmsan_instr.c:259 tcp_create_openreq_child+0x709/0x1ae0 net/ipv4/tcp_minisocks.c:472 tcp_v6_syn_recv_sock+0x7eb/0x2a30 net/ipv6/tcp_ipv6.c:1103 tcp_get_cookie_sock+0x136/0x5f0 net/ipv4/syncookies.c:212 cookie_v6_check+0x17a9/0x1b50 net/ipv6/syncookies.c:245 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989 tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298 tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:198 kmsan_kmalloc+0x7f/0xe0 mm/kmsan/kmsan.c:337 kmem_cache_alloc+0x1c2/0x1e0 mm/slub.c:2766 reqsk_alloc ./include/net/request_sock.h:87 inet_reqsk_alloc+0xa4/0x5b0 net/ipv4/tcp_input.c:6200 cookie_v6_check+0x4f4/0x1b50 net/ipv6/syncookies.c:169 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989 tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298 tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 ================================================================== Similar error is reported for cookie_v4_check(). Fixes: 58d607d3e52f ("tcp: provide skb->hash to synack packets") Signed-off-by: Alexander Potapenko Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/syncookies.c | 1 + net/ipv6/syncookies.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4cbe9f0a4281..731b91409625 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -337,6 +337,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; + treq->txhash = net_tx_rndhash(); req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index eaf7ac496d50..aee87282d352 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -210,6 +210,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->snt_synack.v64 = 0; treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; + treq->txhash = net_tx_rndhash(); /* * We need to lookup the dst_entry to get the correct window size. From e9b2f4617a4e631eae1717a6264cbb3e405f377f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 19 Jul 2017 13:33:24 -0700 Subject: [PATCH 23/76] net: Zero terminate ifr_name in dev_ifname(). [ Upstream commit 63679112c536289826fec61c917621de95ba2ade ] The ifr.ifr_name is passed around and assumed to be NULL terminated. Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev_ioctl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index b94b1d293506..151e047ce072 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -28,6 +28,7 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; + ifr.ifr_name[IFNAMSIZ-1] = 0; error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); if (error) From f09db7559c1faf6f7c2128a92050b978a480681c Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 19 Jul 2017 22:28:55 +0200 Subject: [PATCH 24/76] ipv6: avoid overflow of offset in ip6_find_1stfragopt [ Upstream commit 6399f1fae4ec29fab5ec76070435555e256ca3a6 ] In some cases, offset can overflow and can cause an infinite loop in ip6_find_1stfragopt(). Make it unsigned int to prevent the overflow, and cap it at IPV6_MAXPLEN, since packets larger than that should be invalid. This problem has been here since before the beginning of git history. Signed-off-by: Sabrina Dubroca Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/output_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 8b56c5240429..f9f02581c4ca 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { - u16 offset = sizeof(struct ipv6hdr); + unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; @@ -86,6 +86,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; + unsigned int len; switch (**nexthdr) { @@ -111,7 +112,10 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); - offset += ipv6_optlen(exthdr); + len = ipv6_optlen(exthdr); + if (len + offset >= IPV6_MAXPLEN) + return -EINVAL; + offset += len; *nexthdr = &exthdr->nexthdr; } From 31afa8b5edf9fc81c7df7aa903b7ef7b110bccab Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 19 Jul 2017 15:41:33 -0700 Subject: [PATCH 25/76] ipv4: initialize fib_trie prior to register_netdev_notifier call. [ Upstream commit 8799a221f5944a7d74516ecf46d58c28ec1d1f75 ] Net stack initialization currently initializes fib-trie after the first call to netdevice_notifier() call. In fact fib_trie initialization needs to happen before first rtnl_register(). It does not cause any problem since there are no devices UP at this moment, but trying to bring 'lo' UP at initialization would make this assumption wrong and exposes the issue. Fixes following crash Call Trace: ? alternate_node_alloc+0x76/0xa0 fib_table_insert+0x1b7/0x4b0 fib_magic.isra.17+0xea/0x120 fib_add_ifaddr+0x7b/0x190 fib_netdev_event+0xc0/0x130 register_netdevice_notifier+0x1c1/0x1d0 ip_fib_init+0x72/0x85 ip_rt_init+0x187/0x1e9 ip_init+0xe/0x1a inet_init+0x171/0x26c ? ipv4_offload_init+0x66/0x66 do_one_initcall+0x43/0x160 kernel_init_freeable+0x191/0x219 ? rest_init+0x80/0x80 kernel_init+0xe/0x150 ret_from_fork+0x22/0x30 Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08 RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: ffff9b1500017c28 CR2: 0000000000000014 Fixes: 7b1a74fdbb9e ("[NETNS]: Refactor fib initialization so it can handle multiple namespaces.") Fixes: 7f9b80529b8a ("[IPV4]: fib hash|trie initialization") Signed-off-by: Mahesh Bandewar Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/fib_frontend.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 66dcb529fd9c..0cb240c749bf 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1319,13 +1319,14 @@ static struct pernet_operations fib_net_ops = { void __init ip_fib_init(void) { - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + fib_trie_init(); register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_trie_init(); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); } From d0594690a9ee8f74fcda37aa085087acf40e594f Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 20 Jul 2017 11:27:57 -0700 Subject: [PATCH 26/76] rtnetlink: allocate more memory for dev_set_mac_address() [ Upstream commit 153711f9421be5dbc973dc57a4109dc9d54c89b1 ] virtnet_set_mac_address() interprets mac address as struct sockaddr, but upper layer only allocates dev->addr_len which is ETH_ALEN + sizeof(sa_family_t) in this case. We lack a unified definition for mac address, so just fix the upper layer, this also allows drivers to interpret it to struct sockaddr freely. Reported-by: David Ahern Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/rtnetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2ec5324a7ff7..5b3d611d8b5f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1742,7 +1742,8 @@ static int do_setlink(const struct sk_buff *skb, struct sockaddr *sa; int len; - len = sizeof(sa_family_t) + dev->addr_len; + len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, + sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; From 6d1e34ee278f1a9cc7a24cc5025d8e93ab01965c Mon Sep 17 00:00:00 2001 From: Thomas Jarosch Date: Sat, 22 Jul 2017 17:14:34 +0200 Subject: [PATCH 27/76] mcs7780: Fix initialization when CONFIG_VMAP_STACK is enabled [ Upstream commit 9476d393667968b4a02afbe9d35a3558482b943e ] DMA transfers are not allowed to buffers that are on the stack. Therefore allocate a buffer to store the result of usb_control_message(). Fixes these bugreports: https://bugzilla.kernel.org/show_bug.cgi?id=195217 https://bugzilla.redhat.com/show_bug.cgi?id=1421387 https://bugzilla.redhat.com/show_bug.cgi?id=1427398 Shortened kernel backtrace from 4.11.9-200.fc25.x86_64: kernel: ------------[ cut here ]------------ kernel: WARNING: CPU: 3 PID: 2957 at drivers/usb/core/hcd.c:1587 kernel: transfer buffer not dma capable kernel: Call Trace: kernel: dump_stack+0x63/0x86 kernel: __warn+0xcb/0xf0 kernel: warn_slowpath_fmt+0x5a/0x80 kernel: usb_hcd_map_urb_for_dma+0x37f/0x570 kernel: ? try_to_del_timer_sync+0x53/0x80 kernel: usb_hcd_submit_urb+0x34e/0xb90 kernel: ? schedule_timeout+0x17e/0x300 kernel: ? del_timer_sync+0x50/0x50 kernel: ? __slab_free+0xa9/0x300 kernel: usb_submit_urb+0x2f4/0x560 kernel: ? urb_destroy+0x24/0x30 kernel: usb_start_wait_urb+0x6e/0x170 kernel: usb_control_msg+0xdc/0x120 kernel: mcs_get_reg+0x36/0x40 [mcs7780] kernel: mcs_net_open+0xb5/0x5c0 [mcs7780] ... Regression goes back to 4.9, so it's a good candidate for -stable. Though it's the decision of the maintainer. Thanks to Dan Williams for adding the "transfer buffer not dma capable" warning in the first place. It instantly pointed me in the right direction. Patch has been tested with transferring data from a Polar watch. Signed-off-by: Thomas Jarosch Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/irda/mcs7780.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/net/irda/mcs7780.c b/drivers/net/irda/mcs7780.c index bca6a1e72d1d..e1bb802d4a4d 100644 --- a/drivers/net/irda/mcs7780.c +++ b/drivers/net/irda/mcs7780.c @@ -141,9 +141,19 @@ static int mcs_set_reg(struct mcs_cb *mcs, __u16 reg, __u16 val) static int mcs_get_reg(struct mcs_cb *mcs, __u16 reg, __u16 * val) { struct usb_device *dev = mcs->usbdev; - int ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, - MCS_RD_RTYPE, 0, reg, val, 2, - msecs_to_jiffies(MCS_CTRL_TIMEOUT)); + void *dmabuf; + int ret; + + dmabuf = kmalloc(sizeof(__u16), GFP_KERNEL); + if (!dmabuf) + return -ENOMEM; + + ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, + MCS_RD_RTYPE, 0, reg, dmabuf, 2, + msecs_to_jiffies(MCS_CTRL_TIMEOUT)); + + memcpy(val, dmabuf, sizeof(__u16)); + kfree(dmabuf); return ret; } From 23f787ceebf8139cf886d270c3b780581355ed5b Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 23 Jul 2017 17:52:23 +0800 Subject: [PATCH 28/76] openvswitch: fix potential out of bound access in parse_ct [ Upstream commit 69ec932e364b1ba9c3a2085fe96b76c8a3f71e7c ] Before the 'type' is validated, we shouldn't use it to fetch the ovs_ct_attr_lens's minlen and maxlen, else, out of bound access may happen. Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") Signed-off-by: Liping Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/openvswitch/conntrack.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index ad58d2a6284e..6a2507f24b0f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -577,8 +577,8 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, nla_for_each_nested(a, attr, rem) { int type = nla_type(a); - int maxlen = ovs_ct_attr_lens[type].maxlen; - int minlen = ovs_ct_attr_lens[type].minlen; + int maxlen; + int minlen; if (type > OVS_CT_ATTR_MAX) { OVS_NLERR(log, @@ -586,6 +586,9 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, type, OVS_CT_ATTR_MAX); return -EINVAL; } + + maxlen = ovs_ct_attr_lens[type].maxlen; + minlen = ovs_ct_attr_lens[type].minlen; if (nla_len(a) < minlen || nla_len(a) > maxlen) { OVS_NLERR(log, "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", From 499338964af84436c0306e061c7b0212a181fccb Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 24 Jul 2017 10:07:32 -0700 Subject: [PATCH 29/76] packet: fix use-after-free in prb_retire_rx_blk_timer_expired() [ Upstream commit c800aaf8d869f2b9b47b10c5c312fe19f0a94042 ] There are multiple reports showing we have a use-after-free in the timer prb_retire_rx_blk_timer_expired(), where we use struct tpacket_kbdq_core::pkbdq, a pg_vec, after it gets freed by free_pg_vec(). The interesting part is it is not freed via packet_release() but via packet_setsockopt(), which means we are not closing the socket. Looking into the big and fat function packet_set_ring(), this could happen if we satisfy the following conditions: 1. closing == 0, not on packet_release() path 2. req->tp_block_nr == 0, we don't allocate a new pg_vec 3. rx_ring->pg_vec is already set as V3, which means we already called packet_set_ring() wtih req->tp_block_nr > 0 previously 4. req->tp_frame_nr == 0, pass sanity check 5. po->mapped == 0, never called mmap() In this scenario we are clearing the old rx_ring->pg_vec, so we need to free this pg_vec, but we don't stop the timer on this path because of closing==0. The timer has to be stopped as long as we need to free pg_vec, therefore the check on closing!=0 is wrong, we should check pg_vec!=NULL instead. Thanks to liujian for testing different fixes. Reported-by: alexander.levin@verizon.com Reported-by: Dave Jones Reported-by: liujian (CE) Tested-by: liujian (CE) Cc: Ding Tianhong Cc: Willem de Bruijn Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index f8d6a0ca9c03..061771ca2582 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4225,7 +4225,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, register_prot_hook(sk); } spin_unlock(&po->bind_lock); - if (closing && (po->tp_version > TPACKET_V2)) { + if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); From d1ed1f8ae4871b22ab472dba22d7df96511227ee Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Mon, 24 Jul 2017 23:14:28 +0200 Subject: [PATCH 30/76] ipv6: Don't increase IPSTATS_MIB_FRAGFAILS twice in ip6_fragment() [ Upstream commit afce615aaabfbaad02550e75c0bec106dafa1adf ] RFC 2465 defines ipv6IfStatsOutFragFails as: "The number of IPv6 datagrams that have been discarded because they needed to be fragmented at this output interface but could not be." The existing implementation, instead, would increase the counter twice in case we fail to allocate room for single fragments: once for the fragment, once for the datagram. This didn't look intentional though. In one of the two affected affected failure paths, the double increase was simply a result of a new 'goto fail' statement, introduced to avoid a skb leak. The other path appears to be affected since at least 2.6.12-rc2. Reported-by: Sabrina Dubroca Fixes: 1d325d217c7f ("ipv6: ip6_fragment: fix headroom tests and skb leak") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 150b4923fb72..a7a5790e28e4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -647,8 +647,6 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, *prevhdr = NEXTHDR_FRAGMENT; tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); if (!tmp_hdr) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } @@ -767,8 +765,6 @@ slow_path: frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + hroom + troom, GFP_ATOMIC); if (!frag) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); err = -ENOMEM; goto fail; } From 91c5aa7ea0737d374601687fec02ddd25aefd6e8 Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Tue, 25 Jul 2017 14:35:03 +0200 Subject: [PATCH 31/76] net: ethernet: nb8800: Handle all 4 RGMII modes identically [ Upstream commit 4813497b537c6208c90d6cbecac5072d347de900 ] Before commit bf8f6952a233 ("Add blurb about RGMII") it was unclear whose responsibility it was to insert the required clock skew, and in hindsight, some PHY drivers got it wrong. The solution forward is to introduce a new property, explicitly requiring skew from the node to which it is attached. In the interim, this driver will handle all 4 RGMII modes identically (no skew). Fixes: 52dfc8301248 ("net: ethernet: add driver for Aurora VLSI NB8800 Ethernet controller") Signed-off-by: Marc Gonzalez Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/aurora/nb8800.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/aurora/nb8800.c b/drivers/net/ethernet/aurora/nb8800.c index ecc4a334c507..0a54e7dac0ab 100644 --- a/drivers/net/ethernet/aurora/nb8800.c +++ b/drivers/net/ethernet/aurora/nb8800.c @@ -608,7 +608,7 @@ static void nb8800_mac_config(struct net_device *dev) mac_mode |= HALF_DUPLEX; if (gigabit) { - if (priv->phy_mode == PHY_INTERFACE_MODE_RGMII) + if (phy_interface_is_rgmii(dev->phydev)) mac_mode |= RGMII_MODE; mac_mode |= GMAC_MODE; @@ -1295,11 +1295,10 @@ static int nb8800_tangox_init(struct net_device *dev) break; case PHY_INTERFACE_MODE_RGMII: - pad_mode = PAD_MODE_RGMII; - break; - + case PHY_INTERFACE_MODE_RGMII_ID: + case PHY_INTERFACE_MODE_RGMII_RXID: case PHY_INTERFACE_MODE_RGMII_TXID: - pad_mode = PAD_MODE_RGMII | PAD_MODE_GTX_CLK_DELAY; + pad_mode = PAD_MODE_RGMII; break; default: From c3278ed37d3e34178e578914dd45b2a0df79b5b8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:19:09 +0800 Subject: [PATCH 32/76] dccp: fix a memleak that dccp_ipv6 doesn't put reqsk properly [ Upstream commit 0c2232b0a71db0ac1d22f751aa1ac0cadb950fd2 ] In dccp_v6_conn_request, after reqsk gets alloced and hashed into ehash table, reqsk's refcnt is set 3. one is for req->rsk_timer, one is for hlist, and the other one is for current using. The problem is when dccp_v6_conn_request returns and finishes using reqsk, it doesn't put reqsk. This will cause reqsk refcnt leaks and reqsk obj never gets freed. Jianlin found this issue when running dccp_memleak.c in a loop, the system memory would run out. dccp_memleak.c: int s1 = socket(PF_INET6, 6, IPPROTO_IP); bind(s1, &sa1, 0x20); listen(s1, 0x9); int s2 = socket(PF_INET6, 6, IPPROTO_IP); connect(s2, &sa1, 0x20); close(s1); close(s2); This patch is to put the reqsk before dccp_v6_conn_request returns, just as what tcp_conn_request does. Reported-by: Jianlin Shi Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/ipv6.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 3470ad1843bb..09a9ab65f4e1 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -376,6 +376,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: From adcc87858e0915554b2484b8a4657bb49dbe0843 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:19:46 +0800 Subject: [PATCH 33/76] dccp: fix a memleak that dccp_ipv4 doesn't put reqsk properly [ Upstream commit b7953d3c0e30a5fc944f6b7bd0bcceb0794bcd85 ] The patch "dccp: fix a memleak that dccp_ipv6 doesn't put reqsk properly" fixed reqsk refcnt leak for dccp_ipv6. The same issue exists on dccp_ipv4. This patch is to fix it for dccp_ipv4. Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/ipv4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6467bf392e1b..e217f17997a4 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -635,6 +635,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + reqsk_put(req); return 0; drop_and_free: From dd4edbcbb760ba024dc25017937592bd56994801 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 14:20:15 +0800 Subject: [PATCH 34/76] dccp: fix a memleak for dccp_feat_init err process [ Upstream commit e90ce2fc27cad7e7b1e72b9e66201a7a4c124c2b ] In dccp_feat_init, when ccid_get_builtin_ccids failsto alloc memory for rx.val, it should free tx.val before returning an error. Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/feat.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 1704948e6a12..f227f002c73d 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1471,9 +1471,12 @@ int dccp_feat_init(struct sock *sk) * singleton values (which always leads to failure). * These settings can still (later) be overridden via sockopts. */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len) || - ccid_get_builtin_ccids(&rx.val, &rx.len)) + if (ccid_get_builtin_ccids(&tx.val, &tx.len)) return -ENOBUFS; + if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { + kfree(tx.val); + return -ENOBUFS; + } if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) From 2bac20a4ae9ce90e187ce15cca51c242ee5b2ca3 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 14 Jul 2017 18:32:45 +0200 Subject: [PATCH 35/76] sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}() [ Upstream commit b1f5bfc27a19f214006b9b4db7b9126df2dfdf5a ] If the length field of the iterator (|pos.p| or |err|) is past the end of the chunk, we shouldn't access it. This bug has been detected by KMSAN. For the following pair of system calls: socket(PF_INET6, SOCK_STREAM, 0x84 /* IPPROTO_??? */) = 3 sendto(3, "A", 1, MSG_OOB, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = 1 the tool has reported a use of uninitialized memory: ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_rcv+0x17b8/0x43b0 CPU: 1 PID: 2940 Comm: probe Not tainted 4.11.0-rc5+ #2926 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x172/0x1c0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:927 __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:469 __sctp_rcv_init_lookup net/sctp/input.c:1074 __sctp_rcv_lookup_harder net/sctp/input.c:1233 __sctp_rcv_lookup net/sctp/input.c:1255 sctp_rcv+0x17b8/0x43b0 net/sctp/input.c:170 sctp6_rcv+0x32/0x70 net/sctp/ipv6.c:984 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902 do_softirq kernel/softirq.c:328 __local_bh_enable_ip+0x25b/0x290 kernel/softirq.c:181 local_bh_enable+0x37/0x40 ./include/linux/bottom_half.h:31 rcu_read_unlock_bh ./include/linux/rcupdate.h:931 ip6_finish_output2+0x19b2/0x1cf0 net/ipv6/ip6_output.c:124 ip6_finish_output+0x764/0x970 net/ipv6/ip6_output.c:149 NF_HOOK_COND ./include/linux/netfilter.h:246 ip6_output+0x456/0x520 net/ipv6/ip6_output.c:163 dst_output ./include/net/dst.h:486 NF_HOOK ./include/linux/netfilter.h:257 ip6_xmit+0x1841/0x1c00 net/ipv6/ip6_output.c:261 sctp_v6_xmit+0x3b7/0x470 net/sctp/ipv6.c:225 sctp_packet_transmit+0x38cb/0x3a20 net/sctp/output.c:632 sctp_outq_flush+0xeb3/0x46e0 net/sctp/outqueue.c:885 sctp_outq_uncork+0xb2/0xd0 net/sctp/outqueue.c:750 sctp_side_effects net/sctp/sm_sideeffect.c:1773 sctp_do_sm+0x6962/0x6ec0 net/sctp/sm_sideeffect.c:1147 sctp_primitive_ASSOCIATE+0x12c/0x160 net/sctp/primitive.c:88 sctp_sendmsg+0x43e5/0x4f90 net/sctp/socket.c:1954 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 do_syscall_64+0xe6/0x130 arch/x86/entry/common.c:285 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246 RIP: 0033:0x401133 RSP: 002b:00007fff6d99cd38 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000401133 RDX: 0000000000000001 RSI: 0000000000494088 RDI: 0000000000000003 RBP: 00007fff6d99cd90 R08: 00007fff6d99cd50 R09: 000000000000001c R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004063d0 R14: 0000000000406460 R15: 0000000000000000 origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:198 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:211 slab_alloc_node mm/slub.c:2743 __kmalloc_node_track_caller+0x200/0x360 mm/slub.c:4351 __kmalloc_reserve net/core/skbuff.c:138 __alloc_skb+0x26b/0x840 net/core/skbuff.c:231 alloc_skb ./include/linux/skbuff.h:933 sctp_packet_transmit+0x31e/0x3a20 net/sctp/output.c:570 sctp_outq_flush+0xeb3/0x46e0 net/sctp/outqueue.c:885 sctp_outq_uncork+0xb2/0xd0 net/sctp/outqueue.c:750 sctp_side_effects net/sctp/sm_sideeffect.c:1773 sctp_do_sm+0x6962/0x6ec0 net/sctp/sm_sideeffect.c:1147 sctp_primitive_ASSOCIATE+0x12c/0x160 net/sctp/primitive.c:88 sctp_sendmsg+0x43e5/0x4f90 net/sctp/socket.c:1954 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 sock_sendmsg net/socket.c:643 SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 do_syscall_64+0xe6/0x130 arch/x86/entry/common.c:285 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246 ================================================================== Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/sctp/sctp.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index ce13cf20f625..48e14dbd2d23 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -444,6 +444,8 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) #define _sctp_walk_params(pos, chunk, end, member)\ for (pos.v = chunk->member;\ + (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <\ + (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(sctp_paramhdr_t);\ pos.v += WORD_ROUND(ntohs(pos.p->length))) @@ -454,6 +456,8 @@ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \ sizeof(sctp_chunkhdr_t));\ + ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <\ + (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(sctp_errhdr_t); \ err = (sctp_errhdr_t *)((void *)err + WORD_ROUND(ntohs(err->length)))) From de6669607a4554d0a4af2cb2d9d08738d3d043a8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 26 Jul 2017 16:24:59 +0800 Subject: [PATCH 36/76] sctp: fix the check for _sctp_walk_params and _sctp_walk_errors [ Upstream commit 6b84202c946cd3da3a8daa92c682510e9ed80321 ] Commit b1f5bfc27a19 ("sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}()") tried to fix the issue that it may overstep the chunk end for _sctp_walk_{params, errors} with 'chunk_end > offset(length) + sizeof(length)'. But it introduced a side effect: When processing INIT, it verifies the chunks with 'param.v == chunk_end' after iterating all params by sctp_walk_params(). With the check 'chunk_end > offset(length) + sizeof(length)', it would return when the last param is not yet accessed. Because the last param usually is fwdtsn supported param whose size is 4 and 'chunk_end == offset(length) + sizeof(length)' This is a badly issue even causing sctp couldn't process 4-shakes. Client would always get abort when connecting to server, due to the failure of INIT chunk verification on server. The patch is to use 'chunk_end <= offset(length) + sizeof(length)' instead of 'chunk_end < offset(length) + sizeof(length)' for both _sctp_walk_params and _sctp_walk_errors. Fixes: b1f5bfc27a19 ("sctp: don't dereference ptr before leaving _sctp_walk_{params, errors}()") Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/sctp/sctp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 48e14dbd2d23..d33b17ba51d2 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -444,7 +444,7 @@ _sctp_walk_params((pos), (chunk), ntohs((chunk)->chunk_hdr.length), member) #define _sctp_walk_params(pos, chunk, end, member)\ for (pos.v = chunk->member;\ - (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <\ + (pos.v + offsetof(struct sctp_paramhdr, length) + sizeof(pos.p->length) <=\ (void *)chunk + end) &&\ pos.v <= (void *)chunk + end - ntohs(pos.p->length) &&\ ntohs(pos.p->length) >= sizeof(sctp_paramhdr_t);\ @@ -456,7 +456,7 @@ _sctp_walk_errors((err), (chunk_hdr), ntohs((chunk_hdr)->length)) #define _sctp_walk_errors(err, chunk_hdr, end)\ for (err = (sctp_errhdr_t *)((void *)chunk_hdr + \ sizeof(sctp_chunkhdr_t));\ - ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <\ + ((void *)err + offsetof(sctp_errhdr_t, length) + sizeof(err->length) <=\ (void *)chunk_hdr + end) &&\ (void *)err <= (void *)chunk_hdr + end - ntohs(err->length) &&\ ntohs(err->length) >= sizeof(sctp_errhdr_t); \ From dc4132791a50b7e556d0efd6bc85508751bedaa6 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 25 Jun 2017 18:45:32 +0300 Subject: [PATCH 37/76] net/mlx5: Fix command bad flow on command entry allocation failure [ Upstream commit 219c81f7d1d5a89656cb3b53d3b4e11e93608d80 ] When driver fail to allocate an entry to send command to FW, it must notify the calling function and release the memory allocated for this command. Fixes: e126ba97dba9e ('mlx5: Add driver for Mellanox Connect-IB adapters') Signed-off-by: Moshe Shemesh Cc: kernel-team@fb.com Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index cc199063612a..6c66d2979795 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -630,6 +630,10 @@ static void dump_command(struct mlx5_core_dev *dev, pr_debug("\n"); } +static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg); +static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev, + struct mlx5_cmd_msg *msg); + static void cmd_work_handler(struct work_struct *work) { struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work); @@ -638,16 +642,27 @@ static void cmd_work_handler(struct work_struct *work) struct mlx5_cmd_layout *lay; struct semaphore *sem; unsigned long flags; + int alloc_ret; sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem; down(sem); if (!ent->page_queue) { - ent->idx = alloc_ent(cmd); - if (ent->idx < 0) { + alloc_ret = alloc_ent(cmd); + if (alloc_ret < 0) { + if (ent->callback) { + ent->callback(-EAGAIN, ent->context); + mlx5_free_cmd_msg(dev, ent->out); + free_msg(dev, ent->in); + free_cmd(ent); + } else { + ent->ret = -EAGAIN; + complete(&ent->done); + } mlx5_core_err(dev, "failed to allocate command entry\n"); up(sem); return; } + ent->idx = alloc_ret; } else { ent->idx = cmd->max_reg_cmds; spin_lock_irqsave(&cmd->alloc_lock, flags); From a8f1b40bb2da3819cda93a037124b34f56f5d5fa Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 28 Jul 2017 11:58:36 -0700 Subject: [PATCH 38/76] net: phy: Correctly process PHY_HALTED in phy_stop_machine() [ Upstream commit 7ad813f208533cebfcc32d3d7474dc1677d1b09a ] Marc reported that he was not getting the PHY library adjust_link() callback function to run when calling phy_stop() + phy_disconnect() which does not indeed happen because we set the state machine to PHY_HALTED but we don't get to run it to process this state past that point. Fix this with a synchronous call to phy_state_machine() in order to have the state machine actually act on PHY_HALTED, set the PHY device's link down, turn the network device's carrier off and finally call the adjust_link() function. Reported-by: Marc Gonzalez Fixes: a390d1f379cf ("phylib: convert state_queue work to delayed_work") Signed-off-by: Florian Fainelli Signed-off-by: Marc Gonzalez Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 851c0e121807..29825838d1a7 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -541,6 +541,9 @@ void phy_stop_machine(struct phy_device *phydev) if (phydev->state > PHY_UP && phydev->state != PHY_HALTED) phydev->state = PHY_UP; mutex_unlock(&phydev->lock); + + /* Now we can run the state machine synchronously */ + phy_state_machine(&phydev->state_queue.work); } /** From 2933fb22de6dc58ec1b8f0b1c8569d3699e5d7ea Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 17 Feb 2017 16:07:33 -0800 Subject: [PATCH 39/76] net: phy: Fix PHY unbind crash commit 7b9a88a390dacb37b051a7b09b9a08f546edf5eb upstream. The PHY library does not deal very well with bind and unbind events. The first thing we would see is that we were not properly canceling the PHY state machine workqueue, so we would be crashing while dereferencing phydev->drv since there is no driver attached anymore. Suggested-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Cc: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 1d1e5f7723ab..8179727d3423 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1368,6 +1368,8 @@ static int phy_remove(struct device *dev) { struct phy_device *phydev = to_phy_device(dev); + cancel_delayed_work_sync(&phydev->state_queue); + mutex_lock(&phydev->lock); phydev->state = PHY_DOWN; mutex_unlock(&phydev->lock); From 7c37101cd650d12bf124398eb26ba7eefb235029 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 21 Jun 2017 10:21:22 +0100 Subject: [PATCH 40/76] xen-netback: correctly schedule rate-limited queues [ Upstream commit dfa523ae9f2542bee4cddaea37b3be3e157f6e6b ] Add a flag to indicate if a queue is rate-limited. Test the flag in NAPI poll handler and avoid rescheduling the queue if true, otherwise we risk locking up the host. The rescheduling will be done in the timer callback function. Reported-by: Jean-Louis Dupond Signed-off-by: Wei Liu Tested-by: Jean-Louis Dupond Reviewed-by: Paul Durrant Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/xen-netback/common.h | 1 + drivers/net/xen-netback/interface.c | 6 +++++- drivers/net/xen-netback/netback.c | 6 +++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 0333ab0fd926..34173b5e886f 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -201,6 +201,7 @@ struct xenvif_queue { /* Per-queue data for xenvif */ unsigned long remaining_credit; struct timer_list credit_timeout; u64 credit_window_start; + bool rate_limited; /* Statistics */ struct xenvif_stats stats; diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index e7bd63eb2876..60b26f32d31d 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -105,7 +105,11 @@ static int xenvif_poll(struct napi_struct *napi, int budget) if (work_done < budget) { napi_complete(napi); - xenvif_napi_schedule_or_enable_events(queue); + /* If the queue is rate-limited, it shall be + * rescheduled in the timer callback. + */ + if (likely(!queue->rate_limited)) + xenvif_napi_schedule_or_enable_events(queue); } return work_done; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 1049c34e7d43..72ee1c305cc4 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -687,6 +687,7 @@ static void tx_add_credit(struct xenvif_queue *queue) max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ queue->remaining_credit = min(max_credit, max_burst); + queue->rate_limited = false; } void xenvif_tx_credit_callback(unsigned long data) @@ -1184,8 +1185,10 @@ static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size) msecs_to_jiffies(queue->credit_usec / 1000); /* Timer could already be pending in rare cases. */ - if (timer_pending(&queue->credit_timeout)) + if (timer_pending(&queue->credit_timeout)) { + queue->rate_limited = true; return true; + } /* Passed the point where we can replenish credit? */ if (time_after_eq64(now, next_credit)) { @@ -1200,6 +1203,7 @@ static bool tx_credit_exceeded(struct xenvif_queue *queue, unsigned size) mod_timer(&queue->credit_timeout, next_credit); queue->credit_window_start = next_credit; + queue->rate_limited = true; return true; } From cada8caa265574b4567534cf533096c2cddaf835 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Tue, 11 Jul 2017 12:00:54 -0600 Subject: [PATCH 41/76] sparc64: Measure receiver forward progress to avoid send mondo timeout [ Upstream commit 9d53caec84c7c5700e7c1ed744ea584fff55f9ac ] A large sun4v SPARC system may have moments of intensive xcall activities, usually caused by unmapping many pages on many CPUs concurrently. This can flood receivers with CPU mondo interrupts for an extended period, causing some unlucky senders to hit send-mondo timeout. This problem gets worse as cpu count increases because sometimes mappings must be invalidated on all CPUs, and sometimes all CPUs may gang up on a single CPU. But a busy system is not a broken system. In the above scenario, as long as the receiver is making forward progress processing mondo interrupts, the sender should continue to retry. This patch implements the receiver's forward progress meter by introducing a per cpu counter 'cpu_mondo_counter[cpu]' where 'cpu' is in the range of 0..NR_CPUS. The receiver increments its counter as soon as it receives a mondo and the sender tracks the receiver's counter. If the receiver has stopped making forward progress when the retry limit is reached, the sender declares send-mondo-timeout and panic; otherwise, the receiver is allowed to keep making forward progress. In addition, it's been observed that PCIe hotplug events generate Correctable Errors that are handled by hypervisor and then OS. Hypervisor 'borrows' a guest cpu strand briefly to provide the service. If the cpu strand is simultaneously the only cpu targeted by a mondo, it may not be available for the mondo in 20msec, causing SUN4V mondo timeout. It appears that 1 second is the agreed wait time between hypervisor and guest OS, this patch makes the adjustment. Orabug: 25476541 Orabug: 26417466 Signed-off-by: Jane Chu Reviewed-by: Steve Sistare Reviewed-by: Anthony Yznaga Reviewed-by: Rob Gardner Reviewed-by: Thomas Tai Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/trap_block.h | 1 + arch/sparc/kernel/smp_64.c | 191 +++++++++++++++++----------- arch/sparc/kernel/sun4v_ivec.S | 15 +++ arch/sparc/kernel/traps_64.c | 1 + 4 files changed, 135 insertions(+), 73 deletions(-) diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h index ec9c04de3664..ff05992dae7a 100644 --- a/arch/sparc/include/asm/trap_block.h +++ b/arch/sparc/include/asm/trap_block.h @@ -54,6 +54,7 @@ extern struct trap_per_cpu trap_block[NR_CPUS]; void init_cur_cpu_trap(struct thread_info *); void setup_tba(void); extern int ncpus_probed; +extern u64 cpu_mondo_counter[NR_CPUS]; unsigned long real_hard_smp_processor_id(void); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 95a9fa0d2195..4511caa3b7e9 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -617,22 +617,48 @@ retry: } } -/* Multi-cpu list version. */ +#define CPU_MONDO_COUNTER(cpuid) (cpu_mondo_counter[cpuid]) +#define MONDO_USEC_WAIT_MIN 2 +#define MONDO_USEC_WAIT_MAX 100 +#define MONDO_RETRY_LIMIT 500000 + +/* Multi-cpu list version. + * + * Deliver xcalls to 'cnt' number of cpus in 'cpu_list'. + * Sometimes not all cpus receive the mondo, requiring us to re-send + * the mondo until all cpus have received, or cpus are truly stuck + * unable to receive mondo, and we timeout. + * Occasionally a target cpu strand is borrowed briefly by hypervisor to + * perform guest service, such as PCIe error handling. Consider the + * service time, 1 second overall wait is reasonable for 1 cpu. + * Here two in-between mondo check wait time are defined: 2 usec for + * single cpu quick turn around and up to 100usec for large cpu count. + * Deliver mondo to large number of cpus could take longer, we adjusts + * the retry count as long as target cpus are making forward progress. + */ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt) { - int retries, this_cpu, prev_sent, i, saw_cpu_error; + int this_cpu, tot_cpus, prev_sent, i, rem; + int usec_wait, retries, tot_retries; + u16 first_cpu = 0xffff; + unsigned long xc_rcvd = 0; unsigned long status; + int ecpuerror_id = 0; + int enocpu_id = 0; u16 *cpu_list; + u16 cpu; this_cpu = smp_processor_id(); - cpu_list = __va(tb->cpu_list_pa); - - saw_cpu_error = 0; - retries = 0; + usec_wait = cnt * MONDO_USEC_WAIT_MIN; + if (usec_wait > MONDO_USEC_WAIT_MAX) + usec_wait = MONDO_USEC_WAIT_MAX; + retries = tot_retries = 0; + tot_cpus = cnt; prev_sent = 0; + do { - int forward_progress, n_sent; + int n_sent, mondo_delivered, target_cpu_busy; status = sun4v_cpu_mondo_send(cnt, tb->cpu_list_pa, @@ -640,94 +666,113 @@ static void hypervisor_xcall_deliver(struct trap_per_cpu *tb, int cnt) /* HV_EOK means all cpus received the xcall, we're done. */ if (likely(status == HV_EOK)) - break; + goto xcall_done; + + /* If not these non-fatal errors, panic */ + if (unlikely((status != HV_EWOULDBLOCK) && + (status != HV_ECPUERROR) && + (status != HV_ENOCPU))) + goto fatal_errors; /* First, see if we made any forward progress. + * + * Go through the cpu_list, count the target cpus that have + * received our mondo (n_sent), and those that did not (rem). + * Re-pack cpu_list with the cpus remain to be retried in the + * front - this simplifies tracking the truly stalled cpus. * * The hypervisor indicates successful sends by setting * cpu list entries to the value 0xffff. + * + * EWOULDBLOCK means some target cpus did not receive the + * mondo and retry usually helps. + * + * ECPUERROR means at least one target cpu is in error state, + * it's usually safe to skip the faulty cpu and retry. + * + * ENOCPU means one of the target cpu doesn't belong to the + * domain, perhaps offlined which is unexpected, but not + * fatal and it's okay to skip the offlined cpu. */ + rem = 0; n_sent = 0; for (i = 0; i < cnt; i++) { - if (likely(cpu_list[i] == 0xffff)) + cpu = cpu_list[i]; + if (likely(cpu == 0xffff)) { n_sent++; + } else if ((status == HV_ECPUERROR) && + (sun4v_cpu_state(cpu) == HV_CPU_STATE_ERROR)) { + ecpuerror_id = cpu + 1; + } else if (status == HV_ENOCPU && !cpu_online(cpu)) { + enocpu_id = cpu + 1; + } else { + cpu_list[rem++] = cpu; + } } - forward_progress = 0; - if (n_sent > prev_sent) - forward_progress = 1; + /* No cpu remained, we're done. */ + if (rem == 0) + break; + /* Otherwise, update the cpu count for retry. */ + cnt = rem; + + /* Record the overall number of mondos received by the + * first of the remaining cpus. + */ + if (first_cpu != cpu_list[0]) { + first_cpu = cpu_list[0]; + xc_rcvd = CPU_MONDO_COUNTER(first_cpu); + } + + /* Was any mondo delivered successfully? */ + mondo_delivered = (n_sent > prev_sent); prev_sent = n_sent; - /* If we get a HV_ECPUERROR, then one or more of the cpus - * in the list are in error state. Use the cpu_state() - * hypervisor call to find out which cpus are in error state. + /* or, was any target cpu busy processing other mondos? */ + target_cpu_busy = (xc_rcvd < CPU_MONDO_COUNTER(first_cpu)); + xc_rcvd = CPU_MONDO_COUNTER(first_cpu); + + /* Retry count is for no progress. If we're making progress, + * reset the retry count. */ - if (unlikely(status == HV_ECPUERROR)) { - for (i = 0; i < cnt; i++) { - long err; - u16 cpu; - - cpu = cpu_list[i]; - if (cpu == 0xffff) - continue; - - err = sun4v_cpu_state(cpu); - if (err == HV_CPU_STATE_ERROR) { - saw_cpu_error = (cpu + 1); - cpu_list[i] = 0xffff; - } - } - } else if (unlikely(status != HV_EWOULDBLOCK)) - goto fatal_mondo_error; - - /* Don't bother rewriting the CPU list, just leave the - * 0xffff and non-0xffff entries in there and the - * hypervisor will do the right thing. - * - * Only advance timeout state if we didn't make any - * forward progress. - */ - if (unlikely(!forward_progress)) { - if (unlikely(++retries > 10000)) - goto fatal_mondo_timeout; - - /* Delay a little bit to let other cpus catch up - * on their cpu mondo queue work. - */ - udelay(2 * cnt); + if (likely(mondo_delivered || target_cpu_busy)) { + tot_retries += retries; + retries = 0; + } else if (unlikely(retries > MONDO_RETRY_LIMIT)) { + goto fatal_mondo_timeout; } + + /* Delay a little bit to let other cpus catch up on + * their cpu mondo queue work. + */ + if (!mondo_delivered) + udelay(usec_wait); + + retries++; } while (1); - if (unlikely(saw_cpu_error)) - goto fatal_mondo_cpu_error; - +xcall_done: + if (unlikely(ecpuerror_id > 0)) { + pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) was in error state\n", + this_cpu, ecpuerror_id - 1); + } else if (unlikely(enocpu_id > 0)) { + pr_crit("CPU[%d]: SUN4V mondo cpu error, target cpu(%d) does not belong to the domain\n", + this_cpu, enocpu_id - 1); + } return; -fatal_mondo_cpu_error: - printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus " - "(including %d) were in error state\n", - this_cpu, saw_cpu_error - 1); - return; +fatal_errors: + /* fatal errors include bad alignment, etc */ + pr_crit("CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) mondo_block_pa(%lx)\n", + this_cpu, tot_cpus, tb->cpu_list_pa, tb->cpu_mondo_block_pa); + panic("Unexpected SUN4V mondo error %lu\n", status); fatal_mondo_timeout: - printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward " - " progress after %d retries.\n", - this_cpu, retries); - goto dump_cpu_list_and_out; - -fatal_mondo_error: - printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n", - this_cpu, status); - printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) " - "mondo_block_pa(%lx)\n", - this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa); - -dump_cpu_list_and_out: - printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu); - for (i = 0; i < cnt; i++) - printk("%u ", cpu_list[i]); - printk("]\n"); + /* some cpus being non-responsive to the cpu mondo */ + pr_crit("CPU[%d]: SUN4V mondo timeout, cpu(%d) made no forward progress after %d retries. Total target cpus(%d).\n", + this_cpu, first_cpu, (tot_retries + retries), tot_cpus); + panic("SUN4V mondo timeout panic\n"); } static void (*xcall_deliver_impl)(struct trap_per_cpu *, int); diff --git a/arch/sparc/kernel/sun4v_ivec.S b/arch/sparc/kernel/sun4v_ivec.S index 559bc5e9c199..34631995859a 100644 --- a/arch/sparc/kernel/sun4v_ivec.S +++ b/arch/sparc/kernel/sun4v_ivec.S @@ -26,6 +26,21 @@ sun4v_cpu_mondo: ldxa [%g0] ASI_SCRATCHPAD, %g4 sub %g4, TRAP_PER_CPU_FAULT_INFO, %g4 + /* Get smp_processor_id() into %g3 */ + sethi %hi(trap_block), %g5 + or %g5, %lo(trap_block), %g5 + sub %g4, %g5, %g3 + srlx %g3, TRAP_BLOCK_SZ_SHIFT, %g3 + + /* Increment cpu_mondo_counter[smp_processor_id()] */ + sethi %hi(cpu_mondo_counter), %g5 + or %g5, %lo(cpu_mondo_counter), %g5 + sllx %g3, 3, %g3 + add %g5, %g3, %g5 + ldx [%g5], %g3 + add %g3, 1, %g3 + stx %g3, [%g5] + /* Get CPU mondo queue base phys address into %g7. */ ldx [%g4 + TRAP_PER_CPU_CPU_MONDO_PA], %g7 diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index cc97a43268ee..d883c5951e8b 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2659,6 +2659,7 @@ void do_getpsr(struct pt_regs *regs) } } +u64 cpu_mondo_counter[NR_CPUS] = {0}; struct trap_per_cpu trap_block[NR_CPUS]; EXPORT_SYMBOL(trap_block); From 50231cef2d6fbff22eab442ee4f5cfc1685887b9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Jan 2017 15:35:25 +0100 Subject: [PATCH 42/76] wext: handle NULL extra data in iwe_stream_add_point better commit 93be2b74279c15c2844684b1a027fdc71dd5d9bf upstream. gcc-7 complains that wl3501_cs passes NULL into a function that then uses the argument as the input for memcpy: drivers/net/wireless/wl3501_cs.c: In function 'wl3501_get_scan': include/net/iw_handler.h:559:3: error: argument 2 null where non-null expected [-Werror=nonnull] memcpy(stream + point_len, extra, iwe->u.data.length); This works fine here because iwe->u.data.length is guaranteed to be 0 and the memcpy doesn't actually have an effect. Making the length check explicit avoids the warning and should have no other effect here. Also check the pointer itself, since otherwise we get warnings elsewhere in the code. Signed-off-by: Arnd Bergmann Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- include/net/iw_handler.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/iw_handler.h b/include/net/iw_handler.h index e0f4109e64c6..c2aa73e5e6bb 100644 --- a/include/net/iw_handler.h +++ b/include/net/iw_handler.h @@ -556,7 +556,8 @@ iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends, memcpy(stream + lcp_len, ((char *) &iwe->u) + IW_EV_POINT_OFF, IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); - memcpy(stream + point_len, extra, iwe->u.data.length); + if (iwe->u.data.length && extra) + memcpy(stream + point_len, extra, iwe->u.data.length); stream += event_len; } return stream; From 41433e314b778c1ba6d1f648d975af0788c930b6 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Thu, 5 Jan 2017 00:29:32 +0300 Subject: [PATCH 43/76] sh_eth: R8A7740 supports packet shecksumming [ Upstream commit 0f1f9cbc04dbb3cc310f70a11cba0cf1f2109d9c ] The R8A7740 GEther controller supports the packet checksum offloading but the 'hw_crc' (bad name, I'll fix it) flag isn't set in the R8A7740 data, thus CSMR isn't cleared... Fixes: 73a0d907301e ("net: sh_eth: add support R8A7740") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/renesas/sh_eth.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 4296066a7ad3..479af106aaeb 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -819,6 +819,7 @@ static struct sh_eth_cpu_data r8a7740_data = { .rpadir_value = 2 << 16, .no_trimd = 1, .no_ade = 1, + .hw_crc = 1, .tsu = 1, .select_mii = 1, .shift_rd0 = 1, From 93585e813588b09ef46893599d4833d5797c7af2 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Thu, 5 Jan 2017 14:48:07 -0600 Subject: [PATCH 44/76] net: phy: dp83867: fix irq generation [ Upstream commit 5ca7d1ca77dc23934504b95a96d2660d345f83c2 ] For proper IRQ generation by DP83867 phy the INT/PWDN pin has to be programmed as an interrupt output instead of a Powerdown input in Configuration Register 3 (CFG3), Address 0x001E, bit 7 INT_OE = 1. The current driver doesn't do this and as result IRQs will not be generated by DP83867 phy even if they are properly configured in DT. Hence, fix IRQ generation by properly configuring CFG3.INT_OE bit and ensure that Link Status Change (LINK_STATUS_CHNG_INT) and Auto-Negotiation Complete (AUTONEG_COMP_INT) interrupt are enabled. After this the DP83867 driver will work properly in interrupt enabled mode. Signed-off-by: Grygorii Strashko Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/dp83867.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index 32f10662f4ac..7242dd4b3238 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -29,6 +29,7 @@ #define MII_DP83867_MICR 0x12 #define MII_DP83867_ISR 0x13 #define DP83867_CTRL 0x1f +#define DP83867_CFG3 0x1e /* Extended Registers */ #define DP83867_RGMIICTL 0x0032 @@ -89,6 +90,8 @@ static int dp83867_config_intr(struct phy_device *phydev) micr_status |= (MII_DP83867_MICR_AN_ERR_INT_EN | MII_DP83867_MICR_SPEED_CHNG_INT_EN | + MII_DP83867_MICR_AUTONEG_COMP_INT_EN | + MII_DP83867_MICR_LINK_STS_CHNG_INT_EN | MII_DP83867_MICR_DUP_MODE_CHNG_INT_EN | MII_DP83867_MICR_SLEEP_MODE_CHNG_INT_EN); @@ -184,6 +187,13 @@ static int dp83867_config_init(struct phy_device *phydev) DP83867_DEVADDR, phydev->addr, delay); } + /* Enable Interrupt output INT_OE in CFG3 register */ + if (phy_interrupt_is_valid(phydev)) { + val = phy_read(phydev, DP83867_CFG3); + val |= BIT(7); + phy_write(phydev, DP83867_CFG3, val); + } + return 0; } From 032422cca86dcc9c831bd161a33759679275934c Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Fri, 6 Jan 2017 16:18:53 -0500 Subject: [PATCH 45/76] tg3: Fix race condition in tg3_get_stats64(). [ Upstream commit f5992b72ebe0dde488fa8f706b887194020c66fc ] The driver's ndo_get_stats64() method is not always called under RTNL. So it can race with driver close or ethtool reconfigurations. Fix the race condition by taking tp->lock spinlock in tg3_free_consistent() when freeing the tp->hw_stats memory block. tg3_get_stats64() is already taking tp->lock. Reported-by: Wang Yufen Signed-off-by: Michael Chan Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/tg3.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index 21e5b9ed1ead..3613469dc5c6 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -8722,11 +8722,14 @@ static void tg3_free_consistent(struct tg3 *tp) tg3_mem_rx_release(tp); tg3_mem_tx_release(tp); + /* Protect tg3_get_stats64() from reading freed tp->hw_stats. */ + tg3_full_lock(tp, 0); if (tp->hw_stats) { dma_free_coherent(&tp->pdev->dev, sizeof(struct tg3_hw_stats), tp->hw_stats, tp->stats_mapping); tp->hw_stats = NULL; } + tg3_full_unlock(tp); } /* From db01878ca5dd13ad51ff776309a6b75fd5867764 Mon Sep 17 00:00:00 2001 From: Nicholas Mc Guire Date: Sat, 7 Jan 2017 10:38:31 +0100 Subject: [PATCH 46/76] x86/boot: Add missing declaration of string functions [ Upstream commit fac69d0efad08fc15e4dbfc116830782acc0dc9a ] Add the missing declarations of basic string functions to string.h to allow a clean build. Fixes: 5be865661516 ("String-handling functions for the new x86 setup code.") Signed-off-by: Nicholas Mc Guire Link: http://lkml.kernel.org/r/1483781911-21399-1-git-send-email-hofrat@osadl.org Signed-off-by: Thomas Gleixner Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/x86/boot/string.c | 1 + arch/x86/boot/string.h | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 318b8465d302..06ceddb3a22e 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -14,6 +14,7 @@ #include #include "ctype.h" +#include "string.h" int memcmp(const void *s1, const void *s2, size_t len) { diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h index 725e820602b1..113588ddb43f 100644 --- a/arch/x86/boot/string.h +++ b/arch/x86/boot/string.h @@ -18,4 +18,13 @@ int memcmp(const void *s1, const void *s2, size_t len); #define memset(d,c,l) __builtin_memset(d,c,l) #define memcmp __builtin_memcmp +extern int strcmp(const char *str1, const char *str2); +extern int strncmp(const char *cs, const char *ct, size_t count); +extern size_t strlen(const char *s); +extern char *strstr(const char *s1, const char *s2); +extern size_t strnlen(const char *s, size_t maxlen); +extern unsigned int atou(const char *s); +extern unsigned long long simple_strtoull(const char *cp, char **endp, + unsigned int base); + #endif /* BOOT_STRING_H */ From a9873711065956556e1ce0ec36706639bd8de10a Mon Sep 17 00:00:00 2001 From: Zefir Kurtisi Date: Fri, 6 Jan 2017 12:14:48 +0100 Subject: [PATCH 47/76] phy state machine: failsafe leave invalid RUNNING state [ Upstream commit 811a919135b980bac8009d042acdccf10dc1ef5e ] While in RUNNING state, phy_state_machine() checks for link changes by comparing phydev->link before and after calling phy_read_status(). This works as long as it is guaranteed that phydev->link is never changed outside the phy_state_machine(). If in some setups this happens, it causes the state machine to miss a link loss and remain RUNNING despite phydev->link being 0. This has been observed running a dsa setup with a process continuously polling the link states over ethtool each second (SNMPD RFC-1213 agent). Disconnecting the link on a phy followed by a ETHTOOL_GSET causes dsa_slave_get_settings() / dsa_slave_get_link_ksettings() to call phy_read_status() and with that modify the link status - and with that bricking the phy state machine. This patch adds a fail-safe check while in RUNNING, which causes to move to CHANGELINK when the link is gone and we are still RUNNING. Signed-off-by: Zefir Kurtisi Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 29825838d1a7..49d9f0a789fe 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -921,6 +921,15 @@ void phy_state_machine(struct work_struct *work) if (old_link != phydev->link) phydev->state = PHY_CHANGELINK; } + /* + * Failsafe: check that nobody set phydev->link=0 between two + * poll cycles, otherwise we won't leave RUNNING state as long + * as link remains down. + */ + if (!phydev->link && phydev->state == PHY_RUNNING) { + phydev->state = PHY_CHANGELINK; + dev_err(&phydev->dev, "no link in PHY_RUNNING\n"); + } break; case PHY_CHANGELINK: err = phy_read_status(phydev); From 1e43b2d064741a6f4d4c55c4f9146c1e92e1d06a Mon Sep 17 00:00:00 2001 From: "Milan P. Gandhi" Date: Sat, 24 Dec 2016 22:02:46 +0530 Subject: [PATCH 48/76] scsi: qla2xxx: Get mutex lock before checking optrom_state [ Upstream commit c7702b8c22712a06080e10f1d2dee1a133ec8809 ] There is a race condition with qla2xxx optrom functions where one thread might modify optrom buffer, optrom_state while other thread is still reading from it. In couple of crashes, it was found that we had successfully passed the following 'if' check where we confirm optrom_state to be QLA_SREADING. But by the time we acquired mutex lock to proceed with memory_read_from_buffer function, some other thread/process had already modified that option rom buffer and optrom_state from QLA_SREADING to QLA_SWAITING. Then we got ha->optrom_buffer 0x0 and crashed the system: if (ha->optrom_state != QLA_SREADING) return 0; mutex_lock(&ha->optrom_mutex); rval = memory_read_from_buffer(buf, count, &off, ha->optrom_buffer, ha->optrom_region_size); mutex_unlock(&ha->optrom_mutex); With current optrom function we get following crash due to a race condition: [ 1479.466679] BUG: unable to handle kernel NULL pointer dereference at (null) [ 1479.466707] IP: [] memcpy+0x6/0x110 [...] [ 1479.473673] Call Trace: [ 1479.474296] [] ? memory_read_from_buffer+0x3c/0x60 [ 1479.474941] [] qla2x00_sysfs_read_optrom+0x9c/0xc0 [qla2xxx] [ 1479.475571] [] read+0xdb/0x1f0 [ 1479.476206] [] vfs_read+0x9e/0x170 [ 1479.476839] [] SyS_read+0x7f/0xe0 [ 1479.477466] [] system_call_fastpath+0x16/0x1b Below patch modifies qla2x00_sysfs_read_optrom, qla2x00_sysfs_write_optrom functions to get the mutex_lock before checking ha->optrom_state to avoid similar crashes. The patch was applied and tested and same crashes were no longer observed again. Tested-by: Milan P. Gandhi Signed-off-by: Milan P. Gandhi Reviewed-by: Laurence Oberman Acked-by: Himanshu Madhani Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_attr.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 6b942d9e5b74..1ed85dfc008d 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -329,12 +329,15 @@ qla2x00_sysfs_read_optrom(struct file *filp, struct kobject *kobj, struct qla_hw_data *ha = vha->hw; ssize_t rval = 0; - if (ha->optrom_state != QLA_SREADING) - return 0; - mutex_lock(&ha->optrom_mutex); + + if (ha->optrom_state != QLA_SREADING) + goto out; + rval = memory_read_from_buffer(buf, count, &off, ha->optrom_buffer, ha->optrom_region_size); + +out: mutex_unlock(&ha->optrom_mutex); return rval; @@ -349,14 +352,19 @@ qla2x00_sysfs_write_optrom(struct file *filp, struct kobject *kobj, struct device, kobj))); struct qla_hw_data *ha = vha->hw; - if (ha->optrom_state != QLA_SWRITING) + mutex_lock(&ha->optrom_mutex); + + if (ha->optrom_state != QLA_SWRITING) { + mutex_unlock(&ha->optrom_mutex); return -EINVAL; - if (off > ha->optrom_region_size) + } + if (off > ha->optrom_region_size) { + mutex_unlock(&ha->optrom_mutex); return -ERANGE; + } if (off + count > ha->optrom_region_size) count = ha->optrom_region_size - off; - mutex_lock(&ha->optrom_mutex); memcpy(&ha->optrom_buffer[off], buf, count); mutex_unlock(&ha->optrom_mutex); From c9e4ee445752ecd97c839ffa7e08146a614576dd Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Mon, 28 Nov 2016 08:52:20 +0100 Subject: [PATCH 49/76] drm/virtio: fix framebuffer sparse warning [ Upstream commit 71d3f6ef7f5af38dea2975ec5715c88bae92e92d ] virtio uses normal ram as backing storage for the framebuffer, so we should assign the address to new screen_buffer (added by commit 17a7b0b4d9749f80d365d7baff5dec2f54b0e992) instead of screen_base. Reported-by: Michael S. Tsirkin Signed-off-by: Gerd Hoffmann Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/virtio/virtgpu_fb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/virtio/virtgpu_fb.c b/drivers/gpu/drm/virtio/virtgpu_fb.c index 6a81e084593b..2b59d80a09b8 100644 --- a/drivers/gpu/drm/virtio/virtgpu_fb.c +++ b/drivers/gpu/drm/virtio/virtgpu_fb.c @@ -338,7 +338,7 @@ static int virtio_gpufb_create(struct drm_fb_helper *helper, info->fbops = &virtio_gpufb_ops; info->pixmap.flags = FB_PIXMAP_SYSTEM; - info->screen_base = obj->vmap; + info->screen_buffer = obj->vmap; info->screen_size = obj->gem_base.size; drm_fb_helper_fill_fix(info, fb->pitches[0], fb->depth); drm_fb_helper_fill_var(info, &vfbdev->helper, From 874f22658c5479a04427103ac5d59ecfb2ba890b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 9 Jan 2017 11:44:12 -0800 Subject: [PATCH 50/76] virtio_blk: fix panic in initialization error path [ Upstream commit 6bf6b0aa3da84a3d9126919a94c49c0fb7ee2fb3 ] If blk_mq_init_queue() returns an error, it gets assigned to vblk->disk->queue. Then, when we call put_disk(), we end up calling blk_put_queue() with the ERR_PTR, causing a bad dereference. Fix it by only assigning to vblk->disk->queue on success. Signed-off-by: Omar Sandoval Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/block/virtio_blk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6ca35495a5be..1e5cd39d0cc2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -641,11 +641,12 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_put_disk; - q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); + q = blk_mq_init_queue(&vblk->tag_set); if (IS_ERR(q)) { err = -ENOMEM; goto out_free_tags; } + vblk->disk->queue = q; q->queuedata = vblk; From 5205f5216c6aee6ff6fa947ef7cb0fcb3473d4e3 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Wed, 23 Nov 2016 13:02:32 +0100 Subject: [PATCH 51/76] ARM: 8632/1: ftrace: fix syscall name matching [ Upstream commit 270c8cf1cacc69cb8d99dea812f06067a45e4609 ] ARM has a few system calls (most notably mmap) for which the names of the functions which are referenced in the syscall table do not match the names of the syscall tracepoints. As a consequence of this, these tracepoints are not made available. Implement arch_syscall_match_sym_name to fix this and allow tracing even these system calls. Signed-off-by: Rabin Vincent Signed-off-by: Russell King Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/include/asm/ftrace.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index bfe2a2f5a644..22b73112b75f 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -54,6 +54,24 @@ static inline void *return_address(unsigned int level) #define ftrace_return_address(n) return_address(n) +#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME + +static inline bool arch_syscall_match_sym_name(const char *sym, + const char *name) +{ + if (!strcmp(sym, "sys_mmap2")) + sym = "sys_mmap_pgoff"; + else if (!strcmp(sym, "sys_statfs64_wrapper")) + sym = "sys_statfs64"; + else if (!strcmp(sym, "sys_fstatfs64_wrapper")) + sym = "sys_fstatfs64"; + else if (!strcmp(sym, "sys_arm_fadvise64_64")) + sym = "sys_fadvise64_64"; + + /* Ignore case since sym may start with "SyS" instead of "sys" */ + return !strcasecmp(sym, name); +} + #endif /* ifndef __ASSEMBLY__ */ #endif /* _ASM_ARM_FTRACE */ From 9c83b97bdeabfea1f9c51c1f505ee14f13e4c628 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:27 -0800 Subject: [PATCH 52/76] mm, slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER [ Upstream commit bb1107f7c6052c863692a41f78c000db792334bf ] Andrey Konovalov has reported the following warning triggered by the syzkaller fuzzer. WARNING: CPU: 1 PID: 9935 at mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 9935 Comm: syz-executor0 Not tainted 4.9.0-rc7+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __alloc_pages_slowpath mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 mm/page_alloc.c:3781 alloc_pages_current+0x1c7/0x6b0 mm/mempolicy.c:2072 alloc_pages include/linux/gfp.h:469 kmalloc_order+0x1f/0x70 mm/slab_common.c:1015 kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026 kmalloc_large include/linux/slab.h:422 __kmalloc+0x210/0x2d0 mm/slub.c:3723 kmalloc include/linux/slab.h:495 ep_write_iter+0x167/0xb50 drivers/usb/gadget/legacy/inode.c:664 new_sync_write fs/read_write.c:499 __vfs_write+0x483/0x760 fs/read_write.c:512 vfs_write+0x170/0x4e0 fs/read_write.c:560 SYSC_write fs/read_write.c:607 SyS_write+0xfb/0x230 fs/read_write.c:599 entry_SYSCALL_64_fastpath+0x1f/0xc2 The issue is caused by a lack of size check for the request size in ep_write_iter which should be fixed. It, however, points to another problem, that SLUB defines KMALLOC_MAX_SIZE too large because the its KMALLOC_SHIFT_MAX is (MAX_ORDER + PAGE_SHIFT) which means that the resulting page allocator request might be MAX_ORDER which is too large (see __alloc_pages_slowpath). The same applies to the SLOB allocator which allows even larger sizes. Make sure that they are capped properly and never request more than MAX_ORDER order. Link: http://lkml.kernel.org/r/20161220130659.16461-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Andrey Konovalov Acked-by: Christoph Lameter Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 2037a861e367..8a2a9ffaf5de 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -203,7 +203,7 @@ size_t ksize(const void *); * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -216,7 +216,7 @@ size_t ksize(const void *); * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX 30 +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif From 623f4fcd947ed4d74b9a79df7d2477f113a321b5 Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Tue, 10 Jan 2017 16:57:45 -0800 Subject: [PATCH 53/76] lib/Kconfig.debug: fix frv build failure [ Upstream commit da0510c47519fe0999cffe316e1d370e29f952be ] The build of frv allmodconfig was failing with the errors like: /tmp/cc0JSPc3.s: Assembler messages: /tmp/cc0JSPc3.s:1839: Error: symbol `.LSLT0' is already defined /tmp/cc0JSPc3.s:1842: Error: symbol `.LASLTP0' is already defined /tmp/cc0JSPc3.s:1969: Error: symbol `.LELTP0' is already defined /tmp/cc0JSPc3.s:1970: Error: symbol `.LELT0' is already defined Commit 866ced950bcd ("kbuild: Support split debug info v4") introduced splitting the debug info and keeping that in a separate file. Somehow, the frv-linux gcc did not like that and I am guessing that instead of splitting it started copying. The first report about this is at: https://lists.01.org/pipermail/kbuild-all/2015-July/010527.html. I will try and see if this can work with frv and if still fails I will open a bug report with gcc. But meanwhile this is the easiest option to solve build failure of frv. Fixes: 866ced950bcd ("kbuild: Support split debug info v4") Link: http://lkml.kernel.org/r/1482062348-5352-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Reported-by: Fengguang Wu Cc: Andi Kleen Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8c15b29d5adc..b53b375e14bd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -145,7 +145,7 @@ config DEBUG_INFO_REDUCED config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" - depends on DEBUG_INFO + depends on DEBUG_INFO && !FRV help Generate debug info into separate .dwo files. This significantly reduces the build directory size for builds with DEBUG_INFO, From bbe660db23e41647366039c1860cee0891fe9903 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 10 Jan 2017 16:57:54 -0800 Subject: [PATCH 54/76] signal: protect SIGNAL_UNKILLABLE from unintentional clearing. [ Upstream commit 2d39b3cd34e6d323720d4c61bd714f5ae202c022 ] Since commit 00cd5c37afd5 ("ptrace: permit ptracing of /sbin/init") we can now trace init processes. init is initially protected with SIGNAL_UNKILLABLE which will prevent fatal signals such as SIGSTOP, but there are a number of paths during tracing where SIGNAL_UNKILLABLE can be implicitly cleared. This can result in init becoming stoppable/killable after tracing. For example, running: while true; do kill -STOP 1; done & strace -p 1 and then stopping strace and the kill loop will result in init being left in state TASK_STOPPED. Sending SIGCONT to init will resume it, but init will now respond to future SIGSTOP signals rather than ignoring them. Make sure that when setting SIGNAL_STOP_CONTINUED/SIGNAL_STOP_STOPPED that we don't clear SIGNAL_UNKILLABLE. Link: http://lkml.kernel.org/r/20170104122017.25047-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Cc: Alexander Viro Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/sched.h | 10 ++++++++++ kernel/signal.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 352213b360d7..eff7c1fad26f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -801,6 +801,16 @@ struct signal_struct { #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { diff --git a/kernel/signal.c b/kernel/signal.c index b92a047ddc82..5d50ea899b6d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task) * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { - sig->flags = SIGNAL_STOP_STOPPED; + signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; @@ -845,7 +845,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ - signal->flags = why | SIGNAL_STOP_CONTINUED; + signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } From 78c04996b5888526c7aa3c9debe9dc56519ab81e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 10 Jan 2017 16:58:00 -0800 Subject: [PATCH 55/76] mm: don't dereference struct page fields of invalid pages [ Upstream commit f073bdc51771f5a5c7a8d1191bfc3ae371d44de7 ] The VM_BUG_ON() check in move_freepages() checks whether the node id of a page matches the node id of its zone. However, it does this before having checked whether the struct page pointer refers to a valid struct page to begin with. This is guaranteed in most cases, but may not be the case if CONFIG_HOLES_IN_ZONE=y. So reorder the VM_BUG_ON() with the pfn_valid_within() check. Link: http://lkml.kernel.org/r/1481706707-6211-2-git-send-email-ard.biesheuvel@linaro.org Signed-off-by: Ard Biesheuvel Acked-by: Will Deacon Cc: Catalin Marinas Cc: Hanjun Guo Cc: Yisheng Xie Cc: Robert Richter Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8b80115e4e9e..f9d648fce8cd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1527,14 +1527,14 @@ int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + if (!PageBuddy(page)) { page++; continue; From 12b8f014a3944eea3bcc7f4ebbf6318c5385f270 Mon Sep 17 00:00:00 2001 From: zheng li Date: Mon, 12 Dec 2016 09:56:05 +0800 Subject: [PATCH 56/76] ipv4: Should use consistent conditional judgement for ip fragment in __ip_append_data and ip_finish_output [ Upstream commit 0a28cfd51e17f4f0a056bcf66bfbe492c3b99f38 ] There is an inconsistent conditional judgement in __ip_append_data and ip_finish_output functions, the variable length in __ip_append_data just include the length of application's payload and udp header, don't include the length of ip header, but in ip_finish_output use (skb->len > ip_skb_dst_mtu(skb)) as judgement, and skb->len include the length of ip header. That causes some particular application's udp payload whose length is between (MTU - IP Header) and MTU were fragmented by ip_fragment even though the rst->dev support UFO feature. Add the length of ip header to length in __ip_append_data to keep consistent conditional judgement as ip_finish_output for ip fragment. Signed-off-by: Zheng Li Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2b7283303650..f5c62d0a7453 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,7 +922,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if (((length > mtu) || (skb && skb_is_gso(skb))) && + if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { From 0c787041504edd1739a5f8bd04bdd36b8fcb093c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Mon, 19 Jun 2017 13:03:43 +0200 Subject: [PATCH 57/76] net: account for current skb length when deciding about UFO [ Upstream commit a5cb659bbc1c8644efa0c3138a757a1e432a4880 ] Our customer encountered stuck NFS writes for blocks starting at specific offsets w.r.t. page boundary caused by networking stack sending packets via UFO enabled device with wrong checksum. The problem can be reproduced by composing a long UDP datagram from multiple parts using MSG_MORE flag: sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 3000, 0, ...); Assume this packet is to be routed via a device with MTU 1500 and NETIF_F_UFO enabled. When second sendto() gets into __ip_append_data(), this condition is tested (among others) to decide whether to call ip_ufo_append_data(): ((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb)) At the moment, we already have skb with 1028 bytes of data which is not marked for GSO so that the test is false (fragheaderlen is usually 20). Thus we append second 1000 bytes to this skb without invoking UFO. Third sendto(), however, has sufficient length to trigger the UFO path so that we end up with non-UFO skb followed by a UFO one. Later on, udp_send_skb() uses udp_csum() to calculate the checksum but that assumes all fragments have correct checksum in skb->csum which is not true for UFO fragments. When checking against MTU, we need to add skb->len to length of new segment if we already have a partially filled skb and fragheaderlen only if there isn't one. In the IPv6 case, skb can only be null if this is the first segment so that we have to use headersize (length of the first IPv6 header) rather than fragheaderlen (length of IPv6 header of further fragments) for skb == NULL. Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach") Fixes: e4c5e13aa45c ("ipv6: Should use consistent conditional judgement for ip6 fragment between __ip6_append_data and ip6_finish_output") Signed-off-by: Michal Kubecek Acked-by: Vlad Yasevich Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 3 ++- net/ipv6/ip6_output.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f5c62d0a7453..5d58a6703a43 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,7 +922,8 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && + if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) || + (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a7a5790e28e4..0de3245ea42f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1357,7 +1357,7 @@ emsgsize: */ cork->length += length; - if ((((length + fragheaderlen) > mtu) || + if ((((length + (skb ? skb->len : headersize)) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && From 34a08ae493f1970d5ce80dd3812b8dba4e5cbe22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:36:15 -0400 Subject: [PATCH 58/76] workqueue: implicit ordered attribute should be overridable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0a94efb5acbb6980d7c9ab604372d93cd507e4d8 upstream. 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automatically enabled ordered attribute for unbound workqueues w/ max_active == 1. Because ordered workqueues reject max_active and some attribute changes, this implicit ordered mode broke cases where the user creates an unbound workqueue w/ max_active == 1 and later explicitly changes the related attributes. This patch distinguishes explicit and implicit ordered setting and overrides from attribute changes if implict. Signed-off-by: Tejun Heo Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") Cc: Holger Hoffstätte Signed-off-by: Greg Kroah-Hartman --- include/linux/workqueue.h | 4 +++- kernel/workqueue.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0197358f1e81..262d5c95dfc8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -311,6 +311,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ + __WQ_ORDERED_EXPLICIT = 1 << 18, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ @@ -408,7 +409,8 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ + __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 137332379ed4..23231237f2e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3647,8 +3647,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + if (!list_empty(&wq->pwqs)) { + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + return -EINVAL; + + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, attrs); @@ -4032,13 +4036,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); + wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) @@ -5164,7 +5169,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); From 2ab639c7e2d1f02779c8947c90d8d595f5843e52 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 11 Aug 2017 09:22:51 -0700 Subject: [PATCH 59/76] Linux 4.4.81 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index dddd55adde24..d049e53a6960 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 80 +SUBLEVEL = 81 EXTRAVERSION = NAME = Blurry Fish Butt From 025bb7f7e92c2dc3320eefb5676e1813e3c8fa1d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 1 Aug 2017 13:22:32 -0700 Subject: [PATCH 60/76] tcp: avoid setting cwnd to invalid ssthresh after cwnd reduction states [ Upstream commit ed254971edea92c3ac5c67c6a05247a92aa6075e ] If the sender switches the congestion control during ECN-triggered cwnd-reduction state (CA_CWR), upon exiting recovery cwnd is set to the ssthresh value calculated by the previous congestion control. If the previous congestion control is BBR that always keep ssthresh to TCP_INIFINITE_SSTHRESH, cwnd ends up being infinite. The safe step is to avoid assigning invalid ssthresh value when recovery ends. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8f13b2eaabf8..f0dabd125c43 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2503,8 +2503,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_time_stamp; } From 4e0675f44b891b10108bacf898c6dac07e99492d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Aug 2017 23:10:46 -0700 Subject: [PATCH 61/76] net: fix keepalive code vs TCP_FASTOPEN_CONNECT [ Upstream commit 2dda640040876cd8ae646408b69eea40c24f9ae9 ] syzkaller was able to trigger a divide by 0 in TCP stack [1] Issue here is that keepalive timer needs to be updated to not attempt to send a probe if the connection setup was deferred using TCP_FASTOPEN_CONNECT socket option added in linux-4.11 [1] divide error: 0000 [#1] SMP CPU: 18 PID: 0 Comm: swapper/18 Not tainted task: ffff986f62f4b040 ti: ffff986f62fa2000 task.ti: ffff986f62fa2000 RIP: 0010:[] [] __tcp_select_window+0x8d/0x160 Call Trace: [] tcp_transmit_skb+0x11/0x20 [] tcp_xmit_probe_skb+0xc1/0xe0 [] tcp_write_wakeup+0x68/0x160 [] tcp_keepalive_timer+0x17b/0x230 [] call_timer_fn+0x39/0xf0 [] run_timer_softirq+0x1d7/0x280 [] __do_softirq+0xcb/0x257 [] irq_exit+0x9c/0xb0 [] smp_apic_timer_interrupt+0x6a/0x80 [] apic_timer_interrupt+0x7f/0x90 [] ? cpuidle_enter_state+0x13a/0x3b0 [] ? cpuidle_enter_state+0x11d/0x3b0 Tested: Following packetdrill no longer crashes the kernel `echo 0 >/proc/sys/net/ipv4/tcp_timestamps` // Cache warmup: send a Fast Open cookie request 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress) +0 > S 0:0(0) +.01 < S. 123:123(0) ack 1 win 14600 +0 > . 1:1(0) ack 1 +0 close(3) = 0 +0 > F. 1:1(0) ack 1 +0 < F. 1:1(0) ack 2 win 92 +0 > . 2:2(0) ack 2 +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 setsockopt(4, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0 +.01 connect(4, ..., ...) = 0 +0 setsockopt(4, SOL_TCP, TCP_KEEPIDLE, [5], 4) = 0 +10 close(4) = 0 `echo 1 >/proc/sys/net/ipv4/tcp_timestamps` Fixes: 19f6d3f3c842 ("net/tcp-fastopen: Add new API support") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Wei Wang Cc: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index ebb34d0c5e80..1ec12a4f327e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -606,7 +606,8 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); From d0da2877d421d7270ca876adc64060ab29a2fde5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Aug 2017 14:20:54 +0200 Subject: [PATCH 62/76] bpf, s390: fix jit branch offset related to ldimm64 [ Upstream commit b0a0c2566f28e71e5e32121992ac8060cec75510 ] While testing some other work that required JIT modifications, I run into test_bpf causing a hang when JIT enabled on s390. The problematic test case was the one from ddc665a4bb4b (bpf, arm64: fix jit branch offset related to ldimm64), and turns out that we do have a similar issue on s390 as well. In bpf_jit_prog() we update next instruction address after returning from bpf_jit_insn() with an insn_count. bpf_jit_insn() returns either -1 in case of error (e.g. unsupported insn), 1 or 2. The latter is only the case for ldimm64 due to spanning 2 insns, however, next address is only set to i + 1 not taking actual insn_count into account, thus fix is to use insn_count instead of 1. bpf_jit_enable in mode 2 provides also disasm on s390: Before fix: 000003ff800349b6: a7f40003 brc 15,3ff800349bc ; target 000003ff800349ba: 0000 unknown 000003ff800349bc: e3b0f0700024 stg %r11,112(%r15) 000003ff800349c2: e3e0f0880024 stg %r14,136(%r15) 000003ff800349c8: 0db0 basr %r11,%r0 000003ff800349ca: c0ef00000000 llilf %r14,0 000003ff800349d0: e320b0360004 lg %r2,54(%r11) 000003ff800349d6: e330b03e0004 lg %r3,62(%r11) 000003ff800349dc: ec23ffeda065 clgrj %r2,%r3,10,3ff800349b6 ; jmp 000003ff800349e2: e3e0b0460004 lg %r14,70(%r11) 000003ff800349e8: e3e0b04e0004 lg %r14,78(%r11) 000003ff800349ee: b904002e lgr %r2,%r14 000003ff800349f2: e3b0f0700004 lg %r11,112(%r15) 000003ff800349f8: e3e0f0880004 lg %r14,136(%r15) 000003ff800349fe: 07fe bcr 15,%r14 After fix: 000003ff80ef3db4: a7f40003 brc 15,3ff80ef3dba 000003ff80ef3db8: 0000 unknown 000003ff80ef3dba: e3b0f0700024 stg %r11,112(%r15) 000003ff80ef3dc0: e3e0f0880024 stg %r14,136(%r15) 000003ff80ef3dc6: 0db0 basr %r11,%r0 000003ff80ef3dc8: c0ef00000000 llilf %r14,0 000003ff80ef3dce: e320b0360004 lg %r2,54(%r11) 000003ff80ef3dd4: e330b03e0004 lg %r3,62(%r11) 000003ff80ef3dda: ec230006a065 clgrj %r2,%r3,10,3ff80ef3de6 ; jmp 000003ff80ef3de0: e3e0b0460004 lg %r14,70(%r11) 000003ff80ef3de6: e3e0b04e0004 lg %r14,78(%r11) ; target 000003ff80ef3dec: b904002e lgr %r2,%r14 000003ff80ef3df0: e3b0f0700004 lg %r11,112(%r15) 000003ff80ef3df6: e3e0f0880004 lg %r14,136(%r15) 000003ff80ef3dfc: 07fe bcr 15,%r14 test_bpf.ko suite runs fine after the fix. Fixes: 054623105728 ("s390/bpf: Add s390x eBPF JIT compiler backend") Signed-off-by: Daniel Borkmann Tested-by: Michael Holzheu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/s390/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0e2919dd8df3..1395eeb6005f 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1250,7 +1250,8 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) insn_count = bpf_jit_insn(jit, fp, i); if (insn_count < 0) return -1; - jit->addrs[i + 1] = jit->prg; /* Next instruction address */ + /* Next instruction address */ + jit->addrs[i + insn_count] = jit->prg; } bpf_jit_epilogue(jit); From 40fc2b4451a283ab9c46e82a6f43d978e47ce41f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 9 Aug 2017 18:15:19 +0800 Subject: [PATCH 63/76] net: sched: set xt_tgchk_param par.nft_compat as 0 in ipt_init_target [ Upstream commit 96d9703050a0036a3360ec98bb41e107c90664fe ] Commit 55917a21d0cc ("netfilter: x_tables: add context to know if extension runs from nft_compat") introduced a member nft_compat to xt_tgchk_param structure. But it didn't set it's value for ipt_init_target. With unexpected value in par.nft_compat, it may return unexpected result in some target's checkentry. This patch is to set all it's fields as 0 and only initialize the non-zero fields in ipt_init_target. v1->v2: As Wang Cong's suggestion, fix it by setting all it's fields as 0 and only initializing the non-zero fields. Fixes: 55917a21d0cc ("netfilter: x_tables: add context to know if extension runs from nft_compat") Suggested-by: Cong Wang Signed-off-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_ipt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index d05869646515..0915d448ba23 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -42,8 +42,8 @@ static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int return PTR_ERR(target); t->u.kernel.target = target; + memset(&par, 0, sizeof(par)); par.table = table; - par.entryinfo = NULL; par.target = target; par.targinfo = t->data; par.hook_mask = hook; From 8607d550847f4bab5e51c078865cfabd88ffdabb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Aug 2017 01:41:58 -0700 Subject: [PATCH 64/76] tcp: fastopen: tcp_connect() must refresh the route [ Upstream commit 8ba60924710cde564a3905588b6219741d6356d0 ] With new TCP_FASTOPEN_CONNECT socket option, there is a possibility to call tcp_connect() while socket sk_dst_cache is either NULL or invalid. +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(4, ..., ...) = 0 << sk->sk_dst_cache becomes obsolete, or even set to NULL >> +1 sendto(4, ..., 1000, MSG_FASTOPEN, ..., ...) = 1000 We need to refresh the route otherwise bad things can happen, especially when syzkaller is running on the host :/ Fixes: 19f6d3f3c8422 ("net/tcp-fastopen: Add new API support") Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Cc: Wei Wang Cc: Yuchung Cheng Acked-by: Wei Wang Acked-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3fdcdc730f71..850d1b5bfd81 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3256,6 +3256,9 @@ int tcp_connect(struct sock *sk) struct sk_buff *buff; int err; + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_connect_init(sk); if (unlikely(tp->repair)) { From 37d5c6e8d38d674b1c25741fdf033f7f00b5ed5f Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Aug 2017 14:22:55 -0400 Subject: [PATCH 65/76] net: avoid skb_warn_bad_offload false positives on UFO [ Upstream commit 8d63bee643f1fb53e472f0e135cae4eb99d62d19 ] skb_warn_bad_offload triggers a warning when an skb enters the GSO stack at __skb_gso_segment that does not have CHECKSUM_PARTIAL checksum offload set. Commit b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") observed that SKB_GSO_DODGY producers can trigger the check and that passing those packets through the GSO handlers will fix it up. But, the software UFO handler will set ip_summed to CHECKSUM_NONE. When __skb_gso_segment is called from the receive path, this triggers the warning again. Make UFO set CHECKSUM_UNNECESSARY instead of CHECKSUM_NONE. On Tx these two are equivalent. On Rx, this better matches the skb state (checksum computed), as CHECKSUM_NONE here means no checksum computed. See also this thread for context: http://patchwork.ozlabs.org/patch/799015/ Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 4b0853194a03..24d243084aab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2551,7 +2551,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6396f1c80ae9..6dfc3daf7c21 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -231,7 +231,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 01582966ffa0..2e3c12eeca07 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* Check if there is enough headroom to insert fragment header. */ tnl_hlen = skb_tnl_header_len(skb); From 63364a508d24944abb0975bd823cb11367c56283 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 10 Aug 2017 12:41:58 -0400 Subject: [PATCH 66/76] packet: fix tp_reserve race in packet_set_ring [ Upstream commit c27927e372f0785f3303e8fad94b85945e2c97b7 ] Updates to tp_reserve can race with reads of the field in packet_set_ring. Avoid this by holding the socket lock during updates in setsockopt PACKET_RESERVE. This bug was discovered by syzkaller. Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt") Reported-by: Andrey Konovalov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 061771ca2582..148ec130d99d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3622,14 +3622,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv if (optlen != sizeof(val)) return -EINVAL; - if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) - return -EBUSY; if (copy_from_user(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; - po->tp_reserve = val; - return 0; + lock_sock(sk); + if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { + ret = -EBUSY; + } else { + po->tp_reserve = val; + ret = 0; + } + release_sock(sk); + return ret; } case PACKET_LOSS: { From 54fc0c32307d6805304858be92994b76a0b5b0d6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 11 Aug 2017 09:14:09 -0700 Subject: [PATCH 67/76] revert "net: account for current skb length when deciding about UFO" This reverts commit ef09c9ff343122a0b245416066992d096416ff19 which is commit a5cb659bbc1c8644efa0c3138a757a1e432a4880 upstream as it causes merge issues with later patches that are much more important... Cc: Michal Kubecek Cc: Vlad Yasevich Cc: David S. Miller Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 3 +-- net/ipv6/ip6_output.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 5d58a6703a43..f5c62d0a7453 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,8 +922,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + (skb ? skb->len : fragheaderlen)) > mtu) || - (skb && skb_is_gso(skb))) && + if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0de3245ea42f..a7a5790e28e4 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1357,7 +1357,7 @@ emsgsize: */ cork->length += length; - if ((((length + (skb ? skb->len : headersize)) > mtu) || + if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && From 98c1ad1edfe88f51123aeee0857fa9de5962e328 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 11 Aug 2017 09:19:02 -0700 Subject: [PATCH 68/76] revert "ipv4: Should use consistent conditional judgement for ip fragment in __ip_append_data and ip_finish_output" This reverts commit f102bb7164c9020e12662998f0fd99c3be72d4f6 which is commit 0a28cfd51e17f4f0a056bcf66bfbe492c3b99f38 upstream as there is another patch that needs to be applied instead of this one. Cc: Zheng Li Cc: David S. Miller Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f5c62d0a7453..2b7283303650 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,7 +922,7 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) && + if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { From 938990d2433cdecd225e1ab54a442b3ffdce1f87 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 10 Aug 2017 12:29:19 -0400 Subject: [PATCH 69/76] udp: consistently apply ufo or fragmentation [ Upstream commit 85f1bd9a7b5a79d5baa8bf44af19658f7bf77bfa ] When iteratively building a UDP datagram with MSG_MORE and that datagram exceeds MTU, consistently choose UFO or fragmentation. Once skb_is_gso, always apply ufo. Conversely, once a datagram is split across multiple skbs, do not consider ufo. Sendpage already maintains the first invariant, only add the second. IPv6 does not have a sendpage implementation to modify. A gso skb must have a partial checksum, do not follow sk_no_check_tx in udp_send_skb. Found by syzkaller. Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach") Reported-by: Andrey Konovalov Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 7 +++++-- net/ipv4/udp.c | 2 +- net/ipv6/ip6_output.c | 7 ++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 2b7283303650..f3403a3ce290 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -922,10 +922,12 @@ static int __ip_append_data(struct sock *sk, csummode = CHECKSUM_PARTIAL; cork->length += length; - if (((length > mtu) || (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + ((length > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && - (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) { + (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx)) { err = ip_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, transhdrlen, maxfraglen, flags); @@ -1241,6 +1243,7 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page, return -EINVAL; if ((size + skb->len > mtu) && + (skb_queue_len(&sk->sk_write_queue) == 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO)) { if (skb->ip_summed != CHECKSUM_PARTIAL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e9513e397c4f..301e60829c7e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -819,7 +819,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); - else if (sk->sk_no_check_tx) { /* UDP csum disabled */ + else if (sk->sk_no_check_tx && !skb_is_gso(skb)) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a7a5790e28e4..b725efc0d4ea 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1357,11 +1357,12 @@ emsgsize: */ cork->length += length; - if ((((length + fragheaderlen) > mtu) || - (skb && skb_is_gso(skb))) && + if ((skb && skb_is_gso(skb)) || + (((length + fragheaderlen) > mtu) && + (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && - (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { + (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, exthdrlen, transhdrlen, mtu, flags, fl6); From 6fe71ca3cb3c910e71cbf4ce1a9c35dd010eb815 Mon Sep 17 00:00:00 2001 From: Rob Gardner Date: Mon, 17 Jul 2017 09:22:27 -0600 Subject: [PATCH 70/76] sparc64: Prevent perf from running during super critical sections commit fc290a114fc6034b0f6a5a46e2fb7d54976cf87a upstream. This fixes another cause of random segfaults and bus errors that may occur while running perf with the callgraph option. Critical sections beginning with spin_lock_irqsave() raise the interrupt level to PIL_NORMAL_MAX (14) and intentionally do not block performance counter interrupts, which arrive at PIL_NMI (15). But some sections of code are "super critical" with respect to perf because the perf_callchain_user() path accesses user space and may cause TLB activity as well as faults as it unwinds the user stack. One particular critical section occurs in switch_mm: spin_lock_irqsave(&mm->context.lock, flags); ... load_secondary_context(mm); tsb_context_switch(mm); ... spin_unlock_irqrestore(&mm->context.lock, flags); If a perf interrupt arrives in between load_secondary_context() and tsb_context_switch(), then perf_callchain_user() could execute with the context ID of one process, but with an active TSB for a different process. When the user stack is accessed, it is very likely to incur a TLB miss, since the h/w context ID has been changed. The TLB will then be reloaded with a translation from the TSB for one process, but using a context ID for another process. This exposes memory from one process to another, and since it is a mapping for stack memory, this usually causes the new process to crash quickly. This super critical section needs more protection than is provided by spin_lock_irqsave() since perf interrupts must not be allowed in. Since __tsb_context_switch already goes through the trouble of disabling interrupts completely, we fix this by moving the secondary context load down into this better protected region. Orabug: 25577560 Signed-off-by: Dave Aldridge Signed-off-by: Rob Gardner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- arch/sparc/include/asm/mmu_context_64.h | 14 +++++++++----- arch/sparc/kernel/tsb.S | 12 ++++++++++++ arch/sparc/power/hibernate.c | 3 +-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index 349dd23e2876..0cdeb2b483a0 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -25,9 +25,11 @@ void destroy_context(struct mm_struct *mm); void __tsb_context_switch(unsigned long pgd_pa, struct tsb_config *tsb_base, struct tsb_config *tsb_huge, - unsigned long tsb_descr_pa); + unsigned long tsb_descr_pa, + unsigned long secondary_ctx); -static inline void tsb_context_switch(struct mm_struct *mm) +static inline void tsb_context_switch_ctx(struct mm_struct *mm, + unsigned long ctx) { __tsb_context_switch(__pa(mm->pgd), &mm->context.tsb_block[0], @@ -38,9 +40,12 @@ static inline void tsb_context_switch(struct mm_struct *mm) #else NULL #endif - , __pa(&mm->context.tsb_descr[0])); + , __pa(&mm->context.tsb_descr[0]), + ctx); } +#define tsb_context_switch(X) tsb_context_switch_ctx(X, 0) + void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long mm_rss); @@ -110,8 +115,7 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str * cpu0 to update it's TSB because at that point the cpu_vm_mask * only had cpu1 set in it. */ - load_secondary_context(mm); - tsb_context_switch(mm); + tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context)); /* Any time a processor runs a context on an address space * for the first time, we must flush that context out of the diff --git a/arch/sparc/kernel/tsb.S b/arch/sparc/kernel/tsb.S index 395ec1800530..7d961f6e3907 100644 --- a/arch/sparc/kernel/tsb.S +++ b/arch/sparc/kernel/tsb.S @@ -375,6 +375,7 @@ tsb_flush: * %o1: TSB base config pointer * %o2: TSB huge config pointer, or NULL if none * %o3: Hypervisor TSB descriptor physical address + * %o4: Secondary context to load, if non-zero * * We have to run this whole thing with interrupts * disabled so that the current cpu doesn't change @@ -387,6 +388,17 @@ __tsb_context_switch: rdpr %pstate, %g1 wrpr %g1, PSTATE_IE, %pstate + brz,pn %o4, 1f + mov SECONDARY_CONTEXT, %o5 + +661: stxa %o4, [%o5] ASI_DMMU + .section .sun4v_1insn_patch, "ax" + .word 661b + stxa %o4, [%o5] ASI_MMU + .previous + flush %g6 + +1: TRAP_LOAD_TRAP_BLOCK(%g2, %g3) stx %o0, [%g2 + TRAP_PER_CPU_PGD_PADDR] diff --git a/arch/sparc/power/hibernate.c b/arch/sparc/power/hibernate.c index 17bd2e167e07..df707a8ad311 100644 --- a/arch/sparc/power/hibernate.c +++ b/arch/sparc/power/hibernate.c @@ -35,6 +35,5 @@ void restore_processor_state(void) { struct mm_struct *mm = current->active_mm; - load_secondary_context(mm); - tsb_context_switch(mm); + tsb_context_switch_ctx(mm, CTX_HWBITS(mm->context)); } From 7e86f2d55f66e0026aa70ea268021df6bf294c5b Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 5 Jul 2017 09:57:00 +0100 Subject: [PATCH 71/76] KVM: arm/arm64: Handle hva aging while destroying the vm commit 7e5a672289c9754d07e1c3b33649786d3d70f5e4 upstream. The mmu_notifier_release() callback of KVM triggers cleaning up the stage2 page table on kvm-arm. However there could be other notifier callbacks in parallel with the mmu_notifier_release(), which could cause the call backs ending up in an empty stage2 page table. Make sure we check it for all the notifier callbacks. Fixes: commit 293f29363 ("kvm-arm: Unmap shadow pagetables properly") Reported-by: Alex Graf Reviewed-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 1f1ff7e7b9cf..ba079e279b58 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1629,12 +1629,16 @@ static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) { + if (!kvm->arch.pgd) + return 0; trace_kvm_age_hva(start, end); return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL); } int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) { + if (!kvm->arch.pgd) + return 0; trace_kvm_test_age_hva(hva); return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL); } From d45aabadbcb967d3b01451732f65da9ff7315450 Mon Sep 17 00:00:00 2001 From: Matthew Dawson Date: Fri, 11 Mar 2016 13:08:07 -0800 Subject: [PATCH 72/76] mm/mempool: avoid KASAN marking mempool poison checks as use-after-free commit 7640131032db9118a78af715ac77ba2debeeb17c upstream. When removing an element from the mempool, mark it as unpoisoned in KASAN before verifying its contents for SLUB/SLAB debugging. Otherwise KASAN will flag the reads checking the element use-after-free writes as use-after-free reads. Signed-off-by: Matthew Dawson Acked-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Andrii Bordunov Signed-off-by: Greg Kroah-Hartman --- mm/mempool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempool.c b/mm/mempool.c index 004d42b1dfaf..7924f4f58a6d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool) void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - check_element(pool, element); kasan_unpoison_element(pool, element); + check_element(pool, element); return element; } From 96cdeaa3af8f310c52489fc9342c1b2d32aa7678 Mon Sep 17 00:00:00 2001 From: zheng li Date: Mon, 12 Dec 2016 09:56:05 +0800 Subject: [PATCH 73/76] ipv4: Should use consistent conditional judgement for ip fragment in __ip_append_data and ip_finish_output commit 0a28cfd51e17f4f0a056bcf66bfbe492c3b99f38 upstream. There is an inconsistent conditional judgement in __ip_append_data and ip_finish_output functions, the variable length in __ip_append_data just include the length of application's payload and udp header, don't include the length of ip header, but in ip_finish_output use (skb->len > ip_skb_dst_mtu(skb)) as judgement, and skb->len include the length of ip header. That causes some particular application's udp payload whose length is between (MTU - IP Header) and MTU were fragmented by ip_fragment even though the rst->dev support UFO feature. Add the length of ip header to length in __ip_append_data to keep consistent conditional judgement as ip_finish_output for ip fragment. Signed-off-by: Zheng Li Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f3403a3ce290..0efa401c39f4 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -923,7 +923,7 @@ static int __ip_append_data(struct sock *sk, cork->length += length; if ((skb && skb_is_gso(skb)) || - ((length > mtu) && + (((length + fragheaderlen) > mtu) && (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && From fab61468402b5189bb843cff96066693d03a716b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= Date: Mon, 19 Jun 2017 13:03:43 +0200 Subject: [PATCH 74/76] net: account for current skb length when deciding about UFO commit a5cb659bbc1c8644efa0c3138a757a1e432a4880 upstream. Our customer encountered stuck NFS writes for blocks starting at specific offsets w.r.t. page boundary caused by networking stack sending packets via UFO enabled device with wrong checksum. The problem can be reproduced by composing a long UDP datagram from multiple parts using MSG_MORE flag: sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 1000, MSG_MORE, ...); sendto(sd, buff, 3000, 0, ...); Assume this packet is to be routed via a device with MTU 1500 and NETIF_F_UFO enabled. When second sendto() gets into __ip_append_data(), this condition is tested (among others) to decide whether to call ip_ufo_append_data(): ((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb)) At the moment, we already have skb with 1028 bytes of data which is not marked for GSO so that the test is false (fragheaderlen is usually 20). Thus we append second 1000 bytes to this skb without invoking UFO. Third sendto(), however, has sufficient length to trigger the UFO path so that we end up with non-UFO skb followed by a UFO one. Later on, udp_send_skb() uses udp_csum() to calculate the checksum but that assumes all fragments have correct checksum in skb->csum which is not true for UFO fragments. When checking against MTU, we need to add skb->len to length of new segment if we already have a partially filled skb and fragheaderlen only if there isn't one. In the IPv6 case, skb can only be null if this is the first segment so that we have to use headersize (length of the first IPv6 header) rather than fragheaderlen (length of IPv6 header of further fragments) for skb == NULL. Fixes: e89e9cf539a2 ("[IPv4/IPv6]: UFO Scatter-gather approach") Fixes: e4c5e13aa45c ("ipv6: Should use consistent conditional judgement for ip6 fragment between __ip6_append_data and ip6_finish_output") Signed-off-by: Michal Kubecek Acked-by: Vlad Yasevich Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0efa401c39f4..09c73dd541c5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -923,7 +923,7 @@ static int __ip_append_data(struct sock *sk, cork->length += length; if ((skb && skb_is_gso(skb)) || - (((length + fragheaderlen) > mtu) && + (((length + (skb ? skb->len : fragheaderlen)) > mtu) && (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b725efc0d4ea..e22339fad10b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1358,7 +1358,7 @@ emsgsize: cork->length += length; if ((skb && skb_is_gso(skb)) || - (((length + fragheaderlen) > mtu) && + (((length + (skb ? skb->len : headersize)) > mtu) && (skb_queue_len(queue) <= 1) && (sk->sk_protocol == IPPROTO_UDP) && (rt->dst.dev->features & NETIF_F_UFO) && From 4e2e415f4cc11da3d01d6b9634eae09688e852c5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 12 Aug 2017 19:29:34 -0700 Subject: [PATCH 75/76] Linux 4.4.82 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d049e53a6960..52f2dd8dcebd 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 81 +SUBLEVEL = 82 EXTRAVERSION = NAME = Blurry Fish Butt From 8db423d76122953a279cc02e45a43769c82164e7 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 21 Mar 2016 11:00:17 +0000 Subject: [PATCH 76/76] kvm: arm64: Disable compiler instrumentation for hypervisor code With the recent rewrite of the arm64 KVM hypervisor code in C, enabling certain options like KASAN would allow the compiler to generate memory accesses or function calls to addresses not mapped at EL2. This patch disables the compiler instrumentation on the arm64 hypervisor code for gcov-based profiling (GCOV_KERNEL), undefined behaviour sanity checker (UBSAN) and kernel address sanitizer (KASAN). Signed-off-by: Catalin Marinas Cc: Christoffer Dall Cc: Marc Zyngier Cc: Paolo Bonzini Cc: # 4.5+ Signed-off-by: Christoffer Dall (cherry picked from commit a6cdf1c08cbfe0818a3d8042844d75bf74fd82bd) Signed-off-by: Alex Shi Conflicts: arch/arm64/kvm/hyp/Makefile --- arch/arm64/kvm/hyp/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 826032bc3945..acde4782621a 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -12,3 +12,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o + +GCOV_PROFILE := n +KASAN_SANITIZE := n +UBSAN_SANITIZE := n