From a6c5729b0ae1164326c8899a67a40cbe4325e82e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 31 Jan 2020 09:28:33 +0100 Subject: [PATCH 01/80] openrisc: configs: Cleanup CONFIG_CROSS_COMPILE CONFIG_CROSS_COMPILE is gone since commit f1089c92da79 ("kbuild: remove CONFIG_CROSS_COMPILE support"). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Stafford Horne --- Documentation/openrisc/openrisc_port.rst | 4 ++-- arch/openrisc/configs/or1ksim_defconfig | 1 - arch/openrisc/configs/simple_smp_defconfig | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Documentation/openrisc/openrisc_port.rst b/Documentation/openrisc/openrisc_port.rst index a18747a8d191..4b2c437942a0 100644 --- a/Documentation/openrisc/openrisc_port.rst +++ b/Documentation/openrisc/openrisc_port.rst @@ -37,8 +37,8 @@ or Stafford's toolchain build and release scripts. Build the Linux kernel as usual:: - make ARCH=openrisc defconfig - make ARCH=openrisc + make ARCH=openrisc CROSS_COMPILE="or1k-linux-" defconfig + make ARCH=openrisc CROSS_COMPILE="or1k-linux-" 3) Running on FPGA (optional) diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig index d8ff4f8ffb88..75f2da324d0e 100644 --- a/arch/openrisc/configs/or1ksim_defconfig +++ b/arch/openrisc/configs/or1ksim_defconfig @@ -1,4 +1,3 @@ -CONFIG_CROSS_COMPILE="or1k-linux-" CONFIG_NO_HZ=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y diff --git a/arch/openrisc/configs/simple_smp_defconfig b/arch/openrisc/configs/simple_smp_defconfig index 64278992df9c..ff49d868e040 100644 --- a/arch/openrisc/configs/simple_smp_defconfig +++ b/arch/openrisc/configs/simple_smp_defconfig @@ -1,4 +1,3 @@ -CONFIG_CROSS_COMPILE="or1k-linux-" CONFIG_LOCALVERSION="-simple-smp" CONFIG_NO_HZ=y CONFIG_LOG_BUF_SHIFT=14 From fc74d716600545304a066bfd9d54cbd07e531701 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 29 Dec 2019 16:42:58 +0100 Subject: [PATCH 02/80] openrisc: use mmgrab Mmgrab was introduced in commit f1f1007644ff ("mm: add new mmgrab() helper") and most of the kernel was updated to use it. Update a remaining file. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) @@ expression e; @@ - atomic_inc(&e->mm_count); + mmgrab(e); Signed-off-by: Julia Lawall Signed-off-by: Stafford Horne [shorne: Added missing sched/mm.h include] --- arch/openrisc/kernel/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 7d518ee8bddc..bd1e660bbc89 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -113,7 +114,7 @@ asmlinkage __init void secondary_start_kernel(void) * All kernel threads share the same mm context; grab a * reference and switch to it. */ - atomic_inc(&mm->mm_count); + mmgrab(mm); current->active_mm = mm; cpumask_set_cpu(cpu, mm_cpumask(mm)); From 0d4e1bb1062481c43c01dd20fd4c40544441739f Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 25 Feb 2020 19:04:17 +0900 Subject: [PATCH 03/80] openrisc: Convert copy_thread to copy_thread_tls This is required for clone3 which passes the TLS value through a struct rather than a register. Signed-off-by: Stafford Horne Acked-by: Christian Brauner --- arch/openrisc/Kconfig | 1 + arch/openrisc/kernel/process.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 1928e061ff96..5debdbe6fc35 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -14,6 +14,7 @@ config OPENRISC select HANDLE_DOMAIN_IRQ select GPIOLIB select HAVE_ARCH_TRACEHOOK + select HAVE_COPY_THREAD_TLS select SPARSE_IRQ select GENERIC_IRQ_CHIP select GENERIC_IRQ_PROBE diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index b06f84f6676f..5caa47f7de4f 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -117,12 +117,13 @@ void release_thread(struct task_struct *dead_task) extern asmlinkage void ret_from_fork(void); /* - * copy_thread + * copy_thread_tls * @clone_flags: flags * @usp: user stack pointer or fn for kernel thread * @arg: arg to fn for kernel thread; always NULL for userspace thread * @p: the newly created task * @regs: CPU context to copy for userspace thread; always NULL for kthread + * @tls: the Thread Local Storage pointer for the new process * * At the top of a newly initialized kernel stack are two stacked pt_reg * structures. The first (topmost) is the userspace context of the thread. @@ -148,8 +149,8 @@ extern asmlinkage void ret_from_fork(void); */ int -copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p) +copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *userregs; struct pt_regs *kregs; @@ -179,16 +180,10 @@ copy_thread(unsigned long clone_flags, unsigned long usp, userregs->sp = usp; /* - * For CLONE_SETTLS set "tp" (r10) to the TLS pointer passed to sys_clone. - * - * The kernel entry is: - * int clone (long flags, void *child_stack, int *parent_tid, - * int *child_tid, struct void *tls) - * - * This makes the source r7 in the kernel registers. + * For CLONE_SETTLS set "tp" (r10) to the TLS pointer. */ if (clone_flags & CLONE_SETTLS) - userregs->gpr[10] = userregs->gpr[7]; + userregs->gpr[10] = tls; userregs->gpr[11] = 0; /* Result from fork() */ From 07e83dfbe16cd882afc1d221f4ef3a85b9818f9a Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 25 Feb 2020 19:04:45 +0900 Subject: [PATCH 04/80] openrisc: Enable the clone3 syscall Enable the clone3 syscall for OpenRISC. We use the generic version. This was tested with the clone3 test from selftests. Note, for all tests to pass it required enabling CONFIG_NAMESPACES which is not enabled in the default OpenRISC kernel config. Signed-off-by: Stafford Horne Acked-by: Christian Brauner --- arch/openrisc/include/uapi/asm/unistd.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h index 566f8c4f8047..fae34c60fa88 100644 --- a/arch/openrisc/include/uapi/asm/unistd.h +++ b/arch/openrisc/include/uapi/asm/unistd.h @@ -24,6 +24,7 @@ #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_TIME32_SYSCALLS #include From d95b56c77efdafec14d767bbf9730ff55907667d Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 25 Feb 2020 19:06:12 +0900 Subject: [PATCH 05/80] openrisc: Cleanup copy_thread_tls docs and comments Previously copy_thread_tls was copy_thread and before that something else. Remove the documentation about the regs parameter that didn't exist in either version. Next, fix comment wrapping and details about how TLS pointer gets to the copy_thread_tls function. Signed-off-by: Stafford Horne Acked-by: Christian Brauner --- arch/openrisc/kernel/process.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 5caa47f7de4f..6bcdca424e11 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -122,7 +122,6 @@ extern asmlinkage void ret_from_fork(void); * @usp: user stack pointer or fn for kernel thread * @arg: arg to fn for kernel thread; always NULL for userspace thread * @p: the newly created task - * @regs: CPU context to copy for userspace thread; always NULL for kthread * @tls: the Thread Local Storage pointer for the new process * * At the top of a newly initialized kernel stack are two stacked pt_reg From 7c8c5673f6cf1f35d5ffb1df3095526d28e6730a Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Tue, 18 Feb 2020 20:07:16 +0000 Subject: [PATCH 06/80] pcmcia: omap: remove useless cast for driver.name device_driver name is const char pointer, so it not useful to cast driver_name (which is already const char). Signed-off-by: Corentin Labbe Signed-off-by: Dominik Brodowski --- drivers/pcmcia/omap_cf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c index 0a04eb04f3a2..d3ef5534991e 100644 --- a/drivers/pcmcia/omap_cf.c +++ b/drivers/pcmcia/omap_cf.c @@ -329,7 +329,7 @@ static int __exit omap_cf_remove(struct platform_device *pdev) static struct platform_driver omap_cf_driver = { .driver = { - .name = (char *) driver_name, + .name = driver_name, }, .remove = __exit_p(omap_cf_remove), }; From 6d3fbe919b839bae9d8ed22b4ae5a17b05cb209e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Mar 2020 10:04:26 +0100 Subject: [PATCH 07/80] pcmcia: Use scnprintf() for avoiding potential buffer overflow Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf(). Signed-off-by: Takashi Iwai Signed-off-by: Dominik Brodowski --- drivers/pcmcia/rsrc_nonstatic.c | 6 +++--- drivers/pcmcia/yenta_socket.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index 9e6922c08ef6..3b05760e69d6 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -1076,7 +1076,7 @@ static ssize_t show_io_db(struct device *dev, for (p = data->io_db.next; p != &data->io_db; p = p->next) { if (ret > (PAGE_SIZE - 10)) continue; - ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1), + ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1), "0x%08lx - 0x%08lx\n", ((unsigned long) p->base), ((unsigned long) p->base + p->num - 1)); @@ -1133,7 +1133,7 @@ static ssize_t show_mem_db(struct device *dev, p = p->next) { if (ret > (PAGE_SIZE - 10)) continue; - ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1), + ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1), "0x%08lx - 0x%08lx\n", ((unsigned long) p->base), ((unsigned long) p->base + p->num - 1)); @@ -1142,7 +1142,7 @@ static ssize_t show_mem_db(struct device *dev, for (p = data->mem_db.next; p != &data->mem_db; p = p->next) { if (ret > (PAGE_SIZE - 10)) continue; - ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1), + ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1), "0x%08lx - 0x%08lx\n", ((unsigned long) p->base), ((unsigned long) p->base + p->num - 1)); diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index 49b1c6a1bdbe..bf6529b0b5b0 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -180,12 +180,12 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri for (i = 0; i < 0x24; i += 4) { unsigned val; if (!(i & 15)) - offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); val = cb_readl(socket, i); - offset += snprintf(buf + offset, PAGE_SIZE - offset, " %08x", val); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %08x", val); } - offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:"); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:"); for (i = 0; i < 0x45; i++) { unsigned char val; if (!(i & 7)) { @@ -193,10 +193,10 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri memcpy(buf + offset, " -", 2); offset += 2; } else - offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); } val = exca_readb(socket, i); - offset += snprintf(buf + offset, PAGE_SIZE - offset, " %02x", val); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %02x", val); } buf[offset++] = '\n'; return offset; From 9737e2c5f0bc768b58416ec070bd96c91c52a153 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Wed, 18 Mar 2020 22:10:38 +0900 Subject: [PATCH 08/80] openrisc: Remove obsolete show_trace_task function The function show_trace_task() was removed during linux 2.5 development and replaced with show_stack(). This was never impemented for openrisc but must have got in via copying from another architecture. Just remove it. Signed-off-by: Stafford Horne --- arch/openrisc/kernel/traps.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 932a8ec2b520..c11aa2e17ce0 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -55,13 +55,6 @@ void show_stack(struct task_struct *task, unsigned long *esp) unwind_stack(NULL, esp, print_trace); } -void show_trace_task(struct task_struct *tsk) -{ - /* - * TODO: SysRq-T trace dump... - */ -} - void show_registers(struct pt_regs *regs) { int i; From 1e6709b352e7432289d6384213d5222f8d8f8fdf Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Mar 2020 17:12:42 -0500 Subject: [PATCH 09/80] pcmcia: cs_internal.h: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Dominik Brodowski --- drivers/pcmcia/cs_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pcmcia/cs_internal.h b/drivers/pcmcia/cs_internal.h index 33c9b6ea7364..fb9b17fa0fb5 100644 --- a/drivers/pcmcia/cs_internal.h +++ b/drivers/pcmcia/cs_internal.h @@ -40,7 +40,7 @@ struct cis_cache_entry { unsigned int addr; unsigned int len; unsigned int attr; - unsigned char cache[0]; + unsigned char cache[]; }; struct pccard_resource_ops { From af741b0bad237bf85d388b70c62dfcb2769beef0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 20 Mar 2020 17:13:23 -0500 Subject: [PATCH 10/80] pcmcia: soc_common.h: Replace zero-length array with flexible-array member The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Dominik Brodowski --- drivers/pcmcia/soc_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pcmcia/soc_common.h b/drivers/pcmcia/soc_common.h index b7f993f1bbd0..222e81c79365 100644 --- a/drivers/pcmcia/soc_common.h +++ b/drivers/pcmcia/soc_common.h @@ -88,7 +88,7 @@ struct soc_pcmcia_socket { struct skt_dev_info { int nskt; - struct soc_pcmcia_socket skt[0]; + struct soc_pcmcia_socket skt[]; }; struct pcmcia_state { From b495dfed706c4c5873c0dab8930ad6eb1d276a6c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 30 Jan 2020 20:22:26 +0100 Subject: [PATCH 11/80] um: Cleanup CONFIG_IOSCHED_CFQ CONFIG_IOSCHED_CFQ is gone since commit f382fb0bcef4 ("block: remove legacy IO schedulers"). The IOSCHED_BFQ seems to replace IOSCHED_CFQ so select it in configs previously choosing the latter. Signed-off-by: Krzysztof Kozlowski Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/configs/i386_defconfig | 2 +- arch/um/configs/x86_64_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index 73e98bb57bf5..fb51bd206dbe 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -26,7 +26,7 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -CONFIG_IOSCHED_CFQ=m +CONFIG_IOSCHED_BFQ=m CONFIG_SSL=y CONFIG_NULL_CHAN=y CONFIG_PORT_CHAN=y diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 3281d7600225..477b87317424 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -24,7 +24,7 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -CONFIG_IOSCHED_CFQ=m +CONFIG_IOSCHED_BFQ=m CONFIG_SSL=y CONFIG_NULL_CHAN=y CONFIG_PORT_CHAN=y From 7d7c0568285d6f5630fb269766186afe09e58dc7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 12 Feb 2020 12:02:59 +0200 Subject: [PATCH 12/80] um: Don't use console_drivers directly console_drivers is kind of (semi-)private variable to the console code. Direct use of it make us stuck with it being exported here and there. Reduce use of console_drivers by replacing it with for_each_console(). Cc: Thomas Meyer Signed-off-by: Andy Shevchenko Signed-off-by: Richard Weinberger --- arch/um/kernel/kmsg_dump.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c index 98bdf69e4c2e..e4abac6c9727 100644 --- a/arch/um/kernel/kmsg_dump.c +++ b/arch/um/kernel/kmsg_dump.c @@ -9,20 +9,19 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { static char line[1024]; - + struct console *con; size_t len = 0; - bool con_available = false; /* only dump kmsg when no console is available */ if (!console_trylock()) return; - if (console_drivers != NULL) - con_available = true; + for_each_console(con) + break; console_unlock(); - if (con_available == true) + if (con) return; printf("kmsg_dump:\n"); From ba758cfce00a5977ccf019926f8379a96f5ac5f6 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Wed, 19 Feb 2020 21:44:42 +0800 Subject: [PATCH 13/80] um: Fix len of file in create_pid_file sizeof gives us the size of the pointer variable, not of the area it points to. So the number of bytes copied by umid_file_name() is 8. We should pass in the correct length of the file buffer. Signed-off-by: Wen Yang Signed-off-by: Richard Weinberger --- arch/um/os-Linux/umid.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index 44def53a11cd..9e16078a4bf8 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -220,11 +220,12 @@ static void __init create_pid_file(void) char pid[sizeof("nnnnn\0")], *file; int fd, n; - file = malloc(strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")); + n = strlen(uml_dir) + UMID_LEN + sizeof("/pid\0"); + file = malloc(n); if (!file) return; - if (umid_file_name("pid", file, sizeof(file))) + if (umid_file_name("pid", file, n)) goto out; fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644); From c2ed957c3b288cd037d2c933222edaf1e20ede26 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Feb 2020 10:40:13 +0200 Subject: [PATCH 14/80] um: Don't overwrite ethtool driver version In-tree drivers don't need to manage internal version because they are aligned to the global Linux kernel version, which is reported by default with "ethtool -i". Signed-off-by: Leon Romanovsky Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/net_kern.c | 1 - arch/um/drivers/vector_kern.c | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 35ebeebfc1a8..af07733c2dc8 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -266,7 +266,6 @@ static void uml_net_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); - strlcpy(info->version, "42", sizeof(info->version)); } static const struct ethtool_ops uml_net_ethtool_ops = { diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 0ff86391f77d..9693dfca7651 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -46,7 +46,6 @@ #define DRIVER_NAME "uml-vector" -#define DRIVER_VERSION "01" struct vector_cmd_line_arg { struct list_head list; int unit; @@ -1378,7 +1377,6 @@ static void vector_net_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); - strlcpy(info->version, DRIVER_VERSION, sizeof(info->version)); } static int vector_net_load_bpf_flash(struct net_device *dev, From 73343392aae8bdd939ca58e855889d1b015b0ef4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Feb 2020 10:40:14 +0200 Subject: [PATCH 15/80] um: Delete never executed timer The "#ifdef undef" construction effectively disabled the timer. It causes to the fact that this timer did nothing, so delete it. Signed-off-by: Leon Romanovsky Acked-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/net_kern.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index af07733c2dc8..1802cf4ef5a5 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -274,17 +274,6 @@ static const struct ethtool_ops uml_net_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; -static void uml_net_user_timer_expire(struct timer_list *t) -{ -#ifdef undef - struct uml_net_private *lp = from_timer(lp, t, tl); - struct connection *conn = &lp->user; - - dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn); - do_connect(conn); -#endif -} - void uml_net_setup_etheraddr(struct net_device *dev, char *str) { unsigned char *addr = dev->dev_addr; @@ -455,7 +444,6 @@ static void eth_configure(int n, void *init, char *mac, .add_address = transport->user->add_address, .delete_address = transport->user->delete_address }); - timer_setup(&lp->tl, uml_net_user_timer_expire, 0); spin_lock_init(&lp->lock); memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac)); From 598f5630361397c542a0ba2bec0ac5c0e1723d5c Mon Sep 17 00:00:00 2001 From: David Gow Date: Thu, 5 Mar 2020 10:39:39 -0800 Subject: [PATCH 16/80] um: Fix overlapping ELF segments when statically linked When statically linked, the .text section in UML kernels is not page aligned, causing it to share a page with the executable headers. As .text and the executable headers have different permissions, this causes the kernel to wish to map the same page twice (once as headers with r-- permissions, once as .text with r-x permissions), causing a segfault, and a nasty message printed to the host kernel's dmesg: "Uhuuh, elf segment at 0000000060000000 requested but the memory is mapped already" By aligning the .text to a page boundary (as in the dynamically linked version in dyn.lds.S), there is no such overlap, and the kernel runs correctly. Signed-off-by: David Gow Reviewed-by: Brendan Higgins Signed-off-by: Richard Weinberger --- arch/um/kernel/uml.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 9f21443be2c9..3b6dab3d4501 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -19,10 +19,10 @@ SECTIONS __binary_start = START; . = START + SIZEOF_HEADERS; + . = ALIGN(PAGE_SIZE); _text = .; INIT_TEXT_SECTION(0) - . = ALIGN(PAGE_SIZE); .text : { From 6e682d53fc1ef73a169e2a5300326cb23abb32ee Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Mar 2020 20:45:06 -0400 Subject: [PATCH 17/80] um: ubd: Prevent buffer overrun on command completion On the hypervisor side, when completing commands and the pipe is full, we retry writing only the entries that failed, by offsetting io_req_buffer, but we don't reduce the number of bytes written, which can cause a buffer overrun of io_req_buffer, and write garbage to the pipe. Cc: Martyn Welch Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 247f95da057b..eca45ad2166c 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1607,7 +1607,9 @@ int io_thread(void *arg) written = 0; do { - res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n); + res = os_write_file(kernel_fd, + ((char *) io_req_buffer) + written, + n - written); if (res >= 0) { written += res; } From e355b2f55efc4633409ada14125c9e28ad2de012 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Mar 2020 20:45:07 -0400 Subject: [PATCH 18/80] um: ubd: Retry buffer read on any kind of error Should bulk_req_safe_read return an error, we want to retry the read, otherwise, even though no IO will be done, os_write_file might still end up writing garbage to the pipe. Cc: Martyn Welch Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index eca45ad2166c..eae8c83364f7 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1592,11 +1592,11 @@ int io_thread(void *arg) &io_remainder_size, UBD_REQ_BUFFER_SIZE ); - if (n < 0) { - if (n == -EAGAIN) { + if (n <= 0) { + if (n == -EAGAIN) ubd_read_poll(-1); - continue; - } + + continue; } for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { From 35f3401317a3b26aa01fde8facfd320f2628fdcc Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 17 Mar 2020 17:35:34 +0000 Subject: [PATCH 19/80] um: falloc.h needs to be directly included for older libc When building UML with glibc 2.17 installed, compilation of arch/um/os-Linux/file.c fails due to failure to find FALLOC_FL_PUNCH_HOLE and FALLOC_FL_KEEP_SIZE definitions. It appears that /usr/include/bits/fcntl-linux.h (indirectly included by /usr/include/fcntl.h) does not include falloc.h with an older glibc, whereas a more up-to-date version does. Adding the direct include to file.c resolves the issue and does not cause problems for more recent glibc. Fixes: 50109b5a03b4 ("um: Add support for DISCARD in the UBD Driver") Cc: Brendan Higgins Signed-off-by: Alan Maguire Reviewed-by: Brendan Higgins Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/os-Linux/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index fbda10535dab..5c819f89b8c2 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include From b58c4e96192ee7c47d5c67853b1557306cfa0e7f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 20 Mar 2020 15:07:35 +0200 Subject: [PATCH 20/80] hostfs: Use kasprintf() instead of fixed buffer formatting Improve readability and maintainability by replacing a hardcoded string allocation and formatting by the use of the kasprintf() helper. Signed-off-by: Andy Shevchenko Signed-off-by: Richard Weinberger --- fs/hostfs/hostfs_kern.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e6b8c49076bb..c070c0d8e3e9 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -139,8 +139,8 @@ static char *inode_name(struct inode *ino) static char *follow_link(char *link) { - int len, n; char *name, *resolved, *end; + int n; name = __getname(); if (!name) { @@ -164,15 +164,13 @@ static char *follow_link(char *link) return name; *(end + 1) = '\0'; - len = strlen(link) + strlen(name) + 1; - resolved = kmalloc(len, GFP_KERNEL); + resolved = kasprintf(GFP_KERNEL, "%s%s", link, name); if (resolved == NULL) { n = -ENOMEM; goto out_free; } - sprintf(resolved, "%s%s", link, name); __putname(name); kfree(link); return resolved; @@ -921,18 +919,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; - /* NULL is printed as by sprintf: avoid that. */ + /* NULL is printed as '(null)' by printf(): avoid that. */ if (req_root == NULL) req_root = ""; err = -ENOMEM; sb->s_fs_info = host_root_path = - kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL); + kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root); if (host_root_path == NULL) goto out; - sprintf(host_root_path, "%s/%s", root_ino, req_root); - root_inode = new_inode(sb); if (!root_inode) goto out; From f185063bff914e589b50f1b711fc42218c4790c6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Feb 2020 14:26:44 +0100 Subject: [PATCH 21/80] um: Move timer-internal.h to non-shared This file isn't really shared, it's only used on the kernel side, not on the user side. Remove the include from the user-side and move the file to a better place. While at it, rename it to time-internal.h, it's not really just timers but all kinds of things related to timekeeping. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- .../include/{shared/timer-internal.h => linux/time-internal.h} | 0 arch/um/kernel/process.c | 2 +- arch/um/kernel/skas/syscall.c | 2 +- arch/um/kernel/time.c | 2 +- arch/um/os-Linux/time.c | 1 - 5 files changed, 3 insertions(+), 4 deletions(-) rename arch/um/include/{shared/timer-internal.h => linux/time-internal.h} (100%) diff --git a/arch/um/include/shared/timer-internal.h b/arch/um/include/linux/time-internal.h similarity index 100% rename from arch/um/include/shared/timer-internal.h rename to arch/um/include/linux/time-internal.h diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 56a094182bf5..0274ebb70977 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include /* * This is a per-cpu array. A processor only modifies its entry and it only diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 40d90dddf3f1..2e82820f7d29 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include void handle_syscall(struct uml_pt_regs *r) { diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 94ea87bd231c..a849d391e909 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index 432f8e1f55c2..90f6de224c70 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -14,7 +14,6 @@ #include #include #include -#include static timer_t event_high_res_timer = 0; From 4b786e24ca80a492736b359b3d1a8d07612a78e5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Feb 2020 14:26:45 +0100 Subject: [PATCH 22/80] um: time-travel: Rewrite as an event scheduler Instead of tracking all the various timer configurations, modify the time-travel mode to have an event scheduler and use a timer event on the scheduler to handle the different timer configurations. This doesn't change the function right now, but it prepares the code for having different kinds of events in the future (i.e. interrupts coming from other devices that are part of co-simulation.) While at it, also move time_travel_sleep() to time.c to reduce the externally visible API surface. Also, we really should mark time-travel as incompatible with SMP, even if UML doesn't support SMP yet. Finally, I noticed a bug while developing this - if we move time forward due to consuming time while reading the clock, we might move across the next event and that would cause us to go backward in time when we then handle that event. Fix that by invoking the whole event machine in this case, but in order to simplify this, make reading the clock only cost something when interrupts are not disabled. Otherwise, we'd have to hook into the interrupt delivery machinery etc. and that's somewhat intrusive. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 1 + arch/um/include/linux/time-internal.h | 65 +++----- arch/um/kernel/process.c | 37 ----- arch/um/kernel/time.c | 221 ++++++++++++++++++++++++-- 4 files changed, 229 insertions(+), 95 deletions(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 0917f8443c28..817a4c838a06 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -191,6 +191,7 @@ config UML_TIME_TRAVEL_SUPPORT prompt "Support time-travel mode (e.g. for test execution)" # inf-cpu mode is incompatible with the benchmarking depends on !RAID6_PQ_BENCHMARK + depends on !SMP help Enable this option to support time travel inside the UML instance. diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h index 2d2d13c9b46f..eb1f84616edb 100644 --- a/arch/um/include/linux/time-internal.h +++ b/arch/um/include/linux/time-internal.h @@ -6,6 +6,7 @@ #ifndef __TIMER_INTERNAL_H__ #define __TIMER_INTERNAL_H__ +#include #define TIMER_MULTIPLIER 256 #define TIMER_MIN_DELTA 500 @@ -16,61 +17,35 @@ enum time_travel_mode { TT_MODE_INFCPU, }; -enum time_travel_timer_mode { - TT_TMR_DISABLED, - TT_TMR_ONESHOT, - TT_TMR_PERIODIC, +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +struct time_travel_event { + unsigned long long time; + void (*fn)(struct time_travel_event *d); + struct list_head list; + bool pending, onstack; }; -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT extern enum time_travel_mode time_travel_mode; -extern unsigned long long time_travel_time; -extern enum time_travel_timer_mode time_travel_timer_mode; -extern unsigned long long time_travel_timer_expiry; -extern unsigned long long time_travel_timer_interval; -static inline void time_travel_set_time(unsigned long long ns) -{ - time_travel_time = ns; -} +void time_travel_sleep(unsigned long long duration); -static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode) +static inline void +time_travel_set_event_fn(struct time_travel_event *e, + void (*fn)(struct time_travel_event *d)) { - time_travel_timer_mode = mode; -} - -static inline void time_travel_set_timer_expiry(unsigned long long expiry) -{ - time_travel_timer_expiry = expiry; -} - -static inline void time_travel_set_timer_interval(unsigned long long interval) -{ - time_travel_timer_interval = interval; + e->fn = fn; } #else +struct time_travel_event { +}; + #define time_travel_mode TT_MODE_OFF -#define time_travel_time 0 -#define time_travel_timer_expiry 0 -#define time_travel_timer_interval 0 -static inline void time_travel_set_time(unsigned long long ns) +static inline void time_travel_sleep(unsigned long long duration) { } -static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode) -{ -} - -static inline void time_travel_set_timer_expiry(unsigned long long expiry) -{ -} - -static inline void time_travel_set_timer_interval(unsigned long long interval) -{ -} - -#define time_travel_timer_mode TT_TMR_DISABLED -#endif - -#endif +/* this is a macro so the event/function need not exist */ +#define time_travel_set_event_fn(e, fn) do {} while (0) +#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ +#endif /* __TIMER_INTERNAL_H__ */ diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 0274ebb70977..cbe33af2a880 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -203,43 +203,6 @@ void initial_thread_cb(void (*proc)(void *), void *arg) kmalloc_ok = save_kmalloc_ok; } -static void time_travel_sleep(unsigned long long duration) -{ - unsigned long long next = time_travel_time + duration; - - if (time_travel_mode != TT_MODE_INFCPU) - os_timer_disable(); - - while (time_travel_timer_mode == TT_TMR_PERIODIC && - time_travel_timer_expiry < time_travel_time) - time_travel_set_timer_expiry(time_travel_timer_expiry + - time_travel_timer_interval); - - if (time_travel_timer_mode != TT_TMR_DISABLED && - time_travel_timer_expiry < next) { - if (time_travel_timer_mode == TT_TMR_ONESHOT) - time_travel_set_timer_mode(TT_TMR_DISABLED); - /* - * In basic mode, time_travel_time will be adjusted in - * the timer IRQ handler so it works even when the signal - * comes from the OS timer, see there. - */ - if (time_travel_mode != TT_MODE_BASIC) - time_travel_set_time(time_travel_timer_expiry); - - deliver_alarm(); - } else { - time_travel_set_time(next); - } - - if (time_travel_mode != TT_MODE_INFCPU) { - if (time_travel_timer_mode == TT_TMR_PERIODIC) - os_timer_set_interval(time_travel_timer_interval); - else if (time_travel_timer_mode == TT_TMR_ONESHOT) - os_timer_one_shot(time_travel_timer_expiry - next); - } -} - static void um_idle_sleep(void) { unsigned long long duration = UM_NSEC_PER_SEC; diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index a849d391e909..cdebe96308d7 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -4,6 +4,7 @@ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2012-2014 Cisco Systems * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2019 Intel Corporation */ #include @@ -23,16 +24,201 @@ #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT enum time_travel_mode time_travel_mode; -unsigned long long time_travel_time; -enum time_travel_timer_mode time_travel_timer_mode; -unsigned long long time_travel_timer_expiry; -unsigned long long time_travel_timer_interval; static bool time_travel_start_set; static unsigned long long time_travel_start; -#else +static unsigned long long time_travel_time; +static LIST_HEAD(time_travel_events); +static unsigned long long time_travel_timer_interval; +static unsigned long long time_travel_next_event; +static struct time_travel_event time_travel_timer_event; + +static void time_travel_set_time(unsigned long long ns) +{ + if (unlikely(ns < time_travel_time)) + panic("time-travel: time goes backwards %lld -> %lld\n", + time_travel_time, ns); + time_travel_time = ns; +} + +static struct time_travel_event *time_travel_first_event(void) +{ + return list_first_entry_or_null(&time_travel_events, + struct time_travel_event, + list); +} + +static void __time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + struct time_travel_event *tmp; + bool inserted = false; + + if (WARN(time_travel_mode == TT_MODE_BASIC && + e != &time_travel_timer_event, + "only timer events can be handled in basic mode")) + return; + + if (e->pending) + return; + + e->pending = true; + e->time = time; + + list_for_each_entry(tmp, &time_travel_events, list) { + /* + * Add the new entry before one with higher time, + * or if they're equal and both on stack, because + * in that case we need to unwind the stack in the + * right order, and the later event (timer sleep + * or such) must be dequeued first. + */ + if ((tmp->time > e->time) || + (tmp->time == e->time && tmp->onstack && e->onstack)) { + list_add_tail(&e->list, &tmp->list); + inserted = true; + break; + } + } + + if (!inserted) + list_add_tail(&e->list, &time_travel_events); + + tmp = time_travel_first_event(); + time_travel_next_event = tmp->time; +} + +static void time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + if (WARN_ON(!e->fn)) + return; + + __time_travel_add_event(e, time); +} + +void time_travel_periodic_timer(struct time_travel_event *e) +{ + time_travel_add_event(&time_travel_timer_event, + time_travel_time + time_travel_timer_interval); + deliver_alarm(); +} + +static void time_travel_deliver_event(struct time_travel_event *e) +{ + /* this is basically just deliver_alarm(), handles IRQs itself */ + e->fn(e); +} + +static bool time_travel_del_event(struct time_travel_event *e) +{ + if (!e->pending) + return false; + list_del(&e->list); + e->pending = false; + return true; +} + +static void time_travel_update_time(unsigned long long next, bool retearly) +{ + struct time_travel_event ne = { + .onstack = true, + }; + struct time_travel_event *e; + bool finished = retearly; + + /* add it without a handler - we deal with that specifically below */ + __time_travel_add_event(&ne, next); + + do { + e = time_travel_first_event(); + + BUG_ON(!e); + time_travel_set_time(e->time); + + /* new events may have been inserted while we were waiting */ + if (e == time_travel_first_event()) { + BUG_ON(!time_travel_del_event(e)); + BUG_ON(time_travel_time != e->time); + + if (e == &ne) { + finished = true; + } else { + if (e->onstack) + panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n", + time_travel_time, e->time, e); + time_travel_deliver_event(e); + } + } + } while (!finished); + + time_travel_del_event(&ne); +} + +static void time_travel_oneshot_timer(struct time_travel_event *e) +{ + deliver_alarm(); +} + +void time_travel_sleep(unsigned long long duration) +{ + unsigned long long next = time_travel_time + duration; + + if (time_travel_mode == TT_MODE_BASIC) + os_timer_disable(); + + time_travel_update_time(next, true); + + if (time_travel_mode == TT_MODE_BASIC && + time_travel_timer_event.pending) { + if (time_travel_timer_event.fn == time_travel_periodic_timer) { + /* + * This is somewhat wrong - we should get the first + * one sooner like the os_timer_one_shot() below... + */ + os_timer_set_interval(time_travel_timer_interval); + } else { + os_timer_one_shot(time_travel_timer_event.time - next); + } + } +} + +static void time_travel_handle_real_alarm(void) +{ + time_travel_set_time(time_travel_next_event); + + time_travel_del_event(&time_travel_timer_event); + + if (time_travel_timer_event.fn == time_travel_periodic_timer) + time_travel_add_event(&time_travel_timer_event, + time_travel_time + + time_travel_timer_interval); +} + +static void time_travel_set_interval(unsigned long long interval) +{ + time_travel_timer_interval = interval; +} +#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ #define time_travel_start_set 0 #define time_travel_start 0 +#define time_travel_time 0 + +static inline void time_travel_update_time(unsigned long long ns, bool retearly) +{ +} + +static inline void time_travel_handle_real_alarm(void) +{ +} + +static void time_travel_set_interval(unsigned long long interval) +{ +} + +/* these are empty macros so the struct/fn need not exist */ +#define time_travel_add_event(e, time) do { } while (0) +#define time_travel_del_event(e) do { } while (0) #endif void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) @@ -48,7 +234,7 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * never get any real signals from the OS. */ if (time_travel_mode == TT_MODE_BASIC) - time_travel_set_time(time_travel_timer_expiry); + time_travel_handle_real_alarm(); local_irq_save(flags); do_IRQ(TIMER_IRQ, regs); @@ -58,7 +244,7 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) static int itimer_shutdown(struct clock_event_device *evt) { if (time_travel_mode != TT_MODE_OFF) - time_travel_set_timer_mode(TT_TMR_DISABLED); + time_travel_del_event(&time_travel_timer_event); if (time_travel_mode != TT_MODE_INFCPU) os_timer_disable(); @@ -71,9 +257,12 @@ static int itimer_set_periodic(struct clock_event_device *evt) unsigned long long interval = NSEC_PER_SEC / HZ; if (time_travel_mode != TT_MODE_OFF) { - time_travel_set_timer_mode(TT_TMR_PERIODIC); - time_travel_set_timer_expiry(time_travel_time + interval); - time_travel_set_timer_interval(interval); + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_periodic_timer); + time_travel_set_interval(interval); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + interval); } if (time_travel_mode != TT_MODE_INFCPU) @@ -88,8 +277,11 @@ static int itimer_next_event(unsigned long delta, delta += 1; if (time_travel_mode != TT_MODE_OFF) { - time_travel_set_timer_mode(TT_TMR_ONESHOT); - time_travel_set_timer_expiry(time_travel_time + delta); + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_oneshot_timer); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + delta); } if (time_travel_mode != TT_MODE_INFCPU) @@ -144,7 +336,10 @@ static u64 timer_read(struct clocksource *cs) * exact requested sleep amount, e.g. python's socket server, * see https://bugs.python.org/issue37026. */ - time_travel_set_time(time_travel_time + TIMER_MULTIPLIER); + if (!irqs_disabled()) + time_travel_update_time(time_travel_time + + TIMER_MULTIPLIER, + false); return time_travel_time / TIMER_MULTIPLIER; } From dd9ada5627245a3441ebde00736dd63d09acc222 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Feb 2020 14:26:46 +0100 Subject: [PATCH 23/80] um: virtio: Implement VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS Implement in-band notifications that are necessary for running vhost-user devices under externally synchronized time-travel mode (which is in a follow-up patch). This feature makes what usually should be eventfd notifications in-band messages. We'll prefer this feature, under the assumption that only a few (simulation) devices will ever support it, since it's not very efficient. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/vhost_user.h | 12 +++-- arch/um/drivers/virtio_uml.c | 85 ++++++++++++++++++++++++++++-------- 2 files changed, 75 insertions(+), 22 deletions(-) diff --git a/arch/um/drivers/vhost_user.h b/arch/um/drivers/vhost_user.h index 45ff5ea22fea..6c71b6005177 100644 --- a/arch/um/drivers/vhost_user.h +++ b/arch/um/drivers/vhost_user.h @@ -10,9 +10,10 @@ /* Feature bits */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 /* Protocol feature bits */ -#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 -#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 -#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 +#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14 /* Vring state index masks */ #define VHOST_USER_VRING_INDEX_MASK 0xff #define VHOST_USER_VRING_POLL_MASK BIT(8) @@ -24,7 +25,8 @@ /* Supported protocol features */ #define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ - BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG)) + BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) enum vhost_user_request { VHOST_USER_GET_FEATURES = 1, @@ -52,12 +54,14 @@ enum vhost_user_request { VHOST_USER_SET_VRING_ENDIAN = 23, VHOST_USER_GET_CONFIG = 24, VHOST_USER_SET_CONFIG = 25, + VHOST_USER_VRING_KICK = 35, }; enum vhost_user_slave_request { VHOST_USER_SLAVE_IOTLB_MSG = 1, VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, + VHOST_USER_SLAVE_VRING_CALL = 4, }; struct vhost_user_header { diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 023ced2250ea..9b4c5b7c0f0d 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -53,6 +53,7 @@ struct virtio_uml_device { struct virtio_device vdev; struct platform_device *pdev; + spinlock_t sock_lock; int sock, req_fd; u64 features; u64 protocol_features; @@ -189,6 +190,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, int *fds, size_t num_fds) { size_t size = sizeof(msg->header) + msg->header.size; + unsigned long flags; bool request_ack; int rc; @@ -207,24 +209,28 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, if (request_ack) msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY; + spin_lock_irqsave(&vu_dev->sock_lock, flags); rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds); if (rc < 0) - return rc; + goto out; if (request_ack) { uint64_t status; rc = vhost_user_recv_u64(vu_dev, &status); if (rc) - return rc; + goto out; if (status) { vu_err(vu_dev, "slave reports error: %llu\n", status); - return -EIO; + rc = -EIO; + goto out; } } - return 0; +out: + spin_unlock_irqrestore(&vu_dev->sock_lock, flags); + return rc; } static int vhost_user_send_no_payload(struct virtio_uml_device *vu_dev, @@ -324,6 +330,7 @@ static void vhost_user_reply(struct virtio_uml_device *vu_dev, static irqreturn_t vu_req_interrupt(int irq, void *data) { struct virtio_uml_device *vu_dev = data; + struct virtqueue *vq; int response = 1; struct { struct vhost_user_msg msg; @@ -343,6 +350,15 @@ static irqreturn_t vu_req_interrupt(int irq, void *data) virtio_config_changed(&vu_dev->vdev); response = 0; break; + case VHOST_USER_SLAVE_VRING_CALL: + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + if (vq->index == msg.msg.payload.vring_state.index) { + response = 0; + vring_interrupt(0 /* ignored */, vq); + break; + } + } + break; case VHOST_USER_SLAVE_IOTLB_MSG: /* not supported - VIRTIO_F_IOMMU_PLATFORM */ case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: @@ -684,6 +700,15 @@ static bool vu_notify(struct virtqueue *vq) const uint64_t n = 1; int rc; + if (info->kick_fd < 0) { + struct virtio_uml_device *vu_dev; + + vu_dev = to_virtio_uml_device(vq->vdev); + + return vhost_user_set_vring_state(vu_dev, VHOST_USER_VRING_KICK, + vq->index, 0) == 0; + } + do { rc = os_write_file(info->kick_fd, &n, sizeof(n)); } while (rc == -EINTR); @@ -749,10 +774,13 @@ static void vu_del_vq(struct virtqueue *vq) { struct virtio_uml_vq_info *info = vq->priv; - um_free_irq(VIRTIO_IRQ, vq); + if (info->call_fd >= 0) { + um_free_irq(VIRTIO_IRQ, vq); + os_close_file(info->call_fd); + } - os_close_file(info->call_fd); - os_close_file(info->kick_fd); + if (info->kick_fd >= 0) + os_close_file(info->kick_fd); vring_del_virtqueue(vq); kfree(info); @@ -782,6 +810,15 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, int call_fds[2]; int rc; + /* no call FD needed/desired in this case */ + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { + info->call_fd = -1; + return 0; + } + /* Use a pipe for call fd, since SIGIO is not supported for eventfd */ rc = os_pipe(call_fds, true, true); if (rc < 0) @@ -838,10 +875,15 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, vq->priv = info; num = virtqueue_get_vring_size(vq); - rc = os_eventfd(0, 0); - if (rc < 0) - goto error_kick; - info->kick_fd = rc; + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) { + info->kick_fd = -1; + } else { + rc = os_eventfd(0, 0); + if (rc < 0) + goto error_kick; + info->kick_fd = rc; + } rc = vu_setup_vq_call_fd(vu_dev, vq); if (rc) @@ -866,10 +908,13 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, return vq; error_setup: - um_free_irq(VIRTIO_IRQ, vq); - os_close_file(info->call_fd); + if (info->call_fd >= 0) { + um_free_irq(VIRTIO_IRQ, vq); + os_close_file(info->call_fd); + } error_call: - os_close_file(info->kick_fd); + if (info->kick_fd >= 0) + os_close_file(info->kick_fd); error_kick: vring_del_virtqueue(vq); error_create: @@ -908,10 +953,12 @@ static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, list_for_each_entry(vq, &vdev->vqs, list) { struct virtio_uml_vq_info *info = vq->priv; - rc = vhost_user_set_vring_kick(vu_dev, vq->index, - info->kick_fd); - if (rc) - goto error_setup; + if (info->kick_fd >= 0) { + rc = vhost_user_set_vring_kick(vu_dev, vq->index, + info->kick_fd); + if (rc) + goto error_setup; + } rc = vhost_user_set_vring_enable(vu_dev, vq->index, true); if (rc) @@ -1008,6 +1055,8 @@ static int virtio_uml_probe(struct platform_device *pdev) return rc; vu_dev->sock = rc; + spin_lock_init(&vu_dev->sock_lock); + rc = vhost_user_init(vu_dev); if (rc) goto error_init; From 88ce642492339f49a0b391af40e5798c08948e49 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Feb 2020 14:26:47 +0100 Subject: [PATCH 24/80] um: Implement time-travel=ext This implements synchronized time-travel mode which - using a special application on a unix socket - lets multiple machines take part in a time-travelling simulation together. The protocol for the unix domain socket is defined in the new file include/uapi/linux/um_timetravel.h. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/virtio_uml.c | 68 +++++- arch/um/include/linux/time-internal.h | 27 +++ arch/um/include/shared/os.h | 1 + arch/um/kernel/skas/syscall.c | 3 +- arch/um/kernel/time.c | 323 +++++++++++++++++++++++++- arch/um/os-Linux/file.c | 30 +++ include/uapi/linux/um_timetravel.h | 128 ++++++++++ 7 files changed, 563 insertions(+), 17 deletions(-) create mode 100644 include/uapi/linux/um_timetravel.h diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 9b4c5b7c0f0d..be54d368e73d 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,11 @@ struct virtio_uml_device { struct virtio_uml_vq_info { int kick_fd, call_fd; char name[32]; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + struct virtqueue *vq; + vq_callback_t *callback; + struct time_travel_event defer; +#endif }; extern unsigned long long physmem_size, highmem; @@ -118,10 +124,27 @@ static int vhost_user_recv_header(int fd, struct vhost_user_msg *msg) static int vhost_user_recv(struct virtio_uml_device *vu_dev, int fd, struct vhost_user_msg *msg, - size_t max_payload_size) + size_t max_payload_size, bool wait) { size_t size; - int rc = vhost_user_recv_header(fd, msg); + int rc; + + /* + * In virtio time-travel mode, we're handling all the vhost-user + * FDs by polling them whenever appropriate. However, we may get + * into a situation where we're sending out an interrupt message + * to a device (e.g. a net device) and need to handle a simulation + * time message while doing so, e.g. one that tells us to update + * our idea of how long we can run without scheduling. + * + * Thus, we need to not just read() from the given fd, but need + * to also handle messages for the simulation time - this function + * does that for us while waiting for the given fd to be readable. + */ + if (wait) + time_travel_wait_readable(fd); + + rc = vhost_user_recv_header(fd, msg); if (rc == -ECONNRESET && vu_dev->registered) { struct virtio_uml_platform_data *pdata; @@ -143,7 +166,8 @@ static int vhost_user_recv_resp(struct virtio_uml_device *vu_dev, struct vhost_user_msg *msg, size_t max_payload_size) { - int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg, max_payload_size); + int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg, + max_payload_size, true); if (rc) return rc; @@ -173,7 +197,8 @@ static int vhost_user_recv_req(struct virtio_uml_device *vu_dev, struct vhost_user_msg *msg, size_t max_payload_size) { - int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg, max_payload_size); + int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg, + max_payload_size, false); if (rc) return rc; @@ -700,6 +725,8 @@ static bool vu_notify(struct virtqueue *vq) const uint64_t n = 1; int rc; + time_travel_propagate_time(); + if (info->kick_fd < 0) { struct virtio_uml_device *vu_dev; @@ -847,6 +874,23 @@ out: return rc; } +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static void vu_defer_irq_handle(struct time_travel_event *d) +{ + struct virtio_uml_vq_info *info; + + info = container_of(d, struct virtio_uml_vq_info, defer); + info->callback(info->vq); +} + +static void vu_defer_irq_callback(struct virtqueue *vq) +{ + struct virtio_uml_vq_info *info = vq->priv; + + time_travel_add_irq_event(&info->defer); +} +#endif + static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, unsigned index, vq_callback_t *callback, const char *name, bool ctx) @@ -866,6 +910,19 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, snprintf(info->name, sizeof(info->name), "%s.%d-%s", pdev->name, pdev->id, name); +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + /* + * When we get an interrupt, we must bounce it through the simulation + * calendar (the simtime device), except for the simtime device itself + * since that's part of the simulation control. + */ + if (time_travel_mode == TT_MODE_EXTERNAL && callback) { + info->callback = callback; + callback = vu_defer_irq_callback; + time_travel_set_event_fn(&info->defer, vu_defer_irq_handle); + } +#endif + vq = vring_create_virtqueue(index, num, PAGE_SIZE, vdev, true, true, ctx, vu_notify, callback, info->name); if (!vq) { @@ -874,6 +931,9 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, } vq->priv = info; num = virtqueue_get_vring_size(vq); +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + info->vq = vq; +#endif if (vu_dev->protocol_features & BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) { diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h index eb1f84616edb..e21655926f08 100644 --- a/arch/um/include/linux/time-internal.h +++ b/arch/um/include/linux/time-internal.h @@ -15,6 +15,7 @@ enum time_travel_mode { TT_MODE_OFF, TT_MODE_BASIC, TT_MODE_INFCPU, + TT_MODE_EXTERNAL, }; #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT @@ -35,6 +36,24 @@ time_travel_set_event_fn(struct time_travel_event *e, { e->fn = fn; } + +void __time_travel_propagate_time(void); + +static inline void time_travel_propagate_time(void) +{ + if (time_travel_mode == TT_MODE_EXTERNAL) + __time_travel_propagate_time(); +} + +void __time_travel_wait_readable(int fd); + +static inline void time_travel_wait_readable(int fd) +{ + if (time_travel_mode == TT_MODE_EXTERNAL) + __time_travel_wait_readable(fd); +} + +void time_travel_add_irq_event(struct time_travel_event *e); #else struct time_travel_event { }; @@ -47,5 +66,13 @@ static inline void time_travel_sleep(unsigned long long duration) /* this is a macro so the event/function need not exist */ #define time_travel_set_event_fn(e, fn) do {} while (0) + +static inline void time_travel_propagate_time(void) +{ +} + +static inline void time_travel_wait_readable(int fd) +{ +} #endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ #endif /* __TIMER_INTERNAL_H__ */ diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 0f30204b6afa..f467d28fc0b4 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -181,6 +181,7 @@ extern int os_falloc_punch(int fd, unsigned long long offset, int count); extern int os_eventfd(unsigned int initval, int flags); extern int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, unsigned int fds_num); +int os_poll(unsigned int n, const int *fds); /* start_up.c */ extern void os_early_checks(void); diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 2e82820f7d29..0a12d5a09217 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -24,7 +24,8 @@ void handle_syscall(struct uml_pt_regs *r) * went to sleep, even if said userspace interacts with the kernel in * various ways. */ - if (time_travel_mode == TT_MODE_INFCPU) + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) schedule(); /* Initialize the syscall number and default return value. */ diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index cdebe96308d7..15c4825b857e 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -20,10 +20,12 @@ #include #include #include +#include #include #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT enum time_travel_mode time_travel_mode; +EXPORT_SYMBOL_GPL(time_travel_mode); static bool time_travel_start_set; static unsigned long long time_travel_start; @@ -32,6 +34,12 @@ static LIST_HEAD(time_travel_events); static unsigned long long time_travel_timer_interval; static unsigned long long time_travel_next_event; static struct time_travel_event time_travel_timer_event; +static int time_travel_ext_fd = -1; +static unsigned int time_travel_ext_waiting; +static bool time_travel_ext_prev_request_valid; +static unsigned long long time_travel_ext_prev_request; +static bool time_travel_ext_free_until_valid; +static unsigned long long time_travel_ext_free_until; static void time_travel_set_time(unsigned long long ns) { @@ -41,6 +49,199 @@ static void time_travel_set_time(unsigned long long ns) time_travel_time = ns; } +enum time_travel_message_handling { + TTMH_IDLE, + TTMH_POLL, + TTMH_READ, +}; + +static void time_travel_handle_message(struct um_timetravel_msg *msg, + enum time_travel_message_handling mode) +{ + struct um_timetravel_msg resp = { + .op = UM_TIMETRAVEL_ACK, + }; + int ret; + + /* + * Poll outside the locked section (if we're not called to only read + * the response) so we can get interrupts for e.g. virtio while we're + * here, but then we need to lock to not get interrupted between the + * read of the message and write of the ACK. + */ + if (mode != TTMH_READ) { + while (os_poll(1, &time_travel_ext_fd) != 0) { + if (mode == TTMH_IDLE) { + BUG_ON(!irqs_disabled()); + local_irq_enable(); + local_irq_disable(); + } + } + } + + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + + if (ret == 0) + panic("time-travel external link is broken\n"); + if (ret != sizeof(*msg)) + panic("invalid time-travel message - %d bytes\n", ret); + + switch (msg->op) { + default: + WARN_ONCE(1, "time-travel: unexpected message %lld\n", + (unsigned long long)msg->op); + break; + case UM_TIMETRAVEL_ACK: + return; + case UM_TIMETRAVEL_RUN: + time_travel_set_time(msg->time); + break; + case UM_TIMETRAVEL_FREE_UNTIL: + time_travel_ext_free_until_valid = true; + time_travel_ext_free_until = msg->time; + break; + } + + os_write_file(time_travel_ext_fd, &resp, sizeof(resp)); +} + +static u64 time_travel_ext_req(u32 op, u64 time) +{ + static int seq; + int mseq = ++seq; + struct um_timetravel_msg msg = { + .op = op, + .time = time, + .seq = mseq, + }; + unsigned long flags; + + /* + * We need to save interrupts here and only restore when we + * got the ACK - otherwise we can get interrupted and send + * another request while we're still waiting for an ACK, but + * the peer doesn't know we got interrupted and will send + * the ACKs in the same order as the message, but we'd need + * to see them in the opposite order ... + * + * This wouldn't matter *too* much, but some ACKs carry the + * current time (for UM_TIMETRAVEL_GET) and getting another + * ACK without a time would confuse us a lot! + * + * The sequence number assignment that happens here lets us + * debug such message handling issues more easily. + */ + local_irq_save(flags); + os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + + while (msg.op != UM_TIMETRAVEL_ACK) + time_travel_handle_message(&msg, TTMH_READ); + + if (msg.seq != mseq) + panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", + msg.op, msg.seq, mseq, msg.time); + + if (op == UM_TIMETRAVEL_GET) + time_travel_set_time(msg.time); + local_irq_restore(flags); + + return msg.time; +} + +void __time_travel_wait_readable(int fd) +{ + int fds[2] = { fd, time_travel_ext_fd }; + int ret; + + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + while ((ret = os_poll(2, fds))) { + struct um_timetravel_msg msg; + + if (ret == 1) + time_travel_handle_message(&msg, TTMH_READ); + } +} +EXPORT_SYMBOL_GPL(__time_travel_wait_readable); + +static void time_travel_ext_update_request(unsigned long long time) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + /* asked for exactly this time previously */ + if (time_travel_ext_prev_request_valid && + time == time_travel_ext_prev_request) + return; + + time_travel_ext_prev_request = time; + time_travel_ext_prev_request_valid = true; + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); +} + +void __time_travel_propagate_time(void) +{ + time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time); +} +EXPORT_SYMBOL_GPL(__time_travel_propagate_time); + +/* returns true if we must do a wait to the simtime device */ +static bool time_travel_ext_request(unsigned long long time) +{ + /* + * If we received an external sync point ("free until") then we + * don't have to request/wait for anything until then, unless + * we're already waiting. + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && + time < time_travel_ext_free_until) + return false; + + time_travel_ext_update_request(time); + return true; +} + +static void time_travel_ext_wait(bool idle) +{ + struct um_timetravel_msg msg = { + .op = UM_TIMETRAVEL_ACK, + }; + + time_travel_ext_prev_request_valid = false; + time_travel_ext_waiting++; + + time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); + + /* + * Here we are deep in the idle loop, so we have to break out of the + * kernel abstraction in a sense and implement this in terms of the + * UML system waiting on the VQ interrupt while sleeping, when we get + * the signal it'll call time_travel_ext_vq_notify_done() completing the + * call. + */ + while (msg.op != UM_TIMETRAVEL_RUN) + time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL); + + time_travel_ext_waiting--; + + /* we might request more stuff while polling - reset when we run */ + time_travel_ext_prev_request_valid = false; +} + +static void time_travel_ext_get_time(void) +{ + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); +} + +static void __time_travel_update_time(unsigned long long ns, bool idle) +{ + if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns)) + time_travel_ext_wait(idle); + else + time_travel_set_time(ns); +} + static struct time_travel_event *time_travel_first_event(void) { return list_first_entry_or_null(&time_travel_events, @@ -85,6 +286,7 @@ static void __time_travel_add_event(struct time_travel_event *e, list_add_tail(&e->list, &time_travel_events); tmp = time_travel_first_event(); + time_travel_ext_update_request(tmp->time); time_travel_next_event = tmp->time; } @@ -106,8 +308,21 @@ void time_travel_periodic_timer(struct time_travel_event *e) static void time_travel_deliver_event(struct time_travel_event *e) { - /* this is basically just deliver_alarm(), handles IRQs itself */ - e->fn(e); + if (e == &time_travel_timer_event) { + /* + * deliver_alarm() does the irq_enter/irq_exit + * by itself, so must handle it specially here + */ + e->fn(e); + } else { + unsigned long flags; + + local_irq_save(flags); + irq_enter(); + e->fn(e); + irq_exit(); + local_irq_restore(flags); + } } static bool time_travel_del_event(struct time_travel_event *e) @@ -119,13 +334,13 @@ static bool time_travel_del_event(struct time_travel_event *e) return true; } -static void time_travel_update_time(unsigned long long next, bool retearly) +static void time_travel_update_time(unsigned long long next, bool idle) { struct time_travel_event ne = { .onstack = true, }; struct time_travel_event *e; - bool finished = retearly; + bool finished = idle; /* add it without a handler - we deal with that specifically below */ __time_travel_add_event(&ne, next); @@ -134,7 +349,7 @@ static void time_travel_update_time(unsigned long long next, bool retearly) e = time_travel_first_event(); BUG_ON(!e); - time_travel_set_time(e->time); + __time_travel_update_time(e->time, idle); /* new events may have been inserted while we were waiting */ if (e == time_travel_first_event()) { @@ -150,11 +365,29 @@ static void time_travel_update_time(unsigned long long next, bool retearly) time_travel_deliver_event(e); } } - } while (!finished); + + e = time_travel_first_event(); + if (e) + time_travel_ext_update_request(e->time); + } while (ne.pending && !finished); time_travel_del_event(&ne); } +void time_travel_add_irq_event(struct time_travel_event *e) +{ + BUG_ON(time_travel_mode != TT_MODE_EXTERNAL); + + time_travel_ext_get_time(); + /* + * We could model interrupt latency here, for now just + * don't have any latency at all and request the exact + * same time (again) to run the interrupt... + */ + time_travel_add_event(e, time_travel_time); +} +EXPORT_SYMBOL_GPL(time_travel_add_irq_event); + static void time_travel_oneshot_timer(struct time_travel_event *e) { deliver_alarm(); @@ -199,6 +432,42 @@ static void time_travel_set_interval(unsigned long long interval) { time_travel_timer_interval = interval; } + +static int time_travel_connect_external(const char *socket) +{ + const char *sep; + unsigned long long id = (unsigned long long)-1; + int rc; + + if ((sep = strchr(socket, ':'))) { + char buf[25] = {}; + if (sep - socket > sizeof(buf) - 1) + goto invalid_number; + + memcpy(buf, socket, sep - socket); + if (kstrtoull(buf, 0, &id)) { +invalid_number: + panic("time-travel: invalid external ID in string '%s'\n", + socket); + return -EINVAL; + } + + socket = sep + 1; + } + + rc = os_connect_socket(socket); + if (rc < 0) { + panic("time-travel: failed to connect to external socket %s\n", + socket); + return rc; + } + + time_travel_ext_fd = rc; + + time_travel_ext_req(UM_TIMETRAVEL_START, id); + + return 1; +} #else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ #define time_travel_start_set 0 #define time_travel_start 0 @@ -216,6 +485,9 @@ static void time_travel_set_interval(unsigned long long interval) { } +/* fail link if this actually gets used */ +extern u64 time_travel_ext_req(u32 op, u64 time); + /* these are empty macros so the struct/fn need not exist */ #define time_travel_add_event(e, time) do { } while (0) #define time_travel_del_event(e) do { } while (0) @@ -246,7 +518,8 @@ static int itimer_shutdown(struct clock_event_device *evt) if (time_travel_mode != TT_MODE_OFF) time_travel_del_event(&time_travel_timer_event); - if (time_travel_mode != TT_MODE_INFCPU) + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) os_timer_disable(); return 0; @@ -265,7 +538,8 @@ static int itimer_set_periodic(struct clock_event_device *evt) time_travel_time + interval); } - if (time_travel_mode != TT_MODE_INFCPU) + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) os_timer_set_interval(interval); return 0; @@ -284,7 +558,8 @@ static int itimer_next_event(unsigned long delta, time_travel_time + delta); } - if (time_travel_mode != TT_MODE_INFCPU) + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) return os_timer_one_shot(delta); return 0; @@ -335,8 +610,14 @@ static u64 timer_read(struct clocksource *cs) * stuck in loops that expect time to move more than the * exact requested sleep amount, e.g. python's socket server, * see https://bugs.python.org/issue37026. + * + * However, don't do that when we're in interrupt or such as + * then we might recurse into our own processing, and get to + * even more waiting, and that's not good - it messes up the + * "what do I do next" and onstack event we use to know when + * to return from time_travel_update_time(). */ - if (!irqs_disabled()) + if (!irqs_disabled() && !in_interrupt() && !in_softirq()) time_travel_update_time(time_travel_time + TIMER_MULTIPLIER, false); @@ -383,6 +664,8 @@ void read_persistent_clock64(struct timespec64 *ts) if (time_travel_start_set) nsecs = time_travel_start + time_travel_time; + else if (time_travel_mode == TT_MODE_EXTERNAL) + nsecs = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1); else nsecs = os_persistent_clock_emulation(); @@ -399,7 +682,8 @@ void __init time_init(void) #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT unsigned long calibrate_delay_is_known(void) { - if (time_travel_mode == TT_MODE_INFCPU) + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) return 1; return 0; } @@ -413,6 +697,13 @@ int setup_time_travel(char *str) return 1; } + if (strncmp(str, "=ext:", 5) == 0) { + time_travel_mode = TT_MODE_EXTERNAL; + timer_clockevent.name = "time-travel-timer-external"; + timer_clocksource.name = "time-travel-clock-external"; + return time_travel_connect_external(str + 5); + } + if (!*str) { time_travel_mode = TT_MODE_BASIC; timer_clockevent.name = "time-travel-timer"; @@ -437,7 +728,15 @@ __uml_help(setup_time_travel, "are no wall clock timers, and any CPU processing happens - as seen from the\n" "guest - instantly. This can be useful for accurate simulation regardless of\n" "debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n" -"easily lead to getting stuck (e.g. if anything in the system busy loops).\n"); +"easily lead to getting stuck (e.g. if anything in the system busy loops).\n" +"\n" +"time-travel=ext:[ID:]/path/to/socket\n" +"This enables time travel mode similar to =inf-cpu, except the system will\n" +"use the given socket to coordinate with a central scheduler, in order to\n" +"have more than one system simultaneously be on simulated time. The virtio\n" +"driver code in UML knows about this so you can also simulate networks and\n" +"devices using it, assuming the device has the right capabilities.\n" +"The optional ID is a 64-bit integer that's sent to the central scheduler.\n"); int setup_time_travel_start(char *str) { diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index 5c819f89b8c2..26ecbd64c409 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include static void copy_stat(struct uml_stat *dst, const struct stat64 *src) @@ -665,3 +667,31 @@ int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, return -errno; return err; } + +int os_poll(unsigned int n, const int *fds) +{ + /* currently need 2 FDs at most so avoid dynamic allocation */ + struct pollfd pollfds[2] = {}; + unsigned int i; + int ret; + + if (n > ARRAY_SIZE(pollfds)) + return -EINVAL; + + for (i = 0; i < n; i++) { + pollfds[i].fd = fds[i]; + pollfds[i].events = POLLIN; + } + + ret = poll(pollfds, n, -1); + if (ret < 0) + return -errno; + + /* Return the index of the available FD */ + for (i = 0; i < n; i++) { + if (pollfds[i].revents) + return i; + } + + return -EIO; +} diff --git a/include/uapi/linux/um_timetravel.h b/include/uapi/linux/um_timetravel.h new file mode 100644 index 000000000000..ca3238222b6d --- /dev/null +++ b/include/uapi/linux/um_timetravel.h @@ -0,0 +1,128 @@ +/* + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Copyright (C) 2019 Intel Corporation + */ +#ifndef _UAPI_LINUX_UM_TIMETRAVEL_H +#define _UAPI_LINUX_UM_TIMETRAVEL_H +#include + +/** + * struct um_timetravel_msg - UM time travel message + * + * This is the basic message type, going in both directions. + * + * This is the message passed between the host (user-mode Linux instance) + * and the calendar (the application on the other side of the socket) in + * order to implement common scheduling. + * + * Whenever UML has an event it will request runtime for it from the + * calendar, and then wait for its turn until it can run, etc. Note + * that it will only ever request the single next runtime, i.e. multiple + * REQUEST messages override each other. + */ +struct um_timetravel_msg { + /** + * @op: operation value from &enum um_timetravel_ops + */ + __u32 op; + + /** + * @seq: sequence number for the message - shall be reflected in + * the ACK response, and should be checked while processing + * the response to see if it matches + */ + __u32 seq; + + /** + * @time: time in nanoseconds + */ + __u64 time; +}; + +/** + * enum um_timetravel_ops - Operation codes + */ +enum um_timetravel_ops { + /** + * @UM_TIMETRAVEL_ACK: response (ACK) to any previous message, + * this usually doesn't carry any data in the 'time' field + * unless otherwise specified below + */ + UM_TIMETRAVEL_ACK = 0, + + /** + * @UM_TIMETRAVEL_START: initialize the connection, the time + * field contains an (arbitrary) ID to possibly be able + * to distinguish the connections. + */ + UM_TIMETRAVEL_START = 1, + + /** + * @UM_TIMETRAVEL_REQUEST: request to run at the given time + * (host -> calendar) + */ + UM_TIMETRAVEL_REQUEST = 2, + + /** + * @UM_TIMETRAVEL_WAIT: Indicate waiting for the previously requested + * runtime, new requests may be made while waiting (e.g. due to + * interrupts); the time field is ignored. The calendar must process + * this message and later send a %UM_TIMETRAVEL_RUN message when + * the host can run again. + * (host -> calendar) + */ + UM_TIMETRAVEL_WAIT = 3, + + /** + * @UM_TIMETRAVEL_GET: return the current time from the calendar in the + * ACK message, the time in the request message is ignored + * (host -> calendar) + */ + UM_TIMETRAVEL_GET = 4, + + /** + * @UM_TIMETRAVEL_UPDATE: time update to the calendar, must be sent e.g. + * before kicking an interrupt to another calendar + * (host -> calendar) + */ + UM_TIMETRAVEL_UPDATE = 5, + + /** + * @UM_TIMETRAVEL_RUN: run time request granted, current time is in + * the time field + * (calendar -> host) + */ + UM_TIMETRAVEL_RUN = 6, + + /** + * @UM_TIMETRAVEL_FREE_UNTIL: Enable free-running until the given time, + * this is a message from the calendar telling the host that it can + * freely do its own scheduling for anything before the indicated + * time. + * Note that if a calendar sends this message once, the host may + * assume that it will also do so in the future, if it implements + * wraparound semantics for the time field. + * (calendar -> host) + */ + UM_TIMETRAVEL_FREE_UNTIL = 7, + + /** + * @UM_TIMETRAVEL_GET_TOD: Return time of day, typically used once at + * boot by the virtual machines to get a synchronized time from + * the simulation. + */ + UM_TIMETRAVEL_GET_TOD = 8, +}; + +#endif /* _UAPI_LINUX_UM_TIMETRAVEL_H */ From 0bc8fb4dda2b461491ec567b2333d13897780d8c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Feb 2020 14:26:48 +0100 Subject: [PATCH 25/80] um: Implement ndelay/udelay in time-travel mode In external or inf-cpu time-travel mode, ndelay/udelay currently just waste CPU time since the simulation time doesn't advance. Implement them properly in this case. Note that the "if (time_travel_mode == ...)" parts compile out if CONFIG_UML_TIME_TRAVEL_SUPPORT isn't set, time_travel_mode is defined to TT_MODE_OFF in that case. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/asm/Kbuild | 1 - arch/um/include/asm/delay.h | 30 +++++++++++++++++++++++++++ arch/um/include/linux/time-internal.h | 6 ++++++ arch/um/kernel/time.c | 6 ++++++ 4 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 arch/um/include/asm/delay.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index db7d9d4e30d8..8d435f8a6dec 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -3,7 +3,6 @@ generic-y += bpf_perf_event.h generic-y += bug.h generic-y += compat.h generic-y += current.h -generic-y += delay.h generic-y += device.h generic-y += emergency-restart.h generic-y += exec.h diff --git a/arch/um/include/asm/delay.h b/arch/um/include/asm/delay.h new file mode 100644 index 000000000000..56fc2b8f2dd0 --- /dev/null +++ b/arch/um/include/asm/delay.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_DELAY_H +#define __UM_DELAY_H +#include +#include + +static inline void um_ndelay(unsigned long nsecs) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { + time_travel_ndelay(nsecs); + return; + } + ndelay(nsecs); +} +#undef ndelay +#define ndelay um_ndelay + +static inline void um_udelay(unsigned long usecs) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { + time_travel_ndelay(1000 * usecs); + return; + } + udelay(usecs); +} +#undef udelay +#define udelay um_udelay +#endif /* __UM_DELAY_H */ diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h index e21655926f08..f3b03d39a854 100644 --- a/arch/um/include/linux/time-internal.h +++ b/arch/um/include/linux/time-internal.h @@ -75,4 +75,10 @@ static inline void time_travel_wait_readable(int fd) { } #endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ + +/* + * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used, + * which is intentional since we really shouldn't link it in that case. + */ +void time_travel_ndelay(unsigned long nsec); #endif /* __TIMER_INTERNAL_H__ */ diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 15c4825b857e..25eaa6a0c658 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -374,6 +374,12 @@ static void time_travel_update_time(unsigned long long next, bool idle) time_travel_del_event(&ne); } +void time_travel_ndelay(unsigned long nsec) +{ + time_travel_update_time(time_travel_time + nsec, false); +} +EXPORT_SYMBOL(time_travel_ndelay); + void time_travel_add_irq_event(struct time_travel_event *e) { BUG_ON(time_travel_mode != TT_MODE_EXTERNAL); From 5bef0a153bf29150357ff60283315a933f05c994 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 13 Feb 2020 14:26:49 +0100 Subject: [PATCH 26/80] um: Implement cpu_relax() as ndelay(1) for time-travel In time-travel mode, cpu_relax() currently does actual CPU relax, but that doesn't affect the simulation. Ideally, we wouldn't run anything that uses it in simulation, but if we actually have virtio devices combined with the same simulation it's possible. Implement cpu_relax() as ndelay(1) in this case, using time_travel_ndelay(1) directly to catch errors if this is used erroneously in builds that don't set CONFIG_UML_TIME_TRAVEL_SUPPORT. While at it, convert it to an __always_inline and also add that to rep_nop() like the original does now. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/x86/um/asm/processor.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 593d5f3902bd..478710384b34 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_PROCESSOR_H #define __UM_PROCESSOR_H +#include /* include faultinfo structure */ #include @@ -21,12 +22,19 @@ #include /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) +static __always_inline void rep_nop(void) { __asm__ __volatile__("rep;nop": : :"memory"); } -#define cpu_relax() rep_nop() +static __always_inline void cpu_relax(void) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) + time_travel_ndelay(1); + else + rep_nop(); +} #define task_pt_regs(t) (&(t)->thread.regs) From 3363179385629c1804ea846f4e72608c2201a81e Mon Sep 17 00:00:00 2001 From: Brendan Higgins Date: Fri, 24 Jan 2020 14:14:01 -0800 Subject: [PATCH 27/80] um: Make CONFIG_STATIC_LINK actually static Currently, CONFIG_STATIC_LINK can be enabled with options which cannot be statically linked, namely UML_NET_VECTOR, UML_NET_VDE, and UML_NET_PCAP; this is because glibc tries to load NSS which does not support being statically linked. So make CONFIG_STATIC_LINK depend on !UML_NET_VECTOR && !UML_NET_VDE && !UML_NET_PCAP. Link: https://lore.kernel.org/lkml/f658f317-be54-ed75-8296-c373c2dcc697@cambridgegreys.com/#t Signed-off-by: Brendan Higgins Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 8 +++++++- arch/um/drivers/Kconfig | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 817a4c838a06..96ab7026b037 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -62,9 +62,12 @@ config NR_CPUS source "arch/$(HEADER_ARCH)/um/Kconfig" +config FORBID_STATIC_LINK + bool + config STATIC_LINK bool "Force a static link" - default n + depends on !FORBID_STATIC_LINK help This option gives you the ability to force a static link of UML. Normally, UML is linked as a shared binary. This is inconvenient for @@ -73,6 +76,9 @@ config STATIC_LINK Additionally, this option enables using higher memory spaces (up to 2.75G) for UML. + NOTE: This option is incompatible with some networking features which + depend on features that require being dynamically loaded (like NSS). + config LD_SCRIPT_STATIC bool default y diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 72d417055782..9160ead56e33 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -234,6 +234,7 @@ config UML_NET_DAEMON config UML_NET_VECTOR bool "Vector I/O high performance network devices" depends on UML_NET + select FORBID_STATIC_LINK help This User-Mode Linux network driver uses multi-message send and receive functions. The host running the UML guest must have @@ -245,6 +246,7 @@ config UML_NET_VECTOR config UML_NET_VDE bool "VDE transport (obsolete)" depends on UML_NET + select FORBID_STATIC_LINK help This User-Mode Linux network transport allows one or more running UMLs on a single host to communicate with each other and also @@ -292,6 +294,7 @@ config UML_NET_MCAST config UML_NET_PCAP bool "pcap transport (obsolete)" depends on UML_NET + select FORBID_STATIC_LINK help The pcap transport makes a pcap packet stream on the host look like an ethernet device inside UML. This is useful for making From 237ce2e681d20e92fe17fc23a348b4f0d595e641 Mon Sep 17 00:00:00 2001 From: Sjoerd Simons Date: Sun, 16 Feb 2020 22:36:24 +0100 Subject: [PATCH 28/80] um: vector: Avoid NULL ptr deference if transport is unset When the transport option of a vec isn't set strncmp ends up being called on a NULL pointer. Better not do that. Signed-off-by: Sjoerd Simons Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_kern.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 9693dfca7651..8e8aac1d30e1 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -197,6 +197,9 @@ static int get_transport_options(struct arglist *def) long parsed; int result = 0; + if (transport == NULL) + return -EINVAL; + if (vector != NULL) { if (kstrtoul(vector, 10, &parsed) == 0) { if (parsed == 0) { From 4a7c46247f9c620c0390a15cb00b6ef9576b9c23 Mon Sep 17 00:00:00 2001 From: Alex Dewar Date: Fri, 27 Mar 2020 19:36:25 +0000 Subject: [PATCH 29/80] um: Remove some unnecessary NULL checks in vector_user.c kfree() already checks for null pointers, so additional checking is unnecessary. Signed-off-by: Alex Dewar Signed-off-by: Richard Weinberger --- arch/um/drivers/vector_user.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index ddcd917be0af..aa28e9eecb7b 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -221,8 +221,7 @@ static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) return result; tap_cleanup: printk(UM_KERN_ERR "user_init_tap: init failed, error %d", fd); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } @@ -266,8 +265,7 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec) return result; hybrid_cleanup: printk(UM_KERN_ERR "user_init_hybrid: init failed"); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } @@ -344,10 +342,8 @@ static struct vector_fds *user_init_unix_fds(struct arglist *ifspec, int id) unix_cleanup: if (fd >= 0) os_close_file(fd); - if (remote_addr != NULL) - kfree(remote_addr); - if (result != NULL) - kfree(result); + kfree(remote_addr); + kfree(result); return NULL; } @@ -382,8 +378,7 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) return result; raw_cleanup: printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } From a8c122f72d944f0e77b58c8b44a539a8f0e39017 Mon Sep 17 00:00:00 2001 From: Hu Haowen Date: Mon, 30 Mar 2020 10:00:24 +0800 Subject: [PATCH 30/80] pcmcia: remove some unused space characters There are a few space characters I found by chance. I think they are redundant, so I removed them. Signed-off-by: Hu Haowen Signed-off-by: Dominik Brodowski --- drivers/pcmcia/sa1100_simpad.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pcmcia/sa1100_simpad.c b/drivers/pcmcia/sa1100_simpad.c index e2e8729afd9d..784ada5b8c4f 100644 --- a/drivers/pcmcia/sa1100_simpad.c +++ b/drivers/pcmcia/sa1100_simpad.c @@ -14,7 +14,7 @@ #include #include #include "sa1100_generic.h" - + static int simpad_pcmcia_hw_init(struct soc_pcmcia_socket *skt) { @@ -66,7 +66,7 @@ simpad_pcmcia_configure_socket(struct soc_pcmcia_socket *skt, simpad_clear_cs3_bit(VCC_3V_EN|VCC_5V_EN|EN0|EN1); break; - case 33: + case 33: simpad_clear_cs3_bit(VCC_3V_EN|EN1); simpad_set_cs3_bit(VCC_5V_EN|EN0); break; @@ -95,7 +95,7 @@ static void simpad_pcmcia_socket_suspend(struct soc_pcmcia_socket *skt) simpad_set_cs3_bit(PCMCIA_RESET); } -static struct pcmcia_low_level simpad_pcmcia_ops = { +static struct pcmcia_low_level simpad_pcmcia_ops = { .owner = THIS_MODULE, .hw_init = simpad_pcmcia_hw_init, .hw_shutdown = simpad_pcmcia_hw_shutdown, From 304e024216a802a7dc8ba75d36de82fa136bbf3e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 28 Mar 2020 12:12:59 -0700 Subject: [PATCH 31/80] net_sched: add a temporary refcnt for struct tcindex_data Although we intentionally use an ordered workqueue for all tc filter works, the ordering is not guaranteed by RCU work, given that tcf_queue_work() is esstenially a call_rcu(). This problem is demostrated by Thomas: CPU 0: tcf_queue_work() tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); -> Migration to CPU 1 CPU 1: tcf_queue_work(&p->rwork, tcindex_destroy_work); so the 2nd work could be queued before the 1st one, which leads to a free-after-free. Enforcing this order in RCU work is hard as it requires to change RCU code too. Fortunately we can workaround this problem in tcindex filter by taking a temporary refcnt, we only refcnt it right before we begin to destroy it. This simplifies the code a lot as a full refcnt requires much more changes in tcindex_set_parms(). Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Reviewed-by: Paul E. McKenney Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 44 +++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 9904299424a1..065345832a69 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -26,9 +27,12 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ +struct tcindex_data; + struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; + struct tcindex_data *p; struct rcu_work rwork; }; @@ -49,6 +53,7 @@ struct tcindex_data { u32 hash; /* hash table size; 0 if undefined */ u32 alloc_hash; /* allocated size */ u32 fall_through; /* 0: only classify if explicit match */ + refcount_t refcnt; /* a temporary refcnt for perfect hash */ struct rcu_work rwork; }; @@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) return tcf_exts_has_actions(&r->exts) || r->res.classid; } +static void tcindex_data_get(struct tcindex_data *p) +{ + refcount_inc(&p->refcnt); +} + +static void tcindex_data_put(struct tcindex_data *p) +{ + if (refcount_dec_and_test(&p->refcnt)) { + kfree(p->perfect); + kfree(p->h); + kfree(p); + } +} + static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, u16 key) { @@ -141,6 +160,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) { tcf_exts_destroy(&r->exts); tcf_exts_put_net(&r->exts); + tcindex_data_put(r->p); } static void tcindex_destroy_rexts_work(struct work_struct *work) @@ -212,6 +232,8 @@ found: else __tcindex_destroy_fexts(f); } else { + tcindex_data_get(p); + if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); else @@ -228,9 +250,7 @@ static void tcindex_destroy_work(struct work_struct *work) struct tcindex_data, rwork); - kfree(p->perfect); - kfree(p->h); - kfree(p); + tcindex_data_put(p); } static inline int @@ -248,9 +268,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { }; static int tcindex_filter_result_init(struct tcindex_filter_result *r, + struct tcindex_data *p, struct net *net) { memset(r, 0, sizeof(*r)); + r->p = p; return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } @@ -290,6 +312,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) goto errout; + cp->perfect[i].p = cp; } return 0; @@ -334,6 +357,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->alloc_hash = p->alloc_hash; cp->fall_through = p->fall_through; cp->tp = tp; + refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ if (tb[TCA_TCINDEX_HASH]) cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -366,7 +390,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } cp->h = p->h; - err = tcindex_filter_result_init(&new_filter_result, net); + err = tcindex_filter_result_init(&new_filter_result, cp, net); if (err < 0) goto errout_alloc; if (old_r) @@ -434,7 +458,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, goto errout_alloc; f->key = handle; f->next = NULL; - err = tcindex_filter_result_init(&f->result, net); + err = tcindex_filter_result_init(&f->result, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -447,7 +471,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, net); + err = tcindex_filter_result_init(old_r, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -571,6 +595,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, for (i = 0; i < p->hash; i++) { struct tcindex_filter_result *r = p->perfect + i; + /* tcf_queue_work() does not guarantee the ordering we + * want, so we have to take this refcnt temporarily to + * ensure 'p' is freed after all tcindex_filter_result + * here. Imperfect hash does not need this, because it + * uses linked lists rather than an array. + */ + tcindex_data_get(p); + tcf_unbind_filter(tp, &r->res); if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork, From 744fdc8233f6aa9582ce08a51ca06e59796a3196 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 30 Mar 2020 11:22:19 -0400 Subject: [PATCH 32/80] ipv6: don't auto-add link-local address to lag ports Bonding slave and team port devices should not have link-local addresses automatically added to them, as it can interfere with openvswitch being able to properly add tc ingress. Basic reproducer, courtesy of Marcelo: $ ip link add name bond0 type bond $ ip link set dev ens2f0np0 master bond0 $ ip link set dev ens2f1np2 master bond0 $ ip link set dev bond0 up $ ip a s 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: ens2f0np0: mtu 1500 qdisc mq master bond0 state UP group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff 5: ens2f1np2: mtu 1500 qdisc mq master bond0 state DOWN group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff 11: bond0: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff inet6 fe80::20f:53ff:fe2f:ea40/64 scope link valid_lft forever preferred_lft forever (above trimmed to relevant entries, obviously) $ sysctl net.ipv6.conf.ens2f0np0.addr_gen_mode=0 net.ipv6.conf.ens2f0np0.addr_gen_mode = 0 $ sysctl net.ipv6.conf.ens2f1np2.addr_gen_mode=0 net.ipv6.conf.ens2f1np2.addr_gen_mode = 0 $ ip a l ens2f0np0 2: ens2f0np0: mtu 1500 qdisc mq master bond0 state UP group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff inet6 fe80::20f:53ff:fe2f:ea40/64 scope link tentative valid_lft forever preferred_lft forever $ ip a l ens2f1np2 5: ens2f1np2: mtu 1500 qdisc mq master bond0 state DOWN group default qlen 1000 link/ether 00:0f:53:2f:ea:40 brd ff:ff:ff:ff:ff:ff inet6 fe80::20f:53ff:fe2f:ea40/64 scope link tentative valid_lft forever preferred_lft forever Looks like addrconf_sysctl_addr_gen_mode() bypasses the original "is this a slave interface?" check added by commit c2edacf80e15, and results in an address getting added, while w/the proposed patch added, no address gets added. This simply adds the same gating check to another code path, and thus should prevent the same devices from erroneously obtaining an ipv6 link-local address. Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address generation mode") Reported-by: Moshe Levi CC: Stephen Hemminger CC: Marcelo Ricardo Leitner CC: netdev@vger.kernel.org Signed-off-by: Jarod Wilson Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a11fd4d67832..84a28b539c43 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3298,6 +3298,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (netif_is_l3_master(idev->dev)) return; + /* no link local addresses on devices flagged as slaves */ + if (idev->dev->flags & IFF_SLAVE) + return; + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); switch (idev->cnf.addr_gen_mode) { From 41aa8561ca3fc5748391f08cc5f3e561923da52c Mon Sep 17 00:00:00 2001 From: Herat Ramani Date: Wed, 1 Apr 2020 01:16:09 +0530 Subject: [PATCH 33/80] cxgb4: fix MPS index overwrite when setting MAC address cxgb4_update_mac_filt() earlier requests firmware to add a new MAC address into MPS TCAM. The MPS TCAM index returned by firmware is stored in pi->xact_addr_filt. However, the saved MPS TCAM index gets overwritten again with the return value of cxgb4_update_mac_filt(), which is wrong. When trying to update to another MAC address later, the wrong MPS TCAM index is sent to firmware, which causes firmware to return error, because it's not the same MPS TCAM index that firmware had sent earlier to driver. So, fix by removing the wrong overwrite being done after call to cxgb4_update_mac_filt(). Fixes: 3f8cfd0d95e6 ("cxgb4/cxgb4vf: Program hash region for {t4/t4vf}_change_mac()") Signed-off-by: Herat Ramani Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 75fde0d4d493..ad993d6ff586 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3132,7 +3132,6 @@ static int cxgb_set_mac_addr(struct net_device *dev, void *p) return ret; memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); - pi->xact_addr_filt = ret; return 0; } From cef8dac96bc108633f5090bb3a9988d734dc1ee0 Mon Sep 17 00:00:00 2001 From: Rahul Lakkireddy Date: Wed, 1 Apr 2020 01:17:01 +0530 Subject: [PATCH 34/80] cxgb4: free MQPRIO resources in shutdown path Perform missing MQPRIO resource cleanup in PCI shutdown path. Also, fix MQPRIO MSIX bitmap leak in resource cleanup. Fixes: b1396c2bd675 ("cxgb4: parse and configure TC-MQPRIO offload") Signed-off-by: Rahul Lakkireddy Signed-off-by: David S. Miller --- .../net/ethernet/chelsio/cxgb4/cxgb4_main.c | 4 ++++ .../ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c | 23 +++++++++++++++++++ .../ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h | 1 + 3 files changed, 28 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index ad993d6ff586..a70018f067aa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -6671,6 +6671,10 @@ static void shutdown_one(struct pci_dev *pdev) if (adapter->port[i]->reg_state == NETREG_REGISTERED) cxgb_close(adapter->port[i]); + rtnl_lock(); + cxgb4_mqprio_stop_offload(adapter); + rtnl_unlock(); + if (is_uld(adapter)) { detach_ulds(adapter); t4_uld_clean_up(adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c index ec3eb45ee3b4..e6af4906d674 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c @@ -301,6 +301,7 @@ static void cxgb4_mqprio_free_hw_resources(struct net_device *dev) cxgb4_clear_msix_aff(eorxq->msix->vec, eorxq->msix->aff_mask); free_irq(eorxq->msix->vec, &eorxq->rspq); + cxgb4_free_msix_idx_in_bmap(adap, eorxq->msix->idx); } free_rspq_fl(adap, &eorxq->rspq, &eorxq->fl); @@ -611,6 +612,28 @@ out: return ret; } +void cxgb4_mqprio_stop_offload(struct adapter *adap) +{ + struct cxgb4_tc_port_mqprio *tc_port_mqprio; + struct net_device *dev; + u8 i; + + if (!adap->tc_mqprio || !adap->tc_mqprio->port_mqprio) + return; + + for_each_port(adap, i) { + dev = adap->port[i]; + if (!dev) + continue; + + tc_port_mqprio = &adap->tc_mqprio->port_mqprio[i]; + if (!tc_port_mqprio->mqprio.qopt.num_tc) + continue; + + cxgb4_mqprio_disable_offload(dev); + } +} + int cxgb4_init_tc_mqprio(struct adapter *adap) { struct cxgb4_tc_port_mqprio *tc_port_mqprio, *port_mqprio; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h index c532f1ef8451..ff8794132b22 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h @@ -38,6 +38,7 @@ struct cxgb4_tc_mqprio { int cxgb4_setup_tc_mqprio(struct net_device *dev, struct tc_mqprio_qopt_offload *mqprio); +void cxgb4_mqprio_stop_offload(struct adapter *adap); int cxgb4_init_tc_mqprio(struct adapter *adap); void cxgb4_cleanup_tc_mqprio(struct adapter *adap); #endif /* __CXGB4_TC_MQPRIO_H__ */ From b1f4c209d84057b6d40b939b6e4404854271d797 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 1 Apr 2020 11:57:32 +0200 Subject: [PATCH 35/80] net: phy: at803x: fix clock sink configuration on ATH8030 and ATH8035 The masks in priv->clk_25m_reg and priv->clk_25m_mask are one-bits-set for the values that comprise the fields, not zero-bits-set. This patch fixes the clock frequency configuration for ATH8030 and ATH8035 Atheros PHYs by removing the erroneous "~". To reproduce this bug, configure the PHY with the device tree binding "qca,clk-out-frequency" and remove the machine specific PHY fixups. Fixes: 2f664823a47021 ("net: phy: at803x: add device tree binding") Signed-off-by: Oleksij Rempel Reported-by: Russell King Reviewed-by: Russell King Tested-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/at803x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 481cf48c9b9e..31f731e6df72 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -425,8 +425,8 @@ static int at803x_parse_dt(struct phy_device *phydev) */ if (at803x_match_phy_id(phydev, ATH8030_PHY_ID) || at803x_match_phy_id(phydev, ATH8035_PHY_ID)) { - priv->clk_25m_reg &= ~AT8035_CLK_OUT_MASK; - priv->clk_25m_mask &= ~AT8035_CLK_OUT_MASK; + priv->clk_25m_reg &= AT8035_CLK_OUT_MASK; + priv->clk_25m_mask &= AT8035_CLK_OUT_MASK; } } From b9258a2cece4ec1f020715fe3554bc2e360f6264 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Wed, 1 Apr 2020 12:06:39 +0200 Subject: [PATCH 36/80] slcan: Don't transmit uninitialized stack data in padding struct can_frame contains some padding which is not explicitly zeroed in slc_bump. This uninitialized data will then be transmitted if the stack initialization hardening feature is not enabled (CONFIG_INIT_STACK_ALL). This commit just zeroes the whole struct including the padding. Signed-off-by: Richard Palethorpe Fixes: a1044e36e457 ("can: add slcan driver for serial/USB-serial CAN adapters") Reviewed-by: Kees Cook Cc: linux-can@vger.kernel.org Cc: netdev@vger.kernel.org Cc: security@kernel.org Cc: wg@grandegger.com Cc: mkl@pengutronix.de Cc: davem@davemloft.net Acked-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- drivers/net/can/slcan.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index 086dfb1b9d0b..91cdc0a2b1a7 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -148,7 +148,7 @@ static void slc_bump(struct slcan *sl) u32 tmpid; char *cmd = sl->rbuff; - cf.can_id = 0; + memset(&cf, 0, sizeof(cf)); switch (*cmd) { case 'r': @@ -187,8 +187,6 @@ static void slc_bump(struct slcan *sl) else return; - *(u64 *) (&cf.data) = 0; /* clear payload */ - /* RTR frames may have a dlc > 0 but they never have any data bytes */ if (!(cf.can_id & CAN_RTR_FLAG)) { for (i = 0; i < cf.can_dlc; i++) { From e6b45ee790ec61e58a810d20a820e19a5878ffe3 Mon Sep 17 00:00:00 2001 From: Hu Haowen Date: Wed, 1 Apr 2020 18:56:24 +0800 Subject: [PATCH 37/80] net/faraday: fix grammar in function ftgmac100_setup_clk() in ftgmac100.c "its not" is wrong. The words should be "it's not". Signed-off-by: Hu Haowen Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 835b7816e372..87236206366f 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1731,7 +1731,7 @@ static int ftgmac100_setup_clk(struct ftgmac100 *priv) if (rc) goto cleanup_clk; - /* RCLK is for RMII, typically used for NCSI. Optional because its not + /* RCLK is for RMII, typically used for NCSI. Optional because it's not * necessary if it's the AST2400 MAC, or the MAC is configured for * RGMII, or the controller is not an ASPEED-based controller. */ From cc213f8c47fbd4eeb6c3a4fa9b8c20d14f8bc8f4 Mon Sep 17 00:00:00 2001 From: Hu Haowen Date: Wed, 1 Apr 2020 20:40:50 +0800 Subject: [PATCH 38/80] bnx2x: correct a comment mistake in grammar It is not right in grammar to spell "Its not". The right one is "It's not". And this line is also over 80 characters. So I broke it into two lines as well in order to make that line not be more than 80 characters. Signed-off-by: Hu Haowen Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c index 9638d65d8261..517caedc0a87 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c @@ -6874,7 +6874,8 @@ int bnx2x_link_update(struct link_params *params, struct link_vars *vars) case PORT_HW_CFG_PHY_SELECTION_FIRST_PHY_PRIORITY: /* In this option, the first PHY makes sure to pass the * traffic through itself only. - * Its not clear how to reset the link on the second phy + * It's not clear how to reset the link on the second + * phy. */ active_external_phy = EXT_PHY1; break; From dd6a4998e64a7806b54c3eba1e5e7bf6c81ccf8c Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Wed, 1 Apr 2020 19:29:03 +0200 Subject: [PATCH 39/80] net: stmmac: Fix VLAN filtering when HW does not support it If we don't have any filters available we can't rely upon the return code of stmmac_add_hw_vlan_rx_fltr() / stmmac_del_hw_vlan_rx_fltr(). Add a check for this. Fixes: ed64639bc1e0 ("net: stmmac: Add support for VLAN Rx filtering") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- .../net/ethernet/stmicro/stmmac/stmmac_main.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 2fb671e61ee8..e6898fd5223f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4566,9 +4566,13 @@ static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid return ret; } - ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); + if (priv->hw->num_vlan) { + ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); + if (ret) + return ret; + } - return ret; + return 0; } static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid) @@ -4581,9 +4585,12 @@ static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vi is_double = true; clear_bit(vid, priv->active_vlans); - ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); - if (ret) - return ret; + + if (priv->hw->num_vlan) { + ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); + if (ret) + return ret; + } return stmmac_vlan_update(priv, is_double); } From d61f4d61ff7f73bed1f4c613de59b4534c08a088 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 1 Apr 2020 11:58:04 -0600 Subject: [PATCH 40/80] dt-bindings: net: mvusb: Fix example errors The example for Marvell USB to MDIO Controller doesn't build: Error: Documentation/devicetree/bindings/net/marvell,mvusb.example.dts:18.9-14 syntax error FATAL ERROR: Unable to parse input tree This is due to label refs being used which can't be resolved. Fixes: 61e0150cb44b ("dt-bindings: net: add marvell usb to mdio bindings") Cc: Tobias Waldekranz Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Rob Herring Signed-off-by: David S. Miller --- .../bindings/net/marvell,mvusb.yaml | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml index 9458f6659be1..68573762294b 100644 --- a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml +++ b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml @@ -38,28 +38,27 @@ required: examples: - | /* USB host controller */ - &usb1 { - mvusb: mdio@1 { + usb { + #address-cells = <1>; + #size-cells = <0>; + + mdio@1 { compatible = "usb1286,1fa4"; reg = <1>; #address-cells = <1>; #size-cells = <0>; - }; - }; - /* MV88E6390X devboard */ - &mvusb { - switch@0 { - compatible = "marvell,mv88e6190"; - status = "ok"; - reg = <0x0>; + switch@0 { + compatible = "marvell,mv88e6190"; + reg = <0x0>; - ports { - /* Port definitions */ - }; + ports { + /* Port definitions */ + }; - mdio { - /* PHY definitions */ + mdio { + /* PHY definitions */ + }; }; }; }; From 2abb5792387eb188b12051337d5dcd2cba615cb0 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Wed, 1 Apr 2020 15:23:55 -0600 Subject: [PATCH 41/80] net: qualcomm: rmnet: Allow configuration updates to existing devices This allows the changelink operation to succeed if the mux_id was specified as an argument. Note that the mux_id must match the existing mux_id of the rmnet device or should be an unused mux_id. Fixes: 1dc49e9d164c ("net: rmnet: do not allow to change mux id if mux id is duplicated") Reported-and-tested-by: Alex Elder Signed-off-by: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- .../ethernet/qualcomm/rmnet/rmnet_config.c | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 1305522f72d6..40efe60eff8d 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -282,7 +282,6 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[], { struct rmnet_priv *priv = netdev_priv(dev); struct net_device *real_dev; - struct rmnet_endpoint *ep; struct rmnet_port *port; u16 mux_id; @@ -297,19 +296,27 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[], if (data[IFLA_RMNET_MUX_ID]) { mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]); - if (rmnet_get_endpoint(port, mux_id)) { - NL_SET_ERR_MSG_MOD(extack, "MUX ID already exists"); - return -EINVAL; + + if (mux_id != priv->mux_id) { + struct rmnet_endpoint *ep; + + ep = rmnet_get_endpoint(port, priv->mux_id); + if (!ep) + return -ENODEV; + + if (rmnet_get_endpoint(port, mux_id)) { + NL_SET_ERR_MSG_MOD(extack, + "MUX ID already exists"); + return -EINVAL; + } + + hlist_del_init_rcu(&ep->hlnode); + hlist_add_head_rcu(&ep->hlnode, + &port->muxed_ep[mux_id]); + + ep->mux_id = mux_id; + priv->mux_id = mux_id; } - ep = rmnet_get_endpoint(port, priv->mux_id); - if (!ep) - return -ENODEV; - - hlist_del_init_rcu(&ep->hlnode); - hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]); - - ep->mux_id = mux_id; - priv->mux_id = mux_id; } if (data[IFLA_RMNET_FLAGS]) { From 86287543715ac2a6d92d561cc105d79306511457 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Apr 2020 00:27:36 +0100 Subject: [PATCH 42/80] net: atlantic: fix missing | operator when assigning rec->llc rec->llc is currently being assigned twice, once with the lower 8 bits from packed_record[8] and then re-assigned afterwards with data from packed_record[9]. This looks like a type, I believe the second assignment should be using the |= operator rather than a direct assignment. Addresses-Coverity: ("Unused value") Fixes: b8f8a0b7b5cb ("net: atlantic: MACSec ingress offload HW bindings") Signed-off-by: Colin Ian King Acked-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c index 97901c114bfa..fbe9d88b13c7 100644 --- a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c +++ b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c @@ -491,7 +491,7 @@ get_ingress_preclass_record(struct aq_hw_s *hw, rec->snap[1] = packed_record[8] & 0xFF; rec->llc = (packed_record[8] >> 8) & 0xFF; - rec->llc = packed_record[9] << 8; + rec->llc |= packed_record[9] << 8; rec->mac_sa[0] = packed_record[10]; rec->mac_sa[0] |= packed_record[11] << 16; From bf88dc327de8c311078da557788af5d88b74c8e5 Mon Sep 17 00:00:00 2001 From: kbuild test robot Date: Thu, 2 Apr 2020 09:25:48 +0800 Subject: [PATCH 43/80] net: dsa: dsa_bridge_mtu_normalization() can be static Fixes: f41071407c85 ("net: dsa: implement auto-normalization of MTU for bridge hardware datapath") Signed-off-by: kbuild test robot Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5390ff541658..e94eb1aac602 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1338,7 +1338,7 @@ static void dsa_hw_port_list_free(struct list_head *hw_port_list) } /* Make the hardware datapath to/from @dev limited to a common MTU */ -void dsa_bridge_mtu_normalization(struct dsa_port *dp) +static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; From 21f6f946938693628721d68f13b8ea2afa789b4a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 2 Apr 2020 10:32:58 +0800 Subject: [PATCH 44/80] crypto/chcr: Add missing include file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/crypto/chelsio/chcr_ktls.c: In function ‘chcr_short_record_handler’: drivers/crypto/chelsio/chcr_ktls.c:1770:12: error: implicit declaration of function ‘kmap_atomic’; did you mean ‘in_atomic’? [-Werror=implicit-function-declaration] vaddr = kmap_atomic(skb_frag_page(f)); ^~~~~~~~~~~ Reported-by: Hulk Robot Fixes: dc05f3df8fac ("chcr: Handle first or middle part of record") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/crypto/chelsio/chcr_ktls.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/chelsio/chcr_ktls.c b/drivers/crypto/chelsio/chcr_ktls.c index 73658b71d4a3..cd1769ecdc1c 100644 --- a/drivers/crypto/chelsio/chcr_ktls.c +++ b/drivers/crypto/chelsio/chcr_ktls.c @@ -2,6 +2,7 @@ /* Copyright (C) 2020 Chelsio Communications. All rights reserved. */ #ifdef CONFIG_CHELSIO_TLS_DEVICE +#include #include "chcr_ktls.h" #include "clip_tbl.h" From d16fa759253ff7a42b5257d0db9784caef2da9c0 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Apr 2020 11:18:59 +0100 Subject: [PATCH 45/80] net: ipv6: rpl_iptunnel: remove redundant assignments to variable err The variable err is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/ipv6/rpl_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index a49ddc6cd020..c3ececd7cfc1 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -210,7 +210,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; - int err = -EINVAL; + int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); From 0b4f33def7bbde1ce2fea05f116639270e7acdc7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Apr 2020 13:44:51 +0200 Subject: [PATCH 46/80] mptcp: fix tcp fallback crash Christoph Paasch reports following crash: general protection fault [..] CPU: 0 PID: 2874 Comm: syz-executor072 Not tainted 5.6.0-rc5 #62 RIP: 0010:__pv_queued_spin_lock_slowpath kernel/locking/qspinlock.c:471 [..] queued_spin_lock_slowpath arch/x86/include/asm/qspinlock.h:50 [inline] do_raw_spin_lock include/linux/spinlock.h:181 [inline] spin_lock_bh include/linux/spinlock.h:343 [inline] __mptcp_flush_join_list+0x44/0xb0 net/mptcp/protocol.c:278 mptcp_shutdown+0xb3/0x230 net/mptcp/protocol.c:1882 [..] Problem is that mptcp_shutdown() socket isn't an mptcp socket, its a plain tcp_sk. Thus, trying to access mptcp_sk specific members accesses garbage. Root cause is that accept() returns a fallback (tcp) socket, not an mptcp one. There is code in getpeername to detect this and override the sockets stream_ops. But this will only run when accept() caller provided a sockaddr struct. "accept(fd, NULL, 0)" will therefore result in mptcp stream ops, but with sock->sk pointing at a tcp_sk. Update the existing fallback handling to detect this as well. Moreover, mptcp_shutdown did not have fallback handling, and mptcp_poll did it too late so add that there as well. Reported-by: Christoph Paasch Tested-by: Christoph Paasch Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 50 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1833bc1f4a43..4cf88e3d5121 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -57,10 +57,43 @@ static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) return msk->first && !sk_is_mptcp(msk->first); } +static struct socket *mptcp_is_tcpsk(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sock->sk != sk) + return NULL; + + if (unlikely(sk->sk_prot == &tcp_prot)) { + /* we are being invoked after mptcp_accept() has + * accepted a non-mp-capable flow: sk is a tcp_sk, + * not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + sock->ops = &inet_stream_ops; + return sock; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { + sock->ops = &inet6_stream_ops; + return sock; +#endif + } + + return NULL; +} + static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) { + struct socket *sock; + sock_owned_by_me((const struct sock *)msk); + sock = mptcp_is_tcpsk((struct sock *)msk); + if (unlikely(sock)) + return sock; + if (likely(!__mptcp_needs_tcp_fallback(msk))) return NULL; @@ -84,6 +117,10 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) struct socket *ssock; int err; + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) + return ssock; + ssock = __mptcp_nmpc_socket(msk); if (ssock) goto set_state; @@ -1752,7 +1789,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, msk = mptcp_sk(sk); lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); + ssock = __mptcp_tcp_fallback(msk); + if (!ssock) + ssock = __mptcp_nmpc_socket(msk); if (ssock) { mask = ssock->ops->poll(file, ssock, wait); release_sock(sk); @@ -1762,9 +1801,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, release_sock(sk); sock_poll_wait(file, sock, wait); lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - return ssock->ops->poll(file, ssock, NULL); if (test_bit(MPTCP_DATA_READY, &msk->flags)) mask = EPOLLIN | EPOLLRDNORM; @@ -1783,11 +1819,17 @@ static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; + struct socket *ssock; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); + ssock = __mptcp_tcp_fallback(msk); + if (ssock) { + release_sock(sock->sk); + return inet_shutdown(ssock, how); + } if (how == SHUT_WR || how == SHUT_RDWR) inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); From 59832e246515ab6a4f5aa878073e6f415aa35166 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Apr 2020 13:44:52 +0200 Subject: [PATCH 47/80] mptcp: subflow: check parent mptcp socket on subflow state change This is needed at least until proper MPTCP-Level fin/reset signalling gets added: We wake parent when a subflow changes, but we should do this only when all subflows have closed, not just one. Schedule the mptcp worker and tell it to check eof state on all subflows. Only flag mptcp socket as closed and wake userspace processes blocking in poll if all subflows have closed. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 33 +++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 2 ++ net/mptcp/subflow.c | 3 +-- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 4cf88e3d5121..8cc9dd2cc828 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -327,6 +327,15 @@ void mptcp_data_acked(struct sock *sk) sock_hold(sk); } +void mptcp_subflow_eof(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && + schedule_work(&msk->work)) + sock_hold(sk); +} + static void mptcp_stop_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -1031,6 +1040,27 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } +static void mptcp_check_for_eof(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int receivers = 0; + + mptcp_for_each_subflow(msk, subflow) + receivers += !subflow->rx_eof; + + if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + /* hopefully temporary hack: propagate shutdown status + * to msk, when all subflows agree on it + */ + sk->sk_shutdown |= RCV_SHUTDOWN; + + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + sk->sk_data_ready(sk); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1047,6 +1077,9 @@ static void mptcp_worker(struct work_struct *work) __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) + mptcp_check_for_eof(msk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f733c5425552..67448002a2d7 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -89,6 +89,7 @@ #define MPTCP_DATA_READY 0 #define MPTCP_SEND_SPACE 1 #define MPTCP_WORK_RTX 2 +#define MPTCP_WORK_EOF 3 static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { @@ -339,6 +340,7 @@ void mptcp_finish_connect(struct sock *sk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); +void mptcp_subflow_eof(struct sock *sk); int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(u32 token); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index b5180c81588e..50a8bea987c6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -994,8 +994,7 @@ static void subflow_state_change(struct sock *sk) if (!(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; - parent->sk_shutdown |= RCV_SHUTDOWN; - __subflow_state_change(parent); + mptcp_subflow_eof(parent); } } From de06f57392b60e4d92135fbbedad4aea7d1107e2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 2 Apr 2020 13:44:53 +0200 Subject: [PATCH 48/80] mptcp: re-check dsn before reading from subflow mptcp_subflow_data_available() is commonly called via ssk->sk_data_ready(), in this case the mptcp socket lock cannot be acquired. Therefore, while we can safely discard subflow data that was already received up to msk->ack_seq, we cannot be sure that 'subflow->data_avail' will still be valid at the time userspace wants to read the data -- a previous read on a different subflow might have carried this data already. In that (unlikely) event, msk->ack_seq will have been updated and will be ahead of the subflow dsn. We can check for this condition and skip/resync to the expected sequence number. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cc9dd2cc828..939a5045181a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -158,6 +158,27 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->offset = offset; } +/* both sockets must be locked */ +static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, + struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); + + /* revalidate data sequence number. + * + * mptcp_subflow_data_available() is usually called + * without msk lock. Its unlikely (but possible) + * that msk->ack_seq has been advanced since the last + * call found in-sequence data. + */ + if (likely(dsn == msk->ack_seq)) + return true; + + subflow->data_avail = 0; + return mptcp_subflow_data_available(ssk); +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) @@ -169,6 +190,11 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct tcp_sock *tp; bool done = false; + if (!mptcp_subflow_dsn_valid(msk, ssk)) { + *bytes = 0; + return false; + } + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); From 564cf2f3953678e08a97c43198badf77b042cf96 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 2 Apr 2020 13:44:54 +0200 Subject: [PATCH 49/80] mptcp: fix "fn parameter not described" warnings Obtained with: $ make W=1 net/mptcp/token.o net/mptcp/token.c:53: warning: Function parameter or member 'req' not described in 'mptcp_token_new_request' net/mptcp/token.c:98: warning: Function parameter or member 'sk' not described in 'mptcp_token_new_connect' net/mptcp/token.c:133: warning: Function parameter or member 'conn' not described in 'mptcp_token_new_accept' net/mptcp/token.c:178: warning: Function parameter or member 'token' not described in 'mptcp_token_destroy_request' net/mptcp/token.c:191: warning: Function parameter or member 'token' not described in 'mptcp_token_destroy' Fixes: 79c0949e9a09 (mptcp: Add key generation and token tree) Fixes: 58b09919626b (mptcp: create msk early) Signed-off-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/token.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 129a5ad1bc35..33352dd99d4d 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -40,7 +40,7 @@ static int token_used __read_mostly; /** * mptcp_token_new_request - create new key/idsn/token for subflow_request - * @req - the request socket + * @req: the request socket * * This function is called when a new mptcp connection is coming in. * @@ -80,7 +80,7 @@ int mptcp_token_new_request(struct request_sock *req) /** * mptcp_token_new_connect - create new key/idsn/token for subflow - * @sk - the socket that will initiate a connection + * @sk: the socket that will initiate a connection * * This function is called when a new outgoing mptcp connection is * initiated. @@ -125,6 +125,7 @@ int mptcp_token_new_connect(struct sock *sk) /** * mptcp_token_new_accept - insert token for later processing * @token: the token to insert to the tree + * @conn: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. @@ -169,7 +170,7 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token) /** * mptcp_token_destroy_request - remove mptcp connection/token - * @token - token of mptcp connection to remove + * @token: token of mptcp connection to remove * * Remove not-yet-fully-established incoming connection identified * by @token. @@ -183,7 +184,7 @@ void mptcp_token_destroy_request(u32 token) /** * mptcp_token_destroy - remove mptcp connection/token - * @token - token of mptcp connection to remove + * @token: token of mptcp connection to remove * * Remove the connection identified by @token. */ From a7d40cbb24900c0a9b2bbcff25b44e2dd14eeee1 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 2 Apr 2020 21:23:44 +0800 Subject: [PATCH 50/80] net: cavium: Fix build errors due to 'imply CAVIUM_PTP' If CAVIUM_PTP is m and THUNDER_NIC_VF is y, build fails: drivers/net/ethernet/cavium/thunder/nicvf_main.o: In function 'nicvf_remove': nicvf_main.c:(.text+0x1f0): undefined reference to 'cavium_ptp_put' drivers/net/ethernet/cavium/thunder/nicvf_main.o: In function `nicvf_probe': nicvf_main.c:(.text+0x557c): undefined reference to 'cavium_ptp_get' THUNDER_NIC_VF imply CAVIUM_PTP, which allow the config now, Use IS_REACHABLE() to avoid the vmlinux link error for this case. Reported-by: Hulk Robot Fixes: def2fbffe62c ("kconfig: allow symbols implied by y to become m") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/common/cavium_ptp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cavium/common/cavium_ptp.h b/drivers/net/ethernet/cavium/common/cavium_ptp.h index a04eccbc78e8..1e0ffe8f4152 100644 --- a/drivers/net/ethernet/cavium/common/cavium_ptp.h +++ b/drivers/net/ethernet/cavium/common/cavium_ptp.h @@ -24,7 +24,7 @@ struct cavium_ptp { struct ptp_clock *ptp_clock; }; -#if IS_ENABLED(CONFIG_CAVIUM_PTP) +#if IS_REACHABLE(CONFIG_CAVIUM_PTP) struct cavium_ptp *cavium_ptp_get(void); void cavium_ptp_put(struct cavium_ptp *ptp); From 21f64e72e7073199a6f8d7d8efe52cd814d7d665 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Thu, 2 Apr 2020 15:57:07 +0200 Subject: [PATCH 51/80] net: stmmac: xgmac: Fix VLAN register handling Commit 907a076881f1, forgot that we need to clear old values of XGMAC_VLAN_TAG register when we switch from VLAN perfect matching to HASH matching. Fix it. Fixes: 907a076881f1 ("net: stmmac: xgmac: fix incorrect XGMAC_VLAN_TAG register writting") Signed-off-by: Jose Abreu Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 0e4575f7bedb..ad4df9bddcf3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -577,8 +577,13 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash, value |= XGMAC_VLAN_EDVLP; value |= XGMAC_VLAN_ESVL; value |= XGMAC_VLAN_DOVLTC; + } else { + value &= ~XGMAC_VLAN_EDVLP; + value &= ~XGMAC_VLAN_ESVL; + value &= ~XGMAC_VLAN_DOVLTC; } + value &= ~XGMAC_VLAN_VID; writel(value, ioaddr + XGMAC_VLAN_TAG); } else if (perfect_match) { u32 value = readl(ioaddr + XGMAC_PACKET_FILTER); @@ -589,13 +594,19 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash, value = readl(ioaddr + XGMAC_VLAN_TAG); + value &= ~XGMAC_VLAN_VTHM; value |= XGMAC_VLAN_ETV; if (is_double) { value |= XGMAC_VLAN_EDVLP; value |= XGMAC_VLAN_ESVL; value |= XGMAC_VLAN_DOVLTC; + } else { + value &= ~XGMAC_VLAN_EDVLP; + value &= ~XGMAC_VLAN_ESVL; + value &= ~XGMAC_VLAN_DOVLTC; } + value &= ~XGMAC_VLAN_VID; writel(value | perfect_match, ioaddr + XGMAC_VLAN_TAG); } else { u32 value = readl(ioaddr + XGMAC_PACKET_FILTER); From c427bfec18f2190b8f4718785ee8ed2db4f84ee6 Mon Sep 17 00:00:00 2001 From: Vincent Bernat Date: Tue, 31 Mar 2020 15:20:10 +0200 Subject: [PATCH 52/80] net: core: enable SO_BINDTODEVICE for non-root users Currently, SO_BINDTODEVICE requires CAP_NET_RAW. This change allows a non-root user to bind a socket to an interface if it is not already bound. This is useful to allow an application to bind itself to a specific VRF for outgoing or incoming connections. Currently, an application wanting to manage connections through several VRF need to be privileged. Previously, IP_UNICAST_IF and IPV6_UNICAST_IF were added for Wine (76e21053b5bf3 and c4062dfc425e9) specifically for use by non-root processes. However, they are restricted to sendmsg() and not usable with TCP. Allowing SO_BINDTODEVICE would allow TCP clients to get the same privilege. As for TCP servers, outside the VRF use case, SO_BINDTODEVICE would only further restrict connections a server could accept. When an application is restricted to a VRF (with `ip vrf exec`), the socket is bound to an interface at creation and therefore, a non-privileged call to SO_BINDTODEVICE to escape the VRF fails. When an application bound a socket to SO_BINDTODEVICE and transmit it to a non-privileged process through a Unix socket, a tentative to change the bound device also fails. Before: >>> import socket >>> s=socket.socket(socket.AF_INET, socket.SOCK_STREAM) >>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0") Traceback (most recent call last): File "", line 1, in PermissionError: [Errno 1] Operation not permitted After: >>> import socket >>> s=socket.socket(socket.AF_INET, socket.SOCK_STREAM) >>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0") >>> s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"dummy0") Traceback (most recent call last): File "", line 1, in PermissionError: [Errno 1] Operation not permitted Signed-off-by: Vincent Bernat Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index da32d9b6d09f..ce1d8dce9b7a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) /* Sorry... */ ret = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_RAW)) + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; From 64948427a63f49dd0ce403388d232f22cc1971a8 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 26 Mar 2020 04:27:24 +0800 Subject: [PATCH 53/80] net: openvswitch: use hlist_for_each_entry_rcu instead of hlist_for_each_entry The struct sw_flow is protected by RCU, when traversing them, use hlist_for_each_entry_rcu. Signed-off-by: Tonghao Zhang Tested-by: Greg Rose Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/openvswitch/flow_table.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index fd8a01ca7a2d..2398d7238300 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -462,12 +462,14 @@ static void flow_table_copy_flows(struct table_instance *old, struct hlist_head *head = &old->buckets[i]; if (ufid) - hlist_for_each_entry(flow, head, - ufid_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + ufid_table.node[old_ver], + lockdep_ovsl_is_held()) ufid_table_instance_insert(new, flow); else - hlist_for_each_entry(flow, head, - flow_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + flow_table.node[old_ver], + lockdep_ovsl_is_held()) table_instance_insert(new, flow); } From 19e16d220f0adbf899a652dfb1fde2e3a95153e9 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 1 Apr 2020 14:46:20 +0800 Subject: [PATCH 54/80] neigh: support smaller retrans_time settting Currently, we limited the retrans_time to be greater than HZ/2. i.e. setting retrans_time less than 500ms will not work. This makes the user unable to achieve a more accurate control for bonding arp fast failover. Update the sanity check to HZ/100, which is 10ms, to let users have more ability on the retrans_time control. v3: sync the behavior with IPv6 and update all the timer handler v2: use HZ instead of hard code number Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- net/core/neighbour.c | 10 ++++++---- net/ipv6/addrconf.c | 7 ++++--- net/ipv6/ndisc.c | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5bf8d22a47ec..39d37d0ef575 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t) neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/2); + HZ/100); neigh_add_timer(neigh, next); immediate_probe = true; } else { @@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh) neigh->nud_state = NUD_INCOMPLETE; atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, - jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); + jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 84a28b539c43..24e319dfb510 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1357,7 +1357,7 @@ retry: regen_advance = idev->cnf.regen_max_retry * idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; /* recalculate max_desync_factor each time and update * idev->desync_factor if it's larger @@ -4121,7 +4121,8 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_probes--; addrconf_mod_dad_work(ifp, - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), + HZ/100)); spin_unlock(&ifp->lock); write_unlock_bh(&idev->lock); @@ -4527,7 +4528,7 @@ restart: !(ifp->flags&IFA_F_TENTATIVE)) { unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * ifp->idev->cnf.dad_transmits * - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; if (age >= ifp->prefered_lft - regen_advance) { struct inet6_ifaddr *ifpub = ifp->ifpub; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 6ffa153e5166..1ecd4e9b0bdf 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1359,8 +1359,8 @@ skip_defrtr: if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; - if (rtime < HZ/10) - rtime = HZ/10; + if (rtime < HZ/100) + rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; From 2f5107c34df329854d9e0bcef8185db8c07b59e5 Mon Sep 17 00:00:00 2001 From: Petko Manolov Date: Thu, 2 Apr 2020 17:33:29 +0300 Subject: [PATCH 55/80] pegasus: Remove pegasus' own workqueue Remove pegasus' own workqueue and replace it with system_long_wq. Signed-off-by: Petko Manolov Signed-off-by: David S. Miller --- drivers/net/usb/pegasus.c | 38 ++++++-------------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c index 8783e2ab3ec0..0ef7e1f443e3 100644 --- a/drivers/net/usb/pegasus.c +++ b/drivers/net/usb/pegasus.c @@ -54,6 +54,7 @@ static const char driver_name[] = "pegasus"; #undef PEGASUS_WRITE_EEPROM #define BMSR_MEDIA (BMSR_10HALF | BMSR_10FULL | BMSR_100HALF | \ BMSR_100FULL | BMSR_ANEGCAPABLE) +#define CARRIER_CHECK_DELAY (2 * HZ) static bool loopback; static bool mii_mode; @@ -1089,17 +1090,12 @@ static inline void setup_pegasus_II(pegasus_t *pegasus) set_register(pegasus, Reg81, 2); } - -static int pegasus_count; -static struct workqueue_struct *pegasus_workqueue; -#define CARRIER_CHECK_DELAY (2 * HZ) - static void check_carrier(struct work_struct *work) { pegasus_t *pegasus = container_of(work, pegasus_t, carrier_check.work); set_carrier(pegasus->net); if (!(pegasus->flags & PEGASUS_UNPLUG)) { - queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check, + queue_delayed_work(system_long_wq, &pegasus->carrier_check, CARRIER_CHECK_DELAY); } } @@ -1120,18 +1116,6 @@ static int pegasus_blacklisted(struct usb_device *udev) return 0; } -/* we rely on probe() and remove() being serialized so we - * don't need extra locking on pegasus_count. - */ -static void pegasus_dec_workqueue(void) -{ - pegasus_count--; - if (pegasus_count == 0) { - destroy_workqueue(pegasus_workqueue); - pegasus_workqueue = NULL; - } -} - static int pegasus_probe(struct usb_interface *intf, const struct usb_device_id *id) { @@ -1144,14 +1128,6 @@ static int pegasus_probe(struct usb_interface *intf, if (pegasus_blacklisted(dev)) return -ENODEV; - if (pegasus_count == 0) { - pegasus_workqueue = alloc_workqueue("pegasus", WQ_MEM_RECLAIM, - 0); - if (!pegasus_workqueue) - return -ENOMEM; - } - pegasus_count++; - net = alloc_etherdev(sizeof(struct pegasus)); if (!net) goto out; @@ -1209,7 +1185,7 @@ static int pegasus_probe(struct usb_interface *intf, res = register_netdev(net); if (res) goto out3; - queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check, + queue_delayed_work(system_long_wq, &pegasus->carrier_check, CARRIER_CHECK_DELAY); dev_info(&intf->dev, "%s, %s, %pM\n", net->name, usb_dev_id[dev_index].name, net->dev_addr); @@ -1222,7 +1198,6 @@ out2: out1: free_netdev(net); out: - pegasus_dec_workqueue(); return res; } @@ -1237,7 +1212,7 @@ static void pegasus_disconnect(struct usb_interface *intf) } pegasus->flags |= PEGASUS_UNPLUG; - cancel_delayed_work(&pegasus->carrier_check); + cancel_delayed_work_sync(&pegasus->carrier_check); unregister_netdev(pegasus->net); unlink_all_urbs(pegasus); free_all_urbs(pegasus); @@ -1246,7 +1221,6 @@ static void pegasus_disconnect(struct usb_interface *intf) pegasus->rx_skb = NULL; } free_netdev(pegasus->net); - pegasus_dec_workqueue(); } static int pegasus_suspend(struct usb_interface *intf, pm_message_t message) @@ -1254,7 +1228,7 @@ static int pegasus_suspend(struct usb_interface *intf, pm_message_t message) struct pegasus *pegasus = usb_get_intfdata(intf); netif_device_detach(pegasus->net); - cancel_delayed_work(&pegasus->carrier_check); + cancel_delayed_work_sync(&pegasus->carrier_check); if (netif_running(pegasus->net)) { usb_kill_urb(pegasus->rx_urb); usb_kill_urb(pegasus->intr_urb); @@ -1276,7 +1250,7 @@ static int pegasus_resume(struct usb_interface *intf) pegasus->intr_urb->actual_length = 0; intr_callback(pegasus->intr_urb); } - queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check, + queue_delayed_work(system_long_wq, &pegasus->carrier_check, CARRIER_CHECK_DELAY); return 0; } From 468c2a100208461c1821315f6fd81b32b9d12561 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 2 Apr 2020 15:48:51 +0100 Subject: [PATCH 56/80] mlxsw: spectrum_trap: fix unintention integer overflow on left shift Shifting the integer value 1 is evaluated using 32-bit arithmetic and then used in an expression that expects a 64-bit value, so there is potentially an integer overflow. Fix this by using the BIT_ULL macro to perform the shift and avoid the overflow. Addresses-Coverity: ("Unintentional integer overflow") Fixes: 13f2e64b94ea ("mlxsw: spectrum_trap: Add devlink-trap policer support") Signed-off-by: Colin Ian King Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index 9096ffd89e50..fbf714d027d8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -643,7 +643,7 @@ static int mlxsw_sp_trap_policer_bs(u64 burst, u8 *p_burst_size, { int bs = fls64(burst) - 1; - if (burst != (1 << bs)) { + if (burst != (BIT_ULL(bs))) { NL_SET_ERR_MSG_MOD(extack, "Policer burst size is not power of two"); return -EINVAL; } From 6a0033457f2380852a3a4138d4c79571a5b51be6 Mon Sep 17 00:00:00 2001 From: chenqiwu Date: Fri, 14 Feb 2020 17:16:25 +0800 Subject: [PATCH 57/80] drivers/ide: convert to list_for_each_entry_safe() Use list_for_each_entry_safe() instead of list_for_each_safe() to simplify the code. Signed-off-by: chenqiwu Signed-off-by: David S. Miller --- drivers/ide/ide-scan-pci.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c index acf874800ca4..383f0d813001 100644 --- a/drivers/ide/ide-scan-pci.c +++ b/drivers/ide/ide-scan-pci.c @@ -89,8 +89,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev) static int __init ide_scan_pcibus(void) { struct pci_dev *dev = NULL; - struct pci_driver *d; - struct list_head *l, *n; + struct pci_driver *d, *tmp; pre_init = 0; for_each_pci_dev(dev) @@ -101,9 +100,8 @@ static int __init ide_scan_pcibus(void) * are post init. */ - list_for_each_safe(l, n, &ide_pci_drivers) { - list_del(l); - d = list_entry(l, struct pci_driver, node); + list_for_each_entry_safe(d, tmp, &ide_pci_drivers, node) { + list_del(d->node); if (__pci_register_driver(d, d->driver.owner, d->driver.mod_name)) printk(KERN_ERR "%s: failed to register %s driver\n", From 255a69a94b8cbbcfef142137b5e7f99b5d168771 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2020 09:43:42 +0100 Subject: [PATCH 58/80] sparc32: use per-device dma_ops sparc32 is the last platform making dynamic decisions in get_arch_dma_ops based on the bus passed in. Instead set the iommu dma_ops at iommu probing and propagate them in of_propagate_archdata, falling back to the NULL ops for the direct mapping in the Leon or PCI case. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- arch/sparc/include/asm/dma-mapping.h | 15 ++------------- arch/sparc/kernel/ioport.c | 3 --- arch/sparc/kernel/of_device_common.c | 1 + arch/sparc/mm/io-unit.c | 9 ++++----- arch/sparc/mm/iommu.c | 15 ++++++++------- arch/sparc/mm/mm_32.h | 3 --- arch/sparc/mm/srmmu.c | 4 +--- 7 files changed, 16 insertions(+), 34 deletions(-) diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index ed32845bd2d2..2f051343612e 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -2,23 +2,12 @@ #ifndef ___ASM_SPARC_DMA_MAPPING_H #define ___ASM_SPARC_DMA_MAPPING_H -#include - extern const struct dma_map_ops *dma_ops; -extern struct bus_type pci_bus_type; - static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { -#ifdef CONFIG_SPARC_LEON - if (sparc_cpu_model == sparc_leon) - return NULL; -#endif -#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI) - if (bus == &pci_bus_type) - return NULL; -#endif - return dma_ops; + /* sparc32 uses per-device dma_ops */ + return IS_ENABLED(CONFIG_SPARC64) ? dma_ops : NULL; } #endif diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index e59461d03b9a..d6874c9b639f 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -373,9 +373,6 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, dma_make_coherent(paddr, PAGE_ALIGN(size)); } -const struct dma_map_ops *dma_ops; -EXPORT_SYMBOL(dma_ops); - #ifdef CONFIG_PROC_FS static int sparc_io_proc_show(struct seq_file *m, void *v) diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c index b32cc5610712..e717a56efc5d 100644 --- a/arch/sparc/kernel/of_device_common.c +++ b/arch/sparc/kernel/of_device_common.c @@ -67,6 +67,7 @@ void of_propagate_archdata(struct platform_device *bus) op->dev.archdata.stc = bus_sd->stc; op->dev.archdata.host_controller = bus_sd->host_controller; op->dev.archdata.numa_node = bus_sd->numa_node; + op->dev.dma_ops = bus->dev.dma_ops; if (dp->child) of_propagate_archdata(op); diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c index 33a0facd9eb5..289276b99b01 100644 --- a/arch/sparc/mm/io-unit.c +++ b/arch/sparc/mm/io-unit.c @@ -38,6 +38,8 @@ #define IOPERM (IOUPTE_CACHE | IOUPTE_WRITE | IOUPTE_VALID) #define MKIOPTE(phys) __iopte((((phys)>>4) & IOUPTE_PAGE) | IOPERM) +static const struct dma_map_ops iounit_dma_ops; + static void __init iounit_iommu_init(struct platform_device *op) { struct iounit_struct *iounit; @@ -70,6 +72,8 @@ static void __init iounit_iommu_init(struct platform_device *op) xptend = iounit->page_table + (16 * PAGE_SIZE) / sizeof(iopte_t); for (; xpt < xptend; xpt++) sbus_writel(0, xpt); + + op->dev.dma_ops = &iounit_dma_ops; } static int __init iounit_init(void) @@ -288,8 +292,3 @@ static const struct dma_map_ops iounit_dma_ops = { .map_sg = iounit_map_sg, .unmap_sg = iounit_unmap_sg, }; - -void __init ld_mmu_iounit(void) -{ - dma_ops = &iounit_dma_ops; -} diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 4d3c6991f0ae..b00dde13681b 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -54,6 +54,9 @@ static pgprot_t dvma_prot; /* Consistent mapping pte flags */ #define IOPERM (IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID) #define MKIOPTE(pfn, perm) (((((pfn)<<8) & IOPTE_PAGE) | (perm)) & ~IOPTE_WAZ) +static const struct dma_map_ops sbus_iommu_dma_gflush_ops; +static const struct dma_map_ops sbus_iommu_dma_pflush_ops; + static void __init sbus_iommu_init(struct platform_device *op) { struct iommu_struct *iommu; @@ -129,6 +132,11 @@ static void __init sbus_iommu_init(struct platform_device *op) (int)(IOMMU_NPTES*sizeof(iopte_t)), (int)IOMMU_NPTES); op->dev.archdata.iommu = iommu; + + if (flush_page_for_dma_global) + op->dev.dma_ops = &sbus_iommu_dma_gflush_ops; + else + op->dev.dma_ops = &sbus_iommu_dma_pflush_ops; } static int __init iommu_init(void) @@ -445,13 +453,6 @@ static const struct dma_map_ops sbus_iommu_dma_pflush_ops = { void __init ld_mmu_iommu(void) { - if (flush_page_for_dma_global) { - /* flush_page_for_dma flushes everything, no matter of what page is it */ - dma_ops = &sbus_iommu_dma_gflush_ops; - } else { - dma_ops = &sbus_iommu_dma_pflush_ops; - } - if (viking_mxcc_present || srmmu_modtype == HyperSparc) { dvma_prot = __pgprot(SRMMU_CACHE | SRMMU_ET_PTE | SRMMU_PRIV); ioperm_noc = IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID; diff --git a/arch/sparc/mm/mm_32.h b/arch/sparc/mm/mm_32.h index 0d0b06e952a5..ce750a99eea9 100644 --- a/arch/sparc/mm/mm_32.h +++ b/arch/sparc/mm/mm_32.h @@ -20,6 +20,3 @@ void __init srmmu_paging_init(void); /* iommu.c */ void ld_mmu_iommu(void); - -/* io-unit.c */ -void ld_mmu_iounit(void); diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index f56c3c9a9793..b7c94de70cca 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -1865,9 +1865,7 @@ void __init load_mmu(void) &smp_cachetlb_ops; #endif - if (sparc_cpu_model == sun4d) - ld_mmu_iounit(); - else + if (sparc_cpu_model != sun4d) ld_mmu_iommu(); #ifdef CONFIG_SMP if (sparc_cpu_model == sun4d) From 3e1221acf6a8f8595b5ce354bab4327a69d54d18 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 3 Apr 2020 10:23:29 +0800 Subject: [PATCH 59/80] net: stmmac: dwmac1000: fix out-of-bounds mac address reg setting Commit 9463c4455900 ("net: stmmac: dwmac1000: Clear unused address entries") cleared the unused mac address entries, but introduced an out-of bounds mac address register programming bug -- After setting the secondary unicast mac addresses, the "reg" value has reached netdev_uc_count() + 1, thus we should only clear address entries if (addr < perfect_addr_number) Fixes: 9463c4455900 ("net: stmmac: dwmac1000: Clear unused address entries") Signed-off-by: Jisheng Zhang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 542784300620..efc6ec1b8027 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -207,7 +207,7 @@ static void dwmac1000_set_filter(struct mac_device_info *hw, reg++; } - while (reg <= perfect_addr_number) { + while (reg < perfect_addr_number) { writel(0, ioaddr + GMAC_ADDR_HIGH(reg)); writel(0, ioaddr + GMAC_ADDR_LOW(reg)); reg++; From a8eab6d35e22f4f21471f16147be79529cd6aaf7 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 2 Apr 2020 20:58:51 -0700 Subject: [PATCH 60/80] net_sched: fix a missing refcnt in tcindex_init() The initial refcnt of struct tcindex_data should be 1, it is clear that I forgot to set it to 1 in tcindex_init(). This leads to a dec-after-zero warning. Reported-by: syzbot+8325e509a1bf83ec741d@syzkaller.appspotmail.com Fixes: 304e024216a8 ("net_sched: add a temporary refcnt for struct tcindex_data") Cc: Jamal Hadi Salim Cc: Jiri Pirko Cc: Paul E. McKenney Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 065345832a69..61e95029c18f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -151,6 +151,7 @@ static int tcindex_init(struct tcf_proto *tp) p->mask = 0xffff; p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; + refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ rcu_assign_pointer(tp->root, p); return 0; From 6110dff776f7fa65c35850ef65b41d3b39e2fac2 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 3 Apr 2020 09:53:25 +0200 Subject: [PATCH 61/80] net: phy: micrel: kszphy_resume(): add delay after genphy_resume() before accessing PHY registers After the power-down bit is cleared, the chip internally triggers a global reset. According to the KSZ9031 documentation, we have to wait at least 1ms for the reset to finish. If the chip is accessed during reset, read will return 0xffff, while write will be ignored. Depending on the system performance and MDIO bus speed, we may or may not run in to this issue. This bug was discovered on an iMX6QP system with KSZ9031 PHY and attached PHY interrupt line. If IRQ was used, the link status update was lost. In polling mode, the link status update was always correct. The investigation showed, that during a read-modify-write access, the read returned 0xffff (while the chip was still in reset) and corresponding write hit the chip _after_ reset and triggered (due to the 0xffff) another reset in an undocumented bit (register 0x1f, bit 1), resulting in the next write being lost due to the new reset cycle. This patch fixes the issue by adding a 1...2 ms sleep after the genphy_resume(). Fixes: 836384d2501d ("net: phy: micrel: Add specific suspend") Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 2ec19e5540bf..05d20343b816 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -25,6 +25,7 @@ #include #include #include +#include /* Operation Mode Strap Override */ #define MII_KSZPHY_OMSO 0x16 @@ -952,6 +953,12 @@ static int kszphy_resume(struct phy_device *phydev) genphy_resume(phydev); + /* After switching from power-down to normal mode, an internal global + * reset is automatically generated. Wait a minimum of 1 ms before + * read/write access to the PHY registers. + */ + usleep_range(1000, 2000); + ret = kszphy_config_reset(phydev); if (ret) return ret; From c85adced953af8eb443852c12e8ea1142de91b7c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 3 Apr 2020 17:14:08 +0800 Subject: [PATCH 62/80] mptcp: add some missing pr_fmt defines Some of the mptcp logs didn't print out the format string: [ 185.651493] DSS [ 185.651494] data_fin=0 dsn64=0 use_map=0 ack64=1 use_ack=1 [ 185.651494] data_ack=13792750332298763796 [ 185.651495] MPTCP: msk=00000000c4b81cfc ssk=000000009743af53 data_avail=0 skb=0000000063dc595d [ 185.651495] MPTCP: msk=00000000c4b81cfc ssk=000000009743af53 status=0 [ 185.651495] MPTCP: msk ack_seq=9bbc894565aa2f9a subflow ack_seq=9bbc894565aa2f9a [ 185.651496] MPTCP: msk=00000000c4b81cfc ssk=000000009743af53 data_avail=1 skb=0000000012e809e1 So this patch added these missing pr_fmt defines. Then we can get the same format string "MPTCP" in all mptcp logs like this: [ 142.795829] MPTCP: DSS [ 142.795829] MPTCP: data_fin=0 dsn64=0 use_map=0 ack64=1 use_ack=1 [ 142.795829] MPTCP: data_ack=8089704603109242421 [ 142.795830] MPTCP: msk=00000000133a24e0 ssk=000000002e508c64 data_avail=0 skb=00000000d5f230df [ 142.795830] MPTCP: msk=00000000133a24e0 ssk=000000002e508c64 status=0 [ 142.795831] MPTCP: msk ack_seq=66790290f1199d9b subflow ack_seq=66790290f1199d9b [ 142.795831] MPTCP: msk=00000000133a24e0 ssk=000000002e508c64 data_avail=1 skb=00000000de5aca2e Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/mptcp/options.c | 2 ++ net/mptcp/pm.c | 2 ++ net/mptcp/pm_netlink.c | 2 ++ 3 files changed, 6 insertions(+) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bd220ee4aac9..faf57585b892 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 - 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 064639f72487..977d9c8b1453 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -3,6 +3,8 @@ * * Copyright (c) 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a0ce7f324499..86d61ab34c7c 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -4,6 +4,8 @@ * Copyright (c) 2020, Red Hat, Inc. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include From 0452800f6db4ed0a42ffb15867c0acfd68829f6a Mon Sep 17 00:00:00 2001 From: Chuanhong Guo Date: Fri, 3 Apr 2020 19:28:24 +0800 Subject: [PATCH 63/80] net: dsa: mt7530: fix null pointer dereferencing in port5 setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 2nd gmac of mediatek soc ethernet may not be connected to a PHY and a phy-handle isn't always available. Unfortunately, mt7530 dsa driver assumes that the 2nd gmac is always connected to switch port 5 and setup mt7530 according to phy address of 2nd gmac node, causing null pointer dereferencing when phy-handle isn't defined in dts. This commit fix this setup code by checking return value of of_parse_phandle before using it. Fixes: 38f790a80560 ("net: dsa: mt7530: Add support for port 5") Signed-off-by: Chuanhong Guo Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Tested-by: RenĂ© van Dorst Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index ef57552db260..2d0d91db0ddb 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1403,6 +1403,9 @@ mt7530_setup(struct dsa_switch *ds) continue; phy_node = of_parse_phandle(mac_np, "phy-handle", 0); + if (!phy_node) + continue; + if (phy_node->parent == priv->dev->of_node->parent) { ret = of_get_phy_mode(mac_np, &interface); if (ret && ret != -ENODEV) From 78a515f9faf3af43cf9c4f61254dd1c5b3cb2d0b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 4 Apr 2020 18:07:59 -0700 Subject: [PATCH 64/80] drivers/ide: Fix build regression. drivers/ide/ide-scan-pci.c: In function 'ide_scan_pcibus': >> drivers/ide/ide-scan-pci.c:104:13: error: incompatible type for argument 1 of 'list_del' 104 | list_del(d->node); | ~^~~~~~ | | | struct list_head In file included from include/linux/module.h:12, from drivers/ide/ide-scan-pci.c:12: include/linux/list.h:144:47: note: expected 'struct list_head *' but argument is of type 'struct list_head' 144 | static inline void list_del(struct list_head *entry) | ~~~~~~~~~~~~~~~~~~^~~~~ Fixes: 6a0033457f23 ("drivers/ide: convert to list_for_each_entry_safe()") Reported-by: kbuild test robot Signed-off-by: David S. Miller --- drivers/ide/ide-scan-pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c index 383f0d813001..b0411a1827a3 100644 --- a/drivers/ide/ide-scan-pci.c +++ b/drivers/ide/ide-scan-pci.c @@ -101,7 +101,7 @@ static int __init ide_scan_pcibus(void) */ list_for_each_entry_safe(d, tmp, &ide_pci_drivers, node) { - list_del(d->node); + list_del(&d->node); if (__pci_register_driver(d, d->driver.owner, d->driver.mod_name)) printk(KERN_ERR "%s: failed to register %s driver\n", From 997ba6573685451c37005a5b74a4baa3c16b5231 Mon Sep 17 00:00:00 2001 From: afzal mohammed Date: Wed, 4 Mar 2020 06:20:24 +0530 Subject: [PATCH 65/80] parisc: Replace setup_irq() by request_irq() request_irq() is preferred over setup_irq(). Invocations of setup_irq() occur after memory allocators are ready. Per tglx[1], setup_irq() existed in olden days when allocators were not ready by the time early interrupts were initialized. Hence replace setup_irq() by request_irq(). [1] https://lkml.kernel.org/r/alpine.DEB.2.20.1710191609480.1971@nanos Signed-off-by: afzal mohammed Signed-off-by: Helge Deller --- arch/parisc/kernel/irq.c | 22 ++++++---------------- drivers/parisc/eisa.c | 8 ++------ 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index e5fcfb70cc7c..e76c86619949 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -560,33 +560,23 @@ void do_cpu_irq_mask(struct pt_regs *regs) goto out; } -static struct irqaction timer_action = { - .handler = timer_interrupt, - .name = "timer", - .flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL, -}; - -#ifdef CONFIG_SMP -static struct irqaction ipi_action = { - .handler = ipi_interrupt, - .name = "IPI", - .flags = IRQF_PERCPU, -}; -#endif - static void claim_cpu_irqs(void) { + unsigned long flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL; int i; + for (i = CPU_IRQ_BASE; i <= CPU_IRQ_MAX; i++) { irq_set_chip_and_handler(i, &cpu_interrupt_type, handle_percpu_irq); } irq_set_handler(TIMER_IRQ, handle_percpu_irq); - setup_irq(TIMER_IRQ, &timer_action); + if (request_irq(TIMER_IRQ, timer_interrupt, flags, "timer", NULL)) + pr_err("Failed to register timer interrupt\n"); #ifdef CONFIG_SMP irq_set_handler(IPI_IRQ, handle_percpu_irq); - setup_irq(IPI_IRQ, &ipi_action); + if (request_irq(IPI_IRQ, ipi_interrupt, IRQF_PERCPU, "IPI", NULL)) + pr_err("Failed to register IPI interrupt\n"); #endif } diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c index 9d00a24277aa..f96e5eaee87e 100644 --- a/drivers/parisc/eisa.c +++ b/drivers/parisc/eisa.c @@ -243,11 +243,6 @@ static irqreturn_t dummy_irq2_handler(int _, void *dev) return IRQ_HANDLED; } -static struct irqaction irq2_action = { - .handler = dummy_irq2_handler, - .name = "cascade", -}; - static void init_eisa_pic(void) { unsigned long flags; @@ -335,7 +330,8 @@ static int __init eisa_probe(struct parisc_device *dev) } /* Reserve IRQ2 */ - setup_irq(2, &irq2_action); + if (request_irq(2, dummy_irq2_handler, 0, "cascade", NULL)) + pr_err("Failed to request irq 2 (cascade)\n"); for (i = 0; i < 16; i++) { irq_set_chip_and_handler(i, &eisa_interrupt_type, handle_simple_irq); From 2772f0efd5bbd5413db3d22e363b779ca0fa5310 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 5 Apr 2020 22:08:07 +0200 Subject: [PATCH 66/80] parisc: Improve interrupt handling in arch_spin_lock_flags() Rewrite arch_spin_lock() and arch_spin_lock_flags() to not re-enable and disable the PSW_SM_I interrupt flag too often. Signed-off-by: Helge Deller --- arch/parisc/include/asm/spinlock.h | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index 197d2247e4db..51582eacb7ec 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -10,25 +10,34 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x) { volatile unsigned int *a = __ldcw_align(x); + smp_mb(); return *a == 0; } -#define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0) - -static inline void arch_spin_lock_flags(arch_spinlock_t *x, - unsigned long flags) +static inline void arch_spin_lock(arch_spinlock_t *x) { volatile unsigned int *a; a = __ldcw_align(x); while (__ldcw(a) == 0) while (*a == 0) - if (flags & PSW_SM_I) { - local_irq_enable(); - cpu_relax(); - local_irq_disable(); - } else - cpu_relax(); + cpu_relax(); +} + +static inline void arch_spin_lock_flags(arch_spinlock_t *x, + unsigned long flags) +{ + volatile unsigned int *a; + unsigned long flags_dis; + + a = __ldcw_align(x); + while (__ldcw(a) == 0) { + local_save_flags(flags_dis); + local_irq_restore(flags); + while (*a == 0) + cpu_relax(); + local_irq_restore(flags_dis); + } } #define arch_spin_lock_flags arch_spin_lock_flags From fbdc8f0f4891df7b5eb643ec0a509a4ac7dcfc2e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 5 Apr 2020 22:29:55 +0200 Subject: [PATCH 67/80] parisc: Rework arch_rw locking functions Clean up the arch read/write locking functions based on the arc implemenation. This improves readability of those functions. Signed-off-by: Helge Deller --- arch/parisc/include/asm/spinlock.h | 145 ++++++++++------------- arch/parisc/include/asm/spinlock_types.h | 14 ++- 2 files changed, 72 insertions(+), 87 deletions(-) diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index 51582eacb7ec..70fecb8dc4e2 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -67,116 +67,93 @@ static inline int arch_spin_trylock(arch_spinlock_t *x) /* * Read-write spinlocks, allowing multiple readers but only one writer. - * Linux rwlocks are unfair to writers; they can be starved for an indefinite - * time by readers. With care, they can also be taken in interrupt context. + * Unfair locking as Writers could be starved indefinitely by Reader(s) * - * In the PA-RISC implementation, we have a spinlock and a counter. - * Readers use the lock to serialise their access to the counter (which - * records how many readers currently hold the lock). - * Writers hold the spinlock, preventing any readers or other writers from - * grabbing the rwlock. + * The spinlock itself is contained in @counter and access to it is + * serialized with @lock_mutex. */ -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to grab the same read lock */ -static __inline__ void arch_read_lock(arch_rwlock_t *rw) +/* 1 - lock taken successfully */ +static inline int arch_read_trylock(arch_rwlock_t *rw) { + int ret = 0; unsigned long flags; - local_irq_save(flags); - arch_spin_lock_flags(&rw->lock, flags); - rw->counter++; - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); -} -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to grab the same read lock */ -static __inline__ void arch_read_unlock(arch_rwlock_t *rw) -{ - unsigned long flags; local_irq_save(flags); - arch_spin_lock_flags(&rw->lock, flags); - rw->counter--; - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); -} + arch_spin_lock(&(rw->lock_mutex)); -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to grab the same read lock */ -static __inline__ int arch_read_trylock(arch_rwlock_t *rw) -{ - unsigned long flags; - retry: - local_irq_save(flags); - if (arch_spin_trylock(&rw->lock)) { - rw->counter++; - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); - return 1; + /* + * zero means writer holds the lock exclusively, deny Reader. + * Otherwise grant lock to first/subseq reader + */ + if (rw->counter > 0) { + rw->counter--; + ret = 1; } + arch_spin_unlock(&(rw->lock_mutex)); local_irq_restore(flags); - /* If write-locked, we fail to acquire the lock */ - if (rw->counter < 0) - return 0; - /* Wait until we have a realistic chance at the lock */ - while (arch_spin_is_locked(&rw->lock) && rw->counter >= 0) + return ret; +} + +/* 1 - lock taken successfully */ +static inline int arch_write_trylock(arch_rwlock_t *rw) +{ + int ret = 0; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&(rw->lock_mutex)); + + /* + * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__), + * deny writer. Otherwise if unlocked grant to writer + * Hence the claim that Linux rwlocks are unfair to writers. + * (can be starved for an indefinite time by readers). + */ + if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) { + rw->counter = 0; + ret = 1; + } + arch_spin_unlock(&(rw->lock_mutex)); + local_irq_restore(flags); + + return ret; +} + +static inline void arch_read_lock(arch_rwlock_t *rw) +{ + while (!arch_read_trylock(rw)) cpu_relax(); - - goto retry; } -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to read_trylock() this lock */ -static __inline__ void arch_write_lock(arch_rwlock_t *rw) +static inline void arch_write_lock(arch_rwlock_t *rw) +{ + while (!arch_write_trylock(rw)) + cpu_relax(); +} + +static inline void arch_read_unlock(arch_rwlock_t *rw) { unsigned long flags; -retry: + local_irq_save(flags); - arch_spin_lock_flags(&rw->lock, flags); - - if (rw->counter != 0) { - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); - - while (rw->counter != 0) - cpu_relax(); - - goto retry; - } - - rw->counter = -1; /* mark as write-locked */ - mb(); + arch_spin_lock(&(rw->lock_mutex)); + rw->counter++; + arch_spin_unlock(&(rw->lock_mutex)); local_irq_restore(flags); } -static __inline__ void arch_write_unlock(arch_rwlock_t *rw) -{ - rw->counter = 0; - arch_spin_unlock(&rw->lock); -} - -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to read_trylock() this lock */ -static __inline__ int arch_write_trylock(arch_rwlock_t *rw) +static inline void arch_write_unlock(arch_rwlock_t *rw) { unsigned long flags; - int result = 0; local_irq_save(flags); - if (arch_spin_trylock(&rw->lock)) { - if (rw->counter == 0) { - rw->counter = -1; - result = 1; - } else { - /* Read-locked. Oh well. */ - arch_spin_unlock(&rw->lock); - } - } + arch_spin_lock(&(rw->lock_mutex)); + rw->counter = __ARCH_RW_LOCK_UNLOCKED__; + arch_spin_unlock(&(rw->lock_mutex)); local_irq_restore(flags); - - return result; } #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h index 42979c5704dc..ca39ee350c3f 100644 --- a/arch/parisc/include/asm/spinlock_types.h +++ b/arch/parisc/include/asm/spinlock_types.h @@ -12,11 +12,19 @@ typedef struct { #endif } arch_spinlock_t; + +/* counter: + * Unlocked : 0x0100_0000 + * Read lock(s) : 0x00FF_FFFF to 0x01 (Multiple Readers decrement it) + * Write lock : 0x0, but only if prior value is "unlocked" 0x0100_0000 + */ typedef struct { - arch_spinlock_t lock; - volatile int counter; + arch_spinlock_t lock_mutex; + volatile unsigned int counter; } arch_rwlock_t; -#define __ARCH_RW_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED, 0 } +#define __ARCH_RW_LOCK_UNLOCKED__ 0x01000000 +#define __ARCH_RW_LOCK_UNLOCKED { .lock_mutex = __ARCH_SPIN_LOCK_UNLOCKED, \ + .counter = __ARCH_RW_LOCK_UNLOCKED__ } #endif From 2a3778e70fcc7985d1fbce6e2fdaac5258544a60 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 18 Feb 2020 22:04:18 +0100 Subject: [PATCH 68/80] parisc: Refactor alternative code to accept multiple conditions Allow the alternative loop to accept multiple conditions when replacing existing code, e.g. ALTERNATIVE(ALT_COND_NO_SMP | ALT_COND_RUN_ON_QEMU, INSN_NOP) Signed-off-by: Helge Deller --- arch/parisc/kernel/alternative.c | 37 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/parisc/kernel/alternative.c b/arch/parisc/kernel/alternative.c index 3c66d5c4d90d..fa28c4c9f972 100644 --- a/arch/parisc/kernel/alternative.c +++ b/arch/parisc/kernel/alternative.c @@ -25,6 +25,22 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *entry; int index = 0, applied = 0; int num_cpus = num_online_cpus(); + u32 cond_check; + + cond_check = ALT_COND_ALWAYS | + ((num_cpus == 1) ? ALT_COND_NO_SMP : 0) | + ((cache_info.dc_size == 0) ? ALT_COND_NO_DCACHE : 0) | + ((cache_info.ic_size == 0) ? ALT_COND_NO_ICACHE : 0) | + (running_on_qemu ? ALT_COND_RUN_ON_QEMU : 0) | + ((split_tlb == 0) ? ALT_COND_NO_SPLIT_TLB : 0) | + /* + * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit + * set (bit #61, big endian), we have to flush and sync every + * time IO-PDIR is changed in Ike/Astro. + */ + (((boot_cpu_data.cpu_type > pcxw_) && + ((boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC) == 0)) + ? ALT_COND_NO_IOC_FDC : 0); for (entry = start; entry < end; entry++, index++) { @@ -38,29 +54,14 @@ void __init_or_module apply_alternatives(struct alt_instr *start, WARN_ON(!cond); - if (cond != ALT_COND_ALWAYS && no_alternatives) + if ((cond & ALT_COND_ALWAYS) == 0 && no_alternatives) continue; pr_debug("Check %d: Cond 0x%x, Replace %02d instructions @ 0x%px with 0x%08x\n", index, cond, len, from, replacement); - if ((cond & ALT_COND_NO_SMP) && (num_cpus != 1)) - continue; - if ((cond & ALT_COND_NO_DCACHE) && (cache_info.dc_size != 0)) - continue; - if ((cond & ALT_COND_NO_ICACHE) && (cache_info.ic_size != 0)) - continue; - if ((cond & ALT_COND_RUN_ON_QEMU) && !running_on_qemu) - continue; - - /* - * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit - * set (bit #61, big endian), we have to flush and sync every - * time IO-PDIR is changed in Ike/Astro. - */ - if ((cond & ALT_COND_NO_IOC_FDC) && - ((boot_cpu_data.cpu_type <= pcxw_) || - (boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC))) + /* Bounce out if none of the conditions are true. */ + if ((cond & cond_check) == 0) continue; /* Want to replace pdtlb by a pdtlb,l instruction? */ From 106c90922e1e0cd5fcbb34be8ebbb3e8a8e71909 Mon Sep 17 00:00:00 2001 From: Firoz Khan Date: Wed, 2 Jan 2019 21:32:32 +0530 Subject: [PATCH 69/80] parisc: remove nargs from __SYSCALL The __SYSCALL macro's arguments are system call number, system call entry name and number of arguments for the system call. Argument- nargs in __SYSCALL(nr, entry, nargs) is neither calculated nor used anywhere. So it would be better to keep the implementaion as __SYSCALL(nr, entry). This will unifies the implementation with some other architetures too. Signed-off-by: Firoz Khan Signed-off-by: Helge Deller --- arch/parisc/kernel/syscall.S | 2 +- arch/parisc/kernel/syscalls/syscalltbl.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 97ac707c6bff..f05c9d5b6b9e 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -935,7 +935,7 @@ ENTRY(lws_table) END(lws_table) /* End of lws table */ -#define __SYSCALL(nr, entry, nargs) ASM_ULONG_INSN entry +#define __SYSCALL(nr, entry) ASM_ULONG_INSN entry .align 8 ENTRY(sys_call_table) .export sys_call_table,data diff --git a/arch/parisc/kernel/syscalls/syscalltbl.sh b/arch/parisc/kernel/syscalls/syscalltbl.sh index 45b5bae26240..f7393a7b18aa 100644 --- a/arch/parisc/kernel/syscalls/syscalltbl.sh +++ b/arch/parisc/kernel/syscalls/syscalltbl.sh @@ -13,10 +13,10 @@ emit() { t_entry="$3" while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" t_nxt=$((t_nxt+1)) done - printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" } grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( From bee348907d19d654e8524d3a946dcd25b693aa7e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 3 Apr 2020 16:13:21 +0100 Subject: [PATCH 70/80] tun: Don't put_page() for all negative return values from XDP program When an XDP program is installed, tun_build_skb() grabs a reference to the current page fragment page if the program returns XDP_REDIRECT or XDP_TX. However, since tun_xdp_act() passes through negative return values from the XDP program, it is possible to trigger the error path by mistake and accidentally drop a reference to the fragments page without taking one, leading to a spurious free. This is believed to be the cause of some KASAN use-after-free reports from syzbot [1], although without a reproducer it is not possible to confirm whether this patch fixes the problem. Ensure that we only drop a reference to the fragments page if the XDP transmit or redirect operations actually fail. [1] https://syzkaller.appspot.com/bug?id=e76a6af1be4acd727ff6bbca669833f98cbf5d95 Cc: "David S. Miller" Cc: Alexei Starovoitov Cc: Daniel Borkmann CC: Eric Dumazet Acked-by: Jason Wang Fixes: 8ae1aff0b331 ("tuntap: split out XDP logic") Signed-off-by: Will Deacon Signed-off-by: David S. Miller --- drivers/net/tun.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 228fe449dc6d..07476c6510f2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1678,8 +1678,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); - if (err < 0) - goto err_xdp; + if (err < 0) { + if (act == XDP_REDIRECT || act == XDP_TX) + put_page(alloc_frag->page); + goto out; + } + if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) @@ -1693,8 +1697,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); -err_xdp: - put_page(alloc_frag->page); out: rcu_read_unlock(); local_bh_enable(); From a7f9a6f4cc5fb4d5c5a03f3f898b7cb86db05cbf Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 4 Apr 2020 11:22:57 -0400 Subject: [PATCH 71/80] ipv6: rpl: fix loop iteration This patch fix the loop iteration by not walking over the last iteration. The cmpri compressing value exempt the last segment. As the code shows the last iteration will be overwritten by cmpre value handling which is for the last segment. I think this doesn't end in any bufferoverflows because we work on worst case temporary buffer sizes but it ends in not best compression settings in some cases. Fixes: 8610c7c6e3bd ("net: ipv6: add support for rpl sr exthdr") Signed-off-by: Alexander Aring Signed-off-by: David S. Miller --- net/ipv6/rpl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index dc4f20e23bf7..d38b476fc7f2 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -48,7 +48,7 @@ void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = 0; outhdr->cmpre = 0; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr, ipv6_rpl_segdata_pos(inhdr, i), inhdr->cmpri); @@ -66,7 +66,7 @@ static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr int i; for (plen = 0; plen < sizeof(*daddr); plen++) { - for (i = 0; i <= n; i++) { + for (i = 0; i < n; i++) { if (daddr->s6_addr[plen] != inhdr->rpl_segaddr[i].s6_addr[plen]) return plen; @@ -114,7 +114,7 @@ void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = cmpri; outhdr->cmpre = cmpre; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i), &inhdr->rpl_segaddr[i], cmpri); From 536fab5bf5826404534a6c271f622ad2930d9119 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 4 Apr 2020 14:35:17 -0700 Subject: [PATCH 72/80] net: dsa: bcm_sf2: Do not register slave MDIO bus with OF We were registering our slave MDIO bus with OF and doing so with assigning the newly created slave_mii_bus of_node to the master MDIO bus controller node. This is a bad thing to do for a number of reasons: - we are completely lying about the slave MII bus is arranged and yet we still want to control which MDIO devices it probes. It was attempted before to play tricks with the bus_mask to perform that: https://www.spinics.net/lists/netdev/msg429420.html but the approach was rightfully rejected - the device_node reference counting is messed up and we are effectively doing a double probe on the devices we already probed using the master, this messes up all resources reference counts (such as clocks) The proper fix for this as indicated by David in his reply to the thread above is to use a platform data style registration so as to control exactly which devices we probe: https://www.spinics.net/lists/netdev/msg430083.html By using mdiobus_register(), our slave_mii_bus->phy_mask value is used as intended, and all the PHY addresses that must be redirected towards our slave MDIO bus is happening while other addresses get redirected towards the master MDIO bus. Fixes: 461cd1b03e32 ("net: dsa: bcm_sf2: Register our slave MDIO bus") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index affa5c6e135c..cc95adc5ab4b 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -480,7 +480,7 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds) priv->slave_mii_bus->parent = ds->dev->parent; priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask; - err = of_mdiobus_register(priv->slave_mii_bus, dn); + err = mdiobus_register(priv->slave_mii_bus); if (err && dn) of_node_put(dn); From 95099c569a9fdbe186a27447dfa8a5a0562d4b7f Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 4 Apr 2020 23:52:25 +0200 Subject: [PATCH 73/80] r8169: change back SG and TSO to be disabled by default There has been a number of reports that using SG/TSO on different chip versions results in tx timeouts. However for a lot of people SG/TSO works fine. Therefore disable both features by default, but allow users to enable them. Use at own risk! Fixes: 93681cd7d94f ("r8169: enable HW csum and TSO") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 29 +++++++++++------------ 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 55cb5730beb6..bf5bf05970a2 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5441,9 +5441,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) netif_napi_add(dev, &tp->napi, rtl8169_poll, NAPI_POLL_WEIGHT); - dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO | - NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_CTAG_RX; + dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM | + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO | NETIF_F_HIGHDMA; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; @@ -5460,26 +5459,26 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* Disallow toggling */ dev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX; + if (rtl_chip_supports_csum_v2(tp)) + dev->hw_features |= NETIF_F_IPV6_CSUM; + + dev->features |= dev->hw_features; + + /* There has been a number of reports that using SG/TSO results in + * tx timeouts. However for a lot of people SG/TSO works fine. + * Therefore disable both features by default, but allow users to + * enable them. Use at own risk! + */ if (rtl_chip_supports_csum_v2(tp)) { - dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6; + dev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6; dev->gso_max_size = RTL_GSO_MAX_SIZE_V2; dev->gso_max_segs = RTL_GSO_MAX_SEGS_V2; } else { + dev->hw_features |= NETIF_F_SG | NETIF_F_TSO; dev->gso_max_size = RTL_GSO_MAX_SIZE_V1; dev->gso_max_segs = RTL_GSO_MAX_SEGS_V1; } - /* RTL8168e-vl and one RTL8168c variant are known to have a - * HW issue with TSO. - */ - if (tp->mac_version == RTL_GIGA_MAC_VER_34 || - tp->mac_version == RTL_GIGA_MAC_VER_22) { - dev->vlan_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG); - dev->hw_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG); - } - - dev->features |= dev->hw_features; - dev->hw_features |= NETIF_F_RXALL; dev->hw_features |= NETIF_F_RXFCS; From 0be0ae144109a43266631b94adef5c806c56a4b1 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sun, 5 Apr 2020 09:50:21 +0300 Subject: [PATCH 74/80] mlxsw: spectrum_flower: Do not stop at FLOW_ACTION_PRIORITY The handler for FLOW_ACTION_PRIORITY ends by returning whatever the lower-level function that it calls returns. If there are more actions lined up after this action, those are never offloaded. Fix by only bailing out when the called function returns an error. Fixes: 463957e3fbab ("mlxsw: spectrum_flower: Offload FLOW_ACTION_PRIORITY") Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 2f76908cae73..69f77615c816 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -155,9 +155,12 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, proto, prio, extack); } case FLOW_ACTION_PRIORITY: - return mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei, - act->priority, - extack); + err = mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei, + act->priority, + extack); + if (err) + return err; + break; case FLOW_ACTION_MANGLE: { enum flow_action_mangle_base htype = act->mangle.htype; __be32 be_mask = (__force __be32) act->mangle.mask; From ccfc569347f870830e7c7cf854679a06cf9c45b5 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sun, 5 Apr 2020 09:50:22 +0300 Subject: [PATCH 75/80] mlxsw: spectrum_flower: Do not stop at FLOW_ACTION_VLAN_MANGLE The handler for FLOW_ACTION_VLAN_MANGLE ends by returning whatever the lower-level function that it calls returns. If there are more actions lined up after this action, those are never offloaded. Fix by only bailing out when the called function returns an error. Fixes: a150201a70da ("mlxsw: spectrum: Add support for vlan modify TC action") Signed-off-by: Petr Machata Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 69f77615c816..51117a5a6bbf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -150,9 +150,12 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, u8 prio = act->vlan.prio; u16 vid = act->vlan.vid; - return mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei, - act->id, vid, - proto, prio, extack); + err = mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei, + act->id, vid, + proto, prio, extack); + if (err) + return err; + break; } case FLOW_ACTION_PRIORITY: err = mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei, From 2dd6a24fff41fe8fbf307491ed637a18049b13ff Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 5 Apr 2020 13:06:02 +0100 Subject: [PATCH 76/80] wimax: remove some redundant assignments to variable result In function i2400m_bm_buf_alloc there is no need to use a variable 'result' to return -ENOMEM, just return the literal value. In the function i2400m_setup the variable 'result' is initialized with a value that is never read, it is a redundant assignment that can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/wimax/i2400m/driver.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c index f66c0f8f6f4a..ecb3fccca603 100644 --- a/drivers/net/wimax/i2400m/driver.c +++ b/drivers/net/wimax/i2400m/driver.c @@ -740,9 +740,6 @@ EXPORT_SYMBOL_GPL(i2400m_error_recovery); static int i2400m_bm_buf_alloc(struct i2400m *i2400m) { - int result; - - result = -ENOMEM; i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL); if (i2400m->bm_cmd_buf == NULL) goto error_bm_cmd_kzalloc; @@ -754,7 +751,7 @@ int i2400m_bm_buf_alloc(struct i2400m *i2400m) error_bm_ack_buf_kzalloc: kfree(i2400m->bm_cmd_buf); error_bm_cmd_kzalloc: - return result; + return -ENOMEM; } @@ -843,7 +840,7 @@ EXPORT_SYMBOL_GPL(i2400m_reset); */ int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags) { - int result = -ENODEV; + int result; struct device *dev = i2400m_dev(i2400m); struct wimax_dev *wimax_dev = &i2400m->wimax_dev; struct net_device *net_dev = i2400m->wimax_dev.net_dev; From 7df0a6a3e53dafa0e36127cb09306a19b03bd182 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 5 Apr 2020 14:49:14 +0100 Subject: [PATCH 77/80] qed: remove redundant assignment to variable 'rc' The variable 'rc' is being assigned a value that is never read and it is being updated later with a new value. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_l2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index 1a5fc2ae351c..29810a1aa210 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -369,8 +369,8 @@ int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn, struct qed_spq_entry *p_ent = NULL; struct qed_sp_init_data init_data; u8 abs_vport_id = 0; - int rc = -EINVAL; u16 rx_mode = 0; + int rc; rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id); if (rc) From afa3b592953bfaecfb4f2f335ec5f935cff56804 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sun, 5 Apr 2020 13:00:30 -0700 Subject: [PATCH 78/80] net: dsa: bcm_sf2: Ensure correct sub-node is parsed When the bcm_sf2 was converted into a proper platform device driver and used the new dsa_register_switch() interface, we would still be parsing the legacy DSA node that contained all the port information since the platform firmware has intentionally maintained backward and forward compatibility to client programs. Ensure that we do parse the correct node, which is "ports" per the revised DSA binding. Fixes: d9338023fb8e ("net: dsa: bcm_sf2: Make it a real platform device driver") Signed-off-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index cc95adc5ab4b..c7ac63f41918 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -1079,6 +1079,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) const struct bcm_sf2_of_data *data; struct b53_platform_data *pdata; struct dsa_switch_ops *ops; + struct device_node *ports; struct bcm_sf2_priv *priv; struct b53_device *dev; struct dsa_switch *ds; @@ -1146,7 +1147,11 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) set_bit(0, priv->cfp.used); set_bit(0, priv->cfp.unique); - bcm_sf2_identify_ports(priv, dn->child); + ports = of_find_node_by_name(dn, "ports"); + if (ports) { + bcm_sf2_identify_ports(priv, ports); + of_node_put(ports); + } priv->irq0 = irq_of_parse_and_map(dn, 0); priv->irq1 = irq_of_parse_and_map(dn, 1); From db1f00fb8ff793889e83f2e37e0c7bbb6fc9934e Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Sun, 5 Apr 2020 18:59:24 -0700 Subject: [PATCH 79/80] skbuff.h: Improve the checksum related comments Fixed the punctuation and some typos. Improved some sentences with minor changes. No change of semantics or code. Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Randy Dunlap Signed-off-by: Dexuan Cui Signed-off-by: David S. Miller --- include/linux/skbuff.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 28b1a2b4459e..3a2ac7072dbb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -47,8 +47,8 @@ * A. IP checksum related features * * Drivers advertise checksum offload capabilities in the features of a device. - * From the stack's point of view these are capabilities offered by the driver, - * a driver typically only advertises features that it is capable of offloading + * From the stack's point of view these are capabilities offered by the driver. + * A driver typically only advertises features that it is capable of offloading * to its device. * * The checksum related features are: @@ -63,7 +63,7 @@ * TCP or UDP packets over IPv4. These are specifically * unencapsulated packets of the form IPv4|TCP or * IPv4|UDP where the Protocol field in the IPv4 header - * is TCP or UDP. The IPv4 header may contain IP options + * is TCP or UDP. The IPv4 header may contain IP options. * This feature cannot be set in features for a device * with NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). @@ -79,13 +79,13 @@ * DEPRECATED (see below). * * NETIF_F_RXCSUM - Driver (device) performs receive checksum offload. - * This flag is used only used to disable the RX checksum + * This flag is only used to disable the RX checksum * feature for a device. The stack will accept receive * checksum indication in packets received on a device * regardless of whether NETIF_F_RXCSUM is set. * * B. Checksumming of received packets by device. Indication of checksum - * verification is in set skb->ip_summed. Possible values are: + * verification is set in skb->ip_summed. Possible values are: * * CHECKSUM_NONE: * @@ -115,16 +115,16 @@ * the packet minus one that have been verified as CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), - * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to + * GRE (checksum flag is set) and TCP, skb->csum_level would be set to * two. If the device were only able to verify the UDP checksum and not - * GRE, either because it doesn't support GRE checksum of because GRE + * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * * CHECKSUM_COMPLETE: * * This is the most generic way. The device supplied checksum of the _whole_ - * packet as seen by netif_rx() and fills out in skb->csum. Meaning, the + * packet as seen by netif_rx() and fills in skb->csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: @@ -153,8 +153,8 @@ * from skb->csum_start up to the end, and to record/write the checksum at * offset skb->csum_start + skb->csum_offset. A driver may verify that the * csum_start and csum_offset values are valid values given the length and - * offset of the packet, however they should not attempt to validate that the - * checksum refers to a legitimate transport layer checksum-- it is the + * offset of the packet, but it should not attempt to validate that the + * checksum refers to a legitimate transport layer checksum -- it is the * purview of the stack to validate that csum_start and csum_offset are set * correctly. * @@ -178,18 +178,18 @@ * * CHECKSUM_UNNECESSARY: * - * This has the same meaning on as CHECKSUM_NONE for checksum offload on + * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * * CHECKSUM_COMPLETE: * Not used in checksum output. If a driver observes a packet with this value - * set in skbuff, if should treat as CHECKSUM_NONE being set. + * set in skbuff, it should treat the packet as if CHECKSUM_NONE were set. * * D. Non-IP checksum (CRC) offloads * * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack - * will set set csum_start and csum_offset accordingly, set ip_summed to + * will set csum_start and csum_offset accordingly, set ip_summed to * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c. * A driver that supports both IP checksum offload and SCTP CRC32c offload @@ -200,10 +200,10 @@ * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of * offloading the FCOE CRC in a packet. To perform this offload the stack * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset - * accordingly. Note the there is no indication in the skbuff that the - * CHECKSUM_PARTIAL refers to an FCOE checksum, a driver that supports + * accordingly. Note that there is no indication in the skbuff that the + * CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports * both IP checksum offload and FCOE CRC offload must verify which offload - * is configured for a packet presumably by inspecting packet headers. + * is configured for a packet, presumably by inspecting packet headers. * * E. Checksumming on output with GSO. * @@ -211,9 +211,9 @@ * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded - * with GSO then ip_summed is CHECKSUM_PARTIAL, csum_start and csum_offset - * are set to refer to the outermost checksum being offload (two offloaded - * checksums are possible with UDP encapsulation). + * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and + * csum_offset are set to refer to the outermost checksum being offloaded + * (two offloaded checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ From aa81700cf2326e288c9ca1fe7b544039617f1fc2 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Mon, 6 Apr 2020 11:38:29 +0200 Subject: [PATCH 80/80] macsec: fix NULL dereference in macsec_upd_offload() macsec_upd_offload() gets the value of MACSEC_OFFLOAD_ATTR_TYPE without checking its presence in the request message, and this causes a NULL dereference. Fix it rejecting any configuration that does not include this attribute. Reported-and-tested-by: syzbot+7022ab7c383875c17eff@syzkaller.appspotmail.com Fixes: dcb780fb2795 ("net: macsec: add nla support for changing the offloading selection") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- drivers/net/macsec.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index da82d7f16a09..0d580d81d910 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -2594,6 +2594,9 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(dev); macsec = macsec_priv(dev); + if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) + return -EINVAL; + offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); if (macsec->offload == offload) return 0;