diff --git a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml index 9458f6659be1..68573762294b 100644 --- a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml +++ b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml @@ -38,28 +38,27 @@ required: examples: - | /* USB host controller */ - &usb1 { - mvusb: mdio@1 { + usb { + #address-cells = <1>; + #size-cells = <0>; + + mdio@1 { compatible = "usb1286,1fa4"; reg = <1>; #address-cells = <1>; #size-cells = <0>; - }; - }; - /* MV88E6390X devboard */ - &mvusb { - switch@0 { - compatible = "marvell,mv88e6190"; - status = "ok"; - reg = <0x0>; + switch@0 { + compatible = "marvell,mv88e6190"; + reg = <0x0>; - ports { - /* Port definitions */ - }; + ports { + /* Port definitions */ + }; - mdio { - /* PHY definitions */ + mdio { + /* PHY definitions */ + }; }; }; }; diff --git a/Documentation/openrisc/openrisc_port.rst b/Documentation/openrisc/openrisc_port.rst index a18747a8d191..4b2c437942a0 100644 --- a/Documentation/openrisc/openrisc_port.rst +++ b/Documentation/openrisc/openrisc_port.rst @@ -37,8 +37,8 @@ or Stafford's toolchain build and release scripts. Build the Linux kernel as usual:: - make ARCH=openrisc defconfig - make ARCH=openrisc + make ARCH=openrisc CROSS_COMPILE="or1k-linux-" defconfig + make ARCH=openrisc CROSS_COMPILE="or1k-linux-" 3) Running on FPGA (optional) diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 7e94fe37cb2f..8588996165ae 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -16,6 +16,7 @@ config OPENRISC select HANDLE_DOMAIN_IRQ select GPIOLIB select HAVE_ARCH_TRACEHOOK + select HAVE_COPY_THREAD_TLS select SPARSE_IRQ select GENERIC_IRQ_CHIP select GENERIC_IRQ_PROBE diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig index d8ff4f8ffb88..75f2da324d0e 100644 --- a/arch/openrisc/configs/or1ksim_defconfig +++ b/arch/openrisc/configs/or1ksim_defconfig @@ -1,4 +1,3 @@ -CONFIG_CROSS_COMPILE="or1k-linux-" CONFIG_NO_HZ=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_BLK_DEV_INITRD=y diff --git a/arch/openrisc/configs/simple_smp_defconfig b/arch/openrisc/configs/simple_smp_defconfig index 64278992df9c..ff49d868e040 100644 --- a/arch/openrisc/configs/simple_smp_defconfig +++ b/arch/openrisc/configs/simple_smp_defconfig @@ -1,4 +1,3 @@ -CONFIG_CROSS_COMPILE="or1k-linux-" CONFIG_LOCALVERSION="-simple-smp" CONFIG_NO_HZ=y CONFIG_LOG_BUF_SHIFT=14 diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h index 566f8c4f8047..fae34c60fa88 100644 --- a/arch/openrisc/include/uapi/asm/unistd.h +++ b/arch/openrisc/include/uapi/asm/unistd.h @@ -24,6 +24,7 @@ #define __ARCH_WANT_SET_GET_RLIMIT #define __ARCH_WANT_SYS_FORK #define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_TIME32_SYSCALLS #include diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index b06f84f6676f..6bcdca424e11 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -117,12 +117,12 @@ void release_thread(struct task_struct *dead_task) extern asmlinkage void ret_from_fork(void); /* - * copy_thread + * copy_thread_tls * @clone_flags: flags * @usp: user stack pointer or fn for kernel thread * @arg: arg to fn for kernel thread; always NULL for userspace thread * @p: the newly created task - * @regs: CPU context to copy for userspace thread; always NULL for kthread + * @tls: the Thread Local Storage pointer for the new process * * At the top of a newly initialized kernel stack are two stacked pt_reg * structures. The first (topmost) is the userspace context of the thread. @@ -148,8 +148,8 @@ extern asmlinkage void ret_from_fork(void); */ int -copy_thread(unsigned long clone_flags, unsigned long usp, - unsigned long arg, struct task_struct *p) +copy_thread_tls(unsigned long clone_flags, unsigned long usp, + unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *userregs; struct pt_regs *kregs; @@ -179,16 +179,10 @@ copy_thread(unsigned long clone_flags, unsigned long usp, userregs->sp = usp; /* - * For CLONE_SETTLS set "tp" (r10) to the TLS pointer passed to sys_clone. - * - * The kernel entry is: - * int clone (long flags, void *child_stack, int *parent_tid, - * int *child_tid, struct void *tls) - * - * This makes the source r7 in the kernel registers. + * For CLONE_SETTLS set "tp" (r10) to the TLS pointer. */ if (clone_flags & CLONE_SETTLS) - userregs->gpr[10] = userregs->gpr[7]; + userregs->gpr[10] = tls; userregs->gpr[11] = 0; /* Result from fork() */ diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 7d518ee8bddc..bd1e660bbc89 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -113,7 +114,7 @@ asmlinkage __init void secondary_start_kernel(void) * All kernel threads share the same mm context; grab a * reference and switch to it. */ - atomic_inc(&mm->mm_count); + mmgrab(mm); current->active_mm = mm; cpumask_set_cpu(cpu, mm_cpumask(mm)); diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 932a8ec2b520..c11aa2e17ce0 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -55,13 +55,6 @@ void show_stack(struct task_struct *task, unsigned long *esp) unwind_stack(NULL, esp, print_trace); } -void show_trace_task(struct task_struct *tsk) -{ - /* - * TODO: SysRq-T trace dump... - */ -} - void show_registers(struct pt_regs *regs) { int i; diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index 197d2247e4db..70fecb8dc4e2 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -10,25 +10,34 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x) { volatile unsigned int *a = __ldcw_align(x); + smp_mb(); return *a == 0; } -#define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0) - -static inline void arch_spin_lock_flags(arch_spinlock_t *x, - unsigned long flags) +static inline void arch_spin_lock(arch_spinlock_t *x) { volatile unsigned int *a; a = __ldcw_align(x); while (__ldcw(a) == 0) while (*a == 0) - if (flags & PSW_SM_I) { - local_irq_enable(); - cpu_relax(); - local_irq_disable(); - } else - cpu_relax(); + cpu_relax(); +} + +static inline void arch_spin_lock_flags(arch_spinlock_t *x, + unsigned long flags) +{ + volatile unsigned int *a; + unsigned long flags_dis; + + a = __ldcw_align(x); + while (__ldcw(a) == 0) { + local_save_flags(flags_dis); + local_irq_restore(flags); + while (*a == 0) + cpu_relax(); + local_irq_restore(flags_dis); + } } #define arch_spin_lock_flags arch_spin_lock_flags @@ -58,116 +67,93 @@ static inline int arch_spin_trylock(arch_spinlock_t *x) /* * Read-write spinlocks, allowing multiple readers but only one writer. - * Linux rwlocks are unfair to writers; they can be starved for an indefinite - * time by readers. With care, they can also be taken in interrupt context. + * Unfair locking as Writers could be starved indefinitely by Reader(s) * - * In the PA-RISC implementation, we have a spinlock and a counter. - * Readers use the lock to serialise their access to the counter (which - * records how many readers currently hold the lock). - * Writers hold the spinlock, preventing any readers or other writers from - * grabbing the rwlock. + * The spinlock itself is contained in @counter and access to it is + * serialized with @lock_mutex. */ -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to grab the same read lock */ -static __inline__ void arch_read_lock(arch_rwlock_t *rw) +/* 1 - lock taken successfully */ +static inline int arch_read_trylock(arch_rwlock_t *rw) { + int ret = 0; unsigned long flags; - local_irq_save(flags); - arch_spin_lock_flags(&rw->lock, flags); - rw->counter++; - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); -} -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to grab the same read lock */ -static __inline__ void arch_read_unlock(arch_rwlock_t *rw) -{ - unsigned long flags; local_irq_save(flags); - arch_spin_lock_flags(&rw->lock, flags); - rw->counter--; - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); -} + arch_spin_lock(&(rw->lock_mutex)); -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to grab the same read lock */ -static __inline__ int arch_read_trylock(arch_rwlock_t *rw) -{ - unsigned long flags; - retry: - local_irq_save(flags); - if (arch_spin_trylock(&rw->lock)) { - rw->counter++; - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); - return 1; + /* + * zero means writer holds the lock exclusively, deny Reader. + * Otherwise grant lock to first/subseq reader + */ + if (rw->counter > 0) { + rw->counter--; + ret = 1; } + arch_spin_unlock(&(rw->lock_mutex)); local_irq_restore(flags); - /* If write-locked, we fail to acquire the lock */ - if (rw->counter < 0) - return 0; - /* Wait until we have a realistic chance at the lock */ - while (arch_spin_is_locked(&rw->lock) && rw->counter >= 0) + return ret; +} + +/* 1 - lock taken successfully */ +static inline int arch_write_trylock(arch_rwlock_t *rw) +{ + int ret = 0; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&(rw->lock_mutex)); + + /* + * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__), + * deny writer. Otherwise if unlocked grant to writer + * Hence the claim that Linux rwlocks are unfair to writers. + * (can be starved for an indefinite time by readers). + */ + if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) { + rw->counter = 0; + ret = 1; + } + arch_spin_unlock(&(rw->lock_mutex)); + local_irq_restore(flags); + + return ret; +} + +static inline void arch_read_lock(arch_rwlock_t *rw) +{ + while (!arch_read_trylock(rw)) cpu_relax(); - - goto retry; } -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to read_trylock() this lock */ -static __inline__ void arch_write_lock(arch_rwlock_t *rw) +static inline void arch_write_lock(arch_rwlock_t *rw) +{ + while (!arch_write_trylock(rw)) + cpu_relax(); +} + +static inline void arch_read_unlock(arch_rwlock_t *rw) { unsigned long flags; -retry: + local_irq_save(flags); - arch_spin_lock_flags(&rw->lock, flags); - - if (rw->counter != 0) { - arch_spin_unlock(&rw->lock); - local_irq_restore(flags); - - while (rw->counter != 0) - cpu_relax(); - - goto retry; - } - - rw->counter = -1; /* mark as write-locked */ - mb(); + arch_spin_lock(&(rw->lock_mutex)); + rw->counter++; + arch_spin_unlock(&(rw->lock_mutex)); local_irq_restore(flags); } -static __inline__ void arch_write_unlock(arch_rwlock_t *rw) -{ - rw->counter = 0; - arch_spin_unlock(&rw->lock); -} - -/* Note that we have to ensure interrupts are disabled in case we're - * interrupted by some other code that wants to read_trylock() this lock */ -static __inline__ int arch_write_trylock(arch_rwlock_t *rw) +static inline void arch_write_unlock(arch_rwlock_t *rw) { unsigned long flags; - int result = 0; local_irq_save(flags); - if (arch_spin_trylock(&rw->lock)) { - if (rw->counter == 0) { - rw->counter = -1; - result = 1; - } else { - /* Read-locked. Oh well. */ - arch_spin_unlock(&rw->lock); - } - } + arch_spin_lock(&(rw->lock_mutex)); + rw->counter = __ARCH_RW_LOCK_UNLOCKED__; + arch_spin_unlock(&(rw->lock_mutex)); local_irq_restore(flags); - - return result; } #endif /* __ASM_SPINLOCK_H */ diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h index 42979c5704dc..ca39ee350c3f 100644 --- a/arch/parisc/include/asm/spinlock_types.h +++ b/arch/parisc/include/asm/spinlock_types.h @@ -12,11 +12,19 @@ typedef struct { #endif } arch_spinlock_t; + +/* counter: + * Unlocked : 0x0100_0000 + * Read lock(s) : 0x00FF_FFFF to 0x01 (Multiple Readers decrement it) + * Write lock : 0x0, but only if prior value is "unlocked" 0x0100_0000 + */ typedef struct { - arch_spinlock_t lock; - volatile int counter; + arch_spinlock_t lock_mutex; + volatile unsigned int counter; } arch_rwlock_t; -#define __ARCH_RW_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED, 0 } +#define __ARCH_RW_LOCK_UNLOCKED__ 0x01000000 +#define __ARCH_RW_LOCK_UNLOCKED { .lock_mutex = __ARCH_SPIN_LOCK_UNLOCKED, \ + .counter = __ARCH_RW_LOCK_UNLOCKED__ } #endif diff --git a/arch/parisc/kernel/alternative.c b/arch/parisc/kernel/alternative.c index 3c66d5c4d90d..fa28c4c9f972 100644 --- a/arch/parisc/kernel/alternative.c +++ b/arch/parisc/kernel/alternative.c @@ -25,6 +25,22 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *entry; int index = 0, applied = 0; int num_cpus = num_online_cpus(); + u32 cond_check; + + cond_check = ALT_COND_ALWAYS | + ((num_cpus == 1) ? ALT_COND_NO_SMP : 0) | + ((cache_info.dc_size == 0) ? ALT_COND_NO_DCACHE : 0) | + ((cache_info.ic_size == 0) ? ALT_COND_NO_ICACHE : 0) | + (running_on_qemu ? ALT_COND_RUN_ON_QEMU : 0) | + ((split_tlb == 0) ? ALT_COND_NO_SPLIT_TLB : 0) | + /* + * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit + * set (bit #61, big endian), we have to flush and sync every + * time IO-PDIR is changed in Ike/Astro. + */ + (((boot_cpu_data.cpu_type > pcxw_) && + ((boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC) == 0)) + ? ALT_COND_NO_IOC_FDC : 0); for (entry = start; entry < end; entry++, index++) { @@ -38,29 +54,14 @@ void __init_or_module apply_alternatives(struct alt_instr *start, WARN_ON(!cond); - if (cond != ALT_COND_ALWAYS && no_alternatives) + if ((cond & ALT_COND_ALWAYS) == 0 && no_alternatives) continue; pr_debug("Check %d: Cond 0x%x, Replace %02d instructions @ 0x%px with 0x%08x\n", index, cond, len, from, replacement); - if ((cond & ALT_COND_NO_SMP) && (num_cpus != 1)) - continue; - if ((cond & ALT_COND_NO_DCACHE) && (cache_info.dc_size != 0)) - continue; - if ((cond & ALT_COND_NO_ICACHE) && (cache_info.ic_size != 0)) - continue; - if ((cond & ALT_COND_RUN_ON_QEMU) && !running_on_qemu) - continue; - - /* - * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit - * set (bit #61, big endian), we have to flush and sync every - * time IO-PDIR is changed in Ike/Astro. - */ - if ((cond & ALT_COND_NO_IOC_FDC) && - ((boot_cpu_data.cpu_type <= pcxw_) || - (boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC))) + /* Bounce out if none of the conditions are true. */ + if ((cond & cond_check) == 0) continue; /* Want to replace pdtlb by a pdtlb,l instruction? */ diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index e5fcfb70cc7c..e76c86619949 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -560,33 +560,23 @@ void do_cpu_irq_mask(struct pt_regs *regs) goto out; } -static struct irqaction timer_action = { - .handler = timer_interrupt, - .name = "timer", - .flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL, -}; - -#ifdef CONFIG_SMP -static struct irqaction ipi_action = { - .handler = ipi_interrupt, - .name = "IPI", - .flags = IRQF_PERCPU, -}; -#endif - static void claim_cpu_irqs(void) { + unsigned long flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL; int i; + for (i = CPU_IRQ_BASE; i <= CPU_IRQ_MAX; i++) { irq_set_chip_and_handler(i, &cpu_interrupt_type, handle_percpu_irq); } irq_set_handler(TIMER_IRQ, handle_percpu_irq); - setup_irq(TIMER_IRQ, &timer_action); + if (request_irq(TIMER_IRQ, timer_interrupt, flags, "timer", NULL)) + pr_err("Failed to register timer interrupt\n"); #ifdef CONFIG_SMP irq_set_handler(IPI_IRQ, handle_percpu_irq); - setup_irq(IPI_IRQ, &ipi_action); + if (request_irq(IPI_IRQ, ipi_interrupt, IRQF_PERCPU, "IPI", NULL)) + pr_err("Failed to register IPI interrupt\n"); #endif } diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 97ac707c6bff..f05c9d5b6b9e 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -935,7 +935,7 @@ ENTRY(lws_table) END(lws_table) /* End of lws table */ -#define __SYSCALL(nr, entry, nargs) ASM_ULONG_INSN entry +#define __SYSCALL(nr, entry) ASM_ULONG_INSN entry .align 8 ENTRY(sys_call_table) .export sys_call_table,data diff --git a/arch/parisc/kernel/syscalls/syscalltbl.sh b/arch/parisc/kernel/syscalls/syscalltbl.sh index 45b5bae26240..f7393a7b18aa 100644 --- a/arch/parisc/kernel/syscalls/syscalltbl.sh +++ b/arch/parisc/kernel/syscalls/syscalltbl.sh @@ -13,10 +13,10 @@ emit() { t_entry="$3" while [ $t_nxt -lt $t_nr ]; do - printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" + printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" t_nxt=$((t_nxt+1)) done - printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" + printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" } grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index ed32845bd2d2..2f051343612e 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -2,23 +2,12 @@ #ifndef ___ASM_SPARC_DMA_MAPPING_H #define ___ASM_SPARC_DMA_MAPPING_H -#include - extern const struct dma_map_ops *dma_ops; -extern struct bus_type pci_bus_type; - static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { -#ifdef CONFIG_SPARC_LEON - if (sparc_cpu_model == sparc_leon) - return NULL; -#endif -#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI) - if (bus == &pci_bus_type) - return NULL; -#endif - return dma_ops; + /* sparc32 uses per-device dma_ops */ + return IS_ENABLED(CONFIG_SPARC64) ? dma_ops : NULL; } #endif diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index e59461d03b9a..d6874c9b639f 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -373,9 +373,6 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, dma_make_coherent(paddr, PAGE_ALIGN(size)); } -const struct dma_map_ops *dma_ops; -EXPORT_SYMBOL(dma_ops); - #ifdef CONFIG_PROC_FS static int sparc_io_proc_show(struct seq_file *m, void *v) diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c index b32cc5610712..e717a56efc5d 100644 --- a/arch/sparc/kernel/of_device_common.c +++ b/arch/sparc/kernel/of_device_common.c @@ -67,6 +67,7 @@ void of_propagate_archdata(struct platform_device *bus) op->dev.archdata.stc = bus_sd->stc; op->dev.archdata.host_controller = bus_sd->host_controller; op->dev.archdata.numa_node = bus_sd->numa_node; + op->dev.dma_ops = bus->dev.dma_ops; if (dp->child) of_propagate_archdata(op); diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c index 33a0facd9eb5..289276b99b01 100644 --- a/arch/sparc/mm/io-unit.c +++ b/arch/sparc/mm/io-unit.c @@ -38,6 +38,8 @@ #define IOPERM (IOUPTE_CACHE | IOUPTE_WRITE | IOUPTE_VALID) #define MKIOPTE(phys) __iopte((((phys)>>4) & IOUPTE_PAGE) | IOPERM) +static const struct dma_map_ops iounit_dma_ops; + static void __init iounit_iommu_init(struct platform_device *op) { struct iounit_struct *iounit; @@ -70,6 +72,8 @@ static void __init iounit_iommu_init(struct platform_device *op) xptend = iounit->page_table + (16 * PAGE_SIZE) / sizeof(iopte_t); for (; xpt < xptend; xpt++) sbus_writel(0, xpt); + + op->dev.dma_ops = &iounit_dma_ops; } static int __init iounit_init(void) @@ -288,8 +292,3 @@ static const struct dma_map_ops iounit_dma_ops = { .map_sg = iounit_map_sg, .unmap_sg = iounit_unmap_sg, }; - -void __init ld_mmu_iounit(void) -{ - dma_ops = &iounit_dma_ops; -} diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c index 4d3c6991f0ae..b00dde13681b 100644 --- a/arch/sparc/mm/iommu.c +++ b/arch/sparc/mm/iommu.c @@ -54,6 +54,9 @@ static pgprot_t dvma_prot; /* Consistent mapping pte flags */ #define IOPERM (IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID) #define MKIOPTE(pfn, perm) (((((pfn)<<8) & IOPTE_PAGE) | (perm)) & ~IOPTE_WAZ) +static const struct dma_map_ops sbus_iommu_dma_gflush_ops; +static const struct dma_map_ops sbus_iommu_dma_pflush_ops; + static void __init sbus_iommu_init(struct platform_device *op) { struct iommu_struct *iommu; @@ -129,6 +132,11 @@ static void __init sbus_iommu_init(struct platform_device *op) (int)(IOMMU_NPTES*sizeof(iopte_t)), (int)IOMMU_NPTES); op->dev.archdata.iommu = iommu; + + if (flush_page_for_dma_global) + op->dev.dma_ops = &sbus_iommu_dma_gflush_ops; + else + op->dev.dma_ops = &sbus_iommu_dma_pflush_ops; } static int __init iommu_init(void) @@ -445,13 +453,6 @@ static const struct dma_map_ops sbus_iommu_dma_pflush_ops = { void __init ld_mmu_iommu(void) { - if (flush_page_for_dma_global) { - /* flush_page_for_dma flushes everything, no matter of what page is it */ - dma_ops = &sbus_iommu_dma_gflush_ops; - } else { - dma_ops = &sbus_iommu_dma_pflush_ops; - } - if (viking_mxcc_present || srmmu_modtype == HyperSparc) { dvma_prot = __pgprot(SRMMU_CACHE | SRMMU_ET_PTE | SRMMU_PRIV); ioperm_noc = IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID; diff --git a/arch/sparc/mm/mm_32.h b/arch/sparc/mm/mm_32.h index 0d0b06e952a5..ce750a99eea9 100644 --- a/arch/sparc/mm/mm_32.h +++ b/arch/sparc/mm/mm_32.h @@ -20,6 +20,3 @@ void __init srmmu_paging_init(void); /* iommu.c */ void ld_mmu_iommu(void); - -/* io-unit.c */ -void ld_mmu_iounit(void); diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index f56c3c9a9793..b7c94de70cca 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -1865,9 +1865,7 @@ void __init load_mmu(void) &smp_cachetlb_ops; #endif - if (sparc_cpu_model == sun4d) - ld_mmu_iounit(); - else + if (sparc_cpu_model != sun4d) ld_mmu_iommu(); #ifdef CONFIG_SMP if (sparc_cpu_model == sun4d) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 0917f8443c28..96ab7026b037 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -62,9 +62,12 @@ config NR_CPUS source "arch/$(HEADER_ARCH)/um/Kconfig" +config FORBID_STATIC_LINK + bool + config STATIC_LINK bool "Force a static link" - default n + depends on !FORBID_STATIC_LINK help This option gives you the ability to force a static link of UML. Normally, UML is linked as a shared binary. This is inconvenient for @@ -73,6 +76,9 @@ config STATIC_LINK Additionally, this option enables using higher memory spaces (up to 2.75G) for UML. + NOTE: This option is incompatible with some networking features which + depend on features that require being dynamically loaded (like NSS). + config LD_SCRIPT_STATIC bool default y @@ -191,6 +197,7 @@ config UML_TIME_TRAVEL_SUPPORT prompt "Support time-travel mode (e.g. for test execution)" # inf-cpu mode is incompatible with the benchmarking depends on !RAID6_PQ_BENCHMARK + depends on !SMP help Enable this option to support time travel inside the UML instance. diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index 73e98bb57bf5..fb51bd206dbe 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -26,7 +26,7 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -CONFIG_IOSCHED_CFQ=m +CONFIG_IOSCHED_BFQ=m CONFIG_SSL=y CONFIG_NULL_CHAN=y CONFIG_PORT_CHAN=y diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 3281d7600225..477b87317424 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -24,7 +24,7 @@ CONFIG_SLAB=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y # CONFIG_BLK_DEV_BSG is not set -CONFIG_IOSCHED_CFQ=m +CONFIG_IOSCHED_BFQ=m CONFIG_SSL=y CONFIG_NULL_CHAN=y CONFIG_PORT_CHAN=y diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 72d417055782..9160ead56e33 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -234,6 +234,7 @@ config UML_NET_DAEMON config UML_NET_VECTOR bool "Vector I/O high performance network devices" depends on UML_NET + select FORBID_STATIC_LINK help This User-Mode Linux network driver uses multi-message send and receive functions. The host running the UML guest must have @@ -245,6 +246,7 @@ config UML_NET_VECTOR config UML_NET_VDE bool "VDE transport (obsolete)" depends on UML_NET + select FORBID_STATIC_LINK help This User-Mode Linux network transport allows one or more running UMLs on a single host to communicate with each other and also @@ -292,6 +294,7 @@ config UML_NET_MCAST config UML_NET_PCAP bool "pcap transport (obsolete)" depends on UML_NET + select FORBID_STATIC_LINK help The pcap transport makes a pcap packet stream on the host look like an ethernet device inside UML. This is useful for making diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 35ebeebfc1a8..1802cf4ef5a5 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -266,7 +266,6 @@ static void uml_net_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); - strlcpy(info->version, "42", sizeof(info->version)); } static const struct ethtool_ops uml_net_ethtool_ops = { @@ -275,17 +274,6 @@ static const struct ethtool_ops uml_net_ethtool_ops = { .get_ts_info = ethtool_op_get_ts_info, }; -static void uml_net_user_timer_expire(struct timer_list *t) -{ -#ifdef undef - struct uml_net_private *lp = from_timer(lp, t, tl); - struct connection *conn = &lp->user; - - dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn); - do_connect(conn); -#endif -} - void uml_net_setup_etheraddr(struct net_device *dev, char *str) { unsigned char *addr = dev->dev_addr; @@ -456,7 +444,6 @@ static void eth_configure(int n, void *init, char *mac, .add_address = transport->user->add_address, .delete_address = transport->user->delete_address }); - timer_setup(&lp->tl, uml_net_user_timer_expire, 0); spin_lock_init(&lp->lock); memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac)); diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 247f95da057b..eae8c83364f7 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1592,11 +1592,11 @@ int io_thread(void *arg) &io_remainder_size, UBD_REQ_BUFFER_SIZE ); - if (n < 0) { - if (n == -EAGAIN) { + if (n <= 0) { + if (n == -EAGAIN) ubd_read_poll(-1); - continue; - } + + continue; } for (count = 0; count < n/sizeof(struct io_thread_req *); count++) { @@ -1607,7 +1607,9 @@ int io_thread(void *arg) written = 0; do { - res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n); + res = os_write_file(kernel_fd, + ((char *) io_req_buffer) + written, + n - written); if (res >= 0) { written += res; } diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index e98304d0219e..8735c468230a 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -46,7 +46,6 @@ #define DRIVER_NAME "uml-vector" -#define DRIVER_VERSION "01" struct vector_cmd_line_arg { struct list_head list; int unit; @@ -198,6 +197,9 @@ static int get_transport_options(struct arglist *def) long parsed; int result = 0; + if (transport == NULL) + return -EINVAL; + if (vector != NULL) { if (kstrtoul(vector, 10, &parsed) == 0) { if (parsed == 0) { @@ -1378,7 +1380,6 @@ static void vector_net_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver)); - strlcpy(info->version, DRIVER_VERSION, sizeof(info->version)); } static int vector_net_load_bpf_flash(struct net_device *dev, diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c index ddcd917be0af..aa28e9eecb7b 100644 --- a/arch/um/drivers/vector_user.c +++ b/arch/um/drivers/vector_user.c @@ -221,8 +221,7 @@ static struct vector_fds *user_init_tap_fds(struct arglist *ifspec) return result; tap_cleanup: printk(UM_KERN_ERR "user_init_tap: init failed, error %d", fd); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } @@ -266,8 +265,7 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec) return result; hybrid_cleanup: printk(UM_KERN_ERR "user_init_hybrid: init failed"); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } @@ -344,10 +342,8 @@ static struct vector_fds *user_init_unix_fds(struct arglist *ifspec, int id) unix_cleanup: if (fd >= 0) os_close_file(fd); - if (remote_addr != NULL) - kfree(remote_addr); - if (result != NULL) - kfree(result); + kfree(remote_addr); + kfree(result); return NULL; } @@ -382,8 +378,7 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec) return result; raw_cleanup: printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err); - if (result != NULL) - kfree(result); + kfree(result); return NULL; } diff --git a/arch/um/drivers/vhost_user.h b/arch/um/drivers/vhost_user.h index 45ff5ea22fea..6c71b6005177 100644 --- a/arch/um/drivers/vhost_user.h +++ b/arch/um/drivers/vhost_user.h @@ -10,9 +10,10 @@ /* Feature bits */ #define VHOST_USER_F_PROTOCOL_FEATURES 30 /* Protocol feature bits */ -#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 -#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 -#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 +#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14 /* Vring state index masks */ #define VHOST_USER_VRING_INDEX_MASK 0xff #define VHOST_USER_VRING_POLL_MASK BIT(8) @@ -24,7 +25,8 @@ /* Supported protocol features */ #define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \ - BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG)) + BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG) | \ + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) enum vhost_user_request { VHOST_USER_GET_FEATURES = 1, @@ -52,12 +54,14 @@ enum vhost_user_request { VHOST_USER_SET_VRING_ENDIAN = 23, VHOST_USER_GET_CONFIG = 24, VHOST_USER_SET_CONFIG = 25, + VHOST_USER_VRING_KICK = 35, }; enum vhost_user_slave_request { VHOST_USER_SLAVE_IOTLB_MSG = 1, VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3, + VHOST_USER_SLAVE_VRING_CALL = 4, }; struct vhost_user_header { diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 023ced2250ea..be54d368e73d 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,7 @@ struct virtio_uml_device { struct virtio_device vdev; struct platform_device *pdev; + spinlock_t sock_lock; int sock, req_fd; u64 features; u64 protocol_features; @@ -63,6 +65,11 @@ struct virtio_uml_device { struct virtio_uml_vq_info { int kick_fd, call_fd; char name[32]; +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + struct virtqueue *vq; + vq_callback_t *callback; + struct time_travel_event defer; +#endif }; extern unsigned long long physmem_size, highmem; @@ -117,10 +124,27 @@ static int vhost_user_recv_header(int fd, struct vhost_user_msg *msg) static int vhost_user_recv(struct virtio_uml_device *vu_dev, int fd, struct vhost_user_msg *msg, - size_t max_payload_size) + size_t max_payload_size, bool wait) { size_t size; - int rc = vhost_user_recv_header(fd, msg); + int rc; + + /* + * In virtio time-travel mode, we're handling all the vhost-user + * FDs by polling them whenever appropriate. However, we may get + * into a situation where we're sending out an interrupt message + * to a device (e.g. a net device) and need to handle a simulation + * time message while doing so, e.g. one that tells us to update + * our idea of how long we can run without scheduling. + * + * Thus, we need to not just read() from the given fd, but need + * to also handle messages for the simulation time - this function + * does that for us while waiting for the given fd to be readable. + */ + if (wait) + time_travel_wait_readable(fd); + + rc = vhost_user_recv_header(fd, msg); if (rc == -ECONNRESET && vu_dev->registered) { struct virtio_uml_platform_data *pdata; @@ -142,7 +166,8 @@ static int vhost_user_recv_resp(struct virtio_uml_device *vu_dev, struct vhost_user_msg *msg, size_t max_payload_size) { - int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg, max_payload_size); + int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg, + max_payload_size, true); if (rc) return rc; @@ -172,7 +197,8 @@ static int vhost_user_recv_req(struct virtio_uml_device *vu_dev, struct vhost_user_msg *msg, size_t max_payload_size) { - int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg, max_payload_size); + int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg, + max_payload_size, false); if (rc) return rc; @@ -189,6 +215,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, int *fds, size_t num_fds) { size_t size = sizeof(msg->header) + msg->header.size; + unsigned long flags; bool request_ack; int rc; @@ -207,24 +234,28 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev, if (request_ack) msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY; + spin_lock_irqsave(&vu_dev->sock_lock, flags); rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds); if (rc < 0) - return rc; + goto out; if (request_ack) { uint64_t status; rc = vhost_user_recv_u64(vu_dev, &status); if (rc) - return rc; + goto out; if (status) { vu_err(vu_dev, "slave reports error: %llu\n", status); - return -EIO; + rc = -EIO; + goto out; } } - return 0; +out: + spin_unlock_irqrestore(&vu_dev->sock_lock, flags); + return rc; } static int vhost_user_send_no_payload(struct virtio_uml_device *vu_dev, @@ -324,6 +355,7 @@ static void vhost_user_reply(struct virtio_uml_device *vu_dev, static irqreturn_t vu_req_interrupt(int irq, void *data) { struct virtio_uml_device *vu_dev = data; + struct virtqueue *vq; int response = 1; struct { struct vhost_user_msg msg; @@ -343,6 +375,15 @@ static irqreturn_t vu_req_interrupt(int irq, void *data) virtio_config_changed(&vu_dev->vdev); response = 0; break; + case VHOST_USER_SLAVE_VRING_CALL: + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + if (vq->index == msg.msg.payload.vring_state.index) { + response = 0; + vring_interrupt(0 /* ignored */, vq); + break; + } + } + break; case VHOST_USER_SLAVE_IOTLB_MSG: /* not supported - VIRTIO_F_IOMMU_PLATFORM */ case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG: @@ -684,6 +725,17 @@ static bool vu_notify(struct virtqueue *vq) const uint64_t n = 1; int rc; + time_travel_propagate_time(); + + if (info->kick_fd < 0) { + struct virtio_uml_device *vu_dev; + + vu_dev = to_virtio_uml_device(vq->vdev); + + return vhost_user_set_vring_state(vu_dev, VHOST_USER_VRING_KICK, + vq->index, 0) == 0; + } + do { rc = os_write_file(info->kick_fd, &n, sizeof(n)); } while (rc == -EINTR); @@ -749,10 +801,13 @@ static void vu_del_vq(struct virtqueue *vq) { struct virtio_uml_vq_info *info = vq->priv; - um_free_irq(VIRTIO_IRQ, vq); + if (info->call_fd >= 0) { + um_free_irq(VIRTIO_IRQ, vq); + os_close_file(info->call_fd); + } - os_close_file(info->call_fd); - os_close_file(info->kick_fd); + if (info->kick_fd >= 0) + os_close_file(info->kick_fd); vring_del_virtqueue(vq); kfree(info); @@ -782,6 +837,15 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev, int call_fds[2]; int rc; + /* no call FD needed/desired in this case */ + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) && + vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) { + info->call_fd = -1; + return 0; + } + /* Use a pipe for call fd, since SIGIO is not supported for eventfd */ rc = os_pipe(call_fds, true, true); if (rc < 0) @@ -810,6 +874,23 @@ out: return rc; } +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +static void vu_defer_irq_handle(struct time_travel_event *d) +{ + struct virtio_uml_vq_info *info; + + info = container_of(d, struct virtio_uml_vq_info, defer); + info->callback(info->vq); +} + +static void vu_defer_irq_callback(struct virtqueue *vq) +{ + struct virtio_uml_vq_info *info = vq->priv; + + time_travel_add_irq_event(&info->defer); +} +#endif + static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, unsigned index, vq_callback_t *callback, const char *name, bool ctx) @@ -829,6 +910,19 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, snprintf(info->name, sizeof(info->name), "%s.%d-%s", pdev->name, pdev->id, name); +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + /* + * When we get an interrupt, we must bounce it through the simulation + * calendar (the simtime device), except for the simtime device itself + * since that's part of the simulation control. + */ + if (time_travel_mode == TT_MODE_EXTERNAL && callback) { + info->callback = callback; + callback = vu_defer_irq_callback; + time_travel_set_event_fn(&info->defer, vu_defer_irq_handle); + } +#endif + vq = vring_create_virtqueue(index, num, PAGE_SIZE, vdev, true, true, ctx, vu_notify, callback, info->name); if (!vq) { @@ -837,11 +931,19 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, } vq->priv = info; num = virtqueue_get_vring_size(vq); +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + info->vq = vq; +#endif - rc = os_eventfd(0, 0); - if (rc < 0) - goto error_kick; - info->kick_fd = rc; + if (vu_dev->protocol_features & + BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) { + info->kick_fd = -1; + } else { + rc = os_eventfd(0, 0); + if (rc < 0) + goto error_kick; + info->kick_fd = rc; + } rc = vu_setup_vq_call_fd(vu_dev, vq); if (rc) @@ -866,10 +968,13 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev, return vq; error_setup: - um_free_irq(VIRTIO_IRQ, vq); - os_close_file(info->call_fd); + if (info->call_fd >= 0) { + um_free_irq(VIRTIO_IRQ, vq); + os_close_file(info->call_fd); + } error_call: - os_close_file(info->kick_fd); + if (info->kick_fd >= 0) + os_close_file(info->kick_fd); error_kick: vring_del_virtqueue(vq); error_create: @@ -908,10 +1013,12 @@ static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs, list_for_each_entry(vq, &vdev->vqs, list) { struct virtio_uml_vq_info *info = vq->priv; - rc = vhost_user_set_vring_kick(vu_dev, vq->index, - info->kick_fd); - if (rc) - goto error_setup; + if (info->kick_fd >= 0) { + rc = vhost_user_set_vring_kick(vu_dev, vq->index, + info->kick_fd); + if (rc) + goto error_setup; + } rc = vhost_user_set_vring_enable(vu_dev, vq->index, true); if (rc) @@ -1008,6 +1115,8 @@ static int virtio_uml_probe(struct platform_device *pdev) return rc; vu_dev->sock = rc; + spin_lock_init(&vu_dev->sock_lock); + rc = vhost_user_init(vu_dev); if (rc) goto error_init; diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index db7d9d4e30d8..8d435f8a6dec 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -3,7 +3,6 @@ generic-y += bpf_perf_event.h generic-y += bug.h generic-y += compat.h generic-y += current.h -generic-y += delay.h generic-y += device.h generic-y += emergency-restart.h generic-y += exec.h diff --git a/arch/um/include/asm/delay.h b/arch/um/include/asm/delay.h new file mode 100644 index 000000000000..56fc2b8f2dd0 --- /dev/null +++ b/arch/um/include/asm/delay.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_DELAY_H +#define __UM_DELAY_H +#include +#include + +static inline void um_ndelay(unsigned long nsecs) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { + time_travel_ndelay(nsecs); + return; + } + ndelay(nsecs); +} +#undef ndelay +#define ndelay um_ndelay + +static inline void um_udelay(unsigned long usecs) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) { + time_travel_ndelay(1000 * usecs); + return; + } + udelay(usecs); +} +#undef udelay +#define udelay um_udelay +#endif /* __UM_DELAY_H */ diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h new file mode 100644 index 000000000000..f3b03d39a854 --- /dev/null +++ b/arch/um/include/linux/time-internal.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2012 - 2014 Cisco Systems + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + */ + +#ifndef __TIMER_INTERNAL_H__ +#define __TIMER_INTERNAL_H__ +#include + +#define TIMER_MULTIPLIER 256 +#define TIMER_MIN_DELTA 500 + +enum time_travel_mode { + TT_MODE_OFF, + TT_MODE_BASIC, + TT_MODE_INFCPU, + TT_MODE_EXTERNAL, +}; + +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +struct time_travel_event { + unsigned long long time; + void (*fn)(struct time_travel_event *d); + struct list_head list; + bool pending, onstack; +}; + +extern enum time_travel_mode time_travel_mode; + +void time_travel_sleep(unsigned long long duration); + +static inline void +time_travel_set_event_fn(struct time_travel_event *e, + void (*fn)(struct time_travel_event *d)) +{ + e->fn = fn; +} + +void __time_travel_propagate_time(void); + +static inline void time_travel_propagate_time(void) +{ + if (time_travel_mode == TT_MODE_EXTERNAL) + __time_travel_propagate_time(); +} + +void __time_travel_wait_readable(int fd); + +static inline void time_travel_wait_readable(int fd) +{ + if (time_travel_mode == TT_MODE_EXTERNAL) + __time_travel_wait_readable(fd); +} + +void time_travel_add_irq_event(struct time_travel_event *e); +#else +struct time_travel_event { +}; + +#define time_travel_mode TT_MODE_OFF + +static inline void time_travel_sleep(unsigned long long duration) +{ +} + +/* this is a macro so the event/function need not exist */ +#define time_travel_set_event_fn(e, fn) do {} while (0) + +static inline void time_travel_propagate_time(void) +{ +} + +static inline void time_travel_wait_readable(int fd) +{ +} +#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ + +/* + * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used, + * which is intentional since we really shouldn't link it in that case. + */ +void time_travel_ndelay(unsigned long nsec); +#endif /* __TIMER_INTERNAL_H__ */ diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 0f30204b6afa..f467d28fc0b4 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -181,6 +181,7 @@ extern int os_falloc_punch(int fd, unsigned long long offset, int count); extern int os_eventfd(unsigned int initval, int flags); extern int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, unsigned int fds_num); +int os_poll(unsigned int n, const int *fds); /* start_up.c */ extern void os_early_checks(void); diff --git a/arch/um/include/shared/timer-internal.h b/arch/um/include/shared/timer-internal.h deleted file mode 100644 index 2d2d13c9b46f..000000000000 --- a/arch/um/include/shared/timer-internal.h +++ /dev/null @@ -1,76 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2012 - 2014 Cisco Systems - * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __TIMER_INTERNAL_H__ -#define __TIMER_INTERNAL_H__ - -#define TIMER_MULTIPLIER 256 -#define TIMER_MIN_DELTA 500 - -enum time_travel_mode { - TT_MODE_OFF, - TT_MODE_BASIC, - TT_MODE_INFCPU, -}; - -enum time_travel_timer_mode { - TT_TMR_DISABLED, - TT_TMR_ONESHOT, - TT_TMR_PERIODIC, -}; - -#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT -extern enum time_travel_mode time_travel_mode; -extern unsigned long long time_travel_time; -extern enum time_travel_timer_mode time_travel_timer_mode; -extern unsigned long long time_travel_timer_expiry; -extern unsigned long long time_travel_timer_interval; - -static inline void time_travel_set_time(unsigned long long ns) -{ - time_travel_time = ns; -} - -static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode) -{ - time_travel_timer_mode = mode; -} - -static inline void time_travel_set_timer_expiry(unsigned long long expiry) -{ - time_travel_timer_expiry = expiry; -} - -static inline void time_travel_set_timer_interval(unsigned long long interval) -{ - time_travel_timer_interval = interval; -} -#else -#define time_travel_mode TT_MODE_OFF -#define time_travel_time 0 -#define time_travel_timer_expiry 0 -#define time_travel_timer_interval 0 - -static inline void time_travel_set_time(unsigned long long ns) -{ -} - -static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode) -{ -} - -static inline void time_travel_set_timer_expiry(unsigned long long expiry) -{ -} - -static inline void time_travel_set_timer_interval(unsigned long long interval) -{ -} - -#define time_travel_timer_mode TT_TMR_DISABLED -#endif - -#endif diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c index 98bdf69e4c2e..e4abac6c9727 100644 --- a/arch/um/kernel/kmsg_dump.c +++ b/arch/um/kernel/kmsg_dump.c @@ -9,20 +9,19 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { static char line[1024]; - + struct console *con; size_t len = 0; - bool con_available = false; /* only dump kmsg when no console is available */ if (!console_trylock()) return; - if (console_drivers != NULL) - con_available = true; + for_each_console(con) + break; console_unlock(); - if (con_available == true) + if (con) return; printf("kmsg_dump:\n"); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 56a094182bf5..cbe33af2a880 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include /* * This is a per-cpu array. A processor only modifies its entry and it only @@ -203,43 +203,6 @@ void initial_thread_cb(void (*proc)(void *), void *arg) kmalloc_ok = save_kmalloc_ok; } -static void time_travel_sleep(unsigned long long duration) -{ - unsigned long long next = time_travel_time + duration; - - if (time_travel_mode != TT_MODE_INFCPU) - os_timer_disable(); - - while (time_travel_timer_mode == TT_TMR_PERIODIC && - time_travel_timer_expiry < time_travel_time) - time_travel_set_timer_expiry(time_travel_timer_expiry + - time_travel_timer_interval); - - if (time_travel_timer_mode != TT_TMR_DISABLED && - time_travel_timer_expiry < next) { - if (time_travel_timer_mode == TT_TMR_ONESHOT) - time_travel_set_timer_mode(TT_TMR_DISABLED); - /* - * In basic mode, time_travel_time will be adjusted in - * the timer IRQ handler so it works even when the signal - * comes from the OS timer, see there. - */ - if (time_travel_mode != TT_MODE_BASIC) - time_travel_set_time(time_travel_timer_expiry); - - deliver_alarm(); - } else { - time_travel_set_time(next); - } - - if (time_travel_mode != TT_MODE_INFCPU) { - if (time_travel_timer_mode == TT_TMR_PERIODIC) - os_timer_set_interval(time_travel_timer_interval); - else if (time_travel_timer_mode == TT_TMR_ONESHOT) - os_timer_one_shot(time_travel_timer_expiry - next); - } -} - static void um_idle_sleep(void) { unsigned long long duration = UM_NSEC_PER_SEC; diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index 40d90dddf3f1..0a12d5a09217 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include void handle_syscall(struct uml_pt_regs *r) { @@ -24,7 +24,8 @@ void handle_syscall(struct uml_pt_regs *r) * went to sleep, even if said userspace interacts with the kernel in * various ways. */ - if (time_travel_mode == TT_MODE_INFCPU) + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) schedule(); /* Initialize the syscall number and default return value. */ diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 94ea87bd231c..25eaa6a0c658 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -4,6 +4,7 @@ * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2012-2014 Cisco Systems * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright (C) 2019 Intel Corporation */ #include @@ -18,21 +19,484 @@ #include #include #include -#include +#include +#include #include #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT enum time_travel_mode time_travel_mode; -unsigned long long time_travel_time; -enum time_travel_timer_mode time_travel_timer_mode; -unsigned long long time_travel_timer_expiry; -unsigned long long time_travel_timer_interval; +EXPORT_SYMBOL_GPL(time_travel_mode); static bool time_travel_start_set; static unsigned long long time_travel_start; -#else +static unsigned long long time_travel_time; +static LIST_HEAD(time_travel_events); +static unsigned long long time_travel_timer_interval; +static unsigned long long time_travel_next_event; +static struct time_travel_event time_travel_timer_event; +static int time_travel_ext_fd = -1; +static unsigned int time_travel_ext_waiting; +static bool time_travel_ext_prev_request_valid; +static unsigned long long time_travel_ext_prev_request; +static bool time_travel_ext_free_until_valid; +static unsigned long long time_travel_ext_free_until; + +static void time_travel_set_time(unsigned long long ns) +{ + if (unlikely(ns < time_travel_time)) + panic("time-travel: time goes backwards %lld -> %lld\n", + time_travel_time, ns); + time_travel_time = ns; +} + +enum time_travel_message_handling { + TTMH_IDLE, + TTMH_POLL, + TTMH_READ, +}; + +static void time_travel_handle_message(struct um_timetravel_msg *msg, + enum time_travel_message_handling mode) +{ + struct um_timetravel_msg resp = { + .op = UM_TIMETRAVEL_ACK, + }; + int ret; + + /* + * Poll outside the locked section (if we're not called to only read + * the response) so we can get interrupts for e.g. virtio while we're + * here, but then we need to lock to not get interrupted between the + * read of the message and write of the ACK. + */ + if (mode != TTMH_READ) { + while (os_poll(1, &time_travel_ext_fd) != 0) { + if (mode == TTMH_IDLE) { + BUG_ON(!irqs_disabled()); + local_irq_enable(); + local_irq_disable(); + } + } + } + + ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); + + if (ret == 0) + panic("time-travel external link is broken\n"); + if (ret != sizeof(*msg)) + panic("invalid time-travel message - %d bytes\n", ret); + + switch (msg->op) { + default: + WARN_ONCE(1, "time-travel: unexpected message %lld\n", + (unsigned long long)msg->op); + break; + case UM_TIMETRAVEL_ACK: + return; + case UM_TIMETRAVEL_RUN: + time_travel_set_time(msg->time); + break; + case UM_TIMETRAVEL_FREE_UNTIL: + time_travel_ext_free_until_valid = true; + time_travel_ext_free_until = msg->time; + break; + } + + os_write_file(time_travel_ext_fd, &resp, sizeof(resp)); +} + +static u64 time_travel_ext_req(u32 op, u64 time) +{ + static int seq; + int mseq = ++seq; + struct um_timetravel_msg msg = { + .op = op, + .time = time, + .seq = mseq, + }; + unsigned long flags; + + /* + * We need to save interrupts here and only restore when we + * got the ACK - otherwise we can get interrupted and send + * another request while we're still waiting for an ACK, but + * the peer doesn't know we got interrupted and will send + * the ACKs in the same order as the message, but we'd need + * to see them in the opposite order ... + * + * This wouldn't matter *too* much, but some ACKs carry the + * current time (for UM_TIMETRAVEL_GET) and getting another + * ACK without a time would confuse us a lot! + * + * The sequence number assignment that happens here lets us + * debug such message handling issues more easily. + */ + local_irq_save(flags); + os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); + + while (msg.op != UM_TIMETRAVEL_ACK) + time_travel_handle_message(&msg, TTMH_READ); + + if (msg.seq != mseq) + panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n", + msg.op, msg.seq, mseq, msg.time); + + if (op == UM_TIMETRAVEL_GET) + time_travel_set_time(msg.time); + local_irq_restore(flags); + + return msg.time; +} + +void __time_travel_wait_readable(int fd) +{ + int fds[2] = { fd, time_travel_ext_fd }; + int ret; + + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + while ((ret = os_poll(2, fds))) { + struct um_timetravel_msg msg; + + if (ret == 1) + time_travel_handle_message(&msg, TTMH_READ); + } +} +EXPORT_SYMBOL_GPL(__time_travel_wait_readable); + +static void time_travel_ext_update_request(unsigned long long time) +{ + if (time_travel_mode != TT_MODE_EXTERNAL) + return; + + /* asked for exactly this time previously */ + if (time_travel_ext_prev_request_valid && + time == time_travel_ext_prev_request) + return; + + time_travel_ext_prev_request = time; + time_travel_ext_prev_request_valid = true; + time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time); +} + +void __time_travel_propagate_time(void) +{ + time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time); +} +EXPORT_SYMBOL_GPL(__time_travel_propagate_time); + +/* returns true if we must do a wait to the simtime device */ +static bool time_travel_ext_request(unsigned long long time) +{ + /* + * If we received an external sync point ("free until") then we + * don't have to request/wait for anything until then, unless + * we're already waiting. + */ + if (!time_travel_ext_waiting && time_travel_ext_free_until_valid && + time < time_travel_ext_free_until) + return false; + + time_travel_ext_update_request(time); + return true; +} + +static void time_travel_ext_wait(bool idle) +{ + struct um_timetravel_msg msg = { + .op = UM_TIMETRAVEL_ACK, + }; + + time_travel_ext_prev_request_valid = false; + time_travel_ext_waiting++; + + time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1); + + /* + * Here we are deep in the idle loop, so we have to break out of the + * kernel abstraction in a sense and implement this in terms of the + * UML system waiting on the VQ interrupt while sleeping, when we get + * the signal it'll call time_travel_ext_vq_notify_done() completing the + * call. + */ + while (msg.op != UM_TIMETRAVEL_RUN) + time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL); + + time_travel_ext_waiting--; + + /* we might request more stuff while polling - reset when we run */ + time_travel_ext_prev_request_valid = false; +} + +static void time_travel_ext_get_time(void) +{ + time_travel_ext_req(UM_TIMETRAVEL_GET, -1); +} + +static void __time_travel_update_time(unsigned long long ns, bool idle) +{ + if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns)) + time_travel_ext_wait(idle); + else + time_travel_set_time(ns); +} + +static struct time_travel_event *time_travel_first_event(void) +{ + return list_first_entry_or_null(&time_travel_events, + struct time_travel_event, + list); +} + +static void __time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + struct time_travel_event *tmp; + bool inserted = false; + + if (WARN(time_travel_mode == TT_MODE_BASIC && + e != &time_travel_timer_event, + "only timer events can be handled in basic mode")) + return; + + if (e->pending) + return; + + e->pending = true; + e->time = time; + + list_for_each_entry(tmp, &time_travel_events, list) { + /* + * Add the new entry before one with higher time, + * or if they're equal and both on stack, because + * in that case we need to unwind the stack in the + * right order, and the later event (timer sleep + * or such) must be dequeued first. + */ + if ((tmp->time > e->time) || + (tmp->time == e->time && tmp->onstack && e->onstack)) { + list_add_tail(&e->list, &tmp->list); + inserted = true; + break; + } + } + + if (!inserted) + list_add_tail(&e->list, &time_travel_events); + + tmp = time_travel_first_event(); + time_travel_ext_update_request(tmp->time); + time_travel_next_event = tmp->time; +} + +static void time_travel_add_event(struct time_travel_event *e, + unsigned long long time) +{ + if (WARN_ON(!e->fn)) + return; + + __time_travel_add_event(e, time); +} + +void time_travel_periodic_timer(struct time_travel_event *e) +{ + time_travel_add_event(&time_travel_timer_event, + time_travel_time + time_travel_timer_interval); + deliver_alarm(); +} + +static void time_travel_deliver_event(struct time_travel_event *e) +{ + if (e == &time_travel_timer_event) { + /* + * deliver_alarm() does the irq_enter/irq_exit + * by itself, so must handle it specially here + */ + e->fn(e); + } else { + unsigned long flags; + + local_irq_save(flags); + irq_enter(); + e->fn(e); + irq_exit(); + local_irq_restore(flags); + } +} + +static bool time_travel_del_event(struct time_travel_event *e) +{ + if (!e->pending) + return false; + list_del(&e->list); + e->pending = false; + return true; +} + +static void time_travel_update_time(unsigned long long next, bool idle) +{ + struct time_travel_event ne = { + .onstack = true, + }; + struct time_travel_event *e; + bool finished = idle; + + /* add it without a handler - we deal with that specifically below */ + __time_travel_add_event(&ne, next); + + do { + e = time_travel_first_event(); + + BUG_ON(!e); + __time_travel_update_time(e->time, idle); + + /* new events may have been inserted while we were waiting */ + if (e == time_travel_first_event()) { + BUG_ON(!time_travel_del_event(e)); + BUG_ON(time_travel_time != e->time); + + if (e == &ne) { + finished = true; + } else { + if (e->onstack) + panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n", + time_travel_time, e->time, e); + time_travel_deliver_event(e); + } + } + + e = time_travel_first_event(); + if (e) + time_travel_ext_update_request(e->time); + } while (ne.pending && !finished); + + time_travel_del_event(&ne); +} + +void time_travel_ndelay(unsigned long nsec) +{ + time_travel_update_time(time_travel_time + nsec, false); +} +EXPORT_SYMBOL(time_travel_ndelay); + +void time_travel_add_irq_event(struct time_travel_event *e) +{ + BUG_ON(time_travel_mode != TT_MODE_EXTERNAL); + + time_travel_ext_get_time(); + /* + * We could model interrupt latency here, for now just + * don't have any latency at all and request the exact + * same time (again) to run the interrupt... + */ + time_travel_add_event(e, time_travel_time); +} +EXPORT_SYMBOL_GPL(time_travel_add_irq_event); + +static void time_travel_oneshot_timer(struct time_travel_event *e) +{ + deliver_alarm(); +} + +void time_travel_sleep(unsigned long long duration) +{ + unsigned long long next = time_travel_time + duration; + + if (time_travel_mode == TT_MODE_BASIC) + os_timer_disable(); + + time_travel_update_time(next, true); + + if (time_travel_mode == TT_MODE_BASIC && + time_travel_timer_event.pending) { + if (time_travel_timer_event.fn == time_travel_periodic_timer) { + /* + * This is somewhat wrong - we should get the first + * one sooner like the os_timer_one_shot() below... + */ + os_timer_set_interval(time_travel_timer_interval); + } else { + os_timer_one_shot(time_travel_timer_event.time - next); + } + } +} + +static void time_travel_handle_real_alarm(void) +{ + time_travel_set_time(time_travel_next_event); + + time_travel_del_event(&time_travel_timer_event); + + if (time_travel_timer_event.fn == time_travel_periodic_timer) + time_travel_add_event(&time_travel_timer_event, + time_travel_time + + time_travel_timer_interval); +} + +static void time_travel_set_interval(unsigned long long interval) +{ + time_travel_timer_interval = interval; +} + +static int time_travel_connect_external(const char *socket) +{ + const char *sep; + unsigned long long id = (unsigned long long)-1; + int rc; + + if ((sep = strchr(socket, ':'))) { + char buf[25] = {}; + if (sep - socket > sizeof(buf) - 1) + goto invalid_number; + + memcpy(buf, socket, sep - socket); + if (kstrtoull(buf, 0, &id)) { +invalid_number: + panic("time-travel: invalid external ID in string '%s'\n", + socket); + return -EINVAL; + } + + socket = sep + 1; + } + + rc = os_connect_socket(socket); + if (rc < 0) { + panic("time-travel: failed to connect to external socket %s\n", + socket); + return rc; + } + + time_travel_ext_fd = rc; + + time_travel_ext_req(UM_TIMETRAVEL_START, id); + + return 1; +} +#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ #define time_travel_start_set 0 #define time_travel_start 0 +#define time_travel_time 0 + +static inline void time_travel_update_time(unsigned long long ns, bool retearly) +{ +} + +static inline void time_travel_handle_real_alarm(void) +{ +} + +static void time_travel_set_interval(unsigned long long interval) +{ +} + +/* fail link if this actually gets used */ +extern u64 time_travel_ext_req(u32 op, u64 time); + +/* these are empty macros so the struct/fn need not exist */ +#define time_travel_add_event(e, time) do { } while (0) +#define time_travel_del_event(e) do { } while (0) #endif void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) @@ -48,7 +512,7 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * never get any real signals from the OS. */ if (time_travel_mode == TT_MODE_BASIC) - time_travel_set_time(time_travel_timer_expiry); + time_travel_handle_real_alarm(); local_irq_save(flags); do_IRQ(TIMER_IRQ, regs); @@ -58,9 +522,10 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) static int itimer_shutdown(struct clock_event_device *evt) { if (time_travel_mode != TT_MODE_OFF) - time_travel_set_timer_mode(TT_TMR_DISABLED); + time_travel_del_event(&time_travel_timer_event); - if (time_travel_mode != TT_MODE_INFCPU) + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) os_timer_disable(); return 0; @@ -71,12 +536,16 @@ static int itimer_set_periodic(struct clock_event_device *evt) unsigned long long interval = NSEC_PER_SEC / HZ; if (time_travel_mode != TT_MODE_OFF) { - time_travel_set_timer_mode(TT_TMR_PERIODIC); - time_travel_set_timer_expiry(time_travel_time + interval); - time_travel_set_timer_interval(interval); + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_periodic_timer); + time_travel_set_interval(interval); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + interval); } - if (time_travel_mode != TT_MODE_INFCPU) + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) os_timer_set_interval(interval); return 0; @@ -88,11 +557,15 @@ static int itimer_next_event(unsigned long delta, delta += 1; if (time_travel_mode != TT_MODE_OFF) { - time_travel_set_timer_mode(TT_TMR_ONESHOT); - time_travel_set_timer_expiry(time_travel_time + delta); + time_travel_del_event(&time_travel_timer_event); + time_travel_set_event_fn(&time_travel_timer_event, + time_travel_oneshot_timer); + time_travel_add_event(&time_travel_timer_event, + time_travel_time + delta); } - if (time_travel_mode != TT_MODE_INFCPU) + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL) return os_timer_one_shot(delta); return 0; @@ -143,8 +616,17 @@ static u64 timer_read(struct clocksource *cs) * stuck in loops that expect time to move more than the * exact requested sleep amount, e.g. python's socket server, * see https://bugs.python.org/issue37026. + * + * However, don't do that when we're in interrupt or such as + * then we might recurse into our own processing, and get to + * even more waiting, and that's not good - it messes up the + * "what do I do next" and onstack event we use to know when + * to return from time_travel_update_time(). */ - time_travel_set_time(time_travel_time + TIMER_MULTIPLIER); + if (!irqs_disabled() && !in_interrupt() && !in_softirq()) + time_travel_update_time(time_travel_time + + TIMER_MULTIPLIER, + false); return time_travel_time / TIMER_MULTIPLIER; } @@ -188,6 +670,8 @@ void read_persistent_clock64(struct timespec64 *ts) if (time_travel_start_set) nsecs = time_travel_start + time_travel_time; + else if (time_travel_mode == TT_MODE_EXTERNAL) + nsecs = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1); else nsecs = os_persistent_clock_emulation(); @@ -204,7 +688,8 @@ void __init time_init(void) #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT unsigned long calibrate_delay_is_known(void) { - if (time_travel_mode == TT_MODE_INFCPU) + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) return 1; return 0; } @@ -218,6 +703,13 @@ int setup_time_travel(char *str) return 1; } + if (strncmp(str, "=ext:", 5) == 0) { + time_travel_mode = TT_MODE_EXTERNAL; + timer_clockevent.name = "time-travel-timer-external"; + timer_clocksource.name = "time-travel-clock-external"; + return time_travel_connect_external(str + 5); + } + if (!*str) { time_travel_mode = TT_MODE_BASIC; timer_clockevent.name = "time-travel-timer"; @@ -242,7 +734,15 @@ __uml_help(setup_time_travel, "are no wall clock timers, and any CPU processing happens - as seen from the\n" "guest - instantly. This can be useful for accurate simulation regardless of\n" "debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n" -"easily lead to getting stuck (e.g. if anything in the system busy loops).\n"); +"easily lead to getting stuck (e.g. if anything in the system busy loops).\n" +"\n" +"time-travel=ext:[ID:]/path/to/socket\n" +"This enables time travel mode similar to =inf-cpu, except the system will\n" +"use the given socket to coordinate with a central scheduler, in order to\n" +"have more than one system simultaneously be on simulated time. The virtio\n" +"driver code in UML knows about this so you can also simulate networks and\n" +"devices using it, assuming the device has the right capabilities.\n" +"The optional ID is a 64-bit integer that's sent to the central scheduler.\n"); int setup_time_travel_start(char *str) { diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 9f21443be2c9..3b6dab3d4501 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -19,10 +19,10 @@ SECTIONS __binary_start = START; . = START + SIZEOF_HEADERS; + . = ALIGN(PAGE_SIZE); _text = .; INIT_TEXT_SECTION(0) - . = ALIGN(PAGE_SIZE); .text : { diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index fbda10535dab..26ecbd64c409 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -5,9 +5,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -16,6 +18,7 @@ #include #include #include +#include #include static void copy_stat(struct uml_stat *dst, const struct stat64 *src) @@ -664,3 +667,31 @@ int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds, return -errno; return err; } + +int os_poll(unsigned int n, const int *fds) +{ + /* currently need 2 FDs at most so avoid dynamic allocation */ + struct pollfd pollfds[2] = {}; + unsigned int i; + int ret; + + if (n > ARRAY_SIZE(pollfds)) + return -EINVAL; + + for (i = 0; i < n; i++) { + pollfds[i].fd = fds[i]; + pollfds[i].events = POLLIN; + } + + ret = poll(pollfds, n, -1); + if (ret < 0) + return -errno; + + /* Return the index of the available FD */ + for (i = 0; i < n; i++) { + if (pollfds[i].revents) + return i; + } + + return -EIO; +} diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index 432f8e1f55c2..90f6de224c70 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -14,7 +14,6 @@ #include #include #include -#include static timer_t event_high_res_timer = 0; diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c index 44def53a11cd..9e16078a4bf8 100644 --- a/arch/um/os-Linux/umid.c +++ b/arch/um/os-Linux/umid.c @@ -220,11 +220,12 @@ static void __init create_pid_file(void) char pid[sizeof("nnnnn\0")], *file; int fd, n; - file = malloc(strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")); + n = strlen(uml_dir) + UMID_LEN + sizeof("/pid\0"); + file = malloc(n); if (!file) return; - if (umid_file_name("pid", file, sizeof(file))) + if (umid_file_name("pid", file, n)) goto out; fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644); diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 593d5f3902bd..478710384b34 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __UM_PROCESSOR_H #define __UM_PROCESSOR_H +#include /* include faultinfo structure */ #include @@ -21,12 +22,19 @@ #include /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) +static __always_inline void rep_nop(void) { __asm__ __volatile__("rep;nop": : :"memory"); } -#define cpu_relax() rep_nop() +static __always_inline void cpu_relax(void) +{ + if (time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) + time_travel_ndelay(1); + else + rep_nop(); +} #define task_pt_regs(t) (&(t)->thread.regs) diff --git a/drivers/crypto/chelsio/chcr_ktls.c b/drivers/crypto/chelsio/chcr_ktls.c index 73658b71d4a3..cd1769ecdc1c 100644 --- a/drivers/crypto/chelsio/chcr_ktls.c +++ b/drivers/crypto/chelsio/chcr_ktls.c @@ -2,6 +2,7 @@ /* Copyright (C) 2020 Chelsio Communications. All rights reserved. */ #ifdef CONFIG_CHELSIO_TLS_DEVICE +#include #include "chcr_ktls.h" #include "clip_tbl.h" diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c index acf874800ca4..b0411a1827a3 100644 --- a/drivers/ide/ide-scan-pci.c +++ b/drivers/ide/ide-scan-pci.c @@ -89,8 +89,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev) static int __init ide_scan_pcibus(void) { struct pci_dev *dev = NULL; - struct pci_driver *d; - struct list_head *l, *n; + struct pci_driver *d, *tmp; pre_init = 0; for_each_pci_dev(dev) @@ -101,9 +100,8 @@ static int __init ide_scan_pcibus(void) * are post init. */ - list_for_each_safe(l, n, &ide_pci_drivers) { - list_del(l); - d = list_entry(l, struct pci_driver, node); + list_for_each_entry_safe(d, tmp, &ide_pci_drivers, node) { + list_del(&d->node); if (__pci_register_driver(d, d->driver.owner, d->driver.mod_name)) printk(KERN_ERR "%s: failed to register %s driver\n", diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index 086dfb1b9d0b..91cdc0a2b1a7 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -148,7 +148,7 @@ static void slc_bump(struct slcan *sl) u32 tmpid; char *cmd = sl->rbuff; - cf.can_id = 0; + memset(&cf, 0, sizeof(cf)); switch (*cmd) { case 'r': @@ -187,8 +187,6 @@ static void slc_bump(struct slcan *sl) else return; - *(u64 *) (&cf.data) = 0; /* clear payload */ - /* RTR frames may have a dlc > 0 but they never have any data bytes */ if (!(cf.can_id & CAN_RTR_FLAG)) { for (i = 0; i < cf.can_dlc; i++) { diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index affa5c6e135c..c7ac63f41918 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -480,7 +480,7 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds) priv->slave_mii_bus->parent = ds->dev->parent; priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask; - err = of_mdiobus_register(priv->slave_mii_bus, dn); + err = mdiobus_register(priv->slave_mii_bus); if (err && dn) of_node_put(dn); @@ -1079,6 +1079,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) const struct bcm_sf2_of_data *data; struct b53_platform_data *pdata; struct dsa_switch_ops *ops; + struct device_node *ports; struct bcm_sf2_priv *priv; struct b53_device *dev; struct dsa_switch *ds; @@ -1146,7 +1147,11 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev) set_bit(0, priv->cfp.used); set_bit(0, priv->cfp.unique); - bcm_sf2_identify_ports(priv, dn->child); + ports = of_find_node_by_name(dn, "ports"); + if (ports) { + bcm_sf2_identify_ports(priv, ports); + of_node_put(ports); + } priv->irq0 = irq_of_parse_and_map(dn, 0); priv->irq1 = irq_of_parse_and_map(dn, 1); diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index ef57552db260..2d0d91db0ddb 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -1403,6 +1403,9 @@ mt7530_setup(struct dsa_switch *ds) continue; phy_node = of_parse_phandle(mac_np, "phy-handle", 0); + if (!phy_node) + continue; + if (phy_node->parent == priv->dev->of_node->parent) { ret = of_get_phy_mode(mac_np, &interface); if (ret && ret != -ENODEV) diff --git a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c index 97901c114bfa..fbe9d88b13c7 100644 --- a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c +++ b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c @@ -491,7 +491,7 @@ get_ingress_preclass_record(struct aq_hw_s *hw, rec->snap[1] = packed_record[8] & 0xFF; rec->llc = (packed_record[8] >> 8) & 0xFF; - rec->llc = packed_record[9] << 8; + rec->llc |= packed_record[9] << 8; rec->mac_sa[0] = packed_record[10]; rec->mac_sa[0] |= packed_record[11] << 16; diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c index 9638d65d8261..517caedc0a87 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c @@ -6874,7 +6874,8 @@ int bnx2x_link_update(struct link_params *params, struct link_vars *vars) case PORT_HW_CFG_PHY_SELECTION_FIRST_PHY_PRIORITY: /* In this option, the first PHY makes sure to pass the * traffic through itself only. - * Its not clear how to reset the link on the second phy + * It's not clear how to reset the link on the second + * phy. */ active_external_phy = EXT_PHY1; break; diff --git a/drivers/net/ethernet/cavium/common/cavium_ptp.h b/drivers/net/ethernet/cavium/common/cavium_ptp.h index a04eccbc78e8..1e0ffe8f4152 100644 --- a/drivers/net/ethernet/cavium/common/cavium_ptp.h +++ b/drivers/net/ethernet/cavium/common/cavium_ptp.h @@ -24,7 +24,7 @@ struct cavium_ptp { struct ptp_clock *ptp_clock; }; -#if IS_ENABLED(CONFIG_CAVIUM_PTP) +#if IS_REACHABLE(CONFIG_CAVIUM_PTP) struct cavium_ptp *cavium_ptp_get(void); void cavium_ptp_put(struct cavium_ptp *ptp); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index 75fde0d4d493..a70018f067aa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -3132,7 +3132,6 @@ static int cxgb_set_mac_addr(struct net_device *dev, void *p) return ret; memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); - pi->xact_addr_filt = ret; return 0; } @@ -6672,6 +6671,10 @@ static void shutdown_one(struct pci_dev *pdev) if (adapter->port[i]->reg_state == NETREG_REGISTERED) cxgb_close(adapter->port[i]); + rtnl_lock(); + cxgb4_mqprio_stop_offload(adapter); + rtnl_unlock(); + if (is_uld(adapter)) { detach_ulds(adapter); t4_uld_clean_up(adapter); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c index ec3eb45ee3b4..e6af4906d674 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c @@ -301,6 +301,7 @@ static void cxgb4_mqprio_free_hw_resources(struct net_device *dev) cxgb4_clear_msix_aff(eorxq->msix->vec, eorxq->msix->aff_mask); free_irq(eorxq->msix->vec, &eorxq->rspq); + cxgb4_free_msix_idx_in_bmap(adap, eorxq->msix->idx); } free_rspq_fl(adap, &eorxq->rspq, &eorxq->fl); @@ -611,6 +612,28 @@ out: return ret; } +void cxgb4_mqprio_stop_offload(struct adapter *adap) +{ + struct cxgb4_tc_port_mqprio *tc_port_mqprio; + struct net_device *dev; + u8 i; + + if (!adap->tc_mqprio || !adap->tc_mqprio->port_mqprio) + return; + + for_each_port(adap, i) { + dev = adap->port[i]; + if (!dev) + continue; + + tc_port_mqprio = &adap->tc_mqprio->port_mqprio[i]; + if (!tc_port_mqprio->mqprio.qopt.num_tc) + continue; + + cxgb4_mqprio_disable_offload(dev); + } +} + int cxgb4_init_tc_mqprio(struct adapter *adap) { struct cxgb4_tc_port_mqprio *tc_port_mqprio, *port_mqprio; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h index c532f1ef8451..ff8794132b22 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h @@ -38,6 +38,7 @@ struct cxgb4_tc_mqprio { int cxgb4_setup_tc_mqprio(struct net_device *dev, struct tc_mqprio_qopt_offload *mqprio); +void cxgb4_mqprio_stop_offload(struct adapter *adap); int cxgb4_init_tc_mqprio(struct adapter *adap); void cxgb4_cleanup_tc_mqprio(struct adapter *adap); #endif /* __CXGB4_TC_MQPRIO_H__ */ diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 835b7816e372..87236206366f 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1731,7 +1731,7 @@ static int ftgmac100_setup_clk(struct ftgmac100 *priv) if (rc) goto cleanup_clk; - /* RCLK is for RMII, typically used for NCSI. Optional because its not + /* RCLK is for RMII, typically used for NCSI. Optional because it's not * necessary if it's the AST2400 MAC, or the MAC is configured for * RGMII, or the controller is not an ASPEED-based controller. */ diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index 2f76908cae73..51117a5a6bbf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -150,14 +150,20 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, u8 prio = act->vlan.prio; u16 vid = act->vlan.vid; - return mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei, - act->id, vid, - proto, prio, extack); + err = mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei, + act->id, vid, + proto, prio, extack); + if (err) + return err; + break; } case FLOW_ACTION_PRIORITY: - return mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei, - act->priority, - extack); + err = mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei, + act->priority, + extack); + if (err) + return err; + break; case FLOW_ACTION_MANGLE: { enum flow_action_mangle_base htype = act->mangle.htype; __be32 be_mask = (__force __be32) act->mangle.mask; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c index 9096ffd89e50..fbf714d027d8 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c @@ -643,7 +643,7 @@ static int mlxsw_sp_trap_policer_bs(u64 burst, u8 *p_burst_size, { int bs = fls64(burst) - 1; - if (burst != (1 << bs)) { + if (burst != (BIT_ULL(bs))) { NL_SET_ERR_MSG_MOD(extack, "Policer burst size is not power of two"); return -EINVAL; } diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c index 1a5fc2ae351c..29810a1aa210 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_l2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c @@ -369,8 +369,8 @@ int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn, struct qed_spq_entry *p_ent = NULL; struct qed_sp_init_data init_data; u8 abs_vport_id = 0; - int rc = -EINVAL; u16 rx_mode = 0; + int rc; rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id); if (rc) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 1305522f72d6..40efe60eff8d 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -282,7 +282,6 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[], { struct rmnet_priv *priv = netdev_priv(dev); struct net_device *real_dev; - struct rmnet_endpoint *ep; struct rmnet_port *port; u16 mux_id; @@ -297,19 +296,27 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[], if (data[IFLA_RMNET_MUX_ID]) { mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]); - if (rmnet_get_endpoint(port, mux_id)) { - NL_SET_ERR_MSG_MOD(extack, "MUX ID already exists"); - return -EINVAL; + + if (mux_id != priv->mux_id) { + struct rmnet_endpoint *ep; + + ep = rmnet_get_endpoint(port, priv->mux_id); + if (!ep) + return -ENODEV; + + if (rmnet_get_endpoint(port, mux_id)) { + NL_SET_ERR_MSG_MOD(extack, + "MUX ID already exists"); + return -EINVAL; + } + + hlist_del_init_rcu(&ep->hlnode); + hlist_add_head_rcu(&ep->hlnode, + &port->muxed_ep[mux_id]); + + ep->mux_id = mux_id; + priv->mux_id = mux_id; } - ep = rmnet_get_endpoint(port, priv->mux_id); - if (!ep) - return -ENODEV; - - hlist_del_init_rcu(&ep->hlnode); - hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]); - - ep->mux_id = mux_id; - priv->mux_id = mux_id; } if (data[IFLA_RMNET_FLAGS]) { diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 55cb5730beb6..bf5bf05970a2 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5441,9 +5441,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) netif_napi_add(dev, &tp->napi, rtl8169_poll, NAPI_POLL_WEIGHT); - dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO | - NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_CTAG_RX; + dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM | + NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO | NETIF_F_HIGHDMA; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; @@ -5460,26 +5459,26 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* Disallow toggling */ dev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX; + if (rtl_chip_supports_csum_v2(tp)) + dev->hw_features |= NETIF_F_IPV6_CSUM; + + dev->features |= dev->hw_features; + + /* There has been a number of reports that using SG/TSO results in + * tx timeouts. However for a lot of people SG/TSO works fine. + * Therefore disable both features by default, but allow users to + * enable them. Use at own risk! + */ if (rtl_chip_supports_csum_v2(tp)) { - dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6; + dev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6; dev->gso_max_size = RTL_GSO_MAX_SIZE_V2; dev->gso_max_segs = RTL_GSO_MAX_SEGS_V2; } else { + dev->hw_features |= NETIF_F_SG | NETIF_F_TSO; dev->gso_max_size = RTL_GSO_MAX_SIZE_V1; dev->gso_max_segs = RTL_GSO_MAX_SEGS_V1; } - /* RTL8168e-vl and one RTL8168c variant are known to have a - * HW issue with TSO. - */ - if (tp->mac_version == RTL_GIGA_MAC_VER_34 || - tp->mac_version == RTL_GIGA_MAC_VER_22) { - dev->vlan_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG); - dev->hw_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG); - } - - dev->features |= dev->hw_features; - dev->hw_features |= NETIF_F_RXALL; dev->hw_features |= NETIF_F_RXFCS; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index 542784300620..efc6ec1b8027 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -207,7 +207,7 @@ static void dwmac1000_set_filter(struct mac_device_info *hw, reg++; } - while (reg <= perfect_addr_number) { + while (reg < perfect_addr_number) { writel(0, ioaddr + GMAC_ADDR_HIGH(reg)); writel(0, ioaddr + GMAC_ADDR_LOW(reg)); reg++; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c index 0e4575f7bedb..ad4df9bddcf3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c @@ -577,8 +577,13 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash, value |= XGMAC_VLAN_EDVLP; value |= XGMAC_VLAN_ESVL; value |= XGMAC_VLAN_DOVLTC; + } else { + value &= ~XGMAC_VLAN_EDVLP; + value &= ~XGMAC_VLAN_ESVL; + value &= ~XGMAC_VLAN_DOVLTC; } + value &= ~XGMAC_VLAN_VID; writel(value, ioaddr + XGMAC_VLAN_TAG); } else if (perfect_match) { u32 value = readl(ioaddr + XGMAC_PACKET_FILTER); @@ -589,13 +594,19 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash, value = readl(ioaddr + XGMAC_VLAN_TAG); + value &= ~XGMAC_VLAN_VTHM; value |= XGMAC_VLAN_ETV; if (is_double) { value |= XGMAC_VLAN_EDVLP; value |= XGMAC_VLAN_ESVL; value |= XGMAC_VLAN_DOVLTC; + } else { + value &= ~XGMAC_VLAN_EDVLP; + value &= ~XGMAC_VLAN_ESVL; + value &= ~XGMAC_VLAN_DOVLTC; } + value &= ~XGMAC_VLAN_VID; writel(value | perfect_match, ioaddr + XGMAC_VLAN_TAG); } else { u32 value = readl(ioaddr + XGMAC_PACKET_FILTER); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 2fb671e61ee8..e6898fd5223f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4566,9 +4566,13 @@ static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid return ret; } - ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); + if (priv->hw->num_vlan) { + ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); + if (ret) + return ret; + } - return ret; + return 0; } static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid) @@ -4581,9 +4585,12 @@ static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vi is_double = true; clear_bit(vid, priv->active_vlans); - ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); - if (ret) - return ret; + + if (priv->hw->num_vlan) { + ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid); + if (ret) + return ret; + } return stmmac_vlan_update(priv, is_double); } diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index da82d7f16a09..0d580d81d910 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -2594,6 +2594,9 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) return PTR_ERR(dev); macsec = macsec_priv(dev); + if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) + return -EINVAL; + offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); if (macsec->offload == offload) return 0; diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index 481cf48c9b9e..31f731e6df72 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -425,8 +425,8 @@ static int at803x_parse_dt(struct phy_device *phydev) */ if (at803x_match_phy_id(phydev, ATH8030_PHY_ID) || at803x_match_phy_id(phydev, ATH8035_PHY_ID)) { - priv->clk_25m_reg &= ~AT8035_CLK_OUT_MASK; - priv->clk_25m_mask &= ~AT8035_CLK_OUT_MASK; + priv->clk_25m_reg &= AT8035_CLK_OUT_MASK; + priv->clk_25m_mask &= AT8035_CLK_OUT_MASK; } } diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 2ec19e5540bf..05d20343b816 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -25,6 +25,7 @@ #include #include #include +#include /* Operation Mode Strap Override */ #define MII_KSZPHY_OMSO 0x16 @@ -952,6 +953,12 @@ static int kszphy_resume(struct phy_device *phydev) genphy_resume(phydev); + /* After switching from power-down to normal mode, an internal global + * reset is automatically generated. Wait a minimum of 1 ms before + * read/write access to the PHY registers. + */ + usleep_range(1000, 2000); + ret = kszphy_config_reset(phydev); if (ret) return ret; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 228fe449dc6d..07476c6510f2 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1678,8 +1678,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, alloc_frag->offset += buflen; } err = tun_xdp_act(tun, xdp_prog, &xdp, act); - if (err < 0) - goto err_xdp; + if (err < 0) { + if (act == XDP_REDIRECT || act == XDP_TX) + put_page(alloc_frag->page); + goto out; + } + if (err == XDP_REDIRECT) xdp_do_flush(); if (err != XDP_PASS) @@ -1693,8 +1697,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun, return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad); -err_xdp: - put_page(alloc_frag->page); out: rcu_read_unlock(); local_bh_enable(); diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c index 8783e2ab3ec0..0ef7e1f443e3 100644 --- a/drivers/net/usb/pegasus.c +++ b/drivers/net/usb/pegasus.c @@ -54,6 +54,7 @@ static const char driver_name[] = "pegasus"; #undef PEGASUS_WRITE_EEPROM #define BMSR_MEDIA (BMSR_10HALF | BMSR_10FULL | BMSR_100HALF | \ BMSR_100FULL | BMSR_ANEGCAPABLE) +#define CARRIER_CHECK_DELAY (2 * HZ) static bool loopback; static bool mii_mode; @@ -1089,17 +1090,12 @@ static inline void setup_pegasus_II(pegasus_t *pegasus) set_register(pegasus, Reg81, 2); } - -static int pegasus_count; -static struct workqueue_struct *pegasus_workqueue; -#define CARRIER_CHECK_DELAY (2 * HZ) - static void check_carrier(struct work_struct *work) { pegasus_t *pegasus = container_of(work, pegasus_t, carrier_check.work); set_carrier(pegasus->net); if (!(pegasus->flags & PEGASUS_UNPLUG)) { - queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check, + queue_delayed_work(system_long_wq, &pegasus->carrier_check, CARRIER_CHECK_DELAY); } } @@ -1120,18 +1116,6 @@ static int pegasus_blacklisted(struct usb_device *udev) return 0; } -/* we rely on probe() and remove() being serialized so we - * don't need extra locking on pegasus_count. - */ -static void pegasus_dec_workqueue(void) -{ - pegasus_count--; - if (pegasus_count == 0) { - destroy_workqueue(pegasus_workqueue); - pegasus_workqueue = NULL; - } -} - static int pegasus_probe(struct usb_interface *intf, const struct usb_device_id *id) { @@ -1144,14 +1128,6 @@ static int pegasus_probe(struct usb_interface *intf, if (pegasus_blacklisted(dev)) return -ENODEV; - if (pegasus_count == 0) { - pegasus_workqueue = alloc_workqueue("pegasus", WQ_MEM_RECLAIM, - 0); - if (!pegasus_workqueue) - return -ENOMEM; - } - pegasus_count++; - net = alloc_etherdev(sizeof(struct pegasus)); if (!net) goto out; @@ -1209,7 +1185,7 @@ static int pegasus_probe(struct usb_interface *intf, res = register_netdev(net); if (res) goto out3; - queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check, + queue_delayed_work(system_long_wq, &pegasus->carrier_check, CARRIER_CHECK_DELAY); dev_info(&intf->dev, "%s, %s, %pM\n", net->name, usb_dev_id[dev_index].name, net->dev_addr); @@ -1222,7 +1198,6 @@ out2: out1: free_netdev(net); out: - pegasus_dec_workqueue(); return res; } @@ -1237,7 +1212,7 @@ static void pegasus_disconnect(struct usb_interface *intf) } pegasus->flags |= PEGASUS_UNPLUG; - cancel_delayed_work(&pegasus->carrier_check); + cancel_delayed_work_sync(&pegasus->carrier_check); unregister_netdev(pegasus->net); unlink_all_urbs(pegasus); free_all_urbs(pegasus); @@ -1246,7 +1221,6 @@ static void pegasus_disconnect(struct usb_interface *intf) pegasus->rx_skb = NULL; } free_netdev(pegasus->net); - pegasus_dec_workqueue(); } static int pegasus_suspend(struct usb_interface *intf, pm_message_t message) @@ -1254,7 +1228,7 @@ static int pegasus_suspend(struct usb_interface *intf, pm_message_t message) struct pegasus *pegasus = usb_get_intfdata(intf); netif_device_detach(pegasus->net); - cancel_delayed_work(&pegasus->carrier_check); + cancel_delayed_work_sync(&pegasus->carrier_check); if (netif_running(pegasus->net)) { usb_kill_urb(pegasus->rx_urb); usb_kill_urb(pegasus->intr_urb); @@ -1276,7 +1250,7 @@ static int pegasus_resume(struct usb_interface *intf) pegasus->intr_urb->actual_length = 0; intr_callback(pegasus->intr_urb); } - queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check, + queue_delayed_work(system_long_wq, &pegasus->carrier_check, CARRIER_CHECK_DELAY); return 0; } diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c index f66c0f8f6f4a..ecb3fccca603 100644 --- a/drivers/net/wimax/i2400m/driver.c +++ b/drivers/net/wimax/i2400m/driver.c @@ -740,9 +740,6 @@ EXPORT_SYMBOL_GPL(i2400m_error_recovery); static int i2400m_bm_buf_alloc(struct i2400m *i2400m) { - int result; - - result = -ENOMEM; i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL); if (i2400m->bm_cmd_buf == NULL) goto error_bm_cmd_kzalloc; @@ -754,7 +751,7 @@ int i2400m_bm_buf_alloc(struct i2400m *i2400m) error_bm_ack_buf_kzalloc: kfree(i2400m->bm_cmd_buf); error_bm_cmd_kzalloc: - return result; + return -ENOMEM; } @@ -843,7 +840,7 @@ EXPORT_SYMBOL_GPL(i2400m_reset); */ int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags) { - int result = -ENODEV; + int result; struct device *dev = i2400m_dev(i2400m); struct wimax_dev *wimax_dev = &i2400m->wimax_dev; struct net_device *net_dev = i2400m->wimax_dev.net_dev; diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c index 9d00a24277aa..f96e5eaee87e 100644 --- a/drivers/parisc/eisa.c +++ b/drivers/parisc/eisa.c @@ -243,11 +243,6 @@ static irqreturn_t dummy_irq2_handler(int _, void *dev) return IRQ_HANDLED; } -static struct irqaction irq2_action = { - .handler = dummy_irq2_handler, - .name = "cascade", -}; - static void init_eisa_pic(void) { unsigned long flags; @@ -335,7 +330,8 @@ static int __init eisa_probe(struct parisc_device *dev) } /* Reserve IRQ2 */ - setup_irq(2, &irq2_action); + if (request_irq(2, dummy_irq2_handler, 0, "cascade", NULL)) + pr_err("Failed to request irq 2 (cascade)\n"); for (i = 0; i < 16; i++) { irq_set_chip_and_handler(i, &eisa_interrupt_type, handle_simple_irq); diff --git a/drivers/pcmcia/cs_internal.h b/drivers/pcmcia/cs_internal.h index 33c9b6ea7364..fb9b17fa0fb5 100644 --- a/drivers/pcmcia/cs_internal.h +++ b/drivers/pcmcia/cs_internal.h @@ -40,7 +40,7 @@ struct cis_cache_entry { unsigned int addr; unsigned int len; unsigned int attr; - unsigned char cache[0]; + unsigned char cache[]; }; struct pccard_resource_ops { diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c index 0a04eb04f3a2..d3ef5534991e 100644 --- a/drivers/pcmcia/omap_cf.c +++ b/drivers/pcmcia/omap_cf.c @@ -329,7 +329,7 @@ static int __exit omap_cf_remove(struct platform_device *pdev) static struct platform_driver omap_cf_driver = { .driver = { - .name = (char *) driver_name, + .name = driver_name, }, .remove = __exit_p(omap_cf_remove), }; diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c index 9e6922c08ef6..3b05760e69d6 100644 --- a/drivers/pcmcia/rsrc_nonstatic.c +++ b/drivers/pcmcia/rsrc_nonstatic.c @@ -1076,7 +1076,7 @@ static ssize_t show_io_db(struct device *dev, for (p = data->io_db.next; p != &data->io_db; p = p->next) { if (ret > (PAGE_SIZE - 10)) continue; - ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1), + ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1), "0x%08lx - 0x%08lx\n", ((unsigned long) p->base), ((unsigned long) p->base + p->num - 1)); @@ -1133,7 +1133,7 @@ static ssize_t show_mem_db(struct device *dev, p = p->next) { if (ret > (PAGE_SIZE - 10)) continue; - ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1), + ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1), "0x%08lx - 0x%08lx\n", ((unsigned long) p->base), ((unsigned long) p->base + p->num - 1)); @@ -1142,7 +1142,7 @@ static ssize_t show_mem_db(struct device *dev, for (p = data->mem_db.next; p != &data->mem_db; p = p->next) { if (ret > (PAGE_SIZE - 10)) continue; - ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1), + ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1), "0x%08lx - 0x%08lx\n", ((unsigned long) p->base), ((unsigned long) p->base + p->num - 1)); diff --git a/drivers/pcmcia/sa1100_simpad.c b/drivers/pcmcia/sa1100_simpad.c index e2e8729afd9d..784ada5b8c4f 100644 --- a/drivers/pcmcia/sa1100_simpad.c +++ b/drivers/pcmcia/sa1100_simpad.c @@ -14,7 +14,7 @@ #include #include #include "sa1100_generic.h" - + static int simpad_pcmcia_hw_init(struct soc_pcmcia_socket *skt) { @@ -66,7 +66,7 @@ simpad_pcmcia_configure_socket(struct soc_pcmcia_socket *skt, simpad_clear_cs3_bit(VCC_3V_EN|VCC_5V_EN|EN0|EN1); break; - case 33: + case 33: simpad_clear_cs3_bit(VCC_3V_EN|EN1); simpad_set_cs3_bit(VCC_5V_EN|EN0); break; @@ -95,7 +95,7 @@ static void simpad_pcmcia_socket_suspend(struct soc_pcmcia_socket *skt) simpad_set_cs3_bit(PCMCIA_RESET); } -static struct pcmcia_low_level simpad_pcmcia_ops = { +static struct pcmcia_low_level simpad_pcmcia_ops = { .owner = THIS_MODULE, .hw_init = simpad_pcmcia_hw_init, .hw_shutdown = simpad_pcmcia_hw_shutdown, diff --git a/drivers/pcmcia/soc_common.h b/drivers/pcmcia/soc_common.h index b7f993f1bbd0..222e81c79365 100644 --- a/drivers/pcmcia/soc_common.h +++ b/drivers/pcmcia/soc_common.h @@ -88,7 +88,7 @@ struct soc_pcmcia_socket { struct skt_dev_info { int nskt; - struct soc_pcmcia_socket skt[0]; + struct soc_pcmcia_socket skt[]; }; struct pcmcia_state { diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c index 49b1c6a1bdbe..bf6529b0b5b0 100644 --- a/drivers/pcmcia/yenta_socket.c +++ b/drivers/pcmcia/yenta_socket.c @@ -180,12 +180,12 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri for (i = 0; i < 0x24; i += 4) { unsigned val; if (!(i & 15)) - offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); val = cb_readl(socket, i); - offset += snprintf(buf + offset, PAGE_SIZE - offset, " %08x", val); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %08x", val); } - offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:"); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:"); for (i = 0; i < 0x45; i++) { unsigned char val; if (!(i & 7)) { @@ -193,10 +193,10 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri memcpy(buf + offset, " -", 2); offset += 2; } else - offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i); } val = exca_readb(socket, i); - offset += snprintf(buf + offset, PAGE_SIZE - offset, " %02x", val); + offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %02x", val); } buf[offset++] = '\n'; return offset; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e6b8c49076bb..c070c0d8e3e9 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -139,8 +139,8 @@ static char *inode_name(struct inode *ino) static char *follow_link(char *link) { - int len, n; char *name, *resolved, *end; + int n; name = __getname(); if (!name) { @@ -164,15 +164,13 @@ static char *follow_link(char *link) return name; *(end + 1) = '\0'; - len = strlen(link) + strlen(name) + 1; - resolved = kmalloc(len, GFP_KERNEL); + resolved = kasprintf(GFP_KERNEL, "%s%s", link, name); if (resolved == NULL) { n = -ENOMEM; goto out_free; } - sprintf(resolved, "%s%s", link, name); __putname(name); kfree(link); return resolved; @@ -921,18 +919,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_d_op = &simple_dentry_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; - /* NULL is printed as by sprintf: avoid that. */ + /* NULL is printed as '(null)' by printf(): avoid that. */ if (req_root == NULL) req_root = ""; err = -ENOMEM; sb->s_fs_info = host_root_path = - kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL); + kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root); if (host_root_path == NULL) goto out; - sprintf(host_root_path, "%s/%s", root_ino, req_root); - root_inode = new_inode(sb); if (!root_inode) goto out; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 28b1a2b4459e..3a2ac7072dbb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -47,8 +47,8 @@ * A. IP checksum related features * * Drivers advertise checksum offload capabilities in the features of a device. - * From the stack's point of view these are capabilities offered by the driver, - * a driver typically only advertises features that it is capable of offloading + * From the stack's point of view these are capabilities offered by the driver. + * A driver typically only advertises features that it is capable of offloading * to its device. * * The checksum related features are: @@ -63,7 +63,7 @@ * TCP or UDP packets over IPv4. These are specifically * unencapsulated packets of the form IPv4|TCP or * IPv4|UDP where the Protocol field in the IPv4 header - * is TCP or UDP. The IPv4 header may contain IP options + * is TCP or UDP. The IPv4 header may contain IP options. * This feature cannot be set in features for a device * with NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). @@ -79,13 +79,13 @@ * DEPRECATED (see below). * * NETIF_F_RXCSUM - Driver (device) performs receive checksum offload. - * This flag is used only used to disable the RX checksum + * This flag is only used to disable the RX checksum * feature for a device. The stack will accept receive * checksum indication in packets received on a device * regardless of whether NETIF_F_RXCSUM is set. * * B. Checksumming of received packets by device. Indication of checksum - * verification is in set skb->ip_summed. Possible values are: + * verification is set in skb->ip_summed. Possible values are: * * CHECKSUM_NONE: * @@ -115,16 +115,16 @@ * the packet minus one that have been verified as CHECKSUM_UNNECESSARY. * For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet * and a device is able to verify the checksums for UDP (possibly zero), - * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to + * GRE (checksum flag is set) and TCP, skb->csum_level would be set to * two. If the device were only able to verify the UDP checksum and not - * GRE, either because it doesn't support GRE checksum of because GRE + * GRE, either because it doesn't support GRE checksum or because GRE * checksum is bad, skb->csum_level would be set to zero (TCP checksum is * not considered in this case). * * CHECKSUM_COMPLETE: * * This is the most generic way. The device supplied checksum of the _whole_ - * packet as seen by netif_rx() and fills out in skb->csum. Meaning, the + * packet as seen by netif_rx() and fills in skb->csum. This means the * hardware doesn't need to parse L3/L4 headers to implement this. * * Notes: @@ -153,8 +153,8 @@ * from skb->csum_start up to the end, and to record/write the checksum at * offset skb->csum_start + skb->csum_offset. A driver may verify that the * csum_start and csum_offset values are valid values given the length and - * offset of the packet, however they should not attempt to validate that the - * checksum refers to a legitimate transport layer checksum-- it is the + * offset of the packet, but it should not attempt to validate that the + * checksum refers to a legitimate transport layer checksum -- it is the * purview of the stack to validate that csum_start and csum_offset are set * correctly. * @@ -178,18 +178,18 @@ * * CHECKSUM_UNNECESSARY: * - * This has the same meaning on as CHECKSUM_NONE for checksum offload on + * This has the same meaning as CHECKSUM_NONE for checksum offload on * output. * * CHECKSUM_COMPLETE: * Not used in checksum output. If a driver observes a packet with this value - * set in skbuff, if should treat as CHECKSUM_NONE being set. + * set in skbuff, it should treat the packet as if CHECKSUM_NONE were set. * * D. Non-IP checksum (CRC) offloads * * NETIF_F_SCTP_CRC - This feature indicates that a device is capable of * offloading the SCTP CRC in a packet. To perform this offload the stack - * will set set csum_start and csum_offset accordingly, set ip_summed to + * will set csum_start and csum_offset accordingly, set ip_summed to * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c. * A driver that supports both IP checksum offload and SCTP CRC32c offload @@ -200,10 +200,10 @@ * NETIF_F_FCOE_CRC - This feature indicates that a device is capable of * offloading the FCOE CRC in a packet. To perform this offload the stack * will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset - * accordingly. Note the there is no indication in the skbuff that the - * CHECKSUM_PARTIAL refers to an FCOE checksum, a driver that supports + * accordingly. Note that there is no indication in the skbuff that the + * CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports * both IP checksum offload and FCOE CRC offload must verify which offload - * is configured for a packet presumably by inspecting packet headers. + * is configured for a packet, presumably by inspecting packet headers. * * E. Checksumming on output with GSO. * @@ -211,9 +211,9 @@ * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the * gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as * part of the GSO operation is implied. If a checksum is being offloaded - * with GSO then ip_summed is CHECKSUM_PARTIAL, csum_start and csum_offset - * are set to refer to the outermost checksum being offload (two offloaded - * checksums are possible with UDP encapsulation). + * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and + * csum_offset are set to refer to the outermost checksum being offloaded + * (two offloaded checksums are possible with UDP encapsulation). */ /* Don't change this without changing skb_csum_unnecessary! */ diff --git a/include/uapi/linux/um_timetravel.h b/include/uapi/linux/um_timetravel.h new file mode 100644 index 000000000000..ca3238222b6d --- /dev/null +++ b/include/uapi/linux/um_timetravel.h @@ -0,0 +1,128 @@ +/* + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Copyright (C) 2019 Intel Corporation + */ +#ifndef _UAPI_LINUX_UM_TIMETRAVEL_H +#define _UAPI_LINUX_UM_TIMETRAVEL_H +#include + +/** + * struct um_timetravel_msg - UM time travel message + * + * This is the basic message type, going in both directions. + * + * This is the message passed between the host (user-mode Linux instance) + * and the calendar (the application on the other side of the socket) in + * order to implement common scheduling. + * + * Whenever UML has an event it will request runtime for it from the + * calendar, and then wait for its turn until it can run, etc. Note + * that it will only ever request the single next runtime, i.e. multiple + * REQUEST messages override each other. + */ +struct um_timetravel_msg { + /** + * @op: operation value from &enum um_timetravel_ops + */ + __u32 op; + + /** + * @seq: sequence number for the message - shall be reflected in + * the ACK response, and should be checked while processing + * the response to see if it matches + */ + __u32 seq; + + /** + * @time: time in nanoseconds + */ + __u64 time; +}; + +/** + * enum um_timetravel_ops - Operation codes + */ +enum um_timetravel_ops { + /** + * @UM_TIMETRAVEL_ACK: response (ACK) to any previous message, + * this usually doesn't carry any data in the 'time' field + * unless otherwise specified below + */ + UM_TIMETRAVEL_ACK = 0, + + /** + * @UM_TIMETRAVEL_START: initialize the connection, the time + * field contains an (arbitrary) ID to possibly be able + * to distinguish the connections. + */ + UM_TIMETRAVEL_START = 1, + + /** + * @UM_TIMETRAVEL_REQUEST: request to run at the given time + * (host -> calendar) + */ + UM_TIMETRAVEL_REQUEST = 2, + + /** + * @UM_TIMETRAVEL_WAIT: Indicate waiting for the previously requested + * runtime, new requests may be made while waiting (e.g. due to + * interrupts); the time field is ignored. The calendar must process + * this message and later send a %UM_TIMETRAVEL_RUN message when + * the host can run again. + * (host -> calendar) + */ + UM_TIMETRAVEL_WAIT = 3, + + /** + * @UM_TIMETRAVEL_GET: return the current time from the calendar in the + * ACK message, the time in the request message is ignored + * (host -> calendar) + */ + UM_TIMETRAVEL_GET = 4, + + /** + * @UM_TIMETRAVEL_UPDATE: time update to the calendar, must be sent e.g. + * before kicking an interrupt to another calendar + * (host -> calendar) + */ + UM_TIMETRAVEL_UPDATE = 5, + + /** + * @UM_TIMETRAVEL_RUN: run time request granted, current time is in + * the time field + * (calendar -> host) + */ + UM_TIMETRAVEL_RUN = 6, + + /** + * @UM_TIMETRAVEL_FREE_UNTIL: Enable free-running until the given time, + * this is a message from the calendar telling the host that it can + * freely do its own scheduling for anything before the indicated + * time. + * Note that if a calendar sends this message once, the host may + * assume that it will also do so in the future, if it implements + * wraparound semantics for the time field. + * (calendar -> host) + */ + UM_TIMETRAVEL_FREE_UNTIL = 7, + + /** + * @UM_TIMETRAVEL_GET_TOD: Return time of day, typically used once at + * boot by the virtual machines to get a synchronized time from + * the simulation. + */ + UM_TIMETRAVEL_GET_TOD = 8, +}; + +#endif /* _UAPI_LINUX_UM_TIMETRAVEL_H */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5bf8d22a47ec..39d37d0ef575 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t) neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/2); + HZ/100); neigh_add_timer(neigh, next); immediate_probe = true; } else { @@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh) neigh->nud_state = NUD_INCOMPLETE; atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, - jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); + jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); diff --git a/net/core/sock.c b/net/core/sock.c index da32d9b6d09f..ce1d8dce9b7a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) /* Sorry... */ ret = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_RAW)) + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5390ff541658..e94eb1aac602 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1338,7 +1338,7 @@ static void dsa_hw_port_list_free(struct list_head *hw_port_list) } /* Make the hardware datapath to/from @dev limited to a common MTU */ -void dsa_bridge_mtu_normalization(struct dsa_port *dp) +static void dsa_bridge_mtu_normalization(struct dsa_port *dp) { struct list_head hw_port_list; struct dsa_switch_tree *dst; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ad73d59a40e1..6e7684af6a9a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1359,7 +1359,7 @@ retry: regen_advance = idev->cnf.regen_max_retry * idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; /* recalculate max_desync_factor each time and update * idev->desync_factor if it's larger @@ -3320,6 +3320,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) if (netif_is_l3_master(idev->dev)) return; + /* no link local addresses on devices flagged as slaves */ + if (idev->dev->flags & IFF_SLAVE) + return; + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); switch (idev->cnf.addr_gen_mode) { @@ -4139,7 +4143,8 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_probes--; addrconf_mod_dad_work(ifp, - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), + HZ/100)); spin_unlock(&ifp->lock); write_unlock_bh(&idev->lock); @@ -4545,7 +4550,7 @@ restart: !(ifp->flags&IFA_F_TENTATIVE)) { unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * ifp->idev->cnf.dad_transmits * - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; if (age >= ifp->prefered_lft - regen_advance) { struct inet6_ifaddr *ifpub = ifp->ifpub; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 6ffa153e5166..1ecd4e9b0bdf 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1359,8 +1359,8 @@ skip_defrtr: if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; - if (rtime < HZ/10) - rtime = HZ/10; + if (rtime < HZ/100) + rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index dc4f20e23bf7..d38b476fc7f2 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -48,7 +48,7 @@ void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = 0; outhdr->cmpre = 0; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr, ipv6_rpl_segdata_pos(inhdr, i), inhdr->cmpri); @@ -66,7 +66,7 @@ static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr int i; for (plen = 0; plen < sizeof(*daddr); plen++) { - for (i = 0; i <= n; i++) { + for (i = 0; i < n; i++) { if (daddr->s6_addr[plen] != inhdr->rpl_segaddr[i].s6_addr[plen]) return plen; @@ -114,7 +114,7 @@ void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = cmpri; outhdr->cmpre = cmpre; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i), &inhdr->rpl_segaddr[i], cmpri); diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index a49ddc6cd020..c3ececd7cfc1 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -210,7 +210,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct rpl_lwt *rlwt; - int err = -EINVAL; + int err; rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bd220ee4aac9..faf57585b892 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 - 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 064639f72487..977d9c8b1453 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -3,6 +3,8 @@ * * Copyright (c) 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a0ce7f324499..86d61ab34c7c 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -4,6 +4,8 @@ * Copyright (c) 2020, Red Hat, Inc. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include #include #include diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1833bc1f4a43..939a5045181a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -57,10 +57,43 @@ static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk) return msk->first && !sk_is_mptcp(msk->first); } +static struct socket *mptcp_is_tcpsk(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sock->sk != sk) + return NULL; + + if (unlikely(sk->sk_prot == &tcp_prot)) { + /* we are being invoked after mptcp_accept() has + * accepted a non-mp-capable flow: sk is a tcp_sk, + * not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + sock->ops = &inet_stream_ops; + return sock; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { + sock->ops = &inet6_stream_ops; + return sock; +#endif + } + + return NULL; +} + static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk) { + struct socket *sock; + sock_owned_by_me((const struct sock *)msk); + sock = mptcp_is_tcpsk((struct sock *)msk); + if (unlikely(sock)) + return sock; + if (likely(!__mptcp_needs_tcp_fallback(msk))) return NULL; @@ -84,6 +117,10 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state) struct socket *ssock; int err; + ssock = __mptcp_tcp_fallback(msk); + if (unlikely(ssock)) + return ssock; + ssock = __mptcp_nmpc_socket(msk); if (ssock) goto set_state; @@ -121,6 +158,27 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->offset = offset; } +/* both sockets must be locked */ +static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk, + struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u64 dsn = mptcp_subflow_get_mapped_dsn(subflow); + + /* revalidate data sequence number. + * + * mptcp_subflow_data_available() is usually called + * without msk lock. Its unlikely (but possible) + * that msk->ack_seq has been advanced since the last + * call found in-sequence data. + */ + if (likely(dsn == msk->ack_seq)) + return true; + + subflow->data_avail = 0; + return mptcp_subflow_data_available(ssk); +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sock *ssk, unsigned int *bytes) @@ -132,6 +190,11 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct tcp_sock *tp; bool done = false; + if (!mptcp_subflow_dsn_valid(msk, ssk)) { + *bytes = 0; + return false; + } + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf); @@ -290,6 +353,15 @@ void mptcp_data_acked(struct sock *sk) sock_hold(sk); } +void mptcp_subflow_eof(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) && + schedule_work(&msk->work)) + sock_hold(sk); +} + static void mptcp_stop_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -994,6 +1066,27 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) return 0; } +static void mptcp_check_for_eof(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + int receivers = 0; + + mptcp_for_each_subflow(msk, subflow) + receivers += !subflow->rx_eof; + + if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + /* hopefully temporary hack: propagate shutdown status + * to msk, when all subflows agree on it + */ + sk->sk_shutdown |= RCV_SHUTDOWN; + + smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ + set_bit(MPTCP_DATA_READY, &msk->flags); + sk->sk_data_ready(sk); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1010,6 +1103,9 @@ static void mptcp_worker(struct work_struct *work) __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) + mptcp_check_for_eof(msk); + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) goto unlock; @@ -1752,7 +1848,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, msk = mptcp_sk(sk); lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); + ssock = __mptcp_tcp_fallback(msk); + if (!ssock) + ssock = __mptcp_nmpc_socket(msk); if (ssock) { mask = ssock->ops->poll(file, ssock, wait); release_sock(sk); @@ -1762,9 +1860,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, release_sock(sk); sock_poll_wait(file, sock, wait); lock_sock(sk); - ssock = __mptcp_tcp_fallback(msk); - if (unlikely(ssock)) - return ssock->ops->poll(file, ssock, NULL); if (test_bit(MPTCP_DATA_READY, &msk->flags)) mask = EPOLLIN | EPOLLRDNORM; @@ -1783,11 +1878,17 @@ static int mptcp_shutdown(struct socket *sock, int how) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct mptcp_subflow_context *subflow; + struct socket *ssock; int ret = 0; pr_debug("sk=%p, how=%d", msk, how); lock_sock(sock->sk); + ssock = __mptcp_tcp_fallback(msk); + if (ssock) { + release_sock(sock->sk); + return inet_shutdown(ssock, how); + } if (how == SHUT_WR || how == SHUT_RDWR) inet_sk_state_store(sock->sk, TCP_FIN_WAIT1); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f733c5425552..67448002a2d7 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -89,6 +89,7 @@ #define MPTCP_DATA_READY 0 #define MPTCP_SEND_SPACE 1 #define MPTCP_WORK_RTX 2 +#define MPTCP_WORK_EOF 3 static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) { @@ -339,6 +340,7 @@ void mptcp_finish_connect(struct sock *sk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); void mptcp_data_acked(struct sock *sk); +void mptcp_subflow_eof(struct sock *sk); int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(u32 token); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index b5180c81588e..50a8bea987c6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -994,8 +994,7 @@ static void subflow_state_change(struct sock *sk) if (!(parent->sk_shutdown & RCV_SHUTDOWN) && !subflow->rx_eof && subflow_is_done(sk)) { subflow->rx_eof = 1; - parent->sk_shutdown |= RCV_SHUTDOWN; - __subflow_state_change(parent); + mptcp_subflow_eof(parent); } } diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 129a5ad1bc35..33352dd99d4d 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -40,7 +40,7 @@ static int token_used __read_mostly; /** * mptcp_token_new_request - create new key/idsn/token for subflow_request - * @req - the request socket + * @req: the request socket * * This function is called when a new mptcp connection is coming in. * @@ -80,7 +80,7 @@ int mptcp_token_new_request(struct request_sock *req) /** * mptcp_token_new_connect - create new key/idsn/token for subflow - * @sk - the socket that will initiate a connection + * @sk: the socket that will initiate a connection * * This function is called when a new outgoing mptcp connection is * initiated. @@ -125,6 +125,7 @@ int mptcp_token_new_connect(struct sock *sk) /** * mptcp_token_new_accept - insert token for later processing * @token: the token to insert to the tree + * @conn: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. @@ -169,7 +170,7 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token) /** * mptcp_token_destroy_request - remove mptcp connection/token - * @token - token of mptcp connection to remove + * @token: token of mptcp connection to remove * * Remove not-yet-fully-established incoming connection identified * by @token. @@ -183,7 +184,7 @@ void mptcp_token_destroy_request(u32 token) /** * mptcp_token_destroy - remove mptcp connection/token - * @token - token of mptcp connection to remove + * @token: token of mptcp connection to remove * * Remove the connection identified by @token. */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index fd8a01ca7a2d..2398d7238300 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -462,12 +462,14 @@ static void flow_table_copy_flows(struct table_instance *old, struct hlist_head *head = &old->buckets[i]; if (ufid) - hlist_for_each_entry(flow, head, - ufid_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + ufid_table.node[old_ver], + lockdep_ovsl_is_held()) ufid_table_instance_insert(new, flow); else - hlist_for_each_entry(flow, head, - flow_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + flow_table.node[old_ver], + lockdep_ovsl_is_held()) table_instance_insert(new, flow); } diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 9904299424a1..61e95029c18f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -26,9 +27,12 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ +struct tcindex_data; + struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; + struct tcindex_data *p; struct rcu_work rwork; }; @@ -49,6 +53,7 @@ struct tcindex_data { u32 hash; /* hash table size; 0 if undefined */ u32 alloc_hash; /* allocated size */ u32 fall_through; /* 0: only classify if explicit match */ + refcount_t refcnt; /* a temporary refcnt for perfect hash */ struct rcu_work rwork; }; @@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) return tcf_exts_has_actions(&r->exts) || r->res.classid; } +static void tcindex_data_get(struct tcindex_data *p) +{ + refcount_inc(&p->refcnt); +} + +static void tcindex_data_put(struct tcindex_data *p) +{ + if (refcount_dec_and_test(&p->refcnt)) { + kfree(p->perfect); + kfree(p->h); + kfree(p); + } +} + static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, u16 key) { @@ -132,6 +151,7 @@ static int tcindex_init(struct tcf_proto *tp) p->mask = 0xffff; p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; + refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ rcu_assign_pointer(tp->root, p); return 0; @@ -141,6 +161,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) { tcf_exts_destroy(&r->exts); tcf_exts_put_net(&r->exts); + tcindex_data_put(r->p); } static void tcindex_destroy_rexts_work(struct work_struct *work) @@ -212,6 +233,8 @@ found: else __tcindex_destroy_fexts(f); } else { + tcindex_data_get(p); + if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); else @@ -228,9 +251,7 @@ static void tcindex_destroy_work(struct work_struct *work) struct tcindex_data, rwork); - kfree(p->perfect); - kfree(p->h); - kfree(p); + tcindex_data_put(p); } static inline int @@ -248,9 +269,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { }; static int tcindex_filter_result_init(struct tcindex_filter_result *r, + struct tcindex_data *p, struct net *net) { memset(r, 0, sizeof(*r)); + r->p = p; return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } @@ -290,6 +313,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) goto errout; + cp->perfect[i].p = cp; } return 0; @@ -334,6 +358,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->alloc_hash = p->alloc_hash; cp->fall_through = p->fall_through; cp->tp = tp; + refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ if (tb[TCA_TCINDEX_HASH]) cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -366,7 +391,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } cp->h = p->h; - err = tcindex_filter_result_init(&new_filter_result, net); + err = tcindex_filter_result_init(&new_filter_result, cp, net); if (err < 0) goto errout_alloc; if (old_r) @@ -434,7 +459,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, goto errout_alloc; f->key = handle; f->next = NULL; - err = tcindex_filter_result_init(&f->result, net); + err = tcindex_filter_result_init(&f->result, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -447,7 +472,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, net); + err = tcindex_filter_result_init(old_r, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -571,6 +596,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, for (i = 0; i < p->hash; i++) { struct tcindex_filter_result *r = p->perfect + i; + /* tcf_queue_work() does not guarantee the ordering we + * want, so we have to take this refcnt temporarily to + * ensure 'p' is freed after all tcindex_filter_result + * here. Imperfect hash does not need this, because it + * uses linked lists rather than an array. + */ + tcindex_data_get(p); + tcf_unbind_filter(tp, &r->res); if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork,