diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index d1bf77ef3bc1..c0fe711f14d3 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -434,6 +434,8 @@ prototypes:: void (*lm_break)(struct file_lock *); /* break_lease callback */ int (*lm_change)(struct file_lock **, int); bool (*lm_breaker_owns_lease)(struct file_lock *); + bool (*lm_lock_expirable)(struct file_lock *); + void (*lm_expire_lock)(void); locking rules: @@ -445,6 +447,8 @@ lm_grant: no no no lm_break: yes no no lm_change yes no no lm_breaker_owns_lease: yes no no +lm_lock_expirable yes no no +lm_expire_lock no no yes ====================== ============= ================= ========= buffer_head diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index e0a60a27e14d..9e3653253ef2 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -36,8 +36,10 @@ config ARC select HAVE_KERNEL_LZMA select HAVE_KPROBES select HAVE_KRETPROBES + select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_MOD_ARCH_SPECIFIC select HAVE_PERF_EVENTS + select HAVE_SYSCALL_TRACEPOINTS select IRQ_DOMAIN select MODULES_USE_ELF_RELA select OF diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index 4c919c0f4b30..d5719a260864 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -63,4 +63,8 @@ struct arc_reg_cc_build { #define PERF_COUNT_ARC_HW_MAX (PERF_COUNT_HW_MAX + 8) +#ifdef CONFIG_PERF_EVENTS +#define perf_arch_bpf_user_pt_regs(regs) (struct user_regs_struct *)regs +#endif + #endif /* __ASM_PERF_EVENT_H */ diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h index cca8d6583e31..5869a74c0db2 100644 --- a/arch/arc/include/asm/ptrace.h +++ b/arch/arc/include/asm/ptrace.h @@ -8,6 +8,7 @@ #define __ASM_ARC_PTRACE_H #include +#include #ifndef __ASSEMBLY__ @@ -54,6 +55,9 @@ struct pt_regs { unsigned long user_r25; }; + +#define MAX_REG_OFFSET offsetof(struct pt_regs, user_r25) + #else struct pt_regs { @@ -102,6 +106,8 @@ struct pt_regs { unsigned long status32; }; +#define MAX_REG_OFFSET offsetof(struct pt_regs, status32) + #endif /* Callee saved registers - need to be saved only when you are scheduled out */ @@ -154,6 +160,27 @@ static inline void instruction_pointer_set(struct pt_regs *regs, { instruction_pointer(regs) = val; } + +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + return regs->sp; +} + +extern int regs_query_register_offset(const char *name); +extern const char *regs_query_register_name(unsigned int offset); +extern bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr); +extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, + unsigned int n); + +static inline unsigned long regs_get_register(struct pt_regs *regs, + unsigned int offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + + return *(unsigned long *)((unsigned long)regs + offset); +} + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PTRACE_H */ diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h index 94529e89dff0..9709256e31c8 100644 --- a/arch/arc/include/asm/syscall.h +++ b/arch/arc/include/asm/syscall.h @@ -12,6 +12,8 @@ #include #include /* in_syscall() */ +extern void *sys_call_table[]; + static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index 1e0b2e3914d5..6ba7fe417095 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -78,9 +78,9 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void) #define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ #define TIF_NOTIFY_SIGNAL 5 /* signal notifications exist */ #define TIF_SYSCALL_TRACE 15 /* syscall trace active */ - /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 16 +#define TIF_SYSCALL_TRACEPOINT 17 /* syscall tracepoint instrumentation */ #define _TIF_SYSCALL_TRACE (1< + +typedef struct user_regs_struct bpf_user_pt_regs_t; + +#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */ diff --git a/arch/arc/kernel/disasm.c b/arch/arc/kernel/disasm.c index 1e1db51b6941..ccc7e8c39eb3 100644 --- a/arch/arc/kernel/disasm.c +++ b/arch/arc/kernel/disasm.c @@ -434,14 +434,31 @@ long __kprobes get_reg(int reg, struct pt_regs *regs, { long *p; +#if defined(CONFIG_ISA_ARCOMPACT) if (reg <= 12) { p = ®s->r0; return p[-reg]; } +#else /* CONFIG_ISA_ARCV2 */ + if (reg <= 11) { + p = ®s->r0; + return p[reg]; + } + if (reg == 12) + return regs->r12; + if (reg == 30) + return regs->r30; +#ifdef CONFIG_ARC_HAS_ACCL_REGS + if (reg == 58) + return regs->r58; + if (reg == 59) + return regs->r59; +#endif +#endif if (cregs && (reg <= 25)) { p = &cregs->r13; - return p[13-reg]; + return p[13 - reg]; } if (reg == 26) @@ -461,6 +478,7 @@ void __kprobes set_reg(int reg, long val, struct pt_regs *regs, { long *p; +#if defined(CONFIG_ISA_ARCOMPACT) switch (reg) { case 0 ... 12: p = ®s->r0; @@ -469,7 +487,7 @@ void __kprobes set_reg(int reg, long val, struct pt_regs *regs, case 13 ... 25: if (cregs) { p = &cregs->r13; - p[13-reg] = val; + p[13 - reg] = val; } break; case 26: @@ -487,6 +505,48 @@ void __kprobes set_reg(int reg, long val, struct pt_regs *regs, default: break; } +#else /* CONFIG_ISA_ARCV2 */ + switch (reg) { + case 0 ... 11: + p = ®s->r0; + p[reg] = val; + break; + case 12: + regs->r12 = val; + break; + case 13 ... 25: + if (cregs) { + p = &cregs->r13; + p[13 - reg] = val; + } + break; + case 26: + regs->r26 = val; + break; + case 27: + regs->fp = val; + break; + case 28: + regs->sp = val; + break; + case 30: + regs->r30 = val; + break; + case 31: + regs->blink = val; + break; +#ifdef CONFIG_ARC_HAS_ACCL_REGS + case 58: + regs->r58 = val; + break; + case 59: + regs->r59 = val; + break; +#endif + default: + break; + } +#endif } /* diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index 66ba549b520f..54e91df678dd 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -29,8 +29,8 @@ ENTRY(sys_clone_wrapper) DISCARD_CALLEE_SAVED_USER GET_CURR_THR_INFO_FLAGS r10 - btst r10, TIF_SYSCALL_TRACE - bnz tracesys_exit + and.f 0, r10, _TIF_SYSCALL_WORK + bnz tracesys_exit b .Lret_from_system_call END(sys_clone_wrapper) @@ -41,8 +41,8 @@ ENTRY(sys_clone3_wrapper) DISCARD_CALLEE_SAVED_USER GET_CURR_THR_INFO_FLAGS r10 - btst r10, TIF_SYSCALL_TRACE - bnz tracesys_exit + and.f 0, r10, _TIF_SYSCALL_WORK + bnz tracesys_exit b .Lret_from_system_call END(sys_clone3_wrapper) @@ -247,8 +247,8 @@ ENTRY(EV_Trap) ; If syscall tracing ongoing, invoke pre-post-hooks GET_CURR_THR_INFO_FLAGS r10 - btst r10, TIF_SYSCALL_TRACE - bnz tracesys ; this never comes back + and.f 0, r10, _TIF_SYSCALL_WORK + bnz tracesys ; this never comes back ;============ Normal syscall case diff --git a/arch/arc/kernel/ptrace.c b/arch/arc/kernel/ptrace.c index 54b419ac8bda..da7542cea0d8 100644 --- a/arch/arc/kernel/ptrace.c +++ b/arch/arc/kernel/ptrace.c @@ -9,6 +9,92 @@ #include #include +#define CREATE_TRACE_POINTS +#include + +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +#ifdef CONFIG_ISA_ARCOMPACT +static const struct pt_regs_offset regoffset_table[] = { + REG_OFFSET_NAME(bta), + REG_OFFSET_NAME(lp_start), + REG_OFFSET_NAME(lp_end), + REG_OFFSET_NAME(lp_count), + REG_OFFSET_NAME(status32), + REG_OFFSET_NAME(ret), + REG_OFFSET_NAME(blink), + REG_OFFSET_NAME(fp), + REG_OFFSET_NAME(r26), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), + REG_OFFSET_NAME(r7), + REG_OFFSET_NAME(r6), + REG_OFFSET_NAME(r5), + REG_OFFSET_NAME(r4), + REG_OFFSET_NAME(r3), + REG_OFFSET_NAME(r2), + REG_OFFSET_NAME(r1), + REG_OFFSET_NAME(r0), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(orig_r0), + REG_OFFSET_NAME(event), + REG_OFFSET_NAME(user_r25), + REG_OFFSET_END, +}; + +#else + +static const struct pt_regs_offset regoffset_table[] = { + REG_OFFSET_NAME(orig_r0), + REG_OFFSET_NAME(event), + REG_OFFSET_NAME(bta), + REG_OFFSET_NAME(user_r25), + REG_OFFSET_NAME(r26), + REG_OFFSET_NAME(fp), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r30), +#ifdef CONFIG_ARC_HAS_ACCL_REGS + REG_OFFSET_NAME(r58), + REG_OFFSET_NAME(r59), +#endif +#ifdef CONFIG_ARC_DSP_SAVE_RESTORE_REGS + REG_OFFSET_NAME(DSP_CTRL), +#endif + REG_OFFSET_NAME(r0), + REG_OFFSET_NAME(r1), + REG_OFFSET_NAME(r2), + REG_OFFSET_NAME(r3), + REG_OFFSET_NAME(r4), + REG_OFFSET_NAME(r5), + REG_OFFSET_NAME(r6), + REG_OFFSET_NAME(r7), + REG_OFFSET_NAME(r8), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(blink), + REG_OFFSET_NAME(lp_end), + REG_OFFSET_NAME(lp_start), + REG_OFFSET_NAME(lp_count), + REG_OFFSET_NAME(ei), + REG_OFFSET_NAME(ldi), + REG_OFFSET_NAME(jli), + REG_OFFSET_NAME(ret), + REG_OFFSET_NAME(status32), + REG_OFFSET_END, +}; +#endif + static struct callee_regs *task_callee_regs(struct task_struct *tsk) { struct callee_regs *tmp = (struct callee_regs *)tsk->thread.callee_reg; @@ -257,13 +343,61 @@ long arch_ptrace(struct task_struct *child, long request, asmlinkage int syscall_trace_entry(struct pt_regs *regs) { - if (ptrace_report_syscall_entry(regs)) - return ULONG_MAX; + if (test_thread_flag(TIF_SYSCALL_TRACE)) + if (ptrace_report_syscall_entry(regs)) + return ULONG_MAX; + +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, syscall_get_nr(current, regs)); +#endif return regs->r8; } asmlinkage void syscall_trace_exit(struct pt_regs *regs) { - ptrace_report_syscall_exit(regs, 0); + if (test_thread_flag(TIF_SYSCALL_TRACE)) + ptrace_report_syscall_exit(regs, 0); + +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_exit(regs, regs_return_value(regs)); +#endif +} + +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + return (addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)); +} + +unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; } diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 99f0e4a4cbbd..e814df4c483c 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -113,6 +113,59 @@ config OPENRISC_HAVE_INST_DIV default y help Select this if your implementation has a hardware divide instruction + +config OPENRISC_HAVE_INST_CMOV + bool "Have instruction l.cmov for conditional move" + default n + help + This config enables gcc to generate l.cmov instructions when compiling + the kernel which in general will improve performance and reduce the + binary size. + + Select this if your implementation has support for the Class II + l.cmov conistional move instruction. + + Say N if you are unsure. + +config OPENRISC_HAVE_INST_ROR + bool "Have instruction l.ror for rotate right" + default n + help + This config enables gcc to generate l.ror instructions when compiling + the kernel which in general will improve performance and reduce the + binary size. + + Select this if your implementation has support for the Class II + l.ror rotate right instruction. + + Say N if you are unsure. + +config OPENRISC_HAVE_INST_RORI + bool "Have instruction l.rori for rotate right with immediate" + default n + help + This config enables gcc to generate l.rori instructions when compiling + the kernel which in general will improve performance and reduce the + binary size. + + Select this if your implementation has support for the Class II + l.rori rotate right with immediate instruction. + + Say N if you are unsure. + +config OPENRISC_HAVE_INST_SEXT + bool "Have instructions l.ext* for sign extension" + default n + help + This config enables gcc to generate l.ext* instructions when compiling + the kernel which in general will improve performance and reduce the + binary size. + + Select this if your implementation has support for the Class II + l.exths, l.extbs, l.exthz and l.extbz size extend instructions. + + Say N if you are unsure. + endmenu config NR_CPUS diff --git a/arch/openrisc/Makefile b/arch/openrisc/Makefile index 760b734fb822..b446510173cd 100644 --- a/arch/openrisc/Makefile +++ b/arch/openrisc/Makefile @@ -21,6 +21,7 @@ OBJCOPYFLAGS := -O binary -R .note -R .comment -S LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) KBUILD_CFLAGS += -pipe -ffixed-r10 -D__linux__ +KBUILD_CFLAGS += -msfimm -mshftimm all: vmlinux.bin @@ -38,6 +39,22 @@ else KBUILD_CFLAGS += $(call cc-option,-msoft-div) endif +ifeq ($(CONFIG_OPENRISC_HAVE_INST_CMOV),y) + KBUILD_CFLAGS += $(call cc-option,-mcmov) +endif + +ifeq ($(CONFIG_OPENRISC_HAVE_INST_ROR),y) + KBUILD_CFLAGS += $(call cc-option,-mror) +endif + +ifeq ($(CONFIG_OPENRISC_HAVE_INST_RORI),y) + KBUILD_CFLAGS += $(call cc-option,-mrori) +endif + +ifeq ($(CONFIG_OPENRISC_HAVE_INST_SEXT),y) + KBUILD_CFLAGS += $(call cc-option,-msext) +endif + head-y := arch/openrisc/kernel/head.o libs-y += $(LIBGCC) diff --git a/arch/openrisc/configs/or1klitex_defconfig b/arch/openrisc/configs/or1klitex_defconfig index d695879a4d26..d3fb964b4f85 100644 --- a/arch/openrisc/configs/or1klitex_defconfig +++ b/arch/openrisc/configs/or1klitex_defconfig @@ -1,22 +1,54 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_CGROUPS=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SGETMASK_SYSCALL=y CONFIG_EMBEDDED=y CONFIG_OPENRISC_BUILTIN_DTB="or1klitex" CONFIG_HZ_100=y +CONFIG_OPENRISC_HAVE_SHADOW_GPRS=y CONFIG_NET=y CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y CONFIG_UNIX=y +CONFIG_UNIX_DIAG=y CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_INET_UDP_DIAG=y +CONFIG_INET_RAW_DIAG=y +# CONFIG_WIRELESS is not set +# CONFIG_ETHTOOL_NETLINK is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y CONFIG_OF_OVERLAY=y CONFIG_NETDEVICES=y CONFIG_LITEX_LITEETH=y +# CONFIG_WLAN is not set CONFIG_SERIAL_LITEUART=y CONFIG_SERIAL_LITEUART_CONSOLE=y CONFIG_TTY_PRINTK=y +# CONFIG_GPIO_CDEV is not set +CONFIG_MMC=y +CONFIG_MMC_LITEX=y +# CONFIG_VHOST_MENU is not set +# CONFIG_IOMMU_SUPPORT is not set CONFIG_LITEX_SOC_CONTROLLER=y +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_EXFAT_FS=y CONFIG_TMPFS=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=y +CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity,bpf" CONFIG_PRINTK_TIME=y CONFIG_PANIC_ON_OOPS=y CONFIG_SOFTLOCKUP_DETECTOR=y diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index 3ca1b1f490b9..54a87bba35ca 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -601,7 +601,7 @@ UNHANDLED_EXCEPTION(_vector_0xb00,0xb00) */ _string_syscall_return: - .string "syscall return %ld \n\r\0" + .string "syscall r9:0x%08x -> syscall(%ld) return %ld\0" .align 4 ENTRY(_sys_call_handler) @@ -679,15 +679,25 @@ _syscall_return: _syscall_debug: l.movhi r3,hi(_string_syscall_return) l.ori r3,r3,lo(_string_syscall_return) - l.ori r27,r0,1 + l.ori r27,r0,2 l.sw -4(r1),r27 l.sw -8(r1),r11 - l.addi r1,r1,-8 + l.lwz r29,PT_ORIG_GPR11(r1) + l.sw -12(r1),r29 + l.lwz r29,PT_GPR9(r1) + l.sw -16(r1),r29 l.movhi r27,hi(_printk) l.ori r27,r27,lo(_printk) l.jalr r27 - l.nop - l.addi r1,r1,8 + l.addi r1,r1,-16 + l.addi r1,r1,16 +#endif +#if 0 +_syscall_show_regs: + l.movhi r27,hi(show_registers) + l.ori r27,r27,lo(show_registers) + l.jalr r27 + l.or r3,r1,r1 #endif _syscall_check_trace_leave: diff --git a/arch/openrisc/kernel/head.S b/arch/openrisc/kernel/head.S index 2fa6cefa62ca..e11699f3d6bd 100644 --- a/arch/openrisc/kernel/head.S +++ b/arch/openrisc/kernel/head.S @@ -297,19 +297,23 @@ /* temporary store r3, r9 into r1, r10 */ ;\ l.addi r1,r3,0x0 ;\ l.addi r10,r9,0x0 ;\ - /* the string referenced by r3 must be low enough */ ;\ + LOAD_SYMBOL_2_GPR(r9,_string_unhandled_exception) ;\ + tophys (r3,r9) ;\ l.jal _emergency_print ;\ - l.ori r3,r0,lo(_string_unhandled_exception) ;\ + l.nop ;\ l.mfspr r3,r0,SPR_NPC ;\ l.jal _emergency_print_nr ;\ - l.andi r3,r3,0x1f00 ;\ - /* the string referenced by r3 must be low enough */ ;\ + l.andi r3,r3,0x1f00 ;\ + LOAD_SYMBOL_2_GPR(r9,_string_epc_prefix) ;\ + tophys (r3,r9) ;\ l.jal _emergency_print ;\ - l.ori r3,r0,lo(_string_epc_prefix) ;\ + l.nop ;\ l.jal _emergency_print_nr ;\ - l.mfspr r3,r0,SPR_EPCR_BASE ;\ + l.mfspr r3,r0,SPR_EPCR_BASE ;\ + LOAD_SYMBOL_2_GPR(r9,_string_nl) ;\ + tophys (r3,r9) ;\ l.jal _emergency_print ;\ - l.ori r3,r0,lo(_string_nl) ;\ + l.nop ;\ /* end of printing */ ;\ l.addi r3,r1,0x0 ;\ l.addi r9,r10,0x0 ;\ @@ -1330,215 +1334,63 @@ i_pte_not_present: /* =================================================[ debugging aids ]=== */ - .align 64 -_immu_trampoline: - .space 64 -_immu_trampoline_top: +/* + * DESC: Prints ASCII character stored in r7 + * + * PRMS: r7 - a 32-bit value with an ASCII character in the first byte + * position. + * + * PREQ: The UART at UART_BASE_ADD has to be initialized + * + * POST: internally used but restores: + * r4 - to store UART_BASE_ADD + * r5 - for loading OFF_TXFULL / THRE,TEMT + * r6 - for storing bitmask (SERIAL_8250) + */ +ENTRY(_emergency_putc) + EMERGENCY_PRINT_STORE_GPR4 + EMERGENCY_PRINT_STORE_GPR5 + EMERGENCY_PRINT_STORE_GPR6 -#define TRAMP_SLOT_0 (0x0) -#define TRAMP_SLOT_1 (0x4) -#define TRAMP_SLOT_2 (0x8) -#define TRAMP_SLOT_3 (0xc) -#define TRAMP_SLOT_4 (0x10) -#define TRAMP_SLOT_5 (0x14) -#define TRAMP_FRAME_SIZE (0x18) + l.movhi r4,hi(UART_BASE_ADD) + l.ori r4,r4,lo(UART_BASE_ADD) -ENTRY(_immu_trampoline_workaround) - // r2 EEA - // r6 is physical EEA - tophys(r6,r2) +#if defined(CONFIG_SERIAL_LITEUART) + /* Check OFF_TXFULL status */ +1: l.lwz r5,4(r4) + l.andi r5,r5,0xff + l.sfnei r5,0 + l.bf 1b + l.nop - LOAD_SYMBOL_2_GPR(r5,_immu_trampoline) - tophys (r3,r5) // r3 is trampoline (physical) + /* Write character */ + l.andi r7,r7,0xff + l.sw 0(r4),r7 +#elif defined(CONFIG_SERIAL_8250) + /* Check UART LSR THRE (hold) bit */ + l.addi r6,r0,0x20 +1: l.lbz r5,5(r4) + l.andi r5,r5,0x20 + l.sfeq r5,r6 + l.bnf 1b + l.nop - LOAD_SYMBOL_2_GPR(r4,0x15000000) - l.sw TRAMP_SLOT_0(r3),r4 - l.sw TRAMP_SLOT_1(r3),r4 - l.sw TRAMP_SLOT_4(r3),r4 - l.sw TRAMP_SLOT_5(r3),r4 - - // EPC = EEA - 0x4 - l.lwz r4,0x0(r6) // load op @ EEA + 0x0 (fc address) - l.sw TRAMP_SLOT_3(r3),r4 // store it to _immu_trampoline_data - l.lwz r4,-0x4(r6) // load op @ EEA - 0x4 (f8 address) - l.sw TRAMP_SLOT_2(r3),r4 // store it to _immu_trampoline_data - - l.srli r5,r4,26 // check opcode for write access - l.sfeqi r5,0 // l.j - l.bf 0f - l.sfeqi r5,0x11 // l.jr - l.bf 1f - l.sfeqi r5,1 // l.jal - l.bf 2f - l.sfeqi r5,0x12 // l.jalr - l.bf 3f - l.sfeqi r5,3 // l.bnf - l.bf 4f - l.sfeqi r5,4 // l.bf - l.bf 5f -99: - l.nop - l.j 99b // should never happen - l.nop 1 - - // r2 is EEA - // r3 is trampoline address (physical) - // r4 is instruction - // r6 is physical(EEA) - // - // r5 - -2: // l.jal - - /* 19 20 aa aa l.movhi r9,0xaaaa - * a9 29 bb bb l.ori r9,0xbbbb - * - * where 0xaaaabbbb is EEA + 0x4 shifted right 2 - */ - - l.addi r6,r2,0x4 // this is 0xaaaabbbb - - // l.movhi r9,0xaaaa - l.ori r5,r0,0x1920 // 0x1920 == l.movhi r9 - l.sh (TRAMP_SLOT_0+0x0)(r3),r5 - l.srli r5,r6,16 - l.sh (TRAMP_SLOT_0+0x2)(r3),r5 - - // l.ori r9,0xbbbb - l.ori r5,r0,0xa929 // 0xa929 == l.ori r9 - l.sh (TRAMP_SLOT_1+0x0)(r3),r5 - l.andi r5,r6,0xffff - l.sh (TRAMP_SLOT_1+0x2)(r3),r5 - - /* falthrough, need to set up new jump offset */ - - -0: // l.j - l.slli r6,r4,6 // original offset shifted left 6 - 2 -// l.srli r6,r6,6 // original offset shifted right 2 - - l.slli r4,r2,4 // old jump position: EEA shifted left 4 -// l.srli r4,r4,6 // old jump position: shifted right 2 - - l.addi r5,r3,0xc // new jump position (physical) - l.slli r5,r5,4 // new jump position: shifted left 4 - - // calculate new jump offset - // new_off = old_off + (old_jump - new_jump) - - l.sub r5,r4,r5 // old_jump - new_jump - l.add r5,r6,r5 // orig_off + (old_jump - new_jump) - l.srli r5,r5,6 // new offset shifted right 2 - - // r5 is new jump offset - // l.j has opcode 0x0... - l.sw TRAMP_SLOT_2(r3),r5 // write it back - - l.j trampoline_out - l.nop - -/* ----------------------------- */ - -3: // l.jalr - - /* 19 20 aa aa l.movhi r9,0xaaaa - * a9 29 bb bb l.ori r9,0xbbbb - * - * where 0xaaaabbbb is EEA + 0x4 shifted right 2 - */ - - l.addi r6,r2,0x4 // this is 0xaaaabbbb - - // l.movhi r9,0xaaaa - l.ori r5,r0,0x1920 // 0x1920 == l.movhi r9 - l.sh (TRAMP_SLOT_0+0x0)(r3),r5 - l.srli r5,r6,16 - l.sh (TRAMP_SLOT_0+0x2)(r3),r5 - - // l.ori r9,0xbbbb - l.ori r5,r0,0xa929 // 0xa929 == l.ori r9 - l.sh (TRAMP_SLOT_1+0x0)(r3),r5 - l.andi r5,r6,0xffff - l.sh (TRAMP_SLOT_1+0x2)(r3),r5 - - l.lhz r5,(TRAMP_SLOT_2+0x0)(r3) // load hi part of jump instruction - l.andi r5,r5,0x3ff // clear out opcode part - l.ori r5,r5,0x4400 // opcode changed from l.jalr -> l.jr - l.sh (TRAMP_SLOT_2+0x0)(r3),r5 // write it back - - /* falthrough */ - -1: // l.jr - l.j trampoline_out - l.nop - -/* ----------------------------- */ - -4: // l.bnf -5: // l.bf - l.slli r6,r4,6 // original offset shifted left 6 - 2 -// l.srli r6,r6,6 // original offset shifted right 2 - - l.slli r4,r2,4 // old jump position: EEA shifted left 4 -// l.srli r4,r4,6 // old jump position: shifted right 2 - - l.addi r5,r3,0xc // new jump position (physical) - l.slli r5,r5,4 // new jump position: shifted left 4 - - // calculate new jump offset - // new_off = old_off + (old_jump - new_jump) - - l.add r6,r6,r4 // (orig_off + old_jump) - l.sub r6,r6,r5 // (orig_off + old_jump) - new_jump - l.srli r6,r6,6 // new offset shifted right 2 - - // r6 is new jump offset - l.lwz r4,(TRAMP_SLOT_2+0x0)(r3) // load jump instruction - l.srli r4,r4,16 - l.andi r4,r4,0xfc00 // get opcode part - l.slli r4,r4,16 - l.or r6,r4,r6 // l.b(n)f new offset - l.sw TRAMP_SLOT_2(r3),r6 // write it back - - /* we need to add l.j to EEA + 0x8 */ - tophys (r4,r2) // may not be needed (due to shifts down_ - l.addi r4,r4,(0x8 - 0x8) // jump target = r2 + 0x8 (compensate for 0x8) - // jump position = r5 + 0x8 (0x8 compensated) - l.sub r4,r4,r5 // jump offset = target - new_position + 0x8 - - l.slli r4,r4,4 // the amount of info in imediate of jump - l.srli r4,r4,6 // jump instruction with offset - l.sw TRAMP_SLOT_4(r3),r4 // write it to 4th slot - - /* fallthrough */ - -trampoline_out: - // set up new EPC to point to our trampoline code - LOAD_SYMBOL_2_GPR(r5,_immu_trampoline) - l.mtspr r0,r5,SPR_EPCR_BASE - - // immu_trampoline is (4x) CACHE_LINE aligned - // and only 6 instructions long, - // so we need to invalidate only 2 lines - - /* Establish cache block size - If BS=0, 16; - If BS=1, 32; - r14 contain block size - */ - l.mfspr r21,r0,SPR_ICCFGR - l.andi r21,r21,SPR_ICCFGR_CBS - l.srli r21,r21,7 - l.ori r23,r0,16 - l.sll r14,r23,r21 - - l.mtspr r0,r5,SPR_ICBIR - l.add r5,r5,r14 - l.mtspr r0,r5,SPR_ICBIR + /* Write character */ + l.sb 0(r4),r7 + /* Check UART LSR THRE|TEMT (hold, empty) bits */ + l.addi r6,r0,0x60 +1: l.lbz r5,5(r4) + l.andi r5,r5,0x60 + l.sfeq r5,r6 + l.bnf 1b + l.nop +#endif + EMERGENCY_PRINT_LOAD_GPR6 + EMERGENCY_PRINT_LOAD_GPR5 + EMERGENCY_PRINT_LOAD_GPR4 l.jr r9 - l.nop - + l.nop /* * DSCR: prints a string referenced by r3. @@ -1551,53 +1403,41 @@ trampoline_out: * POST: caller should be aware that r3, r9 are changed */ ENTRY(_emergency_print) - EMERGENCY_PRINT_STORE_GPR4 - EMERGENCY_PRINT_STORE_GPR5 - EMERGENCY_PRINT_STORE_GPR6 EMERGENCY_PRINT_STORE_GPR7 -2: - l.lbz r7,0(r3) - l.sfeq r7,r0 + EMERGENCY_PRINT_STORE_GPR9 + + /* Load character to r7, check for null terminator */ +2: l.lbz r7,0(r3) + l.sfeqi r7,0x0 l.bf 9f - l.nop + l.nop -// putc: - l.movhi r4,hi(UART_BASE_ADD) - - l.addi r6,r0,0x20 -1: l.lbz r5,5(r4) - l.andi r5,r5,0x20 - l.sfeq r5,r6 - l.bnf 1b - l.nop - - l.sb 0(r4),r7 - - l.addi r6,r0,0x60 -1: l.lbz r5,5(r4) - l.andi r5,r5,0x60 - l.sfeq r5,r6 - l.bnf 1b - l.nop + l.jal _emergency_putc + l.nop /* next character */ l.j 2b - l.addi r3,r3,0x1 + l.addi r3,r3,0x1 9: + EMERGENCY_PRINT_LOAD_GPR9 EMERGENCY_PRINT_LOAD_GPR7 - EMERGENCY_PRINT_LOAD_GPR6 - EMERGENCY_PRINT_LOAD_GPR5 - EMERGENCY_PRINT_LOAD_GPR4 l.jr r9 - l.nop + l.nop +/* + * DSCR: prints a number in r3 in hex. + * + * PRMS: r3 - a 32-bit unsigned integer + * + * PREQ: UART at UART_BASE_ADD has to be initialized + * + * POST: caller should be aware that r3, r9 are changed + */ ENTRY(_emergency_print_nr) - EMERGENCY_PRINT_STORE_GPR4 - EMERGENCY_PRINT_STORE_GPR5 - EMERGENCY_PRINT_STORE_GPR6 EMERGENCY_PRINT_STORE_GPR7 EMERGENCY_PRINT_STORE_GPR8 + EMERGENCY_PRINT_STORE_GPR9 l.addi r8,r0,32 // shift register @@ -1609,58 +1449,39 @@ ENTRY(_emergency_print_nr) /* don't skip the last zero if number == 0x0 */ l.sfeqi r8,0x4 l.bf 2f - l.nop + l.nop l.sfeq r7,r0 l.bf 1b - l.nop + l.nop 2: l.srl r7,r3,r8 l.andi r7,r7,0xf l.sflts r8,r0 - l.bf 9f + l.bf 9f + /* Numbers greater than 9 translate to a-f */ l.sfgtui r7,0x9 l.bnf 8f - l.nop + l.nop l.addi r7,r7,0x27 -8: - l.addi r7,r7,0x30 -// putc: - l.movhi r4,hi(UART_BASE_ADD) - - l.addi r6,r0,0x20 -1: l.lbz r5,5(r4) - l.andi r5,r5,0x20 - l.sfeq r5,r6 - l.bnf 1b - l.nop - - l.sb 0(r4),r7 - - l.addi r6,r0,0x60 -1: l.lbz r5,5(r4) - l.andi r5,r5,0x60 - l.sfeq r5,r6 - l.bnf 1b - l.nop + /* Convert to ascii and output character */ +8: l.jal _emergency_putc + l.addi r7,r7,0x30 /* next character */ l.j 2b l.addi r8,r8,-0x4 9: + EMERGENCY_PRINT_LOAD_GPR9 EMERGENCY_PRINT_LOAD_GPR8 EMERGENCY_PRINT_LOAD_GPR7 - EMERGENCY_PRINT_LOAD_GPR6 - EMERGENCY_PRINT_LOAD_GPR5 - EMERGENCY_PRINT_LOAD_GPR4 l.jr r9 - l.nop - + l.nop /* * This should be used for debugging only. @@ -1685,7 +1506,9 @@ ENTRY(_emergency_print_nr) ENTRY(_early_uart_init) l.movhi r3,hi(UART_BASE_ADD) + l.ori r3,r3,lo(UART_BASE_ADD) +#if defined(CONFIG_SERIAL_8250) l.addi r4,r0,0x7 l.sb 0x2(r3),r4 @@ -1703,9 +1526,10 @@ ENTRY(_early_uart_init) l.addi r4,r0,((UART_DIVISOR) & 0x000000ff) l.sb UART_DLL(r3),r4 l.sb 0x3(r3),r5 +#endif l.jr r9 - l.nop + l.nop .align 0x1000 .global _secondary_evbar @@ -1720,13 +1544,13 @@ _secondary_evbar: .section .rodata _string_unhandled_exception: - .string "\n\rRunarunaround: Unhandled exception 0x\0" + .string "\r\nRunarunaround: Unhandled exception 0x\0" _string_epc_prefix: .string ": EPC=0x\0" _string_nl: - .string "\n\r\0" + .string "\r\n\0" /* ========================================[ page aligned structures ]=== */ diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 3c0c91bcdcba..1d4c0921aafa 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -52,6 +52,8 @@ void machine_restart(char *cmd) { do_kernel_restart(cmd); + __asm__("l.nop 13"); + /* Give a grace period for failure to restart of 1s */ mdelay(1000); @@ -60,6 +62,16 @@ void machine_restart(char *cmd) while (1); } +/* + * This is used if pm_power_off has not been set by a power management + * driver, in this case we can assume we are on a simulator. On + * OpenRISC simulators l.nop 1 will trigger the simulator exit. + */ +static void default_power_off(void) +{ + __asm__("l.nop 1"); +} + /* * Similar to machine_power_off, but don't shut off power. Add code * here to freeze the system for e.g. post-mortem debug purpose when @@ -75,7 +87,10 @@ void machine_halt(void) void machine_power_off(void) { printk(KERN_INFO "*** MACHINE POWER OFF ***\n"); - __asm__("l.nop 1"); + if (pm_power_off != NULL) + pm_power_off(); + else + default_power_off(); } /* @@ -89,7 +104,7 @@ void arch_cpu_idle(void) mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME); } -void (*pm_power_off) (void) = machine_power_off; +void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); /* diff --git a/arch/openrisc/kernel/time.c b/arch/openrisc/kernel/time.c index 6d18989d63d0..8e26c1af5441 100644 --- a/arch/openrisc/kernel/time.c +++ b/arch/openrisc/kernel/time.c @@ -23,6 +23,7 @@ #include #include +#include /* Test the timer ticks to count, used in sync routine */ inline void openrisc_timer_set(unsigned long count) @@ -61,7 +62,7 @@ static int openrisc_timer_set_next_event(unsigned long delta, * timers) we cannot enable the PERIODIC feature. The tick timer can run using * one-shot events, so no problem. */ -DEFINE_PER_CPU(struct clock_event_device, clockevent_openrisc_timer); +static DEFINE_PER_CPU(struct clock_event_device, clockevent_openrisc_timer); void openrisc_clockevent_init(void) { diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 0446a3c34372..fd9a0f2b66c4 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -34,11 +34,11 @@ #include #include -int kstack_depth_to_print = 0x180; +static int kstack_depth_to_print = 0x180; int lwa_flag; -unsigned long __user *lwa_addr; +static unsigned long __user *lwa_addr; -void print_trace(void *data, unsigned long addr, int reliable) +static void print_trace(void *data, unsigned long addr, int reliable) { const char *loglvl = data; @@ -46,6 +46,14 @@ void print_trace(void *data, unsigned long addr, int reliable) (void *) addr); } +static void print_data(unsigned long base_addr, unsigned long word, int i) +{ + if (i == 0) + printk("(%08lx:)\t%08lx", base_addr + (i * 4), word); + else + printk(" %08lx:\t%08lx", base_addr + (i * 4), word); +} + /* displays a short stack trace */ void show_stack(struct task_struct *task, unsigned long *esp, const char *loglvl) { @@ -99,22 +107,36 @@ void show_registers(struct pt_regs *regs) printk("\nStack: "); show_stack(NULL, (unsigned long *)esp, KERN_EMERG); + if (esp < PAGE_OFFSET) + goto bad_stack; + + printk("\n"); + for (i = -8; i < 24; i += 1) { + unsigned long word; + + if (__get_user(word, &((unsigned long *)esp)[i])) { +bad_stack: + printk(" Bad Stack value."); + break; + } + + print_data(esp, word, i); + } + printk("\nCode: "); if (regs->pc < PAGE_OFFSET) goto bad; - for (i = -24; i < 24; i++) { - unsigned char c; - if (__get_user(c, &((unsigned char *)regs->pc)[i])) { + for (i = -6; i < 6; i += 1) { + unsigned long word; + + if (__get_user(word, &((unsigned long *)regs->pc)[i])) { bad: printk(" Bad PC value."); break; } - if (i == 0) - printk("(%02x) ", c); - else - printk("%02x ", c); + print_data(regs->pc, word, i); } } printk("\n"); @@ -185,13 +207,11 @@ void nommu_dump_state(struct pt_regs *regs, printk("\nCode: "); for (i = -24; i < 24; i++) { - unsigned char c; - c = ((unsigned char *)(__pa(regs->pc)))[i]; + unsigned long word; - if (i == 0) - printk("(%02x) ", c); - else - printk("%02x ", c); + word = ((unsigned long *)(__pa(regs->pc)))[i]; + + print_data(regs->pc, word, i); } printk("\n"); } @@ -215,16 +235,7 @@ void __noreturn die(const char *str, struct pt_regs *regs, long err) make_task_dead(SIGSEGV); } -/* This is normally the 'Oops' routine */ -void die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (user_mode(regs)) - return; - - die(str, regs, err); -} - -void unhandled_exception(struct pt_regs *regs, int ea, int vector) +asmlinkage void unhandled_exception(struct pt_regs *regs, int ea, int vector) { printk("Unable to handle exception at EA =0x%x, vector 0x%x", ea, vector); diff --git a/arch/openrisc/lib/delay.c b/arch/openrisc/lib/delay.c index 036ae57180ef..5e89e4131304 100644 --- a/arch/openrisc/lib/delay.c +++ b/arch/openrisc/lib/delay.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 80bb66ad42f6..53b760af3bb7 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -18,15 +18,13 @@ #include #include +#include #include #include #define NUM_TLB_ENTRIES 64 #define TLB_OFFSET(add) (((add) >> PAGE_SHIFT) & (NUM_TLB_ENTRIES-1)) -unsigned long pte_misses; /* updated by do_page_fault() */ -unsigned long pte_errors; /* updated by do_page_fault() */ - /* __PHX__ :: - check the vmalloc_fault in do_page_fault() * - also look into include/asm/mmu_context.h */ @@ -223,8 +221,6 @@ no_context: { const struct exception_table_entry *entry; - __asm__ __volatile__("l.nop 42"); - if ((entry = search_exception_tables(regs->pc)) != NULL) { /* Adjust the instruction pointer in the stackframe */ regs->pc = entry->fixup; @@ -252,9 +248,6 @@ no_context: */ out_of_memory: - __asm__ __volatile__("l.nop 42"); - __asm__ __volatile__("l.nop 1"); - mmap_read_unlock(mm); if (!user_mode(regs)) goto no_context; diff --git a/arch/openrisc/mm/tlb.c b/arch/openrisc/mm/tlb.c index 2b6feabf6381..e2f2a3c3bb22 100644 --- a/arch/openrisc/mm/tlb.c +++ b/arch/openrisc/mm/tlb.c @@ -128,7 +128,7 @@ void local_flush_tlb_mm(struct mm_struct *mm) /* Was seeing bugs with the mm struct passed to us. Scrapped most of this function. */ - /* Several architctures do this */ + /* Several architectures do this */ local_flush_tlb_all(); } diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 4deb60a3b43f..d275db195f1a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -58,6 +58,7 @@ struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); +static struct workqueue_struct *ib_unreg_wq; /* * Each of the three rwsem locks (devices, clients, client_data) protects the @@ -1602,7 +1603,7 @@ void ib_unregister_device_queued(struct ib_device *ib_dev) WARN_ON(!refcount_read(&ib_dev->refcount)); WARN_ON(!ib_dev->ops.dealloc_driver); get_device(&ib_dev->dev); - if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) + if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) put_device(&ib_dev->dev); } EXPORT_SYMBOL(ib_unregister_device_queued); @@ -2751,27 +2752,28 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { static int __init ib_core_init(void) { - int ret; + int ret = -ENOMEM; ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; + ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + if (!ib_unreg_wq) + goto err; + ib_comp_wq = alloc_workqueue("ib-comp-wq", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); - if (!ib_comp_wq) { - ret = -ENOMEM; - goto err; - } + if (!ib_comp_wq) + goto err_unbound; ib_comp_unbound_wq = alloc_workqueue("ib-comp-unb-wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); - if (!ib_comp_unbound_wq) { - ret = -ENOMEM; + if (!ib_comp_unbound_wq) goto err_comp; - } ret = class_register(&ib_class); if (ret) { @@ -2831,6 +2833,8 @@ err_comp_unbound: destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); +err_unbound: + destroy_workqueue(ib_unreg_wq); err: destroy_workqueue(ib_wq); return ret; @@ -2852,7 +2856,7 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); - flush_workqueue(system_unbound_wq); + destroy_workqueue(ib_unreg_wq); WARN_ON(!xa_empty(&clients)); WARN_ON(!xa_empty(&devices)); } diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ca24ce34da76..b92358f606d0 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -1739,7 +1739,7 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, if (!device) return -EINVAL; - if (!(device->attrs.device_cap_flags & IB_DEVICE_ALLOW_USER_UNREG)) { + if (!(device->attrs.kernel_cap_flags & IBK_ALLOW_USER_UNREG)) { ib_device_put(device); return -EINVAL; } diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8dc7d1f4b35d..003e504feca2 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1034,10 +1034,9 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct netlink_ext_ack *extack) { unsigned long flags; - struct ib_sa_query *query; + struct ib_sa_query *query = NULL, *iter; struct ib_mad_send_buf *send_buf; struct ib_mad_send_wc mad_send_wc; - int found = 0; int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || @@ -1045,20 +1044,21 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); - list_for_each_entry(query, &ib_nl_request_list, list) { + list_for_each_entry(iter, &ib_nl_request_list, list) { /* * If the query is cancelled, let the timeout routine * take care of it. */ - if (nlh->nlmsg_seq == query->seq) { - found = !ib_sa_query_cancelled(query); - if (found) - list_del(&query->list); + if (nlh->nlmsg_seq == iter->seq) { + if (!ib_sa_query_cancelled(iter)) { + list_del(&iter->list); + query = iter; + } break; } } - if (!found) { + if (!query) { spin_unlock_irqrestore(&ib_nl_request_lock, flags); goto resp_out; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index aead24c1a682..186ed8859920 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -455,7 +455,7 @@ retry: break; } } - /* upon sucesss lock should stay on hold for the callee */ + /* upon success lock should stay on hold for the callee */ if (!ret) ret = dma_index - start_idx; else diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6b6393176b3c..046376bd68e2 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -337,7 +337,7 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->hw_ver = attr->hw_ver; resp->max_qp = attr->max_qp; resp->max_qp_wr = attr->max_qp_wr; - resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); + resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge); resp->max_sge_rd = attr->max_sge_rd; resp->max_cq = attr->max_cq; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a9819c40a140..e54b3f1b730e 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -281,7 +281,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, } rdma_restrack_add(&pd->res); - if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + if (device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else mr_access_flags |= IB_ACCESS_LOCAL_WRITE; @@ -308,7 +308,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, pd->__internal_mr = mr; - if (!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) + if (!(device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY)) pd->local_dma_lkey = pd->__internal_mr->lkey; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) @@ -2131,8 +2131,8 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mr; if (access_flags & IB_ACCESS_ON_DEMAND) { - if (!(pd->device->attrs.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING)) { + if (!(pd->device->attrs.kernel_cap_flags & + IBK_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); return ERR_PTR(-EINVAL); } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 3224f18a66e5..989edc789633 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -146,13 +146,13 @@ int bnxt_re_query_device(struct ib_device *ibdev, | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID - | IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_RESIZE_MAX_WR | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_N_NOTIFY_CQ | IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_MEM_MGT_EXTENSIONS; + ib_attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; ib_attr->max_send_sge = dev_attr->max_qp_sges; ib_attr->max_recv_sge = dev_attr->max_qp_sges; ib_attr->max_sge_rd = dev_attr->max_qp_sges; diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 12f33467c672..50cb2259bf87 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -314,7 +314,6 @@ enum db_state { struct c4iw_dev { struct ib_device ibdev; struct c4iw_rdev rdev; - u32 device_cap_flags; struct xarray cqs; struct xarray qps; struct xarray mrs; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 89f36a3a9af0..246b739ddb2b 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -269,7 +269,10 @@ static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *pro dev->rdev.lldi.ports[0]->dev_addr); props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type); props->fw_ver = dev->rdev.lldi.fw_vers; - props->device_cap_flags = dev->device_cap_flags; + props->device_cap_flags = IB_DEVICE_MEM_WINDOW; + props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; + if (fastreg_support) + props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; props->page_size_cap = T4_PAGESIZE_MASK; props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor; props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device; @@ -529,9 +532,6 @@ void c4iw_register_device(struct work_struct *work) pr_debug("c4iw_dev %p\n", dev); addrconf_addr_eui48((u8 *)&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr); - dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW; - if (fastreg_support) - dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; dev->ibdev.local_dma_lkey = 0; dev->ibdev.node_type = RDMA_NODE_RNIC; BUILD_BUG_ON(sizeof(C4IW_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index 995991d9709d..166ad6b828dc 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -137,61 +137,6 @@ #define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \ HFI1_USER_SWMINOR) -#ifndef HFI1_KERN_TYPE -#define HFI1_KERN_TYPE 0 -#endif - -/* - * Similarly, this is the kernel version going back to the user. It's - * slightly different, in that we want to tell if the driver was built as - * part of a Intel release, or from the driver from openfabrics.org, - * kernel.org, or a standard distribution, for support reasons. - * The high bit is 0 for non-Intel and 1 for Intel-built/supplied. - * - * It's returned by the driver to the user code during initialization in the - * spi_sw_version field of hfi1_base_info, so the user code can in turn - * check for compatibility with the kernel. -*/ -#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION) - -/* - * Define the driver version number. This is something that refers only - * to the driver itself, not the software interfaces it supports. - */ -#ifndef HFI1_DRIVER_VERSION_BASE -#define HFI1_DRIVER_VERSION_BASE "0.9-294" -#endif - -/* create the final driver version string */ -#ifdef HFI1_IDSTR -#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR -#else -#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE -#endif - -/* - * Diagnostics can send a packet by writing the following - * struct to the diag packet special file. - * - * This allows a custom PBC qword, so that special modes and deliberate - * changes to CRCs can be used. - */ -#define _DIAG_PKT_VERS 1 -struct diag_pkt { - __u16 version; /* structure version */ - __u16 unit; /* which device */ - __u16 sw_index; /* send sw index to use */ - __u16 len; /* data length, in bytes */ - __u16 port; /* port number */ - __u16 unused; - __u32 flags; /* call flags */ - __u64 data; /* user data pointer */ - __u64 pbc; /* PBC for the packet */ -}; - -/* diag_pkt flags */ -#define F_DIAGPKT_WAIT 0x1 /* wait until packet is sent */ - /* * The next set of defines are for packet headers, and chip register * and memory bits that are visible to and/or used by user-mode software. diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index e2c634af40e9..8e71bef9d982 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -29,12 +29,6 @@ #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt -/* - * The size has to be longer than this string, so we can append - * board/chip information to it in the initialization code. - */ -const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n"; - DEFINE_MUTEX(hfi1_mutex); /* general driver use */ unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c index e8ed05516bf2..7741a1d69097 100644 --- a/drivers/infiniband/hw/hfi1/efivar.c +++ b/drivers/infiniband/hw/hfi1/efivar.c @@ -72,7 +72,7 @@ static int read_efi_var(const char *name, unsigned long *size, * is in the EFIVAR_FS code and may not be compiled in. * However, even that is insufficient since it does not cover * EFI_BUFFER_TOO_SMALL which could be an important return. - * For now, just split out succces or not found. + * For now, just split out success or not found. */ ret = status == EFI_SUCCESS ? 0 : status == EFI_NOT_FOUND ? -ENOENT : diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 1783a6ea5427..2e4cf2b11653 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -265,6 +265,8 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) unsigned long dim = from->nr_segs; int idx; + if (!HFI1_CAP_IS_KSET(SDMA)) + return -EINVAL; idx = srcu_read_lock(&fd->pq_srcu); pq = srcu_dereference(fd->pq, &fd->pq_srcu); if (!cq || !pq) { @@ -1220,7 +1222,7 @@ static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len) memset(&binfo, 0, sizeof(binfo)); binfo.hw_version = dd->revision; - binfo.sw_version = HFI1_KERN_SWVERSION; + binfo.sw_version = HFI1_USER_SWVERSION; binfo.bthqp = RVT_KDETH_QP_PREFIX; binfo.jkey = uctxt->jkey; /* diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 4436ed41547c..436372b31431 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -489,7 +489,7 @@ void set_link_ipg(struct hfi1_pportdata *ppd) u16 shift, mult; u64 src; u32 current_egress_rate; /* Mbits /sec */ - u32 max_pkt_time; + u64 max_pkt_time; /* * max_pkt_time is the maximum packet egress time in units * of the fabric clock period 1/(805 MHz). diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index f07d328689d3..a95b654f5254 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1288,11 +1288,13 @@ void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) kvfree(sde->tx_ring); sde->tx_ring = NULL; } - spin_lock_irq(&dd->sde_map_lock); - sdma_map_free(rcu_access_pointer(dd->sdma_map)); - RCU_INIT_POINTER(dd->sdma_map, NULL); - spin_unlock_irq(&dd->sde_map_lock); - synchronize_rcu(); + if (rcu_access_pointer(dd->sdma_map)) { + spin_lock_irq(&dd->sde_map_lock); + sdma_map_free(rcu_access_pointer(dd->sdma_map)); + RCU_INIT_POINTER(dd->sdma_map, NULL); + spin_unlock_irq(&dd->sde_map_lock); + synchronize_rcu(); + } kfree(dd->per_sdma); dd->per_sdma = NULL; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 99d0743133ca..6988f6f21bde 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1300,8 +1300,8 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE | - IB_DEVICE_MEM_MGT_EXTENSIONS | - IB_DEVICE_RDMA_NETDEV_OPA; + IB_DEVICE_MEM_MGT_EXTENSIONS; + rdi->dparms.props.kernel_cap_flags = IBK_RDMA_NETDEV_OPA; rdi->dparms.props.page_size_cap = PAGE_SIZE; rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; rdi->dparms.props.vendor_part_id = dd->pcidev->device; diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 3083d6db1d68..2855e9ad4b32 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -106,16 +106,6 @@ enum { SERV_TYPE_XRC = 5, }; -enum hns_roce_qp_state { - HNS_ROCE_QP_STATE_RST, - HNS_ROCE_QP_STATE_INIT, - HNS_ROCE_QP_STATE_RTR, - HNS_ROCE_QP_STATE_RTS, - HNS_ROCE_QP_STATE_SQD, - HNS_ROCE_QP_STATE_ERR, - HNS_ROCE_QP_NUM_STATE, -}; - enum hns_roce_event { HNS_ROCE_EVENT_TYPE_PATH_MIG = 0x01, HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED = 0x02, @@ -139,8 +129,6 @@ enum hns_roce_event { HNS_ROCE_EVENT_TYPE_INVALID_XRCETH = 0x17, }; -#define HNS_ROCE_CAP_FLAGS_EX_SHIFT 12 - enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), @@ -535,6 +523,11 @@ struct hns_roce_cmd_context { u16 busy; }; +enum hns_roce_cmdq_state { + HNS_ROCE_CMDQ_STATE_NORMAL, + HNS_ROCE_CMDQ_STATE_FATAL_ERR, +}; + struct hns_roce_cmdq { struct dma_pool *pool; struct semaphore poll_sem; @@ -554,6 +547,7 @@ struct hns_roce_cmdq { * close device, switch into poll mode(non event mode) */ u8 use_events; + enum hns_roce_cmdq_state state; }; struct hns_roce_cmd_mailbox { @@ -657,6 +651,11 @@ struct hns_roce_ceqe { __le32 rsv[15]; }; +#define CEQE_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_ceqe, h, l) + +#define CEQE_CQN CEQE_FIELD_LOC(23, 0) +#define CEQE_OWNER CEQE_FIELD_LOC(31, 31) + struct hns_roce_aeqe { __le32 asyn; union { @@ -676,6 +675,13 @@ struct hns_roce_aeqe { __le32 rsv[12]; }; +#define AEQE_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_aeqe, h, l) + +#define AEQE_EVENT_TYPE AEQE_FIELD_LOC(7, 0) +#define AEQE_SUB_TYPE AEQE_FIELD_LOC(15, 8) +#define AEQE_OWNER AEQE_FIELD_LOC(31, 31) +#define AEQE_EVENT_QUEUE_NUM AEQE_FIELD_LOC(55, 32) + struct hns_roce_eq { struct hns_roce_dev *hr_dev; void __iomem *db_reg; @@ -725,7 +731,6 @@ struct hns_roce_caps { u32 num_pi_qps; u32 reserved_qps; int num_qpc_timer; - int num_cqc_timer; u32 num_srqs; u32 max_wqes; u32 max_srq_wrs; @@ -1191,7 +1196,6 @@ void *hns_roce_get_send_wqe(struct hns_roce_qp *hr_qp, unsigned int n); void *hns_roce_get_extend_sge(struct hns_roce_qp *hr_qp, unsigned int n); bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, u32 nreq, struct ib_cq *ib_cq); -enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state); void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq); void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2b0cef17ad45..ba3c742258ef 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -149,8 +149,7 @@ static void set_atomic_seg(const struct ib_send_wr *wr, aseg->cmp_data = 0; } - roce_set_field(rc_sq_wqe->byte_16, V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); } static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, @@ -271,8 +270,7 @@ static int set_rc_inl(struct hns_roce_qp *qp, const struct ib_send_wr *wr, dseg += sizeof(struct hns_roce_v2_rc_send_wqe); if (msg_len <= HNS_ROCE_V2_MAX_RC_INL_INN_SZ) { - roce_set_bit(rc_sq_wqe->byte_20, - V2_RC_SEND_WQE_BYTE_20_INL_TYPE_S, 0); + hr_reg_clear(rc_sq_wqe, RC_SEND_WQE_INL_TYPE); for (i = 0; i < wr->num_sge; i++) { memcpy(dseg, ((void *)wr->sg_list[i].addr), @@ -280,17 +278,13 @@ static int set_rc_inl(struct hns_roce_qp *qp, const struct ib_send_wr *wr, dseg += wr->sg_list[i].length; } } else { - roce_set_bit(rc_sq_wqe->byte_20, - V2_RC_SEND_WQE_BYTE_20_INL_TYPE_S, 1); + hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_INL_TYPE); ret = fill_ext_sge_inl_data(qp, wr, &curr_idx, msg_len); if (ret) return ret; - roce_set_field(rc_sq_wqe->byte_16, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, - curr_idx - *sge_idx); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, curr_idx - *sge_idx); } *sge_idx = curr_idx; @@ -309,12 +303,10 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, int j = 0; int i; - roce_set_field(rc_sq_wqe->byte_20, - V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, - V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, - (*sge_ind) & (qp->sge.sge_cnt - 1)); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_MSG_START_SGE_IDX, + (*sge_ind) & (qp->sge.sge_cnt - 1)); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S, + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_INLINE, !!(wr->send_flags & IB_SEND_INLINE)); if (wr->send_flags & IB_SEND_INLINE) return set_rc_inl(qp, wr, rc_sq_wqe, sge_ind); @@ -339,9 +331,7 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, valid_num_sge - HNS_ROCE_SGE_IN_WQE); } - roce_set_field(rc_sq_wqe->byte_16, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SGE_NUM, valid_num_sge); return 0; } @@ -412,8 +402,7 @@ static int set_ud_opcode(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe, ud_sq_wqe->immtdata = get_immtdata(wr); - roce_set_field(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_OPCODE_M, - V2_UD_SEND_WQE_BYTE_4_OPCODE_S, to_hr_opcode(ib_op)); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_OPCODE, to_hr_opcode(ib_op)); return 0; } @@ -424,21 +413,15 @@ static int fill_ud_av(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe, struct ib_device *ib_dev = ah->ibah.device; struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); - roce_set_field(ud_sq_wqe->byte_24, V2_UD_SEND_WQE_BYTE_24_UDPSPN_M, - V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, ah->av.udp_sport); - - roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M, - V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S, ah->av.hop_limit); - roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_TCLASS_M, - V2_UD_SEND_WQE_BYTE_36_TCLASS_S, ah->av.tclass); - roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M, - V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, ah->av.flowlabel); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_UDPSPN, ah->av.udp_sport); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_HOPLIMIT, ah->av.hop_limit); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_TCLASS, ah->av.tclass); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_FLOW_LABEL, ah->av.flowlabel); if (WARN_ON(ah->av.sl > MAX_SERVICE_LEVEL)) return -EINVAL; - roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_SL_M, - V2_UD_SEND_WQE_BYTE_40_SL_S, ah->av.sl); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_SL, ah->av.sl); ud_sq_wqe->sgid_index = ah->av.gid_index; @@ -448,10 +431,8 @@ static int fill_ud_av(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe, if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) return 0; - roce_set_bit(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S, - ah->av.vlan_en); - roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_VLAN_M, - V2_UD_SEND_WQE_BYTE_36_VLAN_S, ah->av.vlan_id); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_VLAN_EN, ah->av.vlan_en); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_VLAN, ah->av.vlan_id); return 0; } @@ -476,27 +457,19 @@ static inline int set_ud_wqe(struct hns_roce_qp *qp, ud_sq_wqe->msg_len = cpu_to_le32(msg_len); - roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_CQE_S, + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_CQE, !!(wr->send_flags & IB_SEND_SIGNALED)); - - roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_SE_S, + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_SE, !!(wr->send_flags & IB_SEND_SOLICITED)); - roce_set_field(ud_sq_wqe->byte_16, V2_UD_SEND_WQE_BYTE_16_PD_M, - V2_UD_SEND_WQE_BYTE_16_PD_S, to_hr_pd(qp->ibqp.pd)->pdn); - - roce_set_field(ud_sq_wqe->byte_16, V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M, - V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge); - - roce_set_field(ud_sq_wqe->byte_20, - V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, - V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, - curr_idx & (qp->sge.sge_cnt - 1)); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_PD, to_hr_pd(qp->ibqp.pd)->pdn); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_SGE_NUM, valid_num_sge); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_MSG_START_SGE_IDX, + curr_idx & (qp->sge.sge_cnt - 1)); ud_sq_wqe->qkey = cpu_to_le32(ud_wr(wr)->remote_qkey & 0x80000000 ? qp->qkey : ud_wr(wr)->remote_qkey); - roce_set_field(ud_sq_wqe->byte_32, V2_UD_SEND_WQE_BYTE_32_DQPN_M, - V2_UD_SEND_WQE_BYTE_32_DQPN_S, ud_wr(wr)->remote_qpn); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_DQPN, ud_wr(wr)->remote_qpn); ret = fill_ud_av(ud_sq_wqe, ah); if (ret) @@ -516,8 +489,7 @@ static inline int set_ud_wqe(struct hns_roce_qp *qp, dma_wmb(); *sge_idx = curr_idx; - roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_OWNER_S, - owner_bit); + hr_reg_write(ud_sq_wqe, UD_SEND_WQE_OWNER, owner_bit); return 0; } @@ -553,7 +525,7 @@ static int set_rc_opcode(struct hns_roce_dev *hr_dev, ret = -EOPNOTSUPP; break; case IB_WR_LOCAL_INV: - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SO_S, 1); + hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_SO); fallthrough; case IB_WR_SEND_WITH_INV: rc_sq_wqe->inv_key = cpu_to_le32(wr->ex.invalidate_rkey); @@ -565,11 +537,11 @@ static int set_rc_opcode(struct hns_roce_dev *hr_dev, if (unlikely(ret)) return ret; - roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, to_hr_opcode(ib_op)); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_OPCODE, to_hr_opcode(ib_op)); return ret; } + static inline int set_rc_wqe(struct hns_roce_qp *qp, const struct ib_send_wr *wr, void *wqe, unsigned int *sge_idx, @@ -590,13 +562,13 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, if (WARN_ON(ret)) return ret; - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_FENCE_S, + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_FENCE, (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SE_S, + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_SE, (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_CQE_S, + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_CQE, (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || @@ -616,8 +588,7 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, dma_wmb(); *sge_idx = curr_idx; - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S, - owner_bit); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_OWNER, owner_bit); return ret; } @@ -682,14 +653,11 @@ static void write_dwqe(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe; /* All kinds of DirectWQE have the same header field layout */ - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_FLAG_S, 1); - roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_DB_SL_L_M, - V2_RC_SEND_WQE_BYTE_4_DB_SL_L_S, qp->sl); - roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_DB_SL_H_M, - V2_RC_SEND_WQE_BYTE_4_DB_SL_H_S, - qp->sl >> HNS_ROCE_SL_SHIFT); - roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_WQE_INDEX_M, - V2_RC_SEND_WQE_BYTE_4_WQE_INDEX_S, qp->sq.head); + hr_reg_enable(rc_sq_wqe, RC_SEND_WQE_FLAG); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_DB_SL_L, qp->sl); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_DB_SL_H, + qp->sl >> HNS_ROCE_SL_SHIFT); + hr_reg_write(rc_sq_wqe, RC_SEND_WQE_WQE_INDEX, qp->sq.head); hns_roce_write512(hr_dev, wqe, qp->sq.db_reg); } @@ -1265,6 +1233,16 @@ static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev) return tail == priv->cmq.csq.head; } +static void update_cmdq_status(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hnae3_handle *handle = priv->handle; + + if (handle->rinfo.reset_state == HNS_ROCE_STATE_RST_INIT || + handle->rinfo.instance_state == HNS_ROCE_STATE_INIT) + hr_dev->cmd.state = HNS_ROCE_CMDQ_STATE_FATAL_ERR; +} + static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { @@ -1296,7 +1274,8 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, } while (++timeout < priv->cmq.tx_timeout); if (hns_roce_cmq_csq_done(hr_dev)) { - for (ret = 0, i = 0; i < num; i++) { + ret = 0; + for (i = 0; i < num; i++) { /* check the result of hardware write back */ desc[i] = csq->desc[tail++]; if (tail == csq->desc_num) @@ -1318,6 +1297,8 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, csq->head, tail); csq->head = tail; + update_cmdq_status(hr_dev); + ret = -EAGAIN; } @@ -1332,6 +1313,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, bool busy; int ret; + if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR) + return -EIO; + if (!v2_chk_mbox_is_avail(hr_dev, &busy)) return busy ? -EBUSY : 0; @@ -1499,7 +1483,7 @@ static void __hns_roce_function_clear(struct hns_roce_dev *hr_dev, int vf_id) if (ret) continue; - if (roce_get_bit(resp->func_done, FUNC_CLEAR_RST_FUN_DONE_S)) { + if (hr_reg_read(resp, FUNC_CLEAR_RST_FUN_DONE)) { if (vf_id == 0) hr_dev->is_reset = true; return; @@ -1510,7 +1494,7 @@ out: hns_roce_func_clr_rst_proc(hr_dev, ret, fclr_write_fail_flag); } -static void hns_roce_free_vf_resource(struct hns_roce_dev *hr_dev, int vf_id) +static int hns_roce_free_vf_resource(struct hns_roce_dev *hr_dev, int vf_id) { enum hns_roce_opcode_type opcode = HNS_ROCE_OPC_ALLOC_VF_RES; struct hns_roce_cmq_desc desc[2]; @@ -1521,17 +1505,29 @@ static void hns_roce_free_vf_resource(struct hns_roce_dev *hr_dev, int vf_id) desc[0].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT); hns_roce_cmq_setup_basic_desc(&desc[1], opcode, false); hr_reg_write(req_a, FUNC_RES_A_VF_ID, vf_id); - hns_roce_cmq_send(hr_dev, desc, 2); + + return hns_roce_cmq_send(hr_dev, desc, 2); } static void hns_roce_function_clear(struct hns_roce_dev *hr_dev) { + int ret; int i; + if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR) + return; + for (i = hr_dev->func_num - 1; i >= 0; i--) { __hns_roce_function_clear(hr_dev, i); - if (i != 0) - hns_roce_free_vf_resource(hr_dev, i); + + if (i == 0) + continue; + + ret = hns_roce_free_vf_resource(hr_dev, i); + if (ret) + ibdev_err(&hr_dev->ib_dev, + "failed to free vf resource, vf_id = %d, ret = %d.\n", + i, ret); } } @@ -1757,17 +1753,16 @@ static int __hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev, swt = (struct hns_roce_vf_switch *)desc.data; hns_roce_cmq_setup_basic_desc(&desc, HNS_SWITCH_PARAMETER_CFG, true); swt->rocee_sel |= cpu_to_le32(HNS_ICL_SWITCH_CMD_ROCEE_SEL); - roce_set_field(swt->fun_id, VF_SWITCH_DATA_FUN_ID_VF_ID_M, - VF_SWITCH_DATA_FUN_ID_VF_ID_S, vf_id); + hr_reg_write(swt, VF_SWITCH_VF_ID, vf_id); ret = hns_roce_cmq_send(hr_dev, &desc, 1); if (ret) return ret; desc.flag = cpu_to_le16(HNS_ROCE_CMD_FLAG_IN); desc.flag &= cpu_to_le16(~HNS_ROCE_CMD_FLAG_WR); - roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_LPBK_S, 1); - roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_LCL_LPBK_S, 0); - roce_set_bit(swt->cfg, VF_SWITCH_DATA_CFG_ALW_DST_OVRD_S, 1); + hr_reg_enable(swt, VF_SWITCH_ALW_LPBK); + hr_reg_clear(swt, VF_SWITCH_ALW_LCL_LPBK); + hr_reg_enable(swt, VF_SWITCH_ALW_DST_OVRD); return hns_roce_cmq_send(hr_dev, &desc, 1); } @@ -1947,7 +1942,7 @@ static void set_default_caps(struct hns_roce_dev *hr_dev) caps->num_mtpts = HNS_ROCE_V2_MAX_MTPT_NUM; caps->num_pds = HNS_ROCE_V2_MAX_PD_NUM; caps->num_qpc_timer = HNS_ROCE_V2_MAX_QPC_TIMER_NUM; - caps->num_cqc_timer = HNS_ROCE_V2_MAX_CQC_TIMER_NUM; + caps->cqc_timer_bt_num = HNS_ROCE_V2_MAX_CQC_TIMER_BT_NUM; caps->max_qp_init_rdma = HNS_ROCE_V2_MAX_QP_INIT_RDMA; caps->max_qp_dest_rdma = HNS_ROCE_V2_MAX_QP_DEST_RDMA; @@ -2243,7 +2238,6 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) caps->max_rq_sg = roundup_pow_of_two(caps->max_rq_sg); caps->max_extend_sg = le32_to_cpu(resp_a->max_extend_sg); caps->num_qpc_timer = le16_to_cpu(resp_a->num_qpc_timer); - caps->num_cqc_timer = le16_to_cpu(resp_a->num_cqc_timer); caps->max_srq_sges = le16_to_cpu(resp_a->max_srq_sges); caps->max_srq_sges = roundup_pow_of_two(caps->max_srq_sges); caps->num_aeq_vectors = resp_a->num_aeq_vectors; @@ -2270,87 +2264,39 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) ctx_hop_num = resp_b->ctx_hop_num; pbl_hop_num = resp_b->pbl_hop_num; - caps->num_pds = 1 << roce_get_field(resp_c->cap_flags_num_pds, - V2_QUERY_PF_CAPS_C_NUM_PDS_M, - V2_QUERY_PF_CAPS_C_NUM_PDS_S); - caps->flags = roce_get_field(resp_c->cap_flags_num_pds, - V2_QUERY_PF_CAPS_C_CAP_FLAGS_M, - V2_QUERY_PF_CAPS_C_CAP_FLAGS_S); + caps->num_pds = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_PDS); + + caps->flags = hr_reg_read(resp_c, PF_CAPS_C_CAP_FLAGS); caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) << HNS_ROCE_CAP_FLAGS_EX_SHIFT; - caps->num_cqs = 1 << roce_get_field(resp_c->max_gid_num_cqs, - V2_QUERY_PF_CAPS_C_NUM_CQS_M, - V2_QUERY_PF_CAPS_C_NUM_CQS_S); - caps->gid_table_len[0] = roce_get_field(resp_c->max_gid_num_cqs, - V2_QUERY_PF_CAPS_C_MAX_GID_M, - V2_QUERY_PF_CAPS_C_MAX_GID_S); - - caps->max_cqes = 1 << roce_get_field(resp_c->cq_depth, - V2_QUERY_PF_CAPS_C_CQ_DEPTH_M, - V2_QUERY_PF_CAPS_C_CQ_DEPTH_S); - caps->num_mtpts = 1 << roce_get_field(resp_c->num_mrws, - V2_QUERY_PF_CAPS_C_NUM_MRWS_M, - V2_QUERY_PF_CAPS_C_NUM_MRWS_S); - caps->num_qps = 1 << roce_get_field(resp_c->ord_num_qps, - V2_QUERY_PF_CAPS_C_NUM_QPS_M, - V2_QUERY_PF_CAPS_C_NUM_QPS_S); - caps->max_qp_init_rdma = roce_get_field(resp_c->ord_num_qps, - V2_QUERY_PF_CAPS_C_MAX_ORD_M, - V2_QUERY_PF_CAPS_C_MAX_ORD_S); + caps->num_cqs = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_CQS); + caps->gid_table_len[0] = hr_reg_read(resp_c, PF_CAPS_C_MAX_GID); + caps->max_cqes = 1 << hr_reg_read(resp_c, PF_CAPS_C_CQ_DEPTH); + caps->num_mtpts = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_MRWS); + caps->num_qps = 1 << hr_reg_read(resp_c, PF_CAPS_C_NUM_QPS); + caps->max_qp_init_rdma = hr_reg_read(resp_c, PF_CAPS_C_MAX_ORD); caps->max_qp_dest_rdma = caps->max_qp_init_rdma; caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth); - caps->num_srqs = 1 << roce_get_field(resp_d->wq_hop_num_max_srqs, - V2_QUERY_PF_CAPS_D_NUM_SRQS_M, - V2_QUERY_PF_CAPS_D_NUM_SRQS_S); - caps->cong_type = roce_get_field(resp_d->wq_hop_num_max_srqs, - V2_QUERY_PF_CAPS_D_CONG_TYPE_M, - V2_QUERY_PF_CAPS_D_CONG_TYPE_S); + + caps->num_srqs = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_SRQS); + caps->cong_type = hr_reg_read(resp_d, PF_CAPS_D_CONG_TYPE); caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth); + caps->ceqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_CEQ_DEPTH); + caps->num_comp_vectors = hr_reg_read(resp_d, PF_CAPS_D_NUM_CEQS); + caps->aeqe_depth = 1 << hr_reg_read(resp_d, PF_CAPS_D_AEQ_DEPTH); + caps->default_aeq_arm_st = hr_reg_read(resp_d, PF_CAPS_D_AEQ_ARM_ST); + caps->default_ceq_arm_st = hr_reg_read(resp_d, PF_CAPS_D_CEQ_ARM_ST); + caps->reserved_pds = hr_reg_read(resp_d, PF_CAPS_D_RSV_PDS); + caps->num_uars = 1 << hr_reg_read(resp_d, PF_CAPS_D_NUM_UARS); + caps->reserved_qps = hr_reg_read(resp_d, PF_CAPS_D_RSV_QPS); + caps->reserved_uars = hr_reg_read(resp_d, PF_CAPS_D_RSV_UARS); - caps->ceqe_depth = 1 << roce_get_field(resp_d->num_ceqs_ceq_depth, - V2_QUERY_PF_CAPS_D_CEQ_DEPTH_M, - V2_QUERY_PF_CAPS_D_CEQ_DEPTH_S); - caps->num_comp_vectors = roce_get_field(resp_d->num_ceqs_ceq_depth, - V2_QUERY_PF_CAPS_D_NUM_CEQS_M, - V2_QUERY_PF_CAPS_D_NUM_CEQS_S); - - caps->aeqe_depth = 1 << roce_get_field(resp_d->arm_st_aeq_depth, - V2_QUERY_PF_CAPS_D_AEQ_DEPTH_M, - V2_QUERY_PF_CAPS_D_AEQ_DEPTH_S); - caps->default_aeq_arm_st = roce_get_field(resp_d->arm_st_aeq_depth, - V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_M, - V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_S); - caps->default_ceq_arm_st = roce_get_field(resp_d->arm_st_aeq_depth, - V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_M, - V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_S); - caps->reserved_pds = roce_get_field(resp_d->num_uars_rsv_pds, - V2_QUERY_PF_CAPS_D_RSV_PDS_M, - V2_QUERY_PF_CAPS_D_RSV_PDS_S); - caps->num_uars = 1 << roce_get_field(resp_d->num_uars_rsv_pds, - V2_QUERY_PF_CAPS_D_NUM_UARS_M, - V2_QUERY_PF_CAPS_D_NUM_UARS_S); - caps->reserved_qps = roce_get_field(resp_d->rsv_uars_rsv_qps, - V2_QUERY_PF_CAPS_D_RSV_QPS_M, - V2_QUERY_PF_CAPS_D_RSV_QPS_S); - caps->reserved_uars = roce_get_field(resp_d->rsv_uars_rsv_qps, - V2_QUERY_PF_CAPS_D_RSV_UARS_M, - V2_QUERY_PF_CAPS_D_RSV_UARS_S); - caps->reserved_mrws = roce_get_field(resp_e->chunk_size_shift_rsv_mrws, - V2_QUERY_PF_CAPS_E_RSV_MRWS_M, - V2_QUERY_PF_CAPS_E_RSV_MRWS_S); - caps->chunk_sz = 1 << roce_get_field(resp_e->chunk_size_shift_rsv_mrws, - V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_M, - V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_S); - caps->reserved_cqs = roce_get_field(resp_e->rsv_cqs, - V2_QUERY_PF_CAPS_E_RSV_CQS_M, - V2_QUERY_PF_CAPS_E_RSV_CQS_S); - caps->reserved_srqs = roce_get_field(resp_e->rsv_srqs, - V2_QUERY_PF_CAPS_E_RSV_SRQS_M, - V2_QUERY_PF_CAPS_E_RSV_SRQS_S); - caps->reserved_lkey = roce_get_field(resp_e->rsv_lkey, - V2_QUERY_PF_CAPS_E_RSV_LKEYS_M, - V2_QUERY_PF_CAPS_E_RSV_LKEYS_S); + caps->reserved_mrws = hr_reg_read(resp_e, PF_CAPS_E_RSV_MRWS); + caps->chunk_sz = 1 << hr_reg_read(resp_e, PF_CAPS_E_CHUNK_SIZE_SHIFT); + caps->reserved_cqs = hr_reg_read(resp_e, PF_CAPS_E_RSV_CQS); + caps->reserved_srqs = hr_reg_read(resp_e, PF_CAPS_E_RSV_SRQS); + caps->reserved_lkey = hr_reg_read(resp_e, PF_CAPS_E_RSV_LKEYS); caps->default_ceq_max_cnt = le16_to_cpu(resp_e->ceq_max_cnt); caps->default_ceq_period = le16_to_cpu(resp_e->ceq_period); caps->default_aeq_max_cnt = le16_to_cpu(resp_e->aeq_max_cnt); @@ -2365,15 +2311,9 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) caps->cqe_hop_num = pbl_hop_num; caps->srqwqe_hop_num = pbl_hop_num; caps->idx_hop_num = pbl_hop_num; - caps->wqe_sq_hop_num = roce_get_field(resp_d->wq_hop_num_max_srqs, - V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_M, - V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_S); - caps->wqe_sge_hop_num = roce_get_field(resp_d->wq_hop_num_max_srqs, - V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_M, - V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_S); - caps->wqe_rq_hop_num = roce_get_field(resp_d->wq_hop_num_max_srqs, - V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_M, - V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_S); + caps->wqe_sq_hop_num = hr_reg_read(resp_d, PF_CAPS_D_SQWQE_HOP_NUM); + caps->wqe_sge_hop_num = hr_reg_read(resp_d, PF_CAPS_D_EX_SGE_HOP_NUM); + caps->wqe_rq_hop_num = hr_reg_read(resp_d, PF_CAPS_D_RQWQE_HOP_NUM); return 0; } @@ -3000,6 +2940,9 @@ static int v2_wait_mbox_complete(struct hns_roce_dev *hr_dev, u32 timeout, mb_st = (struct hns_roce_mbox_status *)desc.data; end = msecs_to_jiffies(timeout) + jiffies; while (v2_chk_mbox_is_avail(hr_dev, &busy)) { + if (hr_dev->cmd.state == HNS_ROCE_CMDQ_STATE_FATAL_ERR) + return -EIO; + status = 0; hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_MB_ST, true); @@ -3103,10 +3046,8 @@ static int config_sgid_table(struct hns_roce_dev *hr_dev, hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_SGID_TB, false); - roce_set_field(sgid_tb->table_idx_rsv, CFG_SGID_TB_TABLE_IDX_M, - CFG_SGID_TB_TABLE_IDX_S, gid_index); - roce_set_field(sgid_tb->vf_sgid_type_rsv, CFG_SGID_TB_VF_SGID_TYPE_M, - CFG_SGID_TB_VF_SGID_TYPE_S, sgid_type); + hr_reg_write(sgid_tb, CFG_SGID_TB_TABLE_IDX, gid_index); + hr_reg_write(sgid_tb, CFG_SGID_TB_VF_SGID_TYPE, sgid_type); copy_gid(&sgid_tb->vf_sgid_l, gid); @@ -3141,19 +3082,14 @@ static int config_gmv_table(struct hns_roce_dev *hr_dev, copy_gid(&tb_a->vf_sgid_l, gid); - roce_set_field(tb_a->vf_sgid_type_vlan, CFG_GMV_TB_VF_SGID_TYPE_M, - CFG_GMV_TB_VF_SGID_TYPE_S, sgid_type); - roce_set_bit(tb_a->vf_sgid_type_vlan, CFG_GMV_TB_VF_VLAN_EN_S, - vlan_id < VLAN_CFI_MASK); - roce_set_field(tb_a->vf_sgid_type_vlan, CFG_GMV_TB_VF_VLAN_ID_M, - CFG_GMV_TB_VF_VLAN_ID_S, vlan_id); + hr_reg_write(tb_a, GMV_TB_A_VF_SGID_TYPE, sgid_type); + hr_reg_write(tb_a, GMV_TB_A_VF_VLAN_EN, vlan_id < VLAN_CFI_MASK); + hr_reg_write(tb_a, GMV_TB_A_VF_VLAN_ID, vlan_id); tb_b->vf_smac_l = cpu_to_le32(*(u32 *)mac); - roce_set_field(tb_b->vf_smac_h, CFG_GMV_TB_SMAC_H_M, - CFG_GMV_TB_SMAC_H_S, *(u16 *)&mac[4]); - roce_set_field(tb_b->table_idx_rsv, CFG_GMV_TB_SGID_IDX_M, - CFG_GMV_TB_SGID_IDX_S, gid_index); + hr_reg_write(tb_b, GMV_TB_B_SMAC_H, *(u16 *)&mac[4]); + hr_reg_write(tb_b, GMV_TB_B_SGID_IDX, gid_index); return hns_roce_cmq_send(hr_dev, desc, 2); } @@ -3202,10 +3138,8 @@ static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, reg_smac_l = *(u32 *)(&addr[0]); reg_smac_h = *(u16 *)(&addr[4]); - roce_set_field(smac_tb->tb_idx_rsv, CFG_SMAC_TB_IDX_M, - CFG_SMAC_TB_IDX_S, phy_port); - roce_set_field(smac_tb->vf_smac_h_rsv, CFG_SMAC_TB_VF_SMAC_H_M, - CFG_SMAC_TB_VF_SMAC_H_S, reg_smac_h); + hr_reg_write(smac_tb, CFG_SMAC_TB_IDX, phy_port); + hr_reg_write(smac_tb, CFG_SMAC_TB_VF_SMAC_H, reg_smac_h); smac_tb->vf_smac_l = cpu_to_le32(reg_smac_l); return hns_roce_cmq_send(hr_dev, &desc, 1); @@ -3234,21 +3168,15 @@ static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, mpt_entry->pbl_size = cpu_to_le32(mr->npages); mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> 3); - roce_set_field(mpt_entry->byte_48_mode_ba, - V2_MPT_BYTE_48_PBL_BA_H_M, V2_MPT_BYTE_48_PBL_BA_H_S, - upper_32_bits(pbl_ba >> 3)); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0])); - roce_set_field(mpt_entry->byte_56_pa0_h, V2_MPT_BYTE_56_PA0_H_M, - V2_MPT_BYTE_56_PA0_H_S, upper_32_bits(pages[0])); + hr_reg_write(mpt_entry, MPT_PA0_H, upper_32_bits(pages[0])); mpt_entry->pa1_l = cpu_to_le32(lower_32_bits(pages[1])); - roce_set_field(mpt_entry->byte_64_buf_pa1, V2_MPT_BYTE_64_PA1_H_M, - V2_MPT_BYTE_64_PA1_H_S, upper_32_bits(pages[1])); - roce_set_field(mpt_entry->byte_64_buf_pa1, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, - to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); + hr_reg_write(mpt_entry, MPT_PA1_H, upper_32_bits(pages[1])); + hr_reg_write(mpt_entry, MPT_PBL_BUF_PG_SZ, + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); return 0; } @@ -3257,7 +3185,6 @@ static int hns_roce_v2_write_mtpt(struct hns_roce_dev *hr_dev, void *mb_buf, struct hns_roce_mr *mr) { struct hns_roce_v2_mpt_entry *mpt_entry; - int ret; mpt_entry = mb_buf; memset(mpt_entry, 0, sizeof(*mpt_entry)); @@ -3296,9 +3223,7 @@ static int hns_roce_v2_write_mtpt(struct hns_roce_dev *hr_dev, to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift)); hr_reg_enable(mpt_entry, MPT_INNER_PA_VLD); - ret = set_mtpt_pbl(hr_dev, mpt_entry, mr); - - return ret; + return set_mtpt_pbl(hr_dev, mpt_entry, mr); } static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, @@ -3309,24 +3234,19 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, u32 mr_access_flags = mr->access; int ret = 0; - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, - V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_VALID); - - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, - V2_MPT_BYTE_4_PD_S, mr->pd); + hr_reg_write(mpt_entry, MPT_ST, V2_MPT_ST_VALID); + hr_reg_write(mpt_entry, MPT_PD, mr->pd); if (flags & IB_MR_REREG_ACCESS) { - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, - V2_MPT_BYTE_8_BIND_EN_S, + hr_reg_write(mpt_entry, MPT_BIND_EN, (mr_access_flags & IB_ACCESS_MW_BIND ? 1 : 0)); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, - V2_MPT_BYTE_8_ATOMIC_EN_S, + hr_reg_write(mpt_entry, MPT_ATOMIC_EN, mr_access_flags & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RR_EN_S, + hr_reg_write(mpt_entry, MPT_RR_EN, mr_access_flags & IB_ACCESS_REMOTE_READ ? 1 : 0); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RW_EN_S, + hr_reg_write(mpt_entry, MPT_RW_EN, mr_access_flags & IB_ACCESS_REMOTE_WRITE ? 1 : 0); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_LW_EN_S, + hr_reg_write(mpt_entry, MPT_LW_EN, mr_access_flags & IB_ACCESS_LOCAL_WRITE ? 1 : 0); } @@ -3357,37 +3277,28 @@ static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, return -ENOBUFS; } - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, - V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE); - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M, - V2_MPT_BYTE_4_PBL_HOP_NUM_S, 1); - roce_set_field(mpt_entry->byte_4_pd_hop_st, - V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, - V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, - to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift)); - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, - V2_MPT_BYTE_4_PD_S, mr->pd); + hr_reg_write(mpt_entry, MPT_ST, V2_MPT_ST_FREE); + hr_reg_write(mpt_entry, MPT_PD, mr->pd); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 1); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); + hr_reg_enable(mpt_entry, MPT_RA_EN); + hr_reg_enable(mpt_entry, MPT_R_INV_EN); + hr_reg_enable(mpt_entry, MPT_L_INV_EN); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_FRE_S, 1); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 0); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1); + hr_reg_enable(mpt_entry, MPT_FRE); + hr_reg_clear(mpt_entry, MPT_MR_MW); + hr_reg_enable(mpt_entry, MPT_BPD); + hr_reg_clear(mpt_entry, MPT_PA); + + hr_reg_write(mpt_entry, MPT_PBL_HOP_NUM, 1); + hr_reg_write(mpt_entry, MPT_PBL_BA_PG_SZ, + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift)); + hr_reg_write(mpt_entry, MPT_PBL_BUF_PG_SZ, + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); mpt_entry->pbl_size = cpu_to_le32(mr->npages); mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> 3)); - roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M, - V2_MPT_BYTE_48_PBL_BA_H_S, - upper_32_bits(pbl_ba >> 3)); - - roce_set_field(mpt_entry->byte_64_buf_pa1, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, - to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); + hr_reg_write(mpt_entry, MPT_PBL_BA_H, upper_32_bits(pbl_ba >> 3)); return 0; } @@ -3399,36 +3310,29 @@ static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw) mpt_entry = mb_buf; memset(mpt_entry, 0, sizeof(*mpt_entry)); - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, - V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE); - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, - V2_MPT_BYTE_4_PD_S, mw->pdn); - roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M, - V2_MPT_BYTE_4_PBL_HOP_NUM_S, - mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : - mw->pbl_hop_num); - roce_set_field(mpt_entry->byte_4_pd_hop_st, - V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, - V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, - mw->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + hr_reg_write(mpt_entry, MPT_ST, V2_MPT_ST_FREE); + hr_reg_write(mpt_entry, MPT_PD, mw->pdn); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_LW_EN_S, 1); + hr_reg_enable(mpt_entry, MPT_R_INV_EN); + hr_reg_enable(mpt_entry, MPT_L_INV_EN); + hr_reg_enable(mpt_entry, MPT_LW_EN); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 1); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1); - roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BQP_S, + hr_reg_enable(mpt_entry, MPT_MR_MW); + hr_reg_enable(mpt_entry, MPT_BPD); + hr_reg_clear(mpt_entry, MPT_PA); + hr_reg_write(mpt_entry, MPT_BQP, mw->ibmw.type == IB_MW_TYPE_1 ? 0 : 1); - roce_set_field(mpt_entry->byte_64_buf_pa1, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, - mw->pbl_buf_pg_sz + PG_SHIFT_OFFSET); - mpt_entry->lkey = cpu_to_le32(mw->rkey); + hr_reg_write(mpt_entry, MPT_PBL_HOP_NUM, + mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : + mw->pbl_hop_num); + hr_reg_write(mpt_entry, MPT_PBL_BA_PG_SZ, + mw->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + hr_reg_write(mpt_entry, MPT_PBL_BUF_PG_SZ, + mw->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + return 0; } @@ -4966,9 +4870,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, if (ret) return ret; - if (gid_attr) - is_udp = (gid_attr->gid_type == - IB_GID_TYPE_ROCE_UDP_ENCAP); + is_udp = (gid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP); } /* Only HIP08 needs to set the vlan_en bits in QPC */ @@ -5949,7 +5851,7 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) (eq->cons_index & (eq->entries - 1)) * eq->eqe_size); - return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^ + return (hr_reg_read(aeqe, AEQE_OWNER) ^ !!(eq->cons_index & eq->entries)) ? aeqe : NULL; } @@ -5969,15 +5871,9 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, */ dma_rmb(); - event_type = roce_get_field(aeqe->asyn, - HNS_ROCE_V2_AEQE_EVENT_TYPE_M, - HNS_ROCE_V2_AEQE_EVENT_TYPE_S); - sub_type = roce_get_field(aeqe->asyn, - HNS_ROCE_V2_AEQE_SUB_TYPE_M, - HNS_ROCE_V2_AEQE_SUB_TYPE_S); - queue_num = roce_get_field(aeqe->event.queue_event.num, - HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, - HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); + event_type = hr_reg_read(aeqe, AEQE_EVENT_TYPE); + sub_type = hr_reg_read(aeqe, AEQE_SUB_TYPE); + queue_num = hr_reg_read(aeqe, AEQE_EVENT_QUEUE_NUM); switch (event_type) { case HNS_ROCE_EVENT_TYPE_PATH_MIG: @@ -6037,8 +5933,8 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) (eq->cons_index & (eq->entries - 1)) * eq->eqe_size); - return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^ - (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; + return (hr_reg_read(ceqe, CEQE_OWNER) ^ + !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, @@ -6054,8 +5950,7 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, */ dma_rmb(); - cqn = roce_get_field(ceqe->comp, HNS_ROCE_V2_CEQE_COMP_CQN_M, - HNS_ROCE_V2_CEQE_COMP_CQN_S); + cqn = hr_reg_read(ceqe, CEQE_CQN); hns_roce_cq_completion(hr_dev, cqn); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 0d87b627601e..7ffb7824d268 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -41,7 +41,7 @@ #define HNS_ROCE_V2_MAX_SRQ_WR 0x8000 #define HNS_ROCE_V2_MAX_SRQ_SGE 64 #define HNS_ROCE_V2_MAX_CQ_NUM 0x100000 -#define HNS_ROCE_V2_MAX_CQC_TIMER_NUM 0x100 +#define HNS_ROCE_V2_MAX_CQC_TIMER_BT_NUM 0x100 #define HNS_ROCE_V2_MAX_SRQ_NUM 0x100000 #define HNS_ROCE_V2_MAX_CQE_NUM 0x400000 #define HNS_ROCE_V2_MAX_RQ_SGE_NUM 64 @@ -303,33 +303,6 @@ struct hns_roce_v2_cq_context { #define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0 #define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL 0x0 -#define V2_CQC_BYTE_4_ARM_ST_S 6 -#define V2_CQC_BYTE_4_ARM_ST_M GENMASK(7, 6) - -#define V2_CQC_BYTE_4_CEQN_S 15 -#define V2_CQC_BYTE_4_CEQN_M GENMASK(23, 15) - -#define V2_CQC_BYTE_8_CQN_S 0 -#define V2_CQC_BYTE_8_CQN_M GENMASK(23, 0) - -#define V2_CQC_BYTE_16_CQE_HOP_NUM_S 30 -#define V2_CQC_BYTE_16_CQE_HOP_NUM_M GENMASK(31, 30) - -#define V2_CQC_BYTE_28_CQ_PRODUCER_IDX_S 0 -#define V2_CQC_BYTE_28_CQ_PRODUCER_IDX_M GENMASK(23, 0) - -#define V2_CQC_BYTE_32_CQ_CONSUMER_IDX_S 0 -#define V2_CQC_BYTE_32_CQ_CONSUMER_IDX_M GENMASK(23, 0) - -#define V2_CQC_BYTE_52_CQE_CNT_S 0 -#define V2_CQC_BYTE_52_CQE_CNT_M GENMASK(23, 0) - -#define V2_CQC_BYTE_56_CQ_MAX_CNT_S 0 -#define V2_CQC_BYTE_56_CQ_MAX_CNT_M GENMASK(15, 0) - -#define V2_CQC_BYTE_56_CQ_PERIOD_S 16 -#define V2_CQC_BYTE_56_CQ_PERIOD_M GENMASK(31, 16) - #define CQC_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_cq_context, h, l) #define CQC_CQ_ST CQC_FIELD_LOC(1, 0) @@ -788,12 +761,15 @@ struct hns_roce_v2_mpt_entry { #define MPT_LKEY MPT_FIELD_LOC(223, 192) #define MPT_VA MPT_FIELD_LOC(287, 224) #define MPT_PBL_SIZE MPT_FIELD_LOC(319, 288) -#define MPT_PBL_BA MPT_FIELD_LOC(380, 320) +#define MPT_PBL_BA_L MPT_FIELD_LOC(351, 320) +#define MPT_PBL_BA_H MPT_FIELD_LOC(380, 352) #define MPT_BLK_MODE MPT_FIELD_LOC(381, 381) #define MPT_RSV0 MPT_FIELD_LOC(383, 382) -#define MPT_PA0 MPT_FIELD_LOC(441, 384) +#define MPT_PA0_L MPT_FIELD_LOC(415, 384) +#define MPT_PA0_H MPT_FIELD_LOC(441, 416) #define MPT_BOUND_VA MPT_FIELD_LOC(447, 442) -#define MPT_PA1 MPT_FIELD_LOC(505, 448) +#define MPT_PA1_L MPT_FIELD_LOC(479, 448) +#define MPT_PA1_H MPT_FIELD_LOC(505, 480) #define MPT_PERSIST_EN MPT_FIELD_LOC(506, 506) #define MPT_RSV2 MPT_FIELD_LOC(507, 507) #define MPT_PBL_BUF_PG_SZ MPT_FIELD_LOC(511, 508) @@ -899,48 +875,24 @@ struct hns_roce_v2_ud_send_wqe { u8 dgid[GID_LEN_V2]; }; -#define V2_UD_SEND_WQE_BYTE_4_OPCODE_S 0 -#define V2_UD_SEND_WQE_BYTE_4_OPCODE_M GENMASK(4, 0) +#define UD_SEND_WQE_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_ud_send_wqe, h, l) -#define V2_UD_SEND_WQE_BYTE_4_OWNER_S 7 - -#define V2_UD_SEND_WQE_BYTE_4_CQE_S 8 - -#define V2_UD_SEND_WQE_BYTE_4_SE_S 11 - -#define V2_UD_SEND_WQE_BYTE_16_PD_S 0 -#define V2_UD_SEND_WQE_BYTE_16_PD_M GENMASK(23, 0) - -#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S 24 -#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M GENMASK(31, 24) - -#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0 -#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0) - -#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_S 16 -#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_M GENMASK(31, 16) - -#define V2_UD_SEND_WQE_BYTE_32_DQPN_S 0 -#define V2_UD_SEND_WQE_BYTE_32_DQPN_M GENMASK(23, 0) - -#define V2_UD_SEND_WQE_BYTE_36_VLAN_S 0 -#define V2_UD_SEND_WQE_BYTE_36_VLAN_M GENMASK(15, 0) - -#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S 16 -#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M GENMASK(23, 16) - -#define V2_UD_SEND_WQE_BYTE_36_TCLASS_S 24 -#define V2_UD_SEND_WQE_BYTE_36_TCLASS_M GENMASK(31, 24) - -#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S 0 -#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M GENMASK(19, 0) - -#define V2_UD_SEND_WQE_BYTE_40_SL_S 20 -#define V2_UD_SEND_WQE_BYTE_40_SL_M GENMASK(23, 20) - -#define V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S 30 - -#define V2_UD_SEND_WQE_BYTE_40_LBI_S 31 +#define UD_SEND_WQE_OPCODE UD_SEND_WQE_FIELD_LOC(4, 0) +#define UD_SEND_WQE_OWNER UD_SEND_WQE_FIELD_LOC(7, 7) +#define UD_SEND_WQE_CQE UD_SEND_WQE_FIELD_LOC(8, 8) +#define UD_SEND_WQE_SE UD_SEND_WQE_FIELD_LOC(11, 11) +#define UD_SEND_WQE_PD UD_SEND_WQE_FIELD_LOC(119, 96) +#define UD_SEND_WQE_SGE_NUM UD_SEND_WQE_FIELD_LOC(127, 120) +#define UD_SEND_WQE_MSG_START_SGE_IDX UD_SEND_WQE_FIELD_LOC(151, 128) +#define UD_SEND_WQE_UDPSPN UD_SEND_WQE_FIELD_LOC(191, 176) +#define UD_SEND_WQE_DQPN UD_SEND_WQE_FIELD_LOC(247, 224) +#define UD_SEND_WQE_VLAN UD_SEND_WQE_FIELD_LOC(271, 256) +#define UD_SEND_WQE_HOPLIMIT UD_SEND_WQE_FIELD_LOC(279, 272) +#define UD_SEND_WQE_TCLASS UD_SEND_WQE_FIELD_LOC(287, 280) +#define UD_SEND_WQE_FLOW_LABEL UD_SEND_WQE_FIELD_LOC(307, 288) +#define UD_SEND_WQE_SL UD_SEND_WQE_FIELD_LOC(311, 308) +#define UD_SEND_WQE_VLAN_EN UD_SEND_WQE_FIELD_LOC(318, 318) +#define UD_SEND_WQE_LBI UD_SEND_WQE_FIELD_LOC(319, 319) struct hns_roce_v2_rc_send_wqe { __le32 byte_4; @@ -955,42 +907,23 @@ struct hns_roce_v2_rc_send_wqe { __le64 va; }; -#define V2_RC_SEND_WQE_BYTE_4_OPCODE_S 0 -#define V2_RC_SEND_WQE_BYTE_4_OPCODE_M GENMASK(4, 0) +#define RC_SEND_WQE_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_v2_rc_send_wqe, h, l) -#define V2_RC_SEND_WQE_BYTE_4_DB_SL_L_S 5 -#define V2_RC_SEND_WQE_BYTE_4_DB_SL_L_M GENMASK(6, 5) - -#define V2_RC_SEND_WQE_BYTE_4_DB_SL_H_S 13 -#define V2_RC_SEND_WQE_BYTE_4_DB_SL_H_M GENMASK(14, 13) - -#define V2_RC_SEND_WQE_BYTE_4_WQE_INDEX_S 15 -#define V2_RC_SEND_WQE_BYTE_4_WQE_INDEX_M GENMASK(30, 15) - -#define V2_RC_SEND_WQE_BYTE_4_OWNER_S 7 - -#define V2_RC_SEND_WQE_BYTE_4_CQE_S 8 - -#define V2_RC_SEND_WQE_BYTE_4_FENCE_S 9 - -#define V2_RC_SEND_WQE_BYTE_4_SO_S 10 - -#define V2_RC_SEND_WQE_BYTE_4_SE_S 11 - -#define V2_RC_SEND_WQE_BYTE_4_INLINE_S 12 - -#define V2_RC_SEND_WQE_BYTE_4_FLAG_S 31 - -#define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_S 0 -#define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_M GENMASK(23, 0) - -#define V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S 24 -#define V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M GENMASK(31, 24) - -#define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0 -#define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0) - -#define V2_RC_SEND_WQE_BYTE_20_INL_TYPE_S 31 +#define RC_SEND_WQE_OPCODE RC_SEND_WQE_FIELD_LOC(4, 0) +#define RC_SEND_WQE_DB_SL_L RC_SEND_WQE_FIELD_LOC(6, 5) +#define RC_SEND_WQE_DB_SL_H RC_SEND_WQE_FIELD_LOC(14, 13) +#define RC_SEND_WQE_OWNER RC_SEND_WQE_FIELD_LOC(7, 7) +#define RC_SEND_WQE_CQE RC_SEND_WQE_FIELD_LOC(8, 8) +#define RC_SEND_WQE_FENCE RC_SEND_WQE_FIELD_LOC(9, 9) +#define RC_SEND_WQE_SO RC_SEND_WQE_FIELD_LOC(10, 10) +#define RC_SEND_WQE_SE RC_SEND_WQE_FIELD_LOC(11, 11) +#define RC_SEND_WQE_INLINE RC_SEND_WQE_FIELD_LOC(12, 12) +#define RC_SEND_WQE_WQE_INDEX RC_SEND_WQE_FIELD_LOC(30, 15) +#define RC_SEND_WQE_FLAG RC_SEND_WQE_FIELD_LOC(31, 31) +#define RC_SEND_WQE_XRC_SRQN RC_SEND_WQE_FIELD_LOC(119, 96) +#define RC_SEND_WQE_SGE_NUM RC_SEND_WQE_FIELD_LOC(127, 120) +#define RC_SEND_WQE_MSG_START_SGE_IDX RC_SEND_WQE_FIELD_LOC(151, 128) +#define RC_SEND_WQE_INL_TYPE RC_SEND_WQE_FIELD_LOC(159, 159) struct hns_roce_wqe_frmr_seg { __le32 pbl_size; @@ -1033,7 +966,10 @@ struct hns_roce_func_clear { __le32 rsv[4]; }; -#define FUNC_CLEAR_RST_FUN_DONE_S 0 +#define FUNC_CLEAR_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_func_clear, h, l) + +#define FUNC_CLEAR_RST_FUN_DONE FUNC_CLEAR_FIELD_LOC(32, 32) + /* Each physical function manages up to 248 virtual functions, it takes up to * 100ms for each function to execute clear. If an abnormal reset occurs, it is * executed twice at most, so it takes up to 249 * 2 * 100ms. @@ -1112,12 +1048,12 @@ struct hns_roce_vf_switch { __le32 resv3; }; -#define VF_SWITCH_DATA_FUN_ID_VF_ID_S 3 -#define VF_SWITCH_DATA_FUN_ID_VF_ID_M GENMASK(10, 3) +#define VF_SWITCH_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_vf_switch, h, l) -#define VF_SWITCH_DATA_CFG_ALW_LPBK_S 1 -#define VF_SWITCH_DATA_CFG_ALW_LCL_LPBK_S 2 -#define VF_SWITCH_DATA_CFG_ALW_DST_OVRD_S 3 +#define VF_SWITCH_VF_ID VF_SWITCH_FIELD_LOC(42, 35) +#define VF_SWITCH_ALW_LPBK VF_SWITCH_FIELD_LOC(65, 65) +#define VF_SWITCH_ALW_LCL_LPBK VF_SWITCH_FIELD_LOC(66, 66) +#define VF_SWITCH_ALW_DST_OVRD VF_SWITCH_FIELD_LOC(67, 67) struct hns_roce_post_mbox { __le32 in_param_l; @@ -1180,11 +1116,10 @@ struct hns_roce_cfg_sgid_tb { __le32 vf_sgid_type_rsv; }; -#define CFG_SGID_TB_TABLE_IDX_S 0 -#define CFG_SGID_TB_TABLE_IDX_M GENMASK(7, 0) +#define SGID_TB_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_cfg_sgid_tb, h, l) -#define CFG_SGID_TB_VF_SGID_TYPE_S 0 -#define CFG_SGID_TB_VF_SGID_TYPE_M GENMASK(1, 0) +#define CFG_SGID_TB_TABLE_IDX SGID_TB_FIELD_LOC(7, 0) +#define CFG_SGID_TB_VF_SGID_TYPE SGID_TB_FIELD_LOC(161, 160) struct hns_roce_cfg_smac_tb { __le32 tb_idx_rsv; @@ -1192,11 +1127,11 @@ struct hns_roce_cfg_smac_tb { __le32 vf_smac_h_rsv; __le32 rsv[3]; }; -#define CFG_SMAC_TB_IDX_S 0 -#define CFG_SMAC_TB_IDX_M GENMASK(7, 0) -#define CFG_SMAC_TB_VF_SMAC_H_S 0 -#define CFG_SMAC_TB_VF_SMAC_H_M GENMASK(15, 0) +#define SMAC_TB_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_cfg_smac_tb, h, l) + +#define CFG_SMAC_TB_IDX SMAC_TB_FIELD_LOC(7, 0) +#define CFG_SMAC_TB_VF_SMAC_H SMAC_TB_FIELD_LOC(79, 64) struct hns_roce_cfg_gmv_tb_a { __le32 vf_sgid_l; @@ -1207,16 +1142,11 @@ struct hns_roce_cfg_gmv_tb_a { __le32 resv; }; -#define CFG_GMV_TB_SGID_IDX_S 0 -#define CFG_GMV_TB_SGID_IDX_M GENMASK(7, 0) +#define GMV_TB_A_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_cfg_gmv_tb_a, h, l) -#define CFG_GMV_TB_VF_SGID_TYPE_S 0 -#define CFG_GMV_TB_VF_SGID_TYPE_M GENMASK(1, 0) - -#define CFG_GMV_TB_VF_VLAN_EN_S 2 - -#define CFG_GMV_TB_VF_VLAN_ID_S 16 -#define CFG_GMV_TB_VF_VLAN_ID_M GENMASK(27, 16) +#define GMV_TB_A_VF_SGID_TYPE GMV_TB_A_FIELD_LOC(129, 128) +#define GMV_TB_A_VF_VLAN_EN GMV_TB_A_FIELD_LOC(130, 130) +#define GMV_TB_A_VF_VLAN_ID GMV_TB_A_FIELD_LOC(155, 144) struct hns_roce_cfg_gmv_tb_b { __le32 vf_smac_l; @@ -1225,8 +1155,10 @@ struct hns_roce_cfg_gmv_tb_b { __le32 resv[3]; }; -#define CFG_GMV_TB_SMAC_H_S 0 -#define CFG_GMV_TB_SMAC_H_M GENMASK(15, 0) +#define GMV_TB_B_FIELD_LOC(h, l) FIELD_LOC(struct hns_roce_cfg_gmv_tb_b, h, l) + +#define GMV_TB_B_SMAC_H GMV_TB_B_FIELD_LOC(47, 32) +#define GMV_TB_B_SGID_IDX GMV_TB_B_FIELD_LOC(71, 64) #define HNS_ROCE_QUERY_PF_CAPS_CMD_NUM 5 struct hns_roce_query_pf_caps_a { @@ -1278,29 +1210,17 @@ struct hns_roce_query_pf_caps_c { __le16 rq_depth; }; -#define V2_QUERY_PF_CAPS_C_NUM_PDS_S 0 -#define V2_QUERY_PF_CAPS_C_NUM_PDS_M GENMASK(19, 0) +#define PF_CAPS_C_FIELD_LOC(h, l) \ + FIELD_LOC(struct hns_roce_query_pf_caps_c, h, l) -#define V2_QUERY_PF_CAPS_C_CAP_FLAGS_S 20 -#define V2_QUERY_PF_CAPS_C_CAP_FLAGS_M GENMASK(31, 20) - -#define V2_QUERY_PF_CAPS_C_NUM_CQS_S 0 -#define V2_QUERY_PF_CAPS_C_NUM_CQS_M GENMASK(19, 0) - -#define V2_QUERY_PF_CAPS_C_MAX_GID_S 20 -#define V2_QUERY_PF_CAPS_C_MAX_GID_M GENMASK(28, 20) - -#define V2_QUERY_PF_CAPS_C_CQ_DEPTH_S 0 -#define V2_QUERY_PF_CAPS_C_CQ_DEPTH_M GENMASK(22, 0) - -#define V2_QUERY_PF_CAPS_C_NUM_MRWS_S 0 -#define V2_QUERY_PF_CAPS_C_NUM_MRWS_M GENMASK(19, 0) - -#define V2_QUERY_PF_CAPS_C_NUM_QPS_S 0 -#define V2_QUERY_PF_CAPS_C_NUM_QPS_M GENMASK(19, 0) - -#define V2_QUERY_PF_CAPS_C_MAX_ORD_S 20 -#define V2_QUERY_PF_CAPS_C_MAX_ORD_M GENMASK(27, 20) +#define PF_CAPS_C_NUM_PDS PF_CAPS_C_FIELD_LOC(19, 0) +#define PF_CAPS_C_CAP_FLAGS PF_CAPS_C_FIELD_LOC(31, 20) +#define PF_CAPS_C_NUM_CQS PF_CAPS_C_FIELD_LOC(51, 32) +#define PF_CAPS_C_MAX_GID PF_CAPS_C_FIELD_LOC(60, 52) +#define PF_CAPS_C_CQ_DEPTH PF_CAPS_C_FIELD_LOC(86, 64) +#define PF_CAPS_C_NUM_MRWS PF_CAPS_C_FIELD_LOC(115, 96) +#define PF_CAPS_C_NUM_QPS PF_CAPS_C_FIELD_LOC(147, 128) +#define PF_CAPS_C_MAX_ORD PF_CAPS_C_FIELD_LOC(155, 148) struct hns_roce_query_pf_caps_d { __le32 wq_hop_num_max_srqs; @@ -1311,20 +1231,26 @@ struct hns_roce_query_pf_caps_d { __le32 num_uars_rsv_pds; __le32 rsv_uars_rsv_qps; }; -#define V2_QUERY_PF_CAPS_D_NUM_SRQS_S 0 -#define V2_QUERY_PF_CAPS_D_NUM_SRQS_M GENMASK(19, 0) -#define V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_S 20 -#define V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_M GENMASK(21, 20) +#define PF_CAPS_D_FIELD_LOC(h, l) \ + FIELD_LOC(struct hns_roce_query_pf_caps_d, h, l) -#define V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_S 22 -#define V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_M GENMASK(23, 22) +#define PF_CAPS_D_NUM_SRQS PF_CAPS_D_FIELD_LOC(19, 0) +#define PF_CAPS_D_RQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(21, 20) +#define PF_CAPS_D_EX_SGE_HOP_NUM PF_CAPS_D_FIELD_LOC(23, 22) +#define PF_CAPS_D_SQWQE_HOP_NUM PF_CAPS_D_FIELD_LOC(25, 24) +#define PF_CAPS_D_CONG_TYPE PF_CAPS_D_FIELD_LOC(29, 26) +#define PF_CAPS_D_CEQ_DEPTH PF_CAPS_D_FIELD_LOC(85, 64) +#define PF_CAPS_D_NUM_CEQS PF_CAPS_D_FIELD_LOC(95, 86) +#define PF_CAPS_D_AEQ_DEPTH PF_CAPS_D_FIELD_LOC(117, 96) +#define PF_CAPS_D_AEQ_ARM_ST PF_CAPS_D_FIELD_LOC(119, 118) +#define PF_CAPS_D_CEQ_ARM_ST PF_CAPS_D_FIELD_LOC(121, 120) +#define PF_CAPS_D_RSV_PDS PF_CAPS_D_FIELD_LOC(147, 128) +#define PF_CAPS_D_NUM_UARS PF_CAPS_D_FIELD_LOC(155, 148) +#define PF_CAPS_D_RSV_QPS PF_CAPS_D_FIELD_LOC(179, 160) +#define PF_CAPS_D_RSV_UARS PF_CAPS_D_FIELD_LOC(187, 180) -#define V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_S 24 -#define V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_M GENMASK(25, 24) - -#define V2_QUERY_PF_CAPS_D_CONG_TYPE_S 26 -#define V2_QUERY_PF_CAPS_D_CONG_TYPE_M GENMASK(29, 26) +#define HNS_ROCE_CAP_FLAGS_EX_SHIFT 12 struct hns_roce_congestion_algorithm { u8 alg_sel; @@ -1333,33 +1259,6 @@ struct hns_roce_congestion_algorithm { u8 wnd_mode_sel; }; -#define V2_QUERY_PF_CAPS_D_CEQ_DEPTH_S 0 -#define V2_QUERY_PF_CAPS_D_CEQ_DEPTH_M GENMASK(21, 0) - -#define V2_QUERY_PF_CAPS_D_NUM_CEQS_S 22 -#define V2_QUERY_PF_CAPS_D_NUM_CEQS_M GENMASK(31, 22) - -#define V2_QUERY_PF_CAPS_D_AEQ_DEPTH_S 0 -#define V2_QUERY_PF_CAPS_D_AEQ_DEPTH_M GENMASK(21, 0) - -#define V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_S 22 -#define V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_M GENMASK(23, 22) - -#define V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_S 24 -#define V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_M GENMASK(25, 24) - -#define V2_QUERY_PF_CAPS_D_RSV_PDS_S 0 -#define V2_QUERY_PF_CAPS_D_RSV_PDS_M GENMASK(19, 0) - -#define V2_QUERY_PF_CAPS_D_NUM_UARS_S 20 -#define V2_QUERY_PF_CAPS_D_NUM_UARS_M GENMASK(27, 20) - -#define V2_QUERY_PF_CAPS_D_RSV_QPS_S 0 -#define V2_QUERY_PF_CAPS_D_RSV_QPS_M GENMASK(19, 0) - -#define V2_QUERY_PF_CAPS_D_RSV_UARS_S 20 -#define V2_QUERY_PF_CAPS_D_RSV_UARS_M GENMASK(27, 20) - struct hns_roce_query_pf_caps_e { __le32 chunk_size_shift_rsv_mrws; __le32 rsv_cqs; @@ -1371,20 +1270,14 @@ struct hns_roce_query_pf_caps_e { __le16 aeq_period; }; -#define V2_QUERY_PF_CAPS_E_RSV_MRWS_S 0 -#define V2_QUERY_PF_CAPS_E_RSV_MRWS_M GENMASK(19, 0) +#define PF_CAPS_E_FIELD_LOC(h, l) \ + FIELD_LOC(struct hns_roce_query_pf_caps_e, h, l) -#define V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_S 20 -#define V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_M GENMASK(31, 20) - -#define V2_QUERY_PF_CAPS_E_RSV_CQS_S 0 -#define V2_QUERY_PF_CAPS_E_RSV_CQS_M GENMASK(19, 0) - -#define V2_QUERY_PF_CAPS_E_RSV_SRQS_S 0 -#define V2_QUERY_PF_CAPS_E_RSV_SRQS_M GENMASK(19, 0) - -#define V2_QUERY_PF_CAPS_E_RSV_LKEYS_S 0 -#define V2_QUERY_PF_CAPS_E_RSV_LKEYS_M GENMASK(19, 0) +#define PF_CAPS_E_RSV_MRWS PF_CAPS_E_FIELD_LOC(19, 0) +#define PF_CAPS_E_CHUNK_SIZE_SHIFT PF_CAPS_E_FIELD_LOC(31, 20) +#define PF_CAPS_E_RSV_CQS PF_CAPS_E_FIELD_LOC(51, 32) +#define PF_CAPS_E_RSV_SRQS PF_CAPS_E_FIELD_LOC(83, 64) +#define PF_CAPS_E_RSV_LKEYS PF_CAPS_E_FIELD_LOC(115, 96) struct hns_roce_cmq_req { __le32 data[6]; @@ -1485,9 +1378,6 @@ struct hns_roce_dip { #define HNS_ROCE_EQ_INIT_CONS_IDX 0 #define HNS_ROCE_EQ_INIT_NXT_EQE_BA 0 -#define HNS_ROCE_V2_CEQ_CEQE_OWNER_S 31 -#define HNS_ROCE_V2_AEQ_AEQE_OWNER_S 31 - #define HNS_ROCE_V2_COMP_EQE_NUM 0x1000 #define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000 @@ -1544,18 +1434,6 @@ struct hns_roce_eq_context { #define EQC_NEX_EQE_BA_H EQC_FIELD_LOC(339, 320) #define EQC_EQE_SIZE EQC_FIELD_LOC(341, 340) -#define HNS_ROCE_V2_CEQE_COMP_CQN_S 0 -#define HNS_ROCE_V2_CEQE_COMP_CQN_M GENMASK(23, 0) - -#define HNS_ROCE_V2_AEQE_EVENT_TYPE_S 0 -#define HNS_ROCE_V2_AEQE_EVENT_TYPE_M GENMASK(7, 0) - -#define HNS_ROCE_V2_AEQE_SUB_TYPE_S 8 -#define HNS_ROCE_V2_AEQE_SUB_TYPE_M GENMASK(15, 8) - -#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0 -#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0) - #define MAX_SERVICE_LEVEL 0x7 struct hns_roce_wqe_atomic_seg { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index f73ba619f375..c8af4ebd7cbd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -737,7 +737,7 @@ static int hns_roce_init_hem(struct hns_roce_dev *hr_dev) ret = hns_roce_init_hem_table(hr_dev, &hr_dev->cqc_timer_table, HEM_TYPE_CQC_TIMER, hr_dev->caps.cqc_timer_entry_sz, - hr_dev->caps.num_cqc_timer, 1); + hr_dev->caps.cqc_timer_bt_num, 1); if (ret) { dev_err(dev, "Failed to init CQC timer memory, aborting.\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index b389738d157f..867972c2a894 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -340,7 +340,6 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); struct hns_roce_mr *mr = to_hr_mr(ibmr); - int ret = 0; if (hr_dev->hw->dereg_mr) hr_dev->hw->dereg_mr(hr_dev); @@ -348,7 +347,7 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) hns_roce_mr_free(hr_dev, mr); kfree(mr); - return ret; + return 0; } struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index d78373e10aab..48d3616a6d71 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -243,26 +243,6 @@ static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) return 0; } -enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state) -{ - switch (state) { - case IB_QPS_RESET: - return HNS_ROCE_QP_STATE_RST; - case IB_QPS_INIT: - return HNS_ROCE_QP_STATE_INIT; - case IB_QPS_RTR: - return HNS_ROCE_QP_STATE_RTR; - case IB_QPS_RTS: - return HNS_ROCE_QP_STATE_RTS; - case IB_QPS_SQD: - return HNS_ROCE_QP_STATE_SQD; - case IB_QPS_ERR: - return HNS_ROCE_QP_STATE_ERR; - default: - return HNS_ROCE_QP_NUM_STATE; - } -} - static void add_qp_to_list(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_cq *send_cq, struct ib_cq *recv_cq) diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c index 259444c0a630..24a154d64630 100644 --- a/drivers/infiniband/hw/hns/hns_roce_restrack.c +++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c @@ -13,61 +13,40 @@ static int hns_roce_fill_cq(struct sk_buff *msg, struct hns_roce_v2_cq_context *context) { if (rdma_nl_put_driver_u32(msg, "state", - roce_get_field(context->byte_4_pg_ceqn, - V2_CQC_BYTE_4_ARM_ST_M, - V2_CQC_BYTE_4_ARM_ST_S))) + hr_reg_read(context, CQC_ARM_ST))) + goto err; if (rdma_nl_put_driver_u32(msg, "ceqn", - roce_get_field(context->byte_4_pg_ceqn, - V2_CQC_BYTE_4_CEQN_M, - V2_CQC_BYTE_4_CEQN_S))) + hr_reg_read(context, CQC_CEQN))) goto err; if (rdma_nl_put_driver_u32(msg, "cqn", - roce_get_field(context->byte_8_cqn, - V2_CQC_BYTE_8_CQN_M, - V2_CQC_BYTE_8_CQN_S))) + hr_reg_read(context, CQC_CQN))) goto err; if (rdma_nl_put_driver_u32(msg, "hopnum", - roce_get_field(context->byte_16_hop_addr, - V2_CQC_BYTE_16_CQE_HOP_NUM_M, - V2_CQC_BYTE_16_CQE_HOP_NUM_S))) + hr_reg_read(context, CQC_CQE_HOP_NUM))) goto err; - if (rdma_nl_put_driver_u32( - msg, "pi", - roce_get_field(context->byte_28_cq_pi, - V2_CQC_BYTE_28_CQ_PRODUCER_IDX_M, - V2_CQC_BYTE_28_CQ_PRODUCER_IDX_S))) + if (rdma_nl_put_driver_u32(msg, "pi", + hr_reg_read(context, CQC_CQ_PRODUCER_IDX))) goto err; - if (rdma_nl_put_driver_u32( - msg, "ci", - roce_get_field(context->byte_32_cq_ci, - V2_CQC_BYTE_32_CQ_CONSUMER_IDX_M, - V2_CQC_BYTE_32_CQ_CONSUMER_IDX_S))) + if (rdma_nl_put_driver_u32(msg, "ci", + hr_reg_read(context, CQC_CQ_CONSUMER_IDX))) goto err; - if (rdma_nl_put_driver_u32( - msg, "coalesce", - roce_get_field(context->byte_56_cqe_period_maxcnt, - V2_CQC_BYTE_56_CQ_MAX_CNT_M, - V2_CQC_BYTE_56_CQ_MAX_CNT_S))) + if (rdma_nl_put_driver_u32(msg, "coalesce", + hr_reg_read(context, CQC_CQ_MAX_CNT))) goto err; - if (rdma_nl_put_driver_u32( - msg, "period", - roce_get_field(context->byte_56_cqe_period_maxcnt, - V2_CQC_BYTE_56_CQ_PERIOD_M, - V2_CQC_BYTE_56_CQ_PERIOD_S))) + if (rdma_nl_put_driver_u32(msg, "period", + hr_reg_read(context, CQC_CQ_PERIOD))) goto err; if (rdma_nl_put_driver_u32(msg, "cnt", - roce_get_field(context->byte_52_cqe_cnt, - V2_CQC_BYTE_52_CQE_CNT_M, - V2_CQC_BYTE_52_CQE_CNT_S))) + hr_reg_read(context, CQC_CQE_CNT))) goto err; return 0; diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index 3dc9b5801da1..dd3943d22dc6 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -61,7 +61,7 @@ static void irdma_iwarp_ce_handler(struct irdma_sc_cq *iwcq) struct irdma_cq *cq = iwcq->back_cq; if (!cq->user_mode) - cq->armed = false; + atomic_set(&cq->armed, 0); if (cq->ibcq.comp_handler) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); } @@ -1827,10 +1827,6 @@ int irdma_rt_init_hw(struct irdma_device *iwdev, rf->rsrc_created = true; } - iwdev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | - IB_DEVICE_MEM_WINDOW | - IB_DEVICE_MEM_MGT_EXTENSIONS; - if (iwdev->rf->sc_dev.hw_attrs.uk_attrs.hw_rev == IRDMA_GEN_1) irdma_alloc_set_mac(iwdev); irdma_add_ip(iwdev); @@ -2693,24 +2689,29 @@ void irdma_flush_wqes(struct irdma_qp *iwqp, u32 flush_mask) info.sq = flush_mask & IRDMA_FLUSH_SQ; info.rq = flush_mask & IRDMA_FLUSH_RQ; - if (flush_mask & IRDMA_REFLUSH) { - if (info.sq) - iwqp->sc_qp.flush_sq = false; - if (info.rq) - iwqp->sc_qp.flush_rq = false; - } - /* Generate userflush errors in CQE */ info.sq_major_code = IRDMA_FLUSH_MAJOR_ERR; info.sq_minor_code = FLUSH_GENERAL_ERR; info.rq_major_code = IRDMA_FLUSH_MAJOR_ERR; info.rq_minor_code = FLUSH_GENERAL_ERR; info.userflushcode = true; - if (flush_code) { - if (info.sq && iwqp->sc_qp.sq_flush_code) - info.sq_minor_code = flush_code; - if (info.rq && iwqp->sc_qp.rq_flush_code) - info.rq_minor_code = flush_code; + + if (flush_mask & IRDMA_REFLUSH) { + if (info.sq) + iwqp->sc_qp.flush_sq = false; + if (info.rq) + iwqp->sc_qp.flush_rq = false; + } else { + if (flush_code) { + if (info.sq && iwqp->sc_qp.sq_flush_code) + info.sq_minor_code = flush_code; + if (info.rq && iwqp->sc_qp.rq_flush_code) + info.rq_minor_code = flush_code; + } + if (!iwqp->user_mode) + queue_delayed_work(iwqp->iwdev->cleanup_wq, + &iwqp->dwork_flush, + msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } /* Issue flush */ diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index 5123f5feaa2f..ef862bced20f 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -338,7 +338,6 @@ struct irdma_device { u32 roce_ackcreds; u32 vendor_id; u32 vendor_part_id; - u32 device_cap_flags; u32 push_mode; u32 rcv_wnd; u16 mac_ip_table_idx; diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c index 397f3d070f90..4ec9639f1bdb 100644 --- a/drivers/infiniband/hw/irdma/puda.c +++ b/drivers/infiniband/hw/irdma/puda.c @@ -191,7 +191,6 @@ static void irdma_puda_dele_buf(struct irdma_sc_dev *dev, static __le64 *irdma_puda_get_next_send_wqe(struct irdma_qp_uk *qp, u32 *wqe_idx) { - __le64 *wqe = NULL; int ret_code = 0; *wqe_idx = IRDMA_RING_CURRENT_HEAD(qp->sq_ring); @@ -199,11 +198,9 @@ static __le64 *irdma_puda_get_next_send_wqe(struct irdma_qp_uk *qp, qp->swqe_polarity = !qp->swqe_polarity; IRDMA_RING_MOVE_HEAD(qp->sq_ring, ret_code); if (ret_code) - return wqe; + return NULL; - wqe = qp->sq_base[*wqe_idx].elem; - - return wqe; + return qp->sq_base[*wqe_idx].elem; } /** diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 81760415d66c..ab3c5208a123 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -2495,3 +2495,150 @@ bool irdma_cq_empty(struct irdma_cq *iwcq) return polarity != ukcq->polarity; } + +void irdma_remove_cmpls_list(struct irdma_cq *iwcq) +{ + struct irdma_cmpl_gen *cmpl_node; + struct list_head *tmp_node, *list_node; + + list_for_each_safe (list_node, tmp_node, &iwcq->cmpl_generated) { + cmpl_node = list_entry(list_node, struct irdma_cmpl_gen, list); + list_del(&cmpl_node->list); + kfree(cmpl_node); + } +} + +int irdma_generated_cmpls(struct irdma_cq *iwcq, struct irdma_cq_poll_info *cq_poll_info) +{ + struct irdma_cmpl_gen *cmpl; + + if (list_empty(&iwcq->cmpl_generated)) + return -ENOENT; + cmpl = list_first_entry_or_null(&iwcq->cmpl_generated, struct irdma_cmpl_gen, list); + list_del(&cmpl->list); + memcpy(cq_poll_info, &cmpl->cpi, sizeof(*cq_poll_info)); + kfree(cmpl); + + ibdev_dbg(iwcq->ibcq.device, + "VERBS: %s: Poll artificially generated completion for QP 0x%X, op %u, wr_id=0x%llx\n", + __func__, cq_poll_info->qp_id, cq_poll_info->op_type, + cq_poll_info->wr_id); + + return 0; +} + +/** + * irdma_set_cpi_common_values - fill in values for polling info struct + * @cpi: resulting structure of cq_poll_info type + * @qp: QPair + * @qp_num: id of the QP + */ +static void irdma_set_cpi_common_values(struct irdma_cq_poll_info *cpi, + struct irdma_qp_uk *qp, u32 qp_num) +{ + cpi->comp_status = IRDMA_COMPL_STATUS_FLUSHED; + cpi->error = true; + cpi->major_err = IRDMA_FLUSH_MAJOR_ERR; + cpi->minor_err = FLUSH_GENERAL_ERR; + cpi->qp_handle = (irdma_qp_handle)(uintptr_t)qp; + cpi->qp_id = qp_num; +} + +static inline void irdma_comp_handler(struct irdma_cq *cq) +{ + if (!cq->ibcq.comp_handler) + return; + if (atomic_cmpxchg(&cq->armed, 1, 0)) + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); +} + +void irdma_generate_flush_completions(struct irdma_qp *iwqp) +{ + struct irdma_qp_uk *qp = &iwqp->sc_qp.qp_uk; + struct irdma_ring *sq_ring = &qp->sq_ring; + struct irdma_ring *rq_ring = &qp->rq_ring; + struct irdma_cmpl_gen *cmpl; + __le64 *sw_wqe; + u64 wqe_qword; + u32 wqe_idx; + bool compl_generated = false; + unsigned long flags1; + + spin_lock_irqsave(&iwqp->iwscq->lock, flags1); + if (irdma_cq_empty(iwqp->iwscq)) { + unsigned long flags2; + + spin_lock_irqsave(&iwqp->lock, flags2); + while (IRDMA_RING_MORE_WORK(*sq_ring)) { + cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); + if (!cmpl) { + spin_unlock_irqrestore(&iwqp->lock, flags2); + spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + return; + } + + wqe_idx = sq_ring->tail; + irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id); + + cmpl->cpi.wr_id = qp->sq_wrtrk_array[wqe_idx].wrid; + sw_wqe = qp->sq_base[wqe_idx].elem; + get_64bit_val(sw_wqe, 24, &wqe_qword); + cmpl->cpi.op_type = (u8)FIELD_GET(IRDMAQPSQ_OPCODE, IRDMAQPSQ_OPCODE); + /* remove the SQ WR by moving SQ tail*/ + IRDMA_RING_SET_TAIL(*sq_ring, + sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta); + + ibdev_dbg(iwqp->iwscq->ibcq.device, + "DEV: %s: adding wr_id = 0x%llx SQ Completion to list qp_id=%d\n", + __func__, cmpl->cpi.wr_id, qp->qp_id); + list_add_tail(&cmpl->list, &iwqp->iwscq->cmpl_generated); + compl_generated = true; + } + spin_unlock_irqrestore(&iwqp->lock, flags2); + spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + if (compl_generated) + irdma_comp_handler(iwqp->iwrcq); + } else { + spin_unlock_irqrestore(&iwqp->iwscq->lock, flags1); + mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, + msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); + } + + spin_lock_irqsave(&iwqp->iwrcq->lock, flags1); + if (irdma_cq_empty(iwqp->iwrcq)) { + unsigned long flags2; + + spin_lock_irqsave(&iwqp->lock, flags2); + while (IRDMA_RING_MORE_WORK(*rq_ring)) { + cmpl = kzalloc(sizeof(*cmpl), GFP_ATOMIC); + if (!cmpl) { + spin_unlock_irqrestore(&iwqp->lock, flags2); + spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + return; + } + + wqe_idx = rq_ring->tail; + irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id); + + cmpl->cpi.wr_id = qp->rq_wrid_array[wqe_idx]; + cmpl->cpi.op_type = IRDMA_OP_TYPE_REC; + /* remove the RQ WR by moving RQ tail */ + IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1); + ibdev_dbg(iwqp->iwrcq->ibcq.device, + "DEV: %s: adding wr_id = 0x%llx RQ Completion to list qp_id=%d, wqe_idx=%d\n", + __func__, cmpl->cpi.wr_id, qp->qp_id, + wqe_idx); + list_add_tail(&cmpl->list, &iwqp->iwrcq->cmpl_generated); + + compl_generated = true; + } + spin_unlock_irqrestore(&iwqp->lock, flags2); + spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + if (compl_generated) + irdma_comp_handler(iwqp->iwrcq); + } else { + spin_unlock_irqrestore(&iwqp->iwrcq->lock, flags1); + mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, + msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); + } +} diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 52f3e88f8569..c4412ece5a6d 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -25,7 +25,9 @@ static int irdma_query_device(struct ib_device *ibdev, iwdev->netdev->dev_addr); props->fw_ver = (u64)irdma_fw_major_ver(&rf->sc_dev) << 32 | irdma_fw_minor_ver(&rf->sc_dev); - props->device_cap_flags = iwdev->device_cap_flags; + props->device_cap_flags = IB_DEVICE_MEM_WINDOW | + IB_DEVICE_MEM_MGT_EXTENSIONS; + props->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; props->vendor_id = pcidev->vendor; props->vendor_part_id = pcidev->device; @@ -533,6 +535,9 @@ static int irdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (iwqp->iwarp_state == IRDMA_QP_STATE_RTS) irdma_modify_qp_to_err(&iwqp->sc_qp); + if (!iwqp->user_mode) + cancel_delayed_work_sync(&iwqp->dwork_flush); + irdma_qp_rem_ref(&iwqp->ibqp); wait_for_completion(&iwqp->free_qp); irdma_free_lsmm_rsrc(iwqp); @@ -788,6 +793,14 @@ static int irdma_validate_qp_attrs(struct ib_qp_init_attr *init_attr, return 0; } +static void irdma_flush_worker(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct irdma_qp *iwqp = container_of(dwork, struct irdma_qp, dwork_flush); + + irdma_generate_flush_completions(iwqp); +} + /** * irdma_create_qp - create qp * @ibqp: ptr of qp @@ -907,6 +920,7 @@ static int irdma_create_qp(struct ib_qp *ibqp, init_info.qp_uk_init_info.abi_ver = iwpd->sc_pd.abi_ver; irdma_setup_virt_qp(iwdev, iwqp, &init_info); } else { + INIT_DELAYED_WORK(&iwqp->dwork_flush, irdma_flush_worker); init_info.qp_uk_init_info.abi_ver = IRDMA_ABI_VER; err_code = irdma_setup_kmode_qp(iwdev, iwqp, &init_info, init_attr); } @@ -1398,11 +1412,11 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, } if (iwqp->ibqp_state > IB_QPS_RTS && !iwqp->flush_issued) { - iwqp->flush_issued = 1; spin_unlock_irqrestore(&iwqp->lock, flags); irdma_flush_wqes(iwqp, IRDMA_FLUSH_SQ | IRDMA_FLUSH_RQ | IRDMA_FLUSH_WAIT); + iwqp->flush_issued = 1; } else { spin_unlock_irqrestore(&iwqp->lock, flags); } @@ -1755,6 +1769,8 @@ static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) unsigned long flags; spin_lock_irqsave(&iwcq->lock, flags); + if (!list_empty(&iwcq->cmpl_generated)) + irdma_remove_cmpls_list(iwcq); if (!list_empty(&iwcq->resize_list)) irdma_process_resize_list(iwcq, iwdev, NULL); spin_unlock_irqrestore(&iwcq->lock, flags); @@ -1959,6 +1975,7 @@ static int irdma_create_cq(struct ib_cq *ibcq, cq->back_cq = iwcq; spin_lock_init(&iwcq->lock); INIT_LIST_HEAD(&iwcq->resize_list); + INIT_LIST_HEAD(&iwcq->cmpl_generated); info.dev = dev; ukinfo->cq_size = max(entries, 4); ukinfo->cq_id = cq_num; @@ -3044,15 +3061,12 @@ static int irdma_post_send(struct ib_qp *ibqp, unsigned long flags; bool inv_stag; struct irdma_ah *ah; - bool reflush = false; iwqp = to_iwqp(ibqp); ukqp = &iwqp->sc_qp.qp_uk; dev = &iwqp->iwdev->rf->sc_dev; spin_lock_irqsave(&iwqp->lock, flags); - if (iwqp->flush_issued && ukqp->sq_flush_complete) - reflush = true; while (ib_wr) { memset(&info, 0, sizeof(info)); inv_stag = false; @@ -3202,15 +3216,14 @@ static int irdma_post_send(struct ib_qp *ibqp, ib_wr = ib_wr->next; } - if (!iwqp->flush_issued && iwqp->hw_iwarp_state <= IRDMA_QP_STATE_RTS) { - irdma_uk_qp_post_wr(ukqp); + if (!iwqp->flush_issued) { + if (iwqp->hw_iwarp_state <= IRDMA_QP_STATE_RTS) + irdma_uk_qp_post_wr(ukqp); spin_unlock_irqrestore(&iwqp->lock, flags); - } else if (reflush) { - ukqp->sq_flush_complete = false; - spin_unlock_irqrestore(&iwqp->lock, flags); - irdma_flush_wqes(iwqp, IRDMA_FLUSH_SQ | IRDMA_REFLUSH); } else { spin_unlock_irqrestore(&iwqp->lock, flags); + mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, + msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); } if (err) *bad_wr = ib_wr; @@ -3233,14 +3246,11 @@ static int irdma_post_recv(struct ib_qp *ibqp, struct irdma_post_rq_info post_recv = {}; unsigned long flags; int err = 0; - bool reflush = false; iwqp = to_iwqp(ibqp); ukqp = &iwqp->sc_qp.qp_uk; spin_lock_irqsave(&iwqp->lock, flags); - if (iwqp->flush_issued && ukqp->rq_flush_complete) - reflush = true; while (ib_wr) { post_recv.num_sges = ib_wr->num_sge; post_recv.wr_id = ib_wr->wr_id; @@ -3256,13 +3266,10 @@ static int irdma_post_recv(struct ib_qp *ibqp, } out: - if (reflush) { - ukqp->rq_flush_complete = false; - spin_unlock_irqrestore(&iwqp->lock, flags); - irdma_flush_wqes(iwqp, IRDMA_FLUSH_RQ | IRDMA_REFLUSH); - } else { - spin_unlock_irqrestore(&iwqp->lock, flags); - } + spin_unlock_irqrestore(&iwqp->lock, flags); + if (iwqp->flush_issued) + mod_delayed_work(iwqp->iwdev->cleanup_wq, &iwqp->dwork_flush, + msecs_to_jiffies(IRDMA_FLUSH_DELAY_MS)); if (err) *bad_wr = ib_wr; @@ -3474,6 +3481,11 @@ static int __irdma_poll_cq(struct irdma_cq *iwcq, int num_entries, struct ib_wc /* check the current CQ for new cqes */ while (npolled < num_entries) { ret = irdma_poll_one(ukcq, cur_cqe, entry + npolled); + if (ret == -ENOENT) { + ret = irdma_generated_cmpls(iwcq, cur_cqe); + if (!ret) + irdma_process_cqe(entry + npolled, cur_cqe); + } if (!ret) { ++npolled; cq_new_cqe = true; @@ -3555,13 +3567,13 @@ static int irdma_req_notify_cq(struct ib_cq *ibcq, if (iwcq->last_notify == IRDMA_CQ_COMPL_SOLICITED && notify_flags != IB_CQ_SOLICITED) promo_event = true; - if (!iwcq->armed || promo_event) { - iwcq->armed = true; + if (!atomic_cmpxchg(&iwcq->armed, 0, 1) || promo_event) { iwcq->last_notify = cq_notify; irdma_uk_cq_request_notification(ukcq, cq_notify); } - if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && !irdma_cq_empty(iwcq)) + if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && + (!irdma_cq_empty(iwcq) || !list_empty(&iwcq->cmpl_generated))) ret = 1; spin_unlock_irqrestore(&iwcq->lock, flags); diff --git a/drivers/infiniband/hw/irdma/verbs.h b/drivers/infiniband/hw/irdma/verbs.h index 08ba24d0b843..4309b7159f42 100644 --- a/drivers/infiniband/hw/irdma/verbs.h +++ b/drivers/infiniband/hw/irdma/verbs.h @@ -4,6 +4,7 @@ #define IRDMA_VERBS_H #define IRDMA_MAX_SAVED_PHY_PGADDR 4 +#define IRDMA_FLUSH_DELAY_MS 20 #define IRDMA_PKEY_TBL_SZ 1 #define IRDMA_DEFAULT_PKEY 0xFFFF @@ -115,7 +116,7 @@ struct irdma_cq { u16 cq_size; u16 cq_num; bool user_mode; - bool armed; + atomic_t armed; enum irdma_cmpl_notify last_notify; u32 polled_cmpls; u32 cq_mem_size; @@ -126,6 +127,12 @@ struct irdma_cq { struct irdma_pbl *iwpbl_shadow; struct list_head resize_list; struct irdma_cq_poll_info cur_cqe; + struct list_head cmpl_generated; +}; + +struct irdma_cmpl_gen { + struct list_head list; + struct irdma_cq_poll_info cpi; }; struct disconn_work { @@ -166,6 +173,7 @@ struct irdma_qp { refcount_t refcnt; struct iw_cm_id *cm_id; struct irdma_cm_node *cm_node; + struct delayed_work dwork_flush; struct ib_mr *lsmm_mr; atomic_t hw_mod_qp_pend; enum ib_qp_state ibqp_state; @@ -229,4 +237,7 @@ int irdma_ib_register_device(struct irdma_device *iwdev); void irdma_ib_unregister_device(struct irdma_device *iwdev); void irdma_ib_dealloc_device(struct ib_device *ibdev); void irdma_ib_qp_event(struct irdma_qp *iwqp, enum irdma_qp_event_type event); +void irdma_generate_flush_completions(struct irdma_qp *iwqp); +void irdma_remove_cmpls_list(struct irdma_cq *iwcq); +int irdma_generated_cmpls(struct irdma_cq *iwcq, struct irdma_cq_poll_info *cq_poll_info); #endif /* IRDMA_VERBS_H */ diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c index 4aff1c8298b1..12b481d138cf 100644 --- a/drivers/infiniband/hw/mlx4/cm.c +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -80,6 +80,7 @@ struct cm_req_msg { union ib_gid primary_path_sgid; }; +static struct workqueue_struct *cm_wq; static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) { @@ -288,10 +289,10 @@ static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) /*make sure that there is no schedule inside the scheduled work.*/ if (!sriov->is_going_down && !id->scheduled_delete) { id->scheduled_delete = 1; - schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + queue_delayed_work(cm_wq, &id->timeout, CM_CLEANUP_CACHE_TIMEOUT); } else if (id->scheduled_delete) { /* Adjust timeout if already scheduled */ - mod_delayed_work(system_wq, &id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + mod_delayed_work(cm_wq, &id->timeout, CM_CLEANUP_CACHE_TIMEOUT); } spin_unlock_irqrestore(&sriov->going_down_lock, flags); spin_unlock(&sriov->id_map_lock); @@ -370,7 +371,7 @@ static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int sl ret = xa_err(item); else /* If a retry, adjust delayed work */ - mod_delayed_work(system_wq, &item->timeout, CM_CLEANUP_CACHE_TIMEOUT); + mod_delayed_work(cm_wq, &item->timeout, CM_CLEANUP_CACHE_TIMEOUT); goto err_or_exists; } xa_unlock(&sriov->xa_rej_tmout); @@ -393,7 +394,7 @@ static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int sl return xa_err(old); } - schedule_delayed_work(&item->timeout, CM_CLEANUP_CACHE_TIMEOUT); + queue_delayed_work(cm_wq, &item->timeout, CM_CLEANUP_CACHE_TIMEOUT); return 0; @@ -500,7 +501,7 @@ static void rej_tmout_xa_cleanup(struct mlx4_ib_sriov *sriov, int slave) xa_lock(&sriov->xa_rej_tmout); xa_for_each(&sriov->xa_rej_tmout, id, item) { if (slave < 0 || slave == item->slave) { - mod_delayed_work(system_wq, &item->timeout, 0); + mod_delayed_work(cm_wq, &item->timeout, 0); flush_needed = true; ++cnt; } @@ -508,7 +509,7 @@ static void rej_tmout_xa_cleanup(struct mlx4_ib_sriov *sriov, int slave) xa_unlock(&sriov->xa_rej_tmout); if (flush_needed) { - flush_scheduled_work(); + flush_workqueue(cm_wq); pr_debug("Deleted %d entries in xarray for slave %d during cleanup\n", cnt, slave); } @@ -540,7 +541,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) spin_unlock(&sriov->id_map_lock); if (need_flush) - flush_scheduled_work(); /* make sure all timers were flushed */ + flush_workqueue(cm_wq); /* make sure all timers were flushed */ /* now, remove all leftover entries from databases*/ spin_lock(&sriov->id_map_lock); @@ -587,3 +588,17 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) rej_tmout_xa_cleanup(sriov, slave); } + +int mlx4_ib_cm_init(void) +{ + cm_wq = alloc_workqueue("mlx4_ib_cm", 0, 0); + if (!cm_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_cm_destroy(void) +{ + destroy_workqueue(cm_wq); +} diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 93b1650eacfa..ba47874f90d3 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -479,8 +479,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | - IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + IB_DEVICE_RC_RNR_NAK_GEN; + props->kernel_cap_flags = IBK_BLOCK_MULTICAST_LOOPBACK; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) @@ -494,9 +494,9 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, if (dev->dev->caps.max_gso_sz && (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) && (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)) - props->device_cap_flags |= IB_DEVICE_UD_TSO; + props->kernel_cap_flags |= IBK_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) - props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; + props->kernel_cap_flags |= IBK_LOCAL_DMA_LKEY; if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) @@ -3307,10 +3307,14 @@ static int __init mlx4_ib_init(void) if (!wq) return -ENOMEM; - err = mlx4_ib_mcg_init(); + err = mlx4_ib_cm_init(); if (err) goto clean_wq; + err = mlx4_ib_mcg_init(); + if (err) + goto clean_cm; + err = mlx4_register_interface(&mlx4_ib_interface); if (err) goto clean_mcg; @@ -3320,6 +3324,9 @@ static int __init mlx4_ib_init(void) clean_mcg: mlx4_ib_mcg_destroy(); +clean_cm: + mlx4_ib_cm_destroy(); + clean_wq: destroy_workqueue(wq); return err; @@ -3329,6 +3336,7 @@ static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); + mlx4_ib_cm_destroy(); destroy_workqueue(wq); } diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index d84023b4b1b8..6a3b0f121045 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -937,4 +937,7 @@ mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table) int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, int *num_of_mtts); +int mlx4_ib_cm_init(void); +void mlx4_ib_cm_destroy(void); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index f43380106bd0..612ee8190a2d 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -19,6 +19,7 @@ mlx5_ib-y := ah.o \ restrack.o \ srq.o \ srq_cmd.o \ + umr.o \ wr.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 9c2886bc72cb..39ffb363ba0c 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -1095,11 +1095,6 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); - if (is_egress) { - err = -EINVAL; - goto free; - } - if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { struct mlx5_ib_mcounters *mcounters; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 61a3b767262f..b68fddeac0f1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -41,6 +41,7 @@ #include "wr.h" #include "restrack.h" #include "counters.h" +#include "umr.h" #include #include #include @@ -854,13 +855,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, IB_DEVICE_MEM_WINDOW_TYPE_2B; props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); /* We support 'Gappy' memory registration too */ - props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG; + props->kernel_cap_flags |= IBK_SG_GAPS_REG; } /* IB_WR_REG_MR always requires changing the entity size with UMR */ if (!MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (MLX5_CAP_GEN(mdev, sho)) { - props->device_cap_flags |= IB_DEVICE_INTEGRITY_HANDOVER; + props->kernel_cap_flags |= IBK_INTEGRITY_HANDOVER; /* At this stage no support for signature handover */ props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 | IB_PROT_T10DIF_TYPE_2 | @@ -869,7 +870,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, IB_GUARD_T10DIF_CSUM; } if (MLX5_CAP_GEN(mdev, block_lb_mc)) - props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + props->kernel_cap_flags |= IBK_BLOCK_MULTICAST_LOOPBACK; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) { if (MLX5_CAP_ETH(mdev, csum_cap)) { @@ -916,7 +917,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; - props->device_cap_flags |= IB_DEVICE_UD_TSO; + props->kernel_cap_flags |= IBK_UD_TSO; } if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && @@ -992,7 +993,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) - props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; + props->kernel_cap_flags |= IBK_ON_DEMAND_PAGING; props->odp_caps = dev->odp_caps; if (!uhw) { /* ODP for kernel QPs is not implemented for receive @@ -1013,11 +1014,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (MLX5_CAP_GEN(mdev, cd)) - props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; - if (mlx5_core_is_vf(mdev)) - props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; + props->kernel_cap_flags |= IBK_VIRTUAL_FUNCTION; if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET && raw_support) { @@ -4008,12 +4006,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); - if (dev->umrc.qp) - ib_destroy_qp(dev->umrc.qp); - if (dev->umrc.cq) - ib_free_cq(dev->umrc.cq); - if (dev->umrc.pd) - ib_dealloc_pd(dev->umrc.pd); + mlx5r_umr_resource_cleanup(dev); } static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) @@ -4021,112 +4014,19 @@ static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) ib_unregister_device(&dev->ib_dev); } -enum { - MAX_UMR_WR = 128, -}; - static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) { - struct ib_qp_init_attr *init_attr = NULL; - struct ib_qp_attr *attr = NULL; - struct ib_pd *pd; - struct ib_cq *cq; - struct ib_qp *qp; int ret; - attr = kzalloc(sizeof(*attr), GFP_KERNEL); - init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL); - if (!attr || !init_attr) { - ret = -ENOMEM; - goto error_0; - } + ret = mlx5r_umr_resource_init(dev); + if (ret) + return ret; - pd = ib_alloc_pd(&dev->ib_dev, 0); - if (IS_ERR(pd)) { - mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); - ret = PTR_ERR(pd); - goto error_0; - } - - cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); - if (IS_ERR(cq)) { - mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); - ret = PTR_ERR(cq); - goto error_2; - } - - init_attr->send_cq = cq; - init_attr->recv_cq = cq; - init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; - init_attr->cap.max_send_wr = MAX_UMR_WR; - init_attr->cap.max_send_sge = 1; - init_attr->qp_type = MLX5_IB_QPT_REG_UMR; - init_attr->port_num = 1; - qp = ib_create_qp(pd, init_attr); - if (IS_ERR(qp)) { - mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); - ret = PTR_ERR(qp); - goto error_3; - } - - attr->qp_state = IB_QPS_INIT; - attr->port_num = 1; - ret = ib_modify_qp(qp, attr, - IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT); - if (ret) { - mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); - goto error_4; - } - - memset(attr, 0, sizeof(*attr)); - attr->qp_state = IB_QPS_RTR; - attr->path_mtu = IB_MTU_256; - - ret = ib_modify_qp(qp, attr, IB_QP_STATE); - if (ret) { - mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); - goto error_4; - } - - memset(attr, 0, sizeof(*attr)); - attr->qp_state = IB_QPS_RTS; - ret = ib_modify_qp(qp, attr, IB_QP_STATE); - if (ret) { - mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); - goto error_4; - } - - dev->umrc.qp = qp; - dev->umrc.cq = cq; - dev->umrc.pd = pd; - - sema_init(&dev->umrc.sem, MAX_UMR_WR); ret = mlx5_mr_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); - goto error_4; + mlx5r_umr_resource_cleanup(dev); } - - kfree(attr); - kfree(init_attr); - - return 0; - -error_4: - ib_destroy_qp(qp); - dev->umrc.qp = NULL; - -error_3: - ib_free_cq(cq); - dev->umrc.cq = NULL; - -error_2: - ib_dealloc_pd(pd); - dev->umrc.pd = NULL; - -error_0: - kfree(attr); - kfree(init_attr); return ret; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8b3c83c0b70a..998b67509a53 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -291,16 +291,9 @@ struct mlx5_ib_flow_db { }; /* Use macros here so that don't have to duplicate - * enum ib_send_flags and enum ib_qp_type for low-level driver + * enum ib_qp_type for low-level driver */ -#define MLX5_IB_SEND_UMR_ENABLE_MR (IB_SEND_RESERVED_START << 0) -#define MLX5_IB_SEND_UMR_DISABLE_MR (IB_SEND_RESERVED_START << 1) -#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 2) -#define MLX5_IB_SEND_UMR_UPDATE_XLT (IB_SEND_RESERVED_START << 3) -#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 4) -#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS IB_SEND_RESERVED_END - #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 /* * IB_QPT_GSI creates the software wrapper around GSI, and MLX5_IB_QPT_HW_GSI @@ -311,9 +304,6 @@ struct mlx5_ib_flow_db { #define MLX5_IB_QPT_DCT IB_QPT_RESERVED4 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 -#define MLX5_IB_UMR_OCTOWORD 16 -#define MLX5_IB_UMR_XLT_ALIGNMENT 64 - #define MLX5_IB_UPD_XLT_ZAP BIT(0) #define MLX5_IB_UPD_XLT_ENABLE BIT(1) #define MLX5_IB_UPD_XLT_ATOMIC BIT(2) @@ -539,24 +529,6 @@ struct mlx5_ib_cq_buf { int nent; }; -struct mlx5_umr_wr { - struct ib_send_wr wr; - u64 virt_addr; - u64 offset; - struct ib_pd *pd; - unsigned int page_shift; - unsigned int xlt_size; - u64 length; - int access_flags; - u32 mkey; - u8 ignore_free_state:1; -}; - -static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr) -{ - return container_of(wr, struct mlx5_umr_wr, wr); -} - enum mlx5_ib_cq_pr_flags { MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD = 1 << 0, MLX5_IB_CQ_PR_FLAGS_REAL_TIME_TS = 1 << 1, @@ -1291,9 +1263,6 @@ int mlx5_ib_advise_mr(struct ib_pd *pd, struct uverbs_attr_bundle *attrs); int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); -int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, - int page_shift, int flags); -int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags); void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); @@ -1472,9 +1441,6 @@ static inline int is_qp1(enum ib_qp_type qp_type) return qp_type == MLX5_IB_QPT_HW_GSI || qp_type == IB_QPT_GSI; } -#define MLX5_MAX_UMR_SHIFT 16 -#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) - static inline u32 check_cq_create_flags(u32 flags) { /* @@ -1546,59 +1512,6 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, u32 bfregn, bool dyn_bfreg); -static inline bool mlx5_ib_can_load_pas_with_umr(struct mlx5_ib_dev *dev, - size_t length) -{ - /* - * umr_check_mkey_mask() rejects MLX5_MKEY_MASK_PAGE_SIZE which is - * always set if MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (aka - * MLX5_IB_UPD_XLT_ADDR and MLX5_IB_UPD_XLT_ENABLE) is set. Thus, a mkey - * can never be enabled without this capability. Simplify this weird - * quirky hardware by just saying it can't use PAS lists with UMR at - * all. - */ - if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) - return false; - - /* - * length is the size of the MR in bytes when mlx5_ib_update_xlt() is - * used. - */ - if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && - length >= MLX5_MAX_UMR_PAGES * PAGE_SIZE) - return false; - return true; -} - -/* - * true if an existing MR can be reconfigured to new access_flags using UMR. - * Older HW cannot use UMR to update certain elements of the MKC. See - * umr_check_mkey_mask(), get_umr_update_access_mask() and umr_check_mkey_mask() - */ -static inline bool mlx5_ib_can_reconfig_with_umr(struct mlx5_ib_dev *dev, - unsigned int current_access_flags, - unsigned int target_access_flags) -{ - unsigned int diffs = current_access_flags ^ target_access_flags; - - if ((diffs & IB_ACCESS_REMOTE_ATOMIC) && - MLX5_CAP_GEN(dev->mdev, atomic) && - MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) - return false; - - if ((diffs & IB_ACCESS_RELAXED_ORDERING) && - MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && - !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) - return false; - - if ((diffs & IB_ACCESS_RELAXED_ORDERING) && - MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) && - !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) - return false; - - return true; -} - static inline int mlx5r_store_odp_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mkey *mmkey) { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 32ef67e9a6a7..1e7653c997b5 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -44,13 +44,7 @@ #include #include "dm.h" #include "mlx5_ib.h" - -/* - * We can't use an array for xlt_emergency_page because dma_map_single doesn't - * work on kernel modules memory - */ -void *xlt_emergency_page; -static DEFINE_MUTEX(xlt_emergency_page_mutex); +#include "umr.h" enum { MAX_PENDING_REG_MR = 8, @@ -128,11 +122,6 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, static int mr_cache_max_order(struct mlx5_ib_dev *dev); static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); -static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) -{ - return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); -} - static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); @@ -600,7 +589,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr; /* Matches access in alloc_cache_mr() */ - if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) + if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) return ERR_PTR(-EOPNOTSUPP); spin_lock_irq(&ent->lock); @@ -741,7 +730,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && !dev->is_rep && mlx5_core_is_pf(dev->mdev) && - mlx5_ib_can_load_pas_with_umr(dev, 0)) + mlx5r_umr_can_load_pas(dev, 0)) ent->limit = dev->mdev->profile.mr_cache[i].limit; else ent->limit = 0; @@ -848,49 +837,6 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev) return MLX5_MAX_UMR_SHIFT; } -static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct mlx5_ib_umr_context *context = - container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); - - context->status = wc->status; - complete(&context->done); -} - -static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) -{ - context->cqe.done = mlx5_ib_umr_done; - context->status = -1; - init_completion(&context->done); -} - -static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, - struct mlx5_umr_wr *umrwr) -{ - struct umr_common *umrc = &dev->umrc; - const struct ib_send_wr *bad; - int err; - struct mlx5_ib_umr_context umr_context; - - mlx5_ib_init_umr_context(&umr_context); - umrwr->wr.wr_cqe = &umr_context.cqe; - - down(&umrc->sem); - err = ib_post_send(umrc->qp, &umrwr->wr, &bad); - if (err) { - mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); - } else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed (%u)\n", - umr_context.status); - err = -EFAULT; - } - } - up(&umrc->sem); - return err; -} - static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, unsigned int order) { @@ -949,7 +895,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, * cache then synchronously create an uncached one. */ if (!ent || ent->limit == 0 || - !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) { + !mlx5r_umr_can_reconfig(dev, 0, access_flags)) { mutex_lock(&dev->slow_path_mutex); mr = reg_create(pd, umem, iova, access_flags, page_size, false); mutex_unlock(&dev->slow_path_mutex); @@ -968,289 +914,6 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } -#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ - MLX5_UMR_MTT_ALIGNMENT) -#define MLX5_SPARE_UMR_CHUNK 0x10000 - -/* - * Allocate a temporary buffer to hold the per-page information to transfer to - * HW. For efficiency this should be as large as it can be, but buffer - * allocation failure is not allowed, so try smaller sizes. - */ -static void *mlx5_ib_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) -{ - const size_t xlt_chunk_align = - MLX5_UMR_MTT_ALIGNMENT / ent_size; - size_t size; - void *res = NULL; - - static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0); - - /* - * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the - * allocation can't trigger any kind of reclaim. - */ - might_sleep(); - - gfp_mask |= __GFP_ZERO | __GFP_NORETRY; - - /* - * If the system already has a suitable high order page then just use - * that, but don't try hard to create one. This max is about 1M, so a - * free x86 huge page will satisfy it. - */ - size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), - MLX5_MAX_UMR_CHUNK); - *nents = size / ent_size; - res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, - get_order(size)); - if (res) - return res; - - if (size > MLX5_SPARE_UMR_CHUNK) { - size = MLX5_SPARE_UMR_CHUNK; - *nents = size / ent_size; - res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, - get_order(size)); - if (res) - return res; - } - - *nents = PAGE_SIZE / ent_size; - res = (void *)__get_free_page(gfp_mask); - if (res) - return res; - - mutex_lock(&xlt_emergency_page_mutex); - memset(xlt_emergency_page, 0, PAGE_SIZE); - return xlt_emergency_page; -} - -static void mlx5_ib_free_xlt(void *xlt, size_t length) -{ - if (xlt == xlt_emergency_page) { - mutex_unlock(&xlt_emergency_page_mutex); - return; - } - - free_pages((unsigned long)xlt, get_order(length)); -} - -/* - * Create a MLX5_IB_SEND_UMR_UPDATE_XLT work request and XLT buffer ready for - * submission. - */ -static void *mlx5_ib_create_xlt_wr(struct mlx5_ib_mr *mr, - struct mlx5_umr_wr *wr, struct ib_sge *sg, - size_t nents, size_t ent_size, - unsigned int flags) -{ - struct mlx5_ib_dev *dev = mr_to_mdev(mr); - struct device *ddev = &dev->mdev->pdev->dev; - dma_addr_t dma; - void *xlt; - - xlt = mlx5_ib_alloc_xlt(&nents, ent_size, - flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : - GFP_KERNEL); - sg->length = nents * ent_size; - dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); - if (dma_mapping_error(ddev, dma)) { - mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); - mlx5_ib_free_xlt(xlt, sg->length); - return NULL; - } - sg->addr = dma; - sg->lkey = dev->umrc.pd->local_dma_lkey; - - memset(wr, 0, sizeof(*wr)); - wr->wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; - if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) - wr->wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; - wr->wr.sg_list = sg; - wr->wr.num_sge = 1; - wr->wr.opcode = MLX5_IB_WR_UMR; - wr->pd = mr->ibmr.pd; - wr->mkey = mr->mmkey.key; - wr->length = mr->ibmr.length; - wr->virt_addr = mr->ibmr.iova; - wr->access_flags = mr->access_flags; - wr->page_shift = mr->page_shift; - wr->xlt_size = sg->length; - return xlt; -} - -static void mlx5_ib_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, - struct ib_sge *sg) -{ - struct device *ddev = &dev->mdev->pdev->dev; - - dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); - mlx5_ib_free_xlt(xlt, sg->length); -} - -static unsigned int xlt_wr_final_send_flags(unsigned int flags) -{ - unsigned int res = 0; - - if (flags & MLX5_IB_UPD_XLT_ENABLE) - res |= MLX5_IB_SEND_UMR_ENABLE_MR | - MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | - MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; - if (flags & MLX5_IB_UPD_XLT_PD || flags & MLX5_IB_UPD_XLT_ACCESS) - res |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; - if (flags & MLX5_IB_UPD_XLT_ADDR) - res |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; - return res; -} - -int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, - int page_shift, int flags) -{ - struct mlx5_ib_dev *dev = mr_to_mdev(mr); - struct device *ddev = &dev->mdev->pdev->dev; - void *xlt; - struct mlx5_umr_wr wr; - struct ib_sge sg; - int err = 0; - int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) - ? sizeof(struct mlx5_klm) - : sizeof(struct mlx5_mtt); - const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; - const int page_mask = page_align - 1; - size_t pages_mapped = 0; - size_t pages_to_map = 0; - size_t pages_iter; - size_t size_to_map = 0; - size_t orig_sg_length; - - if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && - !umr_can_use_indirect_mkey(dev)) - return -EPERM; - - if (WARN_ON(!mr->umem->is_odp)) - return -EINVAL; - - /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, - * so we need to align the offset and length accordingly - */ - if (idx & page_mask) { - npages += idx & page_mask; - idx &= ~page_mask; - } - pages_to_map = ALIGN(npages, page_align); - - xlt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, npages, desc_size, flags); - if (!xlt) - return -ENOMEM; - pages_iter = sg.length / desc_size; - orig_sg_length = sg.length; - - if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { - struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - size_t max_pages = ib_umem_odp_num_pages(odp) - idx; - - pages_to_map = min_t(size_t, pages_to_map, max_pages); - } - - wr.page_shift = page_shift; - - for (pages_mapped = 0; - pages_mapped < pages_to_map && !err; - pages_mapped += pages_iter, idx += pages_iter) { - npages = min_t(int, pages_iter, pages_to_map - pages_mapped); - size_to_map = npages * desc_size; - dma_sync_single_for_cpu(ddev, sg.addr, sg.length, - DMA_TO_DEVICE); - mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); - dma_sync_single_for_device(ddev, sg.addr, sg.length, - DMA_TO_DEVICE); - - sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT); - - if (pages_mapped + pages_iter >= pages_to_map) - wr.wr.send_flags |= xlt_wr_final_send_flags(flags); - - wr.offset = idx * desc_size; - wr.xlt_size = sg.length; - - err = mlx5_ib_post_send_wait(dev, &wr); - } - sg.length = orig_sg_length; - mlx5_ib_unmap_free_xlt(dev, xlt, &sg); - return err; -} - -/* - * Send the DMA list to the HW for a normal MR using UMR. - * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP - * flag may be used. - */ -int mlx5_ib_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) -{ - struct mlx5_ib_dev *dev = mr_to_mdev(mr); - struct device *ddev = &dev->mdev->pdev->dev; - struct ib_block_iter biter; - struct mlx5_mtt *cur_mtt; - struct mlx5_umr_wr wr; - size_t orig_sg_length; - struct mlx5_mtt *mtt; - size_t final_size; - struct ib_sge sg; - int err = 0; - - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - mtt = mlx5_ib_create_xlt_wr(mr, &wr, &sg, - ib_umem_num_dma_blocks(mr->umem, - 1 << mr->page_shift), - sizeof(*mtt), flags); - if (!mtt) - return -ENOMEM; - orig_sg_length = sg.length; - - cur_mtt = mtt; - rdma_for_each_block (mr->umem->sgt_append.sgt.sgl, &biter, - mr->umem->sgt_append.sgt.nents, - BIT(mr->page_shift)) { - if (cur_mtt == (void *)mtt + sg.length) { - dma_sync_single_for_device(ddev, sg.addr, sg.length, - DMA_TO_DEVICE); - err = mlx5_ib_post_send_wait(dev, &wr); - if (err) - goto err; - dma_sync_single_for_cpu(ddev, sg.addr, sg.length, - DMA_TO_DEVICE); - wr.offset += sg.length; - cur_mtt = mtt; - } - - cur_mtt->ptag = - cpu_to_be64(rdma_block_iter_dma_address(&biter) | - MLX5_IB_MTT_PRESENT); - - if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) - cur_mtt->ptag = 0; - - cur_mtt++; - } - - final_size = (void *)cur_mtt - (void *)mtt; - sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT); - memset(cur_mtt, 0, sg.length - final_size); - wr.wr.send_flags |= xlt_wr_final_send_flags(flags); - wr.xlt_size = sg.length; - - dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); - err = mlx5_ib_post_send_wait(dev, &wr); - -err: - sg.length = orig_sg_length; - mlx5_ib_unmap_free_xlt(dev, mtt, &sg); - return err; -} - /* * If ibmr is NULL it will be allocated by reg_create. * Else, the given ibmr will be used. @@ -1441,7 +1104,7 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, bool xlt_with_umr; int err; - xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, umem->length); + xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { mr = alloc_cacheable_mr(pd, umem, iova, access_flags); } else { @@ -1467,7 +1130,7 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, * configured properly but left disabled. It is safe to go ahead * and configure it again via UMR while enabling it. */ - err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); + err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); if (err) { mlx5_ib_dereg_mr(&mr->ibmr, NULL); return ERR_PTR(err); @@ -1504,7 +1167,7 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, } /* ODP requires xlt update via umr to work. */ - if (!mlx5_ib_can_load_pas_with_umr(dev, length)) + if (!mlx5r_umr_can_load_pas(dev, length)) return ERR_PTR(-EINVAL); odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, @@ -1566,7 +1229,7 @@ static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) if (!umem_dmabuf->sgt) return; - mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); + mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); ib_umem_dmabuf_unmap_pages(umem_dmabuf); } @@ -1594,7 +1257,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, offset, virt_addr, length, fd, access_flags); /* dmabuf requires xlt update via umr to work. */ - if (!mlx5_ib_can_load_pas_with_umr(dev, length)) + if (!mlx5r_umr_can_load_pas(dev, length)) return ERR_PTR(-EINVAL); umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, @@ -1631,31 +1294,6 @@ err_dereg_mr: return ERR_PTR(err); } -/** - * revoke_mr - Fence all DMA on the MR - * @mr: The MR to fence - * - * Upon return the NIC will not be doing any DMA to the pages under the MR, - * and any DMA in progress will be completed. Failure of this function - * indicates the HW has failed catastrophically. - */ -static int revoke_mr(struct mlx5_ib_mr *mr) -{ - struct mlx5_umr_wr umrwr = {}; - - if (mr_to_mdev(mr)->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) - return 0; - - umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | - MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; - umrwr.wr.opcode = MLX5_IB_WR_UMR; - umrwr.pd = mr_to_mdev(mr)->umrc.pd; - umrwr.mkey = mr->mmkey.key; - umrwr.ignore_free_state = 1; - - return mlx5_ib_post_send_wait(mr_to_mdev(mr), &umrwr); -} - /* * True if the change in access flags can be done via UMR, only some access * flags can be updated. @@ -1669,32 +1307,8 @@ static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) return false; - return mlx5_ib_can_reconfig_with_umr(dev, current_access_flags, - target_access_flags); -} - -static int umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, - int access_flags) -{ - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); - struct mlx5_umr_wr umrwr = { - .wr = { - .send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | - MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS, - .opcode = MLX5_IB_WR_UMR, - }, - .mkey = mr->mmkey.key, - .pd = pd, - .access_flags = access_flags, - }; - int err; - - err = mlx5_ib_post_send_wait(dev, &umrwr); - if (err) - return err; - - mr->access_flags = access_flags; - return 0; + return mlx5r_umr_can_reconfig(dev, current_access_flags, + target_access_flags); } static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, @@ -1707,7 +1321,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, /* We only track the allocated sizes of MRs from the cache */ if (!mr->cache_ent) return false; - if (!mlx5_ib_can_load_pas_with_umr(dev, new_umem->length)) + if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; *page_size = @@ -1732,7 +1346,7 @@ static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, * with it. This ensure the change is atomic relative to any use of the * MR. */ - err = revoke_mr(mr); + err = mlx5r_umr_revoke_mr(mr); if (err) return err; @@ -1750,7 +1364,7 @@ static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, mr->ibmr.length = new_umem->length; mr->page_shift = order_base_2(page_size); mr->umem = new_umem; - err = mlx5_ib_update_mr_pas(mr, upd_flags); + err = mlx5r_umr_update_mr_pas(mr, upd_flags); if (err) { /* * The MR is revoked at this point so there is no issue to free @@ -1797,7 +1411,8 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, /* Fast path for PD/access change */ if (can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { - err = umr_rereg_pd_access(mr, new_pd, new_access_flags); + err = mlx5r_umr_rereg_pd_access(mr, new_pd, + new_access_flags); if (err) return ERR_PTR(err); return NULL; @@ -1810,7 +1425,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, * Only one active MR can refer to a umem at one time, revoke * the old MR before assigning the umem to the new one. */ - err = revoke_mr(mr); + err = mlx5r_umr_revoke_mr(mr); if (err) return ERR_PTR(err); umem = mr->umem; @@ -1955,7 +1570,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) /* Stop DMA */ if (mr->cache_ent) { - if (revoke_mr(mr)) { + if (mlx5r_umr_revoke_mr(mr)) { spin_lock_irq(&mr->cache_ent->lock); mr->cache_ent->total_mrs--; spin_unlock_irq(&mr->cache_ent->lock); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 41c964a45f89..84da5674e1ab 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -38,6 +38,7 @@ #include "mlx5_ib.h" #include "cmd.h" +#include "umr.h" #include "qp.h" #include @@ -117,7 +118,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, * * xa_store() * mutex_lock(umem_mutex) - * mlx5_ib_update_xlt() + * mlx5r_umr_update_xlt() * mutex_unlock(umem_mutex) * destroy lkey * @@ -198,9 +199,9 @@ static void free_implicit_child_mr_work(struct work_struct *work) mlx5r_deref_wait_odp_mkey(&mr->mmkey); mutex_lock(&odp_imr->umem_mutex); - mlx5_ib_update_xlt(mr->parent, ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, - 1, 0, - MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); + mlx5r_umr_update_xlt(mr->parent, + ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0, + MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); mlx5_ib_dereg_mr(&mr->ibmr, NULL); @@ -282,19 +283,19 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, u64 umr_offset = idx & umr_block_mask; if (in_block && umr_offset == 0) { - mlx5_ib_update_xlt(mr, blk_start_idx, - idx - blk_start_idx, 0, - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ATOMIC); + mlx5r_umr_update_xlt(mr, blk_start_idx, + idx - blk_start_idx, 0, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); in_block = 0; } } } if (in_block) - mlx5_ib_update_xlt(mr, blk_start_idx, - idx - blk_start_idx + 1, 0, - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ATOMIC); + mlx5r_umr_update_xlt(mr, blk_start_idx, + idx - blk_start_idx + 1, 0, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); mlx5_update_odp_stats(mr, invalidations, invalidations); @@ -323,8 +324,7 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) memset(caps, 0, sizeof(*caps)); - if (!MLX5_CAP_GEN(dev->mdev, pg) || - !mlx5_ib_can_load_pas_with_umr(dev, 0)) + if (!MLX5_CAP_GEN(dev->mdev, pg) || !mlx5r_umr_can_load_pas(dev, 0)) return; caps->general_caps = IB_ODP_SUPPORT; @@ -442,11 +442,11 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, */ refcount_set(&mr->mmkey.usecount, 2); - err = mlx5_ib_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, - PAGE_SHIFT, - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE); + err = mlx5r_umr_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); if (err) { ret = ERR_PTR(err); goto out_mr; @@ -487,8 +487,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct mlx5_ib_mr *imr; int err; - if (!mlx5_ib_can_load_pas_with_umr(dev, - MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) + if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE)) return ERR_PTR(-EOPNOTSUPP); umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags); @@ -510,16 +509,15 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr->ibmr.lkey = imr->mmkey.key; imr->ibmr.rkey = imr->mmkey.key; imr->ibmr.device = &dev->ib_dev; - imr->umem = &umem_odp->umem; imr->is_odp_implicit = true; xa_init(&imr->implicit_children); - err = mlx5_ib_update_xlt(imr, 0, - mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE); + err = mlx5r_umr_update_xlt(imr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); if (err) goto out_mr; @@ -582,7 +580,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, * No need to check whether the MTTs really belong to this MR, since * ib_umem_odp_map_dma_and_lock already checks this. */ - ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags); + ret = mlx5r_umr_update_xlt(mr, start_idx, np, page_shift, xlt_flags); mutex_unlock(&odp->umem_mutex); if (ret < 0) { @@ -680,9 +678,9 @@ out: * next pagefault handler will see the new information. */ mutex_lock(&odp_imr->umem_mutex); - err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ATOMIC); + err = mlx5r_umr_update_xlt(imr, upd_start_idx, upd_len, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); if (err) { mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n"); @@ -716,7 +714,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - err = mlx5_ib_update_mr_pas(mr, xlt_flags); + err = mlx5r_umr_update_mr_pas(mr, xlt_flags); } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index fb8669c02546..40d9410ec303 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -40,6 +40,7 @@ #include "ib_rep.h" #include "counters.h" #include "cmd.h" +#include "umr.h" #include "qp.h" #include "wr.h" diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c new file mode 100644 index 000000000000..3a48364c0918 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -0,0 +1,700 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */ + +#include +#include "mlx5_ib.h" +#include "umr.h" +#include "wr.h" + +/* + * We can't use an array for xlt_emergency_page because dma_map_single doesn't + * work on kernel modules memory + */ +void *xlt_emergency_page; +static DEFINE_MUTEX(xlt_emergency_page_mutex); + +static __be64 get_umr_enable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_disable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(struct mlx5_ib_dev *dev) +{ + u64 result; + + result = MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW; + + if (MLX5_CAP_GEN(dev->mdev, atomic)) + result |= MLX5_MKEY_MASK_A; + + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE; + + if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD; + + return cpu_to_be64(result); +} + +static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) +{ + if (mask & MLX5_MKEY_MASK_PAGE_SIZE && + MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_A && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + return -EPERM; + + if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + return -EPERM; + + return 0; +} + +enum { + MAX_UMR_WR = 128, +}; + +static int mlx5r_umr_qp_rst2rts(struct mlx5_ib_dev *dev, struct ib_qp *qp) +{ + struct ib_qp_attr attr = {}; + int ret; + + attr.qp_state = IB_QPS_INIT; + attr.port_num = 1; + ret = ib_modify_qp(qp, &attr, + IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + return ret; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTR; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n"); + return ret; + } + + memset(&attr, 0, sizeof(attr)); + attr.qp_state = IB_QPS_RTS; + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret) { + mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n"); + return ret; + } + + return 0; +} + +int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) +{ + struct ib_qp_init_attr init_attr = {}; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + int ret; + + pd = ib_alloc_pd(&dev->ib_dev, 0); + if (IS_ERR(pd)) { + mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n"); + return PTR_ERR(pd); + } + + cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ); + if (IS_ERR(cq)) { + mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n"); + ret = PTR_ERR(cq); + goto destroy_pd; + } + + init_attr.send_cq = cq; + init_attr.recv_cq = cq; + init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + init_attr.cap.max_send_wr = MAX_UMR_WR; + init_attr.cap.max_send_sge = 1; + init_attr.qp_type = MLX5_IB_QPT_REG_UMR; + init_attr.port_num = 1; + qp = ib_create_qp(pd, &init_attr); + if (IS_ERR(qp)) { + mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n"); + ret = PTR_ERR(qp); + goto destroy_cq; + } + + ret = mlx5r_umr_qp_rst2rts(dev, qp); + if (ret) + goto destroy_qp; + + dev->umrc.qp = qp; + dev->umrc.cq = cq; + dev->umrc.pd = pd; + + sema_init(&dev->umrc.sem, MAX_UMR_WR); + + return 0; + +destroy_qp: + ib_destroy_qp(qp); +destroy_cq: + ib_free_cq(cq); +destroy_pd: + ib_dealloc_pd(pd); + return ret; +} + +void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) +{ + ib_destroy_qp(dev->umrc.qp); + ib_free_cq(dev->umrc.cq); + ib_dealloc_pd(dev->umrc.pd); +} + +static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + unsigned int wqe_size = + with_data ? sizeof(struct mlx5r_umr_wqe) : + sizeof(struct mlx5r_umr_wqe) - + sizeof(struct mlx5_wqe_data_seg); + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_ctrl_seg *ctrl; + union { + struct ib_cqe *ib_cqe; + u64 wr_id; + } id; + void *cur_edge, *seg; + unsigned long flags; + unsigned int idx; + int size, err; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) + return -EIO; + + spin_lock_irqsave(&qp->sq.lock, flags); + + err = mlx5r_begin_wqe(qp, &seg, &ctrl, &idx, &size, &cur_edge, 0, + cpu_to_be32(mkey), false, false); + if (WARN_ON(err)) + goto out; + + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + + mlx5r_memcpy_send_wqe(&qp->sq, &cur_edge, &seg, &size, wqe, wqe_size); + + id.ib_cqe = cqe; + mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, + MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR); + + mlx5r_ring_db(qp, 1, ctrl); + +out: + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void mlx5r_umr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_umr_context *context = + container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe); + + context->status = wc->status; + complete(&context->done); +} + +static inline void mlx5r_umr_init_context(struct mlx5r_umr_context *context) +{ + context->cqe.done = mlx5r_umr_done; + init_completion(&context->done); +} + +static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, + struct mlx5r_umr_wqe *wqe, bool with_data) +{ + struct umr_common *umrc = &dev->umrc; + struct mlx5r_umr_context umr_context; + int err; + + err = umr_check_mkey_mask(dev, be64_to_cpu(wqe->ctrl_seg.mkey_mask)); + if (WARN_ON(err)) + return err; + + mlx5r_umr_init_context(&umr_context); + + down(&umrc->sem); + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, + with_data); + if (err) + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); + else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + return err; +} + +/** + * mlx5r_umr_revoke_mr - Fence all DMA on the MR + * @mr: The MR to fence + * + * Upon return the NIC will not be doing any DMA to the pages under the MR, + * and any DMA in progress will be completed. Failure of this function + * indicates the HW has failed catastrophically. + */ +int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct mlx5r_umr_wqe wqe = {}; + + if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + return 0; + + wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); + wqe.ctrl_seg.mkey_mask |= get_umr_disable_mr_mask(); + wqe.ctrl_seg.flags |= MLX5_UMR_INLINE; + + MLX5_SET(mkc, &wqe.mkey_seg, free, 1); + MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(dev->umrc.pd)->pdn); + MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, + mlx5_mkey_variant(mr->mmkey.key)); + + return mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); +} + +static void mlx5r_umr_set_access_flags(struct mlx5_ib_dev *dev, + struct mlx5_mkey_seg *seg, + unsigned int access_flags) +{ + MLX5_SET(mkc, seg, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); + MLX5_SET(mkc, seg, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + MLX5_SET(mkc, seg, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); + MLX5_SET(mkc, seg, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); + MLX5_SET(mkc, seg, lr, 1); + MLX5_SET(mkc, seg, relaxed_ordering_write, + !!(access_flags & IB_ACCESS_RELAXED_ORDERING)); + MLX5_SET(mkc, seg, relaxed_ordering_read, + !!(access_flags & IB_ACCESS_RELAXED_ORDERING)); +} + +int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, + int access_flags) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct mlx5r_umr_wqe wqe = {}; + int err; + + wqe.ctrl_seg.mkey_mask = get_umr_update_access_mask(dev); + wqe.ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); + wqe.ctrl_seg.flags = MLX5_UMR_CHECK_FREE; + wqe.ctrl_seg.flags |= MLX5_UMR_INLINE; + + mlx5r_umr_set_access_flags(dev, &wqe.mkey_seg, access_flags); + MLX5_SET(mkc, &wqe.mkey_seg, pd, to_mpd(pd)->pdn); + MLX5_SET(mkc, &wqe.mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, &wqe.mkey_seg, mkey_7_0, + mlx5_mkey_variant(mr->mmkey.key)); + + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, false); + if (err) + return err; + + mr->access_flags = access_flags; + return 0; +} + +#define MLX5_MAX_UMR_CHUNK \ + ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - MLX5_UMR_MTT_ALIGNMENT) +#define MLX5_SPARE_UMR_CHUNK 0x10000 + +/* + * Allocate a temporary buffer to hold the per-page information to transfer to + * HW. For efficiency this should be as large as it can be, but buffer + * allocation failure is not allowed, so try smaller sizes. + */ +static void *mlx5r_umr_alloc_xlt(size_t *nents, size_t ent_size, gfp_t gfp_mask) +{ + const size_t xlt_chunk_align = MLX5_UMR_MTT_ALIGNMENT / ent_size; + size_t size; + void *res = NULL; + + static_assert(PAGE_SIZE % MLX5_UMR_MTT_ALIGNMENT == 0); + + /* + * MLX5_IB_UPD_XLT_ATOMIC doesn't signal an atomic context just that the + * allocation can't trigger any kind of reclaim. + */ + might_sleep(); + + gfp_mask |= __GFP_ZERO | __GFP_NORETRY; + + /* + * If the system already has a suitable high order page then just use + * that, but don't try hard to create one. This max is about 1M, so a + * free x86 huge page will satisfy it. + */ + size = min_t(size_t, ent_size * ALIGN(*nents, xlt_chunk_align), + MLX5_MAX_UMR_CHUNK); + *nents = size / ent_size; + res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, + get_order(size)); + if (res) + return res; + + if (size > MLX5_SPARE_UMR_CHUNK) { + size = MLX5_SPARE_UMR_CHUNK; + *nents = size / ent_size; + res = (void *)__get_free_pages(gfp_mask | __GFP_NOWARN, + get_order(size)); + if (res) + return res; + } + + *nents = PAGE_SIZE / ent_size; + res = (void *)__get_free_page(gfp_mask); + if (res) + return res; + + mutex_lock(&xlt_emergency_page_mutex); + memset(xlt_emergency_page, 0, PAGE_SIZE); + return xlt_emergency_page; +} + +static void mlx5r_umr_free_xlt(void *xlt, size_t length) +{ + if (xlt == xlt_emergency_page) { + mutex_unlock(&xlt_emergency_page_mutex); + return; + } + + free_pages((unsigned long)xlt, get_order(length)); +} + +static void mlx5r_umr_unmap_free_xlt(struct mlx5_ib_dev *dev, void *xlt, + struct ib_sge *sg) +{ + struct device *ddev = &dev->mdev->pdev->dev; + + dma_unmap_single(ddev, sg->addr, sg->length, DMA_TO_DEVICE); + mlx5r_umr_free_xlt(xlt, sg->length); +} + +/* + * Create an XLT buffer ready for submission. + */ +static void *mlx5r_umr_create_xlt(struct mlx5_ib_dev *dev, struct ib_sge *sg, + size_t nents, size_t ent_size, + unsigned int flags) +{ + struct device *ddev = &dev->mdev->pdev->dev; + dma_addr_t dma; + void *xlt; + + xlt = mlx5r_umr_alloc_xlt(&nents, ent_size, + flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : + GFP_KERNEL); + sg->length = nents * ent_size; + dma = dma_map_single(ddev, xlt, sg->length, DMA_TO_DEVICE); + if (dma_mapping_error(ddev, dma)) { + mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); + mlx5r_umr_free_xlt(xlt, sg->length); + return NULL; + } + sg->addr = dma; + sg->lkey = dev->umrc.pd->local_dma_lkey; + + return xlt; +} + +static void +mlx5r_umr_set_update_xlt_ctrl_seg(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg, + unsigned int flags, struct ib_sge *sg) +{ + if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) + /* fail if free */ + ctrl_seg->flags = MLX5_UMR_CHECK_FREE; + else + /* fail if not free */ + ctrl_seg->flags = MLX5_UMR_CHECK_NOT_FREE; + ctrl_seg->xlt_octowords = + cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length)); +} + +static void mlx5r_umr_set_update_xlt_mkey_seg(struct mlx5_ib_dev *dev, + struct mlx5_mkey_seg *mkey_seg, + struct mlx5_ib_mr *mr, + unsigned int page_shift) +{ + mlx5r_umr_set_access_flags(dev, mkey_seg, mr->access_flags); + MLX5_SET(mkc, mkey_seg, pd, to_mpd(mr->ibmr.pd)->pdn); + MLX5_SET64(mkc, mkey_seg, start_addr, mr->ibmr.iova); + MLX5_SET64(mkc, mkey_seg, len, mr->ibmr.length); + MLX5_SET(mkc, mkey_seg, log_page_size, page_shift); + MLX5_SET(mkc, mkey_seg, qpn, 0xffffff); + MLX5_SET(mkc, mkey_seg, mkey_7_0, mlx5_mkey_variant(mr->mmkey.key)); +} + +static void +mlx5r_umr_set_update_xlt_data_seg(struct mlx5_wqe_data_seg *data_seg, + struct ib_sge *sg) +{ + data_seg->byte_count = cpu_to_be32(sg->length); + data_seg->lkey = cpu_to_be32(sg->lkey); + data_seg->addr = cpu_to_be64(sg->addr); +} + +static void mlx5r_umr_update_offset(struct mlx5_wqe_umr_ctrl_seg *ctrl_seg, + u64 offset) +{ + u64 octo_offset = mlx5r_umr_get_xlt_octo(offset); + + ctrl_seg->xlt_offset = cpu_to_be16(octo_offset & 0xffff); + ctrl_seg->xlt_offset_47_16 = cpu_to_be32(octo_offset >> 16); + ctrl_seg->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; +} + +static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, + struct mlx5r_umr_wqe *wqe, + struct mlx5_ib_mr *mr, struct ib_sge *sg, + unsigned int flags) +{ + bool update_pd_access, update_translation; + + if (flags & MLX5_IB_UPD_XLT_ENABLE) + wqe->ctrl_seg.mkey_mask |= get_umr_enable_mr_mask(); + + update_pd_access = flags & MLX5_IB_UPD_XLT_ENABLE || + flags & MLX5_IB_UPD_XLT_PD || + flags & MLX5_IB_UPD_XLT_ACCESS; + + if (update_pd_access) { + wqe->ctrl_seg.mkey_mask |= get_umr_update_access_mask(dev); + wqe->ctrl_seg.mkey_mask |= get_umr_update_pd_mask(); + } + + update_translation = + flags & MLX5_IB_UPD_XLT_ENABLE || flags & MLX5_IB_UPD_XLT_ADDR; + + if (update_translation) { + wqe->ctrl_seg.mkey_mask |= get_umr_update_translation_mask(); + if (!mr->ibmr.length) + MLX5_SET(mkc, &wqe->mkey_seg, length64, 1); + } + + wqe->ctrl_seg.xlt_octowords = + cpu_to_be16(mlx5r_umr_get_xlt_octo(sg->length)); + wqe->data_seg.byte_count = cpu_to_be32(sg->length); +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct device *ddev = &dev->mdev->pdev->dev; + struct mlx5r_umr_wqe wqe = {}; + struct ib_block_iter biter; + struct mlx5_mtt *cur_mtt; + size_t orig_sg_length; + struct mlx5_mtt *mtt; + size_t final_size; + struct ib_sge sg; + u64 offset = 0; + int err = 0; + + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + mtt = mlx5r_umr_create_xlt( + dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), + sizeof(*mtt), flags); + if (!mtt) + return -ENOMEM; + + orig_sg_length = sg.length; + + mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); + mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, + mr->page_shift); + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); + + cur_mtt = mtt; + rdma_for_each_block(mr->umem->sgt_append.sgt.sgl, &biter, + mr->umem->sgt_append.sgt.nents, + BIT(mr->page_shift)) { + if (cur_mtt == (void *)mtt + sg.length) { + dma_sync_single_for_device(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, + true); + if (err) + goto err; + dma_sync_single_for_cpu(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + offset += sg.length; + mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); + + cur_mtt = mtt; + } + + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + + cur_mtt++; + } + + final_size = (void *)cur_mtt - (void *)mtt; + sg.length = ALIGN(final_size, MLX5_UMR_MTT_ALIGNMENT); + memset(cur_mtt, 0, sg.length - final_size); + mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); + + dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true); + +err: + sg.length = orig_sg_length; + mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); + return err; +} + +static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) +{ + return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); +} + +int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags) +{ + int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) + ? sizeof(struct mlx5_klm) + : sizeof(struct mlx5_mtt); + const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; + struct mlx5_ib_dev *dev = mr_to_mdev(mr); + struct device *ddev = &dev->mdev->pdev->dev; + const int page_mask = page_align - 1; + struct mlx5r_umr_wqe wqe = {}; + size_t pages_mapped = 0; + size_t pages_to_map = 0; + size_t size_to_map = 0; + size_t orig_sg_length; + size_t pages_iter; + struct ib_sge sg; + int err = 0; + void *xlt; + + if ((flags & MLX5_IB_UPD_XLT_INDIRECT) && + !umr_can_use_indirect_mkey(dev)) + return -EPERM; + + if (WARN_ON(!mr->umem->is_odp)) + return -EINVAL; + + /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, + * so we need to align the offset and length accordingly + */ + if (idx & page_mask) { + npages += idx & page_mask; + idx &= ~page_mask; + } + pages_to_map = ALIGN(npages, page_align); + + xlt = mlx5r_umr_create_xlt(dev, &sg, npages, desc_size, flags); + if (!xlt) + return -ENOMEM; + + pages_iter = sg.length / desc_size; + orig_sg_length = sg.length; + + if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) { + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + size_t max_pages = ib_umem_odp_num_pages(odp) - idx; + + pages_to_map = min_t(size_t, pages_to_map, max_pages); + } + + mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); + mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, page_shift); + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); + + for (pages_mapped = 0; + pages_mapped < pages_to_map && !err; + pages_mapped += pages_iter, idx += pages_iter) { + npages = min_t(int, pages_iter, pages_to_map - pages_mapped); + size_to_map = npages * desc_size; + dma_sync_single_for_cpu(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags); + dma_sync_single_for_device(ddev, sg.addr, sg.length, + DMA_TO_DEVICE); + sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT); + + if (pages_mapped + pages_iter >= pages_to_map) + mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); + mlx5r_umr_update_offset(&wqe.ctrl_seg, idx * desc_size); + err = mlx5r_umr_post_send_wait(dev, mr->mmkey.key, &wqe, true); + } + sg.length = orig_sg_length; + mlx5r_umr_unmap_free_xlt(dev, xlt, &sg); + return err; +} diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h new file mode 100644 index 000000000000..c9d0021381a2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. */ + +#ifndef _MLX5_IB_UMR_H +#define _MLX5_IB_UMR_H + +#include "mlx5_ib.h" + + +#define MLX5_MAX_UMR_SHIFT 16 +#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) + +#define MLX5_IB_UMR_OCTOWORD 16 +#define MLX5_IB_UMR_XLT_ALIGNMENT 64 + +int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev); +void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev); + +static inline bool mlx5r_umr_can_load_pas(struct mlx5_ib_dev *dev, + size_t length) +{ + /* + * umr_check_mkey_mask() rejects MLX5_MKEY_MASK_PAGE_SIZE which is + * always set if MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (aka + * MLX5_IB_UPD_XLT_ADDR and MLX5_IB_UPD_XLT_ENABLE) is set. Thus, a mkey + * can never be enabled without this capability. Simplify this weird + * quirky hardware by just saying it can't use PAS lists with UMR at + * all. + */ + if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) + return false; + + /* + * length is the size of the MR in bytes when mlx5_ib_update_xlt() is + * used. + */ + if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) && + length >= MLX5_MAX_UMR_PAGES * PAGE_SIZE) + return false; + return true; +} + +/* + * true if an existing MR can be reconfigured to new access_flags using UMR. + * Older HW cannot use UMR to update certain elements of the MKC. See + * get_umr_update_access_mask() and umr_check_mkey_mask() + */ +static inline bool mlx5r_umr_can_reconfig(struct mlx5_ib_dev *dev, + unsigned int current_access_flags, + unsigned int target_access_flags) +{ + unsigned int diffs = current_access_flags ^ target_access_flags; + + if ((diffs & IB_ACCESS_REMOTE_ATOMIC) && + MLX5_CAP_GEN(dev->mdev, atomic) && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) + return false; + + if ((diffs & IB_ACCESS_RELAXED_ORDERING) && + MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) + return false; + + if ((diffs & IB_ACCESS_RELAXED_ORDERING) && + MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) && + !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) + return false; + + return true; +} + +static inline u64 mlx5r_umr_get_xlt_octo(u64 bytes) +{ + return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / + MLX5_IB_UMR_OCTOWORD; +} + +struct mlx5r_umr_context { + struct ib_cqe cqe; + enum ib_wc_status status; + struct completion done; +}; + +struct mlx5r_umr_wqe { + struct mlx5_wqe_umr_ctrl_seg ctrl_seg; + struct mlx5_mkey_seg mkey_seg; + struct mlx5_wqe_data_seg data_seg; +}; + +int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); +int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, + int access_flags); +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags); + +#endif /* _MLX5_IB_UMR_H */ diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c index 51e48ca9016e..855f3f4fefad 100644 --- a/drivers/infiniband/hw/mlx5/wr.c +++ b/drivers/infiniband/hw/mlx5/wr.c @@ -7,6 +7,7 @@ #include #include #include "wr.h" +#include "umr.h" static const u32 mlx5_ib_opcode[] = { [IB_WR_SEND] = MLX5_OPCODE_SEND, @@ -25,58 +26,7 @@ static const u32 mlx5_ib_opcode[] = { [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, }; -/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the - * next nearby edge and get new address translation for current WQE position. - * @sq - SQ buffer. - * @seg: Current WQE position (16B aligned). - * @wqe_sz: Total current WQE size [16B]. - * @cur_edge: Updated current edge. - */ -static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg, - u32 wqe_sz, void **cur_edge) -{ - u32 idx; - - if (likely(*seg != *cur_edge)) - return; - - idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1); - *cur_edge = get_sq_edge(sq, idx); - - *seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx); -} - -/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's - * pointers. At the end @seg is aligned to 16B regardless the copied size. - * @sq - SQ buffer. - * @cur_edge: Updated current edge. - * @seg: Current WQE position (16B aligned). - * @wqe_sz: Total current WQE size [16B]. - * @src: Pointer to copy from. - * @n: Number of bytes to copy. - */ -static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge, - void **seg, u32 *wqe_sz, const void *src, - size_t n) -{ - while (likely(n)) { - size_t leftlen = *cur_edge - *seg; - size_t copysz = min_t(size_t, leftlen, n); - size_t stride; - - memcpy(*seg, src, copysz); - - n -= copysz; - src += copysz; - stride = !n ? ALIGN(copysz, 16) : copysz; - *seg += stride; - *wqe_sz += stride >> 4; - handle_post_send_edge(sq, seg, *wqe_sz, cur_edge); - } -} - -static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, - struct ib_cq *ib_cq) +int mlx5r_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) { struct mlx5_ib_cq *cq; unsigned int cur; @@ -122,9 +72,9 @@ static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, eseg->mss = cpu_to_be16(ud_wr->mss); eseg->inline_hdr.sz = cpu_to_be16(left); - /* memcpy_send_wqe should get a 16B align address. Hence, we - * first copy up to the current edge and then, if needed, - * continue to memcpy_send_wqe. + /* mlx5r_memcpy_send_wqe should get a 16B align address. Hence, + * we first copy up to the current edge and then, if needed, + * continue to mlx5r_memcpy_send_wqe. */ copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start, left); @@ -138,8 +88,8 @@ static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, handle_post_send_edge(&qp->sq, seg, *size, cur_edge); left -= copysz; pdata += copysz; - memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata, - left); + mlx5r_memcpy_send_wqe(&qp->sq, cur_edge, seg, size, + pdata, left); } return; @@ -165,12 +115,6 @@ static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) dseg->addr = cpu_to_be64(sg->addr); } -static u64 get_xlt_octo(u64 bytes) -{ - return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / - MLX5_IB_UMR_OCTOWORD; -} - static __be64 frwr_mkey_mask(bool atomic) { u64 result; @@ -222,7 +166,7 @@ static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, memset(umr, 0, sizeof(*umr)); umr->flags = flags; - umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->xlt_octowords = cpu_to_be16(mlx5r_umr_get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(atomic); } @@ -233,134 +177,6 @@ static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) umr->flags = MLX5_UMR_INLINE; } -static __be64 get_umr_enable_mr_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; - - return cpu_to_be64(result); -} - -static __be64 get_umr_disable_mr_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_FREE; - - return cpu_to_be64(result); -} - -static __be64 get_umr_update_translation_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_LEN | - MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR; - - return cpu_to_be64(result); -} - -static __be64 get_umr_update_access_mask(int atomic, - int relaxed_ordering_write, - int relaxed_ordering_read) -{ - u64 result; - - result = MLX5_MKEY_MASK_LR | - MLX5_MKEY_MASK_LW | - MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW; - - if (atomic) - result |= MLX5_MKEY_MASK_A; - - if (relaxed_ordering_write) - result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE; - - if (relaxed_ordering_read) - result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ; - - return cpu_to_be64(result); -} - -static __be64 get_umr_update_pd_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_PD; - - return cpu_to_be64(result); -} - -static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) -{ - if (mask & MLX5_MKEY_MASK_PAGE_SIZE && - MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) - return -EPERM; - - if (mask & MLX5_MKEY_MASK_A && - MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) - return -EPERM; - - if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE && - !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) - return -EPERM; - - if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ && - !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) - return -EPERM; - - return 0; -} - -static int set_reg_umr_segment(struct mlx5_ib_dev *dev, - struct mlx5_wqe_umr_ctrl_seg *umr, - const struct ib_send_wr *wr) -{ - const struct mlx5_umr_wr *umrwr = umr_wr(wr); - - memset(umr, 0, sizeof(*umr)); - - if (!umrwr->ignore_free_state) { - if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) - /* fail if free */ - umr->flags = MLX5_UMR_CHECK_FREE; - else - /* fail if not free */ - umr->flags = MLX5_UMR_CHECK_NOT_FREE; - } - - umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size)); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) { - u64 offset = get_xlt_octo(umrwr->offset); - - umr->xlt_offset = cpu_to_be16(offset & 0xffff); - umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16); - umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; - } - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) - umr->mkey_mask |= get_umr_update_translation_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) { - umr->mkey_mask |= get_umr_update_access_mask( - !!(MLX5_CAP_GEN(dev->mdev, atomic)), - !!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)), - !!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))); - umr->mkey_mask |= get_umr_update_pd_mask(); - } - if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR) - umr->mkey_mask |= get_umr_enable_mr_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) - umr->mkey_mask |= get_umr_disable_mr_mask(); - - if (!wr->num_sge) - umr->flags |= MLX5_UMR_INLINE; - - return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask)); -} - static u8 get_umr_flags(int acc) { return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | @@ -398,43 +214,6 @@ static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) seg->status = MLX5_MKEY_STATUS_FREE; } -static void set_reg_mkey_segment(struct mlx5_ib_dev *dev, - struct mlx5_mkey_seg *seg, - const struct ib_send_wr *wr) -{ - const struct mlx5_umr_wr *umrwr = umr_wr(wr); - - memset(seg, 0, sizeof(*seg)); - if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) - MLX5_SET(mkc, seg, free, 1); - - MLX5_SET(mkc, seg, a, - !!(umrwr->access_flags & IB_ACCESS_REMOTE_ATOMIC)); - MLX5_SET(mkc, seg, rw, - !!(umrwr->access_flags & IB_ACCESS_REMOTE_WRITE)); - MLX5_SET(mkc, seg, rr, !!(umrwr->access_flags & IB_ACCESS_REMOTE_READ)); - MLX5_SET(mkc, seg, lw, !!(umrwr->access_flags & IB_ACCESS_LOCAL_WRITE)); - MLX5_SET(mkc, seg, lr, 1); - if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) - MLX5_SET(mkc, seg, relaxed_ordering_write, - !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING)); - if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) - MLX5_SET(mkc, seg, relaxed_ordering_read, - !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING)); - - if (umrwr->pd) - MLX5_SET(mkc, seg, pd, to_mpd(umrwr->pd)->pdn); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION && - !umrwr->length) - MLX5_SET(mkc, seg, length64, 1); - - MLX5_SET64(mkc, seg, start_addr, umrwr->virt_addr); - MLX5_SET64(mkc, seg, len, umrwr->length); - MLX5_SET(mkc, seg, log_page_size, umrwr->page_shift); - MLX5_SET(mkc, seg, qpn, 0xffffff); - MLX5_SET(mkc, seg, mkey_7_0, mlx5_mkey_variant(umrwr->mkey)); -} - static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, struct mlx5_ib_mr *mr, struct mlx5_ib_pd *pd) @@ -760,7 +539,7 @@ static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | MLX5_MKEY_BSF_EN | pdn); seg->len = cpu_to_be64(length); - seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size)); + seg->xlt_oct_size = cpu_to_be32(mlx5r_umr_get_xlt_octo(size)); seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); } @@ -770,7 +549,7 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, memset(umr, 0, sizeof(*umr)); umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; - umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->xlt_octowords = cpu_to_be16(mlx5r_umr_get_xlt_octo(size)); umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); umr->mkey_mask = sig_mkey_mask(); } @@ -870,7 +649,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, * Relaxed Ordering is set implicitly in mlx5_set_umr_free_mkey() and * kernel ULPs are not aware of it, so we don't set it here. */ - if (!mlx5_ib_can_reconfig_with_umr(dev, 0, wr->access)) { + if (!mlx5r_umr_can_reconfig(dev, 0, wr->access)) { mlx5_ib_warn( to_mdev(qp->ibqp.device), "Fast update for MR access flags is not possible\n"); @@ -899,8 +678,8 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, handle_post_send_edge(&qp->sq, seg, *size, cur_edge); if (umr_inline) { - memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs, - mr_list_size); + mlx5r_memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs, + mr_list_size); *size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4); } else { set_reg_data_seg(*seg, mr, pd); @@ -942,23 +721,22 @@ static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16) } } -static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg, - struct mlx5_wqe_ctrl_seg **ctrl, - const struct ib_send_wr *wr, unsigned int *idx, - int *size, void **cur_edge, int nreq, - bool send_signaled, bool solicited) +int mlx5r_begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, unsigned int *idx, + int *size, void **cur_edge, int nreq, __be32 general_id, + bool send_signaled, bool solicited) { - if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) + if (unlikely(mlx5r_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) return -ENOMEM; *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); *seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx); *ctrl = *seg; *(uint32_t *)(*seg + 8) = 0; - (*ctrl)->imm = send_ieth(wr); + (*ctrl)->general_id = general_id; (*ctrl)->fm_ce_se = qp->sq_signal_bits | - (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | - (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); + (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); *seg += sizeof(**ctrl); *size = sizeof(**ctrl) / 16; @@ -972,16 +750,14 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, const struct ib_send_wr *wr, unsigned int *idx, int *size, void **cur_edge, int nreq) { - return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, - wr->send_flags & IB_SEND_SIGNALED, - wr->send_flags & IB_SEND_SOLICITED); + return mlx5r_begin_wqe(qp, seg, ctrl, idx, size, cur_edge, nreq, + send_ieth(wr), wr->send_flags & IB_SEND_SIGNALED, + wr->send_flags & IB_SEND_SOLICITED); } -static void finish_wqe(struct mlx5_ib_qp *qp, - struct mlx5_wqe_ctrl_seg *ctrl, - void *seg, u8 size, void *cur_edge, - unsigned int idx, u64 wr_id, int nreq, u8 fence, - u32 mlx5_opcode) +void mlx5r_finish_wqe(struct mlx5_ib_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, + void *seg, u8 size, void *cur_edge, unsigned int idx, + u64 wr_id, int nreq, u8 fence, u32 mlx5_opcode) { u8 opmod = 0; @@ -1045,8 +821,8 @@ static int handle_psv(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, /* * SET_PSV WQEs are not signaled and solicited on error. */ - err = __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, - false, true); + err = mlx5r_begin_wqe(qp, seg, ctrl, idx, size, cur_edge, nreq, + send_ieth(wr), false, true); if (unlikely(err)) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; @@ -1057,8 +833,8 @@ static int handle_psv(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, mlx5_ib_warn(dev, "\n"); goto out; } - finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, - next_fence, MLX5_OPCODE_SET_PSV); + mlx5r_finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, + nreq, next_fence, MLX5_OPCODE_SET_PSV); out: return err; @@ -1098,8 +874,8 @@ static int handle_reg_mr_integrity(struct mlx5_ib_dev *dev, if (unlikely(err)) goto out; - finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, - nreq, fence, MLX5_OPCODE_UMR); + mlx5r_finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, + wr->wr_id, nreq, fence, MLX5_OPCODE_UMR); err = begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq); if (unlikely(err)) { @@ -1130,8 +906,8 @@ static int handle_reg_mr_integrity(struct mlx5_ib_dev *dev, mlx5_ib_warn(dev, "\n"); goto out; } - finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, - fence, MLX5_OPCODE_UMR); + mlx5r_finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, + nreq, fence, MLX5_OPCODE_UMR); sig_attrs = mr->ibmr.sig_attrs; err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, @@ -1246,33 +1022,30 @@ static void handle_qpt_ud(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, } } -static int handle_qpt_reg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - const struct ib_send_wr *wr, - struct mlx5_wqe_ctrl_seg **ctrl, void **seg, - int *size, void **cur_edge, unsigned int idx) +void mlx5r_ring_db(struct mlx5_ib_qp *qp, unsigned int nreq, + struct mlx5_wqe_ctrl_seg *ctrl) { - int err = 0; + struct mlx5_bf *bf = &qp->bf; - if (unlikely(wr->opcode != MLX5_IB_WR_UMR)) { - err = -EINVAL; - mlx5_ib_warn(dev, "bad opcode %d\n", wr->opcode); - goto out; - } + qp->sq.head += nreq; - qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; - (*ctrl)->imm = cpu_to_be32(umr_wr(wr)->mkey); - err = set_reg_umr_segment(dev, *seg, wr); - if (unlikely(err)) - goto out; - *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); - *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - set_reg_mkey_segment(dev, *seg, wr); - *seg += sizeof(struct mlx5_mkey_seg); - *size += sizeof(struct mlx5_mkey_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); -out: - return err; + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell. + */ + wmb(); + + mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + bf->offset ^= bf->buf_size; } int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, @@ -1283,7 +1056,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_wqe_xrc_seg *xrc; - struct mlx5_bf *bf; void *cur_edge; int size; unsigned long flags; @@ -1305,8 +1077,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, if (qp->type == IB_QPT_GSI) return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); - bf = &qp->bf; - spin_lock_irqsave(&qp->sq.lock, flags); for (nreq = 0; wr; nreq++, wr = wr->next) { @@ -1384,12 +1154,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, case IB_QPT_UD: handle_qpt_ud(qp, wr, &seg, &size, &cur_edge); break; - case MLX5_IB_QPT_REG_UMR: - err = handle_qpt_reg_umr(dev, qp, wr, &ctrl, &seg, - &size, &cur_edge, idx); - if (unlikely(err)) - goto out; - break; default: break; @@ -1418,35 +1182,16 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, } qp->next_fence = next_fence; - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq, - fence, mlx5_ib_opcode[wr->opcode]); + mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, + nreq, fence, mlx5_ib_opcode[wr->opcode]); skip_psv: if (0) dump_wqe(qp, idx, size); } out: - if (likely(nreq)) { - qp->sq.head += nreq; - - /* Make sure that descriptors are written before - * updating doorbell record and ringing the doorbell - */ - wmb(); - - qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); - - /* Make sure doorbell record is visible to the HCA before - * we hit doorbell. - */ - wmb(); - - mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset); - /* Make sure doorbells don't leak out of SQ spinlock - * and reach the HCA out of order. - */ - bf->offset ^= bf->buf_size; - } + if (likely(nreq)) + mlx5r_ring_db(qp, nreq, ctrl); spin_unlock_irqrestore(&qp->sq.lock, flags); @@ -1486,7 +1231,7 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; nreq++, wr = wr->next) { - if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + if (mlx5r_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { err = -ENOMEM; *bad_wr = wr; goto out; diff --git a/drivers/infiniband/hw/mlx5/wr.h b/drivers/infiniband/hw/mlx5/wr.h index 4f0057516402..2dc89438000d 100644 --- a/drivers/infiniband/hw/mlx5/wr.h +++ b/drivers/infiniband/hw/mlx5/wr.h @@ -41,6 +41,66 @@ static inline void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx) return fragment_end + MLX5_SEND_WQE_BB; } +/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the + * next nearby edge and get new address translation for current WQE position. + * @sq: SQ buffer. + * @seg: Current WQE position (16B aligned). + * @wqe_sz: Total current WQE size [16B]. + * @cur_edge: Updated current edge. + */ +static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg, + u32 wqe_sz, void **cur_edge) +{ + u32 idx; + + if (likely(*seg != *cur_edge)) + return; + + idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1); + *cur_edge = get_sq_edge(sq, idx); + + *seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx); +} + +/* mlx5r_memcpy_send_wqe - copy data from src to WQE and update the relevant + * WQ's pointers. At the end @seg is aligned to 16B regardless the copied size. + * @sq: SQ buffer. + * @cur_edge: Updated current edge. + * @seg: Current WQE position (16B aligned). + * @wqe_sz: Total current WQE size [16B]. + * @src: Pointer to copy from. + * @n: Number of bytes to copy. + */ +static inline void mlx5r_memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge, + void **seg, u32 *wqe_sz, + const void *src, size_t n) +{ + while (likely(n)) { + size_t leftlen = *cur_edge - *seg; + size_t copysz = min_t(size_t, leftlen, n); + size_t stride; + + memcpy(*seg, src, copysz); + + n -= copysz; + src += copysz; + stride = !n ? ALIGN(copysz, 16) : copysz; + *seg += stride; + *wqe_sz += stride >> 4; + handle_post_send_edge(sq, seg, *wqe_sz, cur_edge); + } +} + +int mlx5r_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq); +int mlx5r_begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, unsigned int *idx, + int *size, void **cur_edge, int nreq, __be32 general_id, + bool send_signaled, bool solicited); +void mlx5r_finish_wqe(struct mlx5_ib_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, + void *seg, u8 size, void *cur_edge, unsigned int idx, + u64 wr_id, int nreq, u8 fence, u32 mlx5_opcode); +void mlx5r_ring_db(struct mlx5_ib_qp *qp, unsigned int nreq, + struct mlx5_wqe_ctrl_seg *ctrl); int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr, bool drain); int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index acf9970ec245..dd4021b11963 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -90,8 +90,8 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_SHUTDOWN_PORT | IB_DEVICE_SYS_IMAGE_GUID | - IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; attr->max_send_sge = dev->attr.max_send_sge; attr->max_recv_sge = dev->attr.max_recv_sge; attr->max_sge_rd = dev->attr.max_rdma_sge; diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 65ce6d0f1885..5152f10d2e6d 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -500,7 +500,6 @@ static void qedr_sync_free_irqs(struct qedr_dev *dev) if (dev->int_info.msix_cnt) { idx = i * dev->num_hwfns + dev->affin_hwfn_idx; vector = dev->int_info.msix[idx].vector; - synchronize_irq(vector); free_irq(vector, &dev->cnq_array[i]); } } diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index a53476653b0d..f0f43b6db89e 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -134,7 +134,8 @@ int qedr_query_device(struct ib_device *ibdev, attr->max_qp_wr = max_t(u32, qattr->max_sqe, qattr->max_rqe); attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD | IB_DEVICE_RC_RNR_NAK_GEN | - IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS; + IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; if (!rdma_protocol_iwarp(&dev->ibdev, 1)) attr->device_cap_flags |= IB_DEVICE_XRC; diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index a8e1c30c370f..b37b1c6d35c6 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -678,7 +678,7 @@ struct qib_pportdata { /* Observers. Not to be taken lightly, possibly not to ship. */ /* * If a diag read or write is to (bottom <= offset <= top), - * the "hoook" is called, allowing, e.g. shadows to be + * the "hook" is called, allowing, e.g. shadows to be * updated in sync with the driver. struct diag_observer * is the "visible" part. */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index d346dd48e731..46653ad56f5a 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -534,6 +534,11 @@ static int usnic_ib_pci_probe(struct pci_dev *pdev, struct usnic_ib_vf *vf; enum usnic_vnic_res_type res_type; + if (!device_iommu_mapped(&pdev->dev)) { + usnic_err("IOMMU required but not present or enabled. USNIC QPs will not function w/o enabling IOMMU\n"); + return -EPERM; + } + vf = kzalloc(sizeof(*vf), GFP_KERNEL); if (!vf) return -ENOMEM; @@ -642,12 +647,6 @@ static int __init usnic_ib_init(void) printk_once(KERN_INFO "%s", usnic_version); - err = usnic_uiom_init(DRV_NAME); - if (err) { - usnic_err("Unable to initialize umem with err %d\n", err); - return err; - } - err = pci_register_driver(&usnic_ib_pci_driver); if (err) { usnic_err("Unable to register with PCI\n"); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index d3a9670bf971..6e8c4fbb8083 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -305,7 +305,8 @@ int usnic_ib_query_device(struct ib_device *ibdev, props->max_qp = qp_per_vf * kref_read(&us_ibdev->vf_cnt); props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | - IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + IB_DEVICE_SYS_IMAGE_GUID; + props->kernel_cap_flags = IBK_BLOCK_MULTICAST_LOOPBACK; props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] * kref_read(&us_ibdev->vf_cnt); props->max_pd = USNIC_UIOM_MAX_PD_CNT; @@ -442,7 +443,7 @@ int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct usnic_ib_pd *pd = to_upd(ibpd); - pd->umem_pd = usnic_uiom_alloc_pd(); + pd->umem_pd = usnic_uiom_alloc_pd(ibpd->device->dev.parent); if (IS_ERR(pd->umem_pd)) return PTR_ERR(pd->umem_pd); @@ -706,4 +707,3 @@ int usnic_ib_mmap(struct ib_ucontext *context, usnic_err("No VF %u found\n", vfid); return -EINVAL; } - diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 760b254ba42d..e212929369df 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include "usnic_log.h" @@ -439,7 +438,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr) __usnic_uiom_release_tail(uiomr); } -struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) +struct usnic_uiom_pd *usnic_uiom_alloc_pd(struct device *dev) { struct usnic_uiom_pd *pd; void *domain; @@ -448,7 +447,7 @@ struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) if (!pd) return ERR_PTR(-ENOMEM); - pd->domain = domain = iommu_domain_alloc(&pci_bus_type); + pd->domain = domain = iommu_domain_alloc(dev->bus); if (!domain) { usnic_err("Failed to allocate IOMMU domain"); kfree(pd); @@ -556,13 +555,3 @@ void usnic_uiom_free_dev_list(struct device **devs) { kfree(devs); } - -int usnic_uiom_init(char *drv_name) -{ - if (!iommu_present(&pci_bus_type)) { - usnic_err("IOMMU required but not present or enabled. USNIC QPs will not function w/o enabling IOMMU\n"); - return -EPERM; - } - - return 0; -} diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h index 7ec8991ace67..5a9acf941510 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -80,7 +80,7 @@ struct usnic_uiom_chunk { struct scatterlist page_list[]; }; -struct usnic_uiom_pd *usnic_uiom_alloc_pd(void); +struct usnic_uiom_pd *usnic_uiom_alloc_pd(struct device *dev); void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd); int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev); void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, @@ -91,5 +91,4 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, unsigned long addr, size_t size, int access, int dmasync); void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr); -int usnic_uiom_init(char *drv_name); #endif /* USNIC_UIOM_H_ */ diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 8ef112f883a7..3acab569fbb9 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2775,7 +2775,7 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, EXPORT_SYMBOL(rvt_qp_iter); /* - * This should be called with s_lock held. + * This should be called with s_lock and r_lock held. */ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status) @@ -3134,7 +3134,9 @@ send_comp: rvp->n_loop_pkts++; flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + spin_lock(&sqp->r_lock); rvt_send_complete(sqp, wqe, send_status); + spin_unlock(&sqp->r_lock); if (local_ops) { atomic_dec(&sqp->local_ops_pending); local_ops = 0; @@ -3188,7 +3190,9 @@ serr: spin_unlock_irqrestore(&qp->r_lock, flags); serr_no_r_lock: spin_lock_irqsave(&sqp->s_lock, flags); + spin_lock(&sqp->r_lock); rvt_send_complete(sqp, wqe, send_status); + spin_unlock(&sqp->r_lock); if (sqp->ibqp.qp_type == IB_QPT_RC) { int lastwqe; diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 2dae7538a2ea..51daac5c4feb 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -46,6 +46,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_qp = RXE_MAX_QP; rxe->attr.max_qp_wr = RXE_MAX_QP_WR; rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; + rxe->attr.kernel_cap_flags = IBK_ALLOW_USER_UNREG; rxe->attr.max_send_sge = RXE_MAX_SGE; rxe->attr.max_recv_sge = RXE_MAX_SGE; rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 138b3e7d3a5f..da3a398053b8 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -562,7 +562,8 @@ int rxe_completer(void *arg) enum comp_state state; int ret = 0; - rxe_get(qp); + if (!rxe_get(qp)) + return -EAGAIN; if (!qp->valid || qp->req.state == QP_STATE_ERROR || qp->req.state == QP_STATE_RESET) { diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 2ffbe3390668..0e022ae1b8a5 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -37,7 +37,7 @@ int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited); void rxe_cq_disable(struct rxe_cq *cq); -void rxe_cq_cleanup(struct rxe_pool_elem *arg); +void rxe_cq_cleanup(struct rxe_pool_elem *elem); /* rxe_mcast.c */ struct rxe_mcg *rxe_lookup_mcg(struct rxe_dev *rxe, union ib_gid *mgid); @@ -81,7 +81,7 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 rkey); int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); int rxe_mr_set_page(struct ib_mr *ibmr, u64 addr); int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); -void rxe_mr_cleanup(struct rxe_pool_elem *arg); +void rxe_mr_cleanup(struct rxe_pool_elem *elem); /* rxe_mw.c */ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata); @@ -89,7 +89,7 @@ int rxe_dealloc_mw(struct ib_mw *ibmw); int rxe_bind_mw(struct rxe_qp *qp, struct rxe_send_wqe *wqe); int rxe_invalidate_mw(struct rxe_qp *qp, u32 rkey); struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey); -void rxe_mw_cleanup(struct rxe_pool_elem *arg); +void rxe_mw_cleanup(struct rxe_pool_elem *elem); /* rxe_net.c */ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, @@ -114,7 +114,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask); void rxe_qp_error(struct rxe_qp *qp); int rxe_qp_chk_destroy(struct rxe_qp *qp); -void rxe_qp_destroy(struct rxe_qp *qp); void rxe_qp_cleanup(struct rxe_pool_elem *elem); static inline int qp_num(struct rxe_qp *qp) @@ -159,18 +158,16 @@ void retransmit_timer(struct timer_list *t); void rnr_nak_timer(struct timer_list *t); /* rxe_srq.c */ -#define IB_SRQ_INIT_MASK (~IB_SRQ_LIMIT) - -int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, - struct ib_srq_attr *attr, enum ib_srq_attr_mask mask); - +int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init); int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_init_attr *init, struct ib_udata *udata, struct rxe_create_srq_resp __user *uresp); - +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask); int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata); +void rxe_srq_cleanup(struct rxe_pool_elem *elem); void rxe_dealloc(struct ib_device *ib_dev); diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 873a9b10307c..86cc2e18a7fd 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -206,8 +206,10 @@ static struct rxe_mcg *rxe_get_mcg(struct rxe_dev *rxe, union ib_gid *mgid) /* speculative alloc of new mcg */ mcg = kzalloc(sizeof(*mcg), GFP_KERNEL); - if (!mcg) - return ERR_PTR(-ENOMEM); + if (!mcg) { + err = -ENOMEM; + goto err_dec; + } spin_lock_bh(&rxe->mcg_lock); /* re-check to see if someone else just added it */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 60a31b718774..fc3942e04a1f 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -683,14 +683,10 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); - if (atomic_read(&mr->num_mw) > 0) { - pr_warn("%s: Attempt to deregister an MR while bound to MWs\n", - __func__); + /* See IBA 10.6.7.2.6 */ + if (atomic_read(&mr->num_mw) > 0) return -EINVAL; - } - mr->state = RXE_MR_STATE_INVALID; - rxe_put(mr_pd(mr)); rxe_put(mr); return 0; @@ -700,6 +696,8 @@ void rxe_mr_cleanup(struct rxe_pool_elem *elem) { struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); + rxe_put(mr_pd(mr)); + ib_umem_release(mr->umem); if (mr->cur_map_set) diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c index c86b2efd58f2..2e1fa844fabf 100644 --- a/drivers/infiniband/sw/rxe/rxe_mw.c +++ b/drivers/infiniband/sw/rxe/rxe_mw.c @@ -3,6 +3,14 @@ * Copyright (c) 2020 Hewlett Packard Enterprise, Inc. All rights reserved. */ +/* + * The rdma_rxe driver supports type 1 or type 2B memory windows. + * Type 1 MWs are created by ibv_alloc_mw() verbs calls and bound by + * ibv_bind_mw() calls. Type 2 MWs are also created by ibv_alloc_mw() + * but bound by bind_mw work requests. The ibv_bind_mw() call is converted + * by libibverbs to a bind_mw work request. + */ + #include "rxe.h" int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) @@ -28,40 +36,11 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) return 0; } -static void rxe_do_dealloc_mw(struct rxe_mw *mw) -{ - if (mw->mr) { - struct rxe_mr *mr = mw->mr; - - mw->mr = NULL; - atomic_dec(&mr->num_mw); - rxe_put(mr); - } - - if (mw->qp) { - struct rxe_qp *qp = mw->qp; - - mw->qp = NULL; - rxe_put(qp); - } - - mw->access = 0; - mw->addr = 0; - mw->length = 0; - mw->state = RXE_MW_STATE_INVALID; -} - int rxe_dealloc_mw(struct ib_mw *ibmw) { struct rxe_mw *mw = to_rmw(ibmw); - struct rxe_pd *pd = to_rpd(ibmw->pd); - - spin_lock_bh(&mw->lock); - rxe_do_dealloc_mw(mw); - spin_unlock_bh(&mw->lock); rxe_put(mw); - rxe_put(pd); return 0; } @@ -328,3 +307,31 @@ struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey) return mw; } + +void rxe_mw_cleanup(struct rxe_pool_elem *elem) +{ + struct rxe_mw *mw = container_of(elem, typeof(*mw), elem); + struct rxe_pd *pd = to_rpd(mw->ibmw.pd); + + rxe_put(pd); + + if (mw->mr) { + struct rxe_mr *mr = mw->mr; + + mw->mr = NULL; + atomic_dec(&mr->num_mw); + rxe_put(mr); + } + + if (mw->qp) { + struct rxe_qp *qp = mw->qp; + + mw->qp = NULL; + rxe_put(qp); + } + + mw->access = 0; + mw->addr = 0; + mw->length = 0; + mw->state = RXE_MW_STATE_INVALID; +} diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.c b/drivers/infiniband/sw/rxe/rxe_opcode.c index df596ba7527d..d4ba4d506f17 100644 --- a/drivers/infiniband/sw/rxe/rxe_opcode.c +++ b/drivers/infiniband/sw/rxe/rxe_opcode.c @@ -29,7 +29,6 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { [IB_WR_SEND] = { .name = "IB_WR_SEND", .mask = { - [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, @@ -39,7 +38,6 @@ struct rxe_wr_opcode_info rxe_wr_opcode_info[] = { [IB_WR_SEND_WITH_IMM] = { .name = "IB_WR_SEND_WITH_IMM", .mask = { - [IB_QPT_SMI] = WR_INLINE_MASK | WR_SEND_MASK, [IB_QPT_GSI] = WR_INLINE_MASK | WR_SEND_MASK, [IB_QPT_RC] = WR_INLINE_MASK | WR_SEND_MASK, [IB_QPT_UC] = WR_INLINE_MASK | WR_SEND_MASK, diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 918270e34a35..568a7cbd13d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -50,9 +50,7 @@ enum rxe_device_param { | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_SRQ_RESIZE | IB_DEVICE_MEM_MGT_EXTENSIONS - | IB_DEVICE_ALLOW_USER_UNREG | IB_DEVICE_MEM_WINDOW - | IB_DEVICE_MEM_WINDOW_TYPE_2A | IB_DEVICE_MEM_WINDOW_TYPE_2B, RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 87066d04ed18..19b14826385b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -13,7 +13,6 @@ static const struct rxe_type_info { size_t size; size_t elem_offset; void (*cleanup)(struct rxe_pool_elem *elem); - enum rxe_pool_flags flags; u32 min_index; u32 max_index; u32 max_elem; @@ -46,6 +45,7 @@ static const struct rxe_type_info { .name = "srq", .size = sizeof(struct rxe_srq), .elem_offset = offsetof(struct rxe_srq, elem), + .cleanup = rxe_srq_cleanup, .min_index = RXE_MIN_SRQ_INDEX, .max_index = RXE_MAX_SRQ_INDEX, .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1, @@ -73,7 +73,6 @@ static const struct rxe_type_info { .size = sizeof(struct rxe_mr), .elem_offset = offsetof(struct rxe_mr, elem), .cleanup = rxe_mr_cleanup, - .flags = RXE_POOL_ALLOC, .min_index = RXE_MIN_MR_INDEX, .max_index = RXE_MAX_MR_INDEX, .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1, @@ -82,6 +81,7 @@ static const struct rxe_type_info { .name = "mw", .size = sizeof(struct rxe_mw), .elem_offset = offsetof(struct rxe_mw, elem), + .cleanup = rxe_mw_cleanup, .min_index = RXE_MIN_MW_INDEX, .max_index = RXE_MAX_MW_INDEX, .max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1, @@ -101,7 +101,6 @@ void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, pool->max_elem = info->max_elem; pool->elem_size = ALIGN(info->size, RXE_POOL_ALIGN); pool->elem_offset = info->elem_offset; - pool->flags = info->flags; pool->cleanup = info->cleanup; atomic_set(&pool->num_elem, 0); @@ -122,7 +121,7 @@ void *rxe_alloc(struct rxe_pool *pool) void *obj; int err; - if (WARN_ON(!(pool->flags & RXE_POOL_ALLOC))) + if (WARN_ON(!(pool->type == RXE_TYPE_MR))) return NULL; if (atomic_inc_return(&pool->num_elem) > pool->max_elem) @@ -156,7 +155,7 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem) { int err; - if (WARN_ON(pool->flags & RXE_POOL_ALLOC)) + if (WARN_ON(pool->type == RXE_TYPE_MR)) return -EINVAL; if (atomic_inc_return(&pool->num_elem) > pool->max_elem) @@ -206,7 +205,7 @@ static void rxe_elem_release(struct kref *kref) if (pool->cleanup) pool->cleanup(elem); - if (pool->flags & RXE_POOL_ALLOC) + if (pool->type == RXE_TYPE_MR) kfree(elem->obj); atomic_dec(&pool->num_elem); diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 24bcc786c1b3..0860660d65ec 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -7,10 +7,6 @@ #ifndef RXE_POOL_H #define RXE_POOL_H -enum rxe_pool_flags { - RXE_POOL_ALLOC = BIT(1), -}; - enum rxe_elem_type { RXE_TYPE_UC, RXE_TYPE_PD, @@ -35,7 +31,6 @@ struct rxe_pool { struct rxe_dev *rxe; const char *name; void (*cleanup)(struct rxe_pool_elem *elem); - enum rxe_pool_flags flags; enum rxe_elem_type type; unsigned int max_elem; diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 62acf890af6c..22e9b85344c3 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -63,7 +63,6 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) int port_num = init->port_num; switch (init->qp_type) { - case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_RC: case IB_QPT_UC: @@ -81,7 +80,7 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) if (rxe_qp_chk_cap(rxe, cap, !!init->srq)) goto err1; - if (init->qp_type == IB_QPT_SMI || init->qp_type == IB_QPT_GSI) { + if (init->qp_type == IB_QPT_GSI) { if (!rdma_is_port_valid(&rxe->ib_dev, port_num)) { pr_warn("invalid port = %d\n", port_num); goto err1; @@ -89,11 +88,6 @@ int rxe_qp_chk_init(struct rxe_dev *rxe, struct ib_qp_init_attr *init) port = &rxe->port; - if (init->qp_type == IB_QPT_SMI && port->qp_smi_index) { - pr_warn("SMI QP exists for port %d\n", port_num); - goto err1; - } - if (init->qp_type == IB_QPT_GSI && port->qp_gsi_index) { pr_warn("GSI QP exists for port %d\n", port_num); goto err1; @@ -167,12 +161,6 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, port = &rxe->port; switch (init->qp_type) { - case IB_QPT_SMI: - qp->ibqp.qp_num = 0; - port->qp_smi_index = qpn; - qp->attr.port_num = init->port_num; - break; - case IB_QPT_GSI: qp->ibqp.qp_num = 1; port->qp_gsi_index = qpn; @@ -334,6 +322,9 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, qp->scq = scq; qp->srq = srq; + atomic_inc(&rcq->num_wq); + atomic_inc(&scq->num_wq); + rxe_qp_init_misc(rxe, qp, init); err = rxe_qp_init_req(rxe, qp, init, udata, uresp); @@ -353,6 +344,9 @@ err2: rxe_queue_cleanup(qp->sq.queue); qp->sq.queue = NULL; err1: + atomic_dec(&rcq->num_wq); + atomic_dec(&scq->num_wq); + qp->pd = NULL; qp->rcq = NULL; qp->scq = NULL; @@ -777,9 +771,11 @@ int rxe_qp_chk_destroy(struct rxe_qp *qp) return 0; } -/* called by the destroy qp verb */ -void rxe_qp_destroy(struct rxe_qp *qp) +/* called when the last reference to the qp is dropped */ +static void rxe_qp_do_cleanup(struct work_struct *work) { + struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); + qp->valid = 0; qp->qp_timeout_jiffies = 0; rxe_cleanup_task(&qp->resp.task); @@ -798,12 +794,6 @@ void rxe_qp_destroy(struct rxe_qp *qp) __rxe_do_task(&qp->comp.task); __rxe_do_task(&qp->req.task); } -} - -/* called when the last reference to the qp is dropped */ -static void rxe_qp_do_cleanup(struct work_struct *work) -{ - struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); @@ -814,10 +804,14 @@ static void rxe_qp_do_cleanup(struct work_struct *work) if (qp->rq.queue) rxe_queue_cleanup(qp->rq.queue); + atomic_dec(&qp->scq->num_wq); if (qp->scq) rxe_put(qp->scq); + + atomic_dec(&qp->rcq->num_wq); if (qp->rcq) rxe_put(qp->rcq); + if (qp->pd) rxe_put(qp->pd); diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index d09a8b68c962..f3ad7b6dbd97 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -34,7 +34,6 @@ static int check_type_state(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, } break; case IB_QPT_UD: - case IB_QPT_SMI: case IB_QPT_GSI: if (unlikely(pkt_type != IB_OPCODE_UD)) { pr_warn_ratelimited("bad qp type\n"); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index ae5fbc79dd5c..9d98237389cf 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -33,8 +33,6 @@ static inline void retry_first_write_send(struct rxe_qp *qp, } else { advance_dma_data(&wqe->dma, to_send); } - if (mask & WR_WRITE_MASK) - wqe->iova += qp->mtu; } } @@ -308,7 +306,6 @@ static int next_opcode(struct rxe_qp *qp, struct rxe_send_wqe *wqe, case IB_QPT_UC: return next_opcode_uc(qp, opcode, fits); - case IB_QPT_SMI: case IB_QPT_UD: case IB_QPT_GSI: switch (opcode) { @@ -414,8 +411,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, if (pkt->mask & RXE_ATMETH_MASK) { atmeth_set_va(pkt, wqe->iova); - if (opcode == IB_OPCODE_RC_COMPARE_SWAP || - opcode == IB_OPCODE_RD_COMPARE_SWAP) { + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { atmeth_set_swap_add(pkt, ibwr->wr.atomic.swap); atmeth_set_comp(pkt, ibwr->wr.atomic.compare_add); } else { @@ -437,7 +433,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, struct rxe_send_wqe *wqe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, u32 paylen) + struct sk_buff *skb, u32 payload) { int err; @@ -449,19 +445,19 @@ static int finish_packet(struct rxe_qp *qp, struct rxe_av *av, if (wqe->wr.send_flags & IB_SEND_INLINE) { u8 *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset]; - memcpy(payload_addr(pkt), tmp, paylen); + memcpy(payload_addr(pkt), tmp, payload); - wqe->dma.resid -= paylen; - wqe->dma.sge_offset += paylen; + wqe->dma.resid -= payload; + wqe->dma.sge_offset += payload; } else { err = copy_data(qp->pd, 0, &wqe->dma, - payload_addr(pkt), paylen, + payload_addr(pkt), payload, RXE_FROM_MR_OBJ); if (err) return err; } if (bth_pad(pkt)) { - u8 *pad = payload_addr(pkt) + paylen; + u8 *pad = payload_addr(pkt) + payload; memset(pad, 0, bth_pad(pkt)); } @@ -527,8 +523,7 @@ static void rollback_state(struct rxe_send_wqe *wqe, qp->req.psn = rollback_psn; } -static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe, - struct rxe_pkt_info *pkt) +static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { qp->req.opcode = pkt->opcode; @@ -611,7 +606,8 @@ int rxe_requester(void *arg) struct rxe_ah *ah; struct rxe_av *av; - rxe_get(qp); + if (!rxe_get(qp)) + return -EAGAIN; next_wqe: if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) @@ -661,7 +657,7 @@ next_wqe: opcode = next_opcode(qp, wqe, wqe->wr.opcode); if (unlikely(opcode < 0)) { wqe->status = IB_WC_LOC_QP_OP_ERR; - goto exit; + goto err; } mask = rxe_opcode[opcode].mask; @@ -755,7 +751,7 @@ next_wqe: goto err; } - update_state(qp, wqe, &pkt); + update_state(qp, &pkt); goto next_wqe; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 9cd0eaff98de..f4f6ee5d81fe 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -277,7 +277,6 @@ static enum resp_states check_op_valid(struct rxe_qp *qp, break; case IB_QPT_UD: - case IB_QPT_SMI: case IB_QPT_GSI: break; @@ -577,8 +576,7 @@ static enum resp_states process_atomic(struct rxe_qp *qp, qp->resp.atomic_orig = *vaddr; - if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP || - pkt->opcode == IB_OPCODE_RD_COMPARE_SWAP) { + if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { if (*vaddr == atmeth_comp(pkt)) *vaddr = atmeth_swap_add(pkt); } else { @@ -834,7 +832,6 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) if (pkt->mask & RXE_SEND_MASK) { if (qp_type(qp) == IB_QPT_UD || - qp_type(qp) == IB_QPT_SMI || qp_type(qp) == IB_QPT_GSI) { if (skb->protocol == htons(ETH_P_IP)) { memset(&hdr.reserved, 0, @@ -1265,7 +1262,8 @@ int rxe_responder(void *arg) struct rxe_pkt_info *pkt = NULL; int ret = 0; - rxe_get(qp); + if (!rxe_get(qp)) + return -EAGAIN; qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED; diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 0c0721f04357..02b39498c370 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -6,64 +6,34 @@ #include #include "rxe.h" -#include "rxe_loc.h" #include "rxe_queue.h" -int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, - struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) +int rxe_srq_chk_init(struct rxe_dev *rxe, struct ib_srq_init_attr *init) { - if (srq && srq->error) { - pr_warn("srq in error state\n"); + struct ib_srq_attr *attr = &init->attr; + + if (attr->max_wr > rxe->attr.max_srq_wr) { + pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + attr->max_wr, rxe->attr.max_srq_wr); goto err1; } - if (mask & IB_SRQ_MAX_WR) { - if (attr->max_wr > rxe->attr.max_srq_wr) { - pr_warn("max_wr(%d) > max_srq_wr(%d)\n", - attr->max_wr, rxe->attr.max_srq_wr); - goto err1; - } - - if (attr->max_wr <= 0) { - pr_warn("max_wr(%d) <= 0\n", attr->max_wr); - goto err1; - } - - if (srq && srq->limit && (attr->max_wr < srq->limit)) { - pr_warn("max_wr (%d) < srq->limit (%d)\n", - attr->max_wr, srq->limit); - goto err1; - } - - if (attr->max_wr < RXE_MIN_SRQ_WR) - attr->max_wr = RXE_MIN_SRQ_WR; + if (attr->max_wr <= 0) { + pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + goto err1; } - if (mask & IB_SRQ_LIMIT) { - if (attr->srq_limit > rxe->attr.max_srq_wr) { - pr_warn("srq_limit(%d) > max_srq_wr(%d)\n", - attr->srq_limit, rxe->attr.max_srq_wr); - goto err1; - } + if (attr->max_wr < RXE_MIN_SRQ_WR) + attr->max_wr = RXE_MIN_SRQ_WR; - if (srq && (attr->srq_limit > srq->rq.queue->buf->index_mask)) { - pr_warn("srq_limit (%d) > cur limit(%d)\n", - attr->srq_limit, - srq->rq.queue->buf->index_mask); - goto err1; - } + if (attr->max_sge > rxe->attr.max_srq_sge) { + pr_warn("max_sge(%d) > max_srq_sge(%d)\n", + attr->max_sge, rxe->attr.max_srq_sge); + goto err1; } - if (mask == IB_SRQ_INIT_MASK) { - if (attr->max_sge > rxe->attr.max_srq_sge) { - pr_warn("max_sge(%d) > max_srq_sge(%d)\n", - attr->max_sge, rxe->attr.max_srq_sge); - goto err1; - } - - if (attr->max_sge < RXE_MIN_SRQ_SGE) - attr->max_sge = RXE_MIN_SRQ_SGE; - } + if (attr->max_sge < RXE_MIN_SRQ_SGE) + attr->max_sge = RXE_MIN_SRQ_SGE; return 0; @@ -93,8 +63,7 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, spin_lock_init(&srq->rq.consumer_lock); type = QUEUE_TYPE_FROM_CLIENT; - q = rxe_queue_init(rxe, &srq->rq.max_wr, - srq_wqe_size, type); + q = rxe_queue_init(rxe, &srq->rq.max_wr, srq_wqe_size, type); if (!q) { pr_warn("unable to allocate queue for srq\n"); return -ENOMEM; @@ -121,6 +90,57 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, return 0; } +int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, + struct ib_srq_attr *attr, enum ib_srq_attr_mask mask) +{ + if (srq->error) { + pr_warn("srq in error state\n"); + goto err1; + } + + if (mask & IB_SRQ_MAX_WR) { + if (attr->max_wr > rxe->attr.max_srq_wr) { + pr_warn("max_wr(%d) > max_srq_wr(%d)\n", + attr->max_wr, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->max_wr <= 0) { + pr_warn("max_wr(%d) <= 0\n", attr->max_wr); + goto err1; + } + + if (srq->limit && (attr->max_wr < srq->limit)) { + pr_warn("max_wr (%d) < srq->limit (%d)\n", + attr->max_wr, srq->limit); + goto err1; + } + + if (attr->max_wr < RXE_MIN_SRQ_WR) + attr->max_wr = RXE_MIN_SRQ_WR; + } + + if (mask & IB_SRQ_LIMIT) { + if (attr->srq_limit > rxe->attr.max_srq_wr) { + pr_warn("srq_limit(%d) > max_srq_wr(%d)\n", + attr->srq_limit, rxe->attr.max_srq_wr); + goto err1; + } + + if (attr->srq_limit > srq->rq.queue->buf->index_mask) { + pr_warn("srq_limit (%d) > cur limit(%d)\n", + attr->srq_limit, + srq->rq.queue->buf->index_mask); + goto err1; + } + } + + return 0; + +err1: + return -EINVAL; +} + int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata) @@ -154,3 +174,14 @@ err2: srq->rq.queue = NULL; return err; } + +void rxe_srq_cleanup(struct rxe_pool_elem *elem) +{ + struct rxe_srq *srq = container_of(elem, typeof(*srq), elem); + + if (srq->pd) + rxe_put(srq->pd); + + if (srq->rq.queue) + rxe_queue_cleanup(srq->rq.queue); +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 67184b0281a0..9d995854a174 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -7,8 +7,8 @@ #include #include #include + #include "rxe.h" -#include "rxe_loc.h" #include "rxe_queue.h" #include "rxe_hw_counters.h" @@ -286,36 +286,34 @@ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_create_srq_resp __user *uresp = NULL; - if (init->srq_type != IB_SRQT_BASIC) - return -EOPNOTSUPP; - if (udata) { if (udata->outlen < sizeof(*uresp)) return -EINVAL; uresp = udata->outbuf; } - err = rxe_srq_chk_attr(rxe, NULL, &init->attr, IB_SRQ_INIT_MASK); + if (init->srq_type != IB_SRQT_BASIC) + return -EOPNOTSUPP; + + err = rxe_srq_chk_init(rxe, init); if (err) - goto err1; + return err; err = rxe_add_to_pool(&rxe->srq_pool, srq); if (err) - goto err1; + return err; rxe_get(pd); srq->pd = pd; err = rxe_srq_from_init(rxe, srq, init, udata, uresp); if (err) - goto err2; + goto err_put; return 0; -err2: - rxe_put(pd); +err_put: rxe_put(srq); -err1: return err; } @@ -339,16 +337,12 @@ static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, err = rxe_srq_chk_attr(rxe, srq, attr, mask); if (err) - goto err1; + return err; err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata); if (err) - goto err1; - + return err; return 0; - -err1: - return err; } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) @@ -368,10 +362,6 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); - if (srq->rq.queue) - rxe_queue_cleanup(srq->rq.queue); - - rxe_put(srq->pd); rxe_put(srq); return 0; } @@ -495,7 +485,6 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (ret) return ret; - rxe_qp_destroy(qp); rxe_put(qp); return 0; } @@ -536,7 +525,6 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, wr->send_flags = ibwr->send_flags; if (qp_type(qp) == IB_QPT_UD || - qp_type(qp) == IB_QPT_SMI || qp_type(qp) == IB_QPT_GSI) { struct ib_ah *ibah = ud_wr(ibwr)->ah; @@ -807,6 +795,12 @@ static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); + /* See IBA C11-17: The CI shall return an error if this Verb is + * invoked while a Work Queue is still associated with the CQ. + */ + if (atomic_read(&cq->num_wq)) + return -EINVAL; + rxe_cq_disable(cq); rxe_put(cq); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index e7eff1ca75e9..ac464e68c923 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -67,6 +67,7 @@ struct rxe_cq { bool is_dying; bool is_user; struct tasklet_struct comp_task; + atomic_t num_wq; }; enum wqe_state { @@ -373,7 +374,6 @@ struct rxe_port { spinlock_t port_lock; /* guard port */ unsigned int mtu_cap; /* special QPs */ - u32 qp_smi_index; u32 qp_gsi_index; }; @@ -394,7 +394,6 @@ struct rxe_dev { struct rxe_pool cq_pool; struct rxe_pool mr_pool; struct rxe_pool mw_pool; - struct rxe_pool mc_grp_pool; /* multicast support */ spinlock_t mcg_lock; diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index e5c586913d0b..dacc174604bf 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -119,6 +119,7 @@ static int siw_dev_qualified(struct net_device *netdev) * for type identifiers. */ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || + netdev->type == ARPHRD_NONE || (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) return 1; @@ -315,12 +316,12 @@ static struct siw_device *siw_device_create(struct net_device *netdev) sdev->netdev = netdev; - if (netdev->type != ARPHRD_LOOPBACK) { + if (netdev->type != ARPHRD_LOOPBACK && netdev->type != ARPHRD_NONE) { addrconf_addr_eui48((unsigned char *)&base_dev->node_guid, netdev->dev_addr); } else { /* - * The loopback device does not have a HW address, + * This device does not have a HW address, * but connection mangagement lib expects gid != 0 */ size_t len = min_t(size_t, strlen(base_dev->name), 6); diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 54ef367b074a..09316072b789 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -132,8 +132,8 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, /* Revisit atomic caps if RFC 7306 gets supported */ attr->atomic_cap = 0; - attr->device_cap_flags = - IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; + attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG; attr->max_cq = sdev->attrs.max_cq; attr->max_cqe = sdev->attrs.max_cqe; attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 44d8d151ff90..35e9c8a330e2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -411,6 +411,7 @@ struct ipoib_dev_priv { struct dentry *path_dentry; #endif u64 hca_caps; + u64 kernel_caps; struct ipoib_ethtool_st ethtool; unsigned int max_send_sge; const struct net_device_ops *rn_ops; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 9934b8bd7f56..2a8961b685c2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1850,11 +1850,12 @@ static void ipoib_parent_unregister_pre(struct net_device *ndev) static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) { priv->hca_caps = priv->ca->attrs.device_cap_flags; + priv->kernel_caps = priv->ca->attrs.kernel_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - if (priv->hca_caps & IB_DEVICE_UD_TSO) + if (priv->kernel_caps & IBK_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; priv->dev->features |= priv->dev->hw_features; @@ -2201,7 +2202,7 @@ int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, priv->rn_ops = dev->netdev_ops; - if (hca->attrs.device_cap_flags & IB_DEVICE_VIRTUAL_FUNCTION) + if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION) dev->netdev_ops = &ipoib_netdev_ops_vf; else dev->netdev_ops = &ipoib_netdev_ops_pf; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 5a150a080ac2..368e5d77416d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -197,16 +197,16 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) init_attr.send_cq = priv->send_cq; init_attr.recv_cq = priv->recv_cq; - if (priv->hca_caps & IB_DEVICE_UD_TSO) + if (priv->kernel_caps & IBK_UD_TSO) init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; - if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK) + if (priv->kernel_caps & IBK_BLOCK_MULTICAST_LOOPBACK) init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; - if (priv->hca_caps & IB_DEVICE_RDMA_NETDEV_OPA) + if (priv->kernel_caps & IBK_RDMA_NETDEV_OPA) init_attr.create_flags |= IB_QP_CREATE_NETDEV_USE; priv->qp = ib_create_qp(priv->pd, &init_attr); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index f8d0bab4424c..321949a570ed 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -650,7 +650,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, SHOST_DIX_GUARD_CRC); } - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + if (!(ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)) shost->virt_boundary_mask = SZ_4K - 1; if (iscsi_host_add(shost, ib_dev->dev.parent)) { diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 7e4faf9c5e9e..dee8c97ff056 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -363,7 +363,7 @@ struct iser_fr_pool { * @cq: Connection completion queue * @cq_size: The number of max outstanding completions * @device: reference to iser device - * @fr_pool: connection fast registration poool + * @fr_pool: connection fast registration pool * @pi_support: Indicate device T10-PI support * @reg_cqe: completion handler */ diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 5dbad68c7390..c08f2d9133b6 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -115,7 +115,7 @@ iser_create_fastreg_desc(struct iser_device *device, if (!desc) return ERR_PTR(-ENOMEM); - if (ib_dev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) mr_type = IB_MR_TYPE_SG_GAPS; else mr_type = IB_MR_TYPE_MEM_REG; @@ -517,7 +517,7 @@ static void iser_calc_scsi_params(struct iser_conn *iser_conn, * (head and tail) for a single page worth data, so one additional * entry is required. */ - if (attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG) + if (attr->kernel_cap_flags & IBK_SG_GAPS_REG) reserved_mr_pages = 0; else reserved_mr_pages = 1; @@ -562,8 +562,8 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) /* connection T10-PI support */ if (iser_pi_enable) { - if (!(device->ib_device->attrs.device_cap_flags & - IB_DEVICE_INTEGRITY_HANDOVER)) { + if (!(device->ib_device->attrs.kernel_cap_flags & + IBK_INTEGRITY_HANDOVER)) { iser_warn("T10-PI requested but not supported on %s, " "continue without T10-PI\n", dev_name(&ib_conn->device->ib_device->dev)); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 48064bd8aa2c..b360a1527cd1 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -42,6 +42,7 @@ MODULE_PARM_DESC(sg_tablesize, static DEFINE_MUTEX(device_list_mutex); static LIST_HEAD(device_list); +static struct workqueue_struct *isert_login_wq; static struct workqueue_struct *isert_comp_wq; static struct workqueue_struct *isert_release_wq; @@ -230,7 +231,7 @@ isert_create_device_ib_res(struct isert_device *device) } /* Check signature cap */ - if (ib_dev->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER) + if (ib_dev->attrs.kernel_cap_flags & IBK_INTEGRITY_HANDOVER) device->pi_capable = true; else device->pi_capable = false; @@ -1017,7 +1018,7 @@ isert_rx_login_req(struct isert_conn *isert_conn) complete(&isert_conn->login_comp); return; } - schedule_delayed_work(&conn->login_work, 0); + queue_delayed_work(isert_login_wq, &conn->login_work, 0); } static struct iscsit_cmd @@ -2348,9 +2349,9 @@ isert_get_login_rx(struct iscsit_conn *conn, struct iscsi_login *login) /* * For login requests after the first PDU, isert_rx_login_req() will - * kick schedule_delayed_work(&conn->login_work) as the packet is - * received, which turns this callback from iscsi_target_do_login_rx() - * into a NOP. + * kick queue_delayed_work(isert_login_wq, &conn->login_work) as + * the packet is received, which turns this callback from + * iscsi_target_do_login_rx() into a NOP. */ if (!login->first_request) return 0; @@ -2606,20 +2607,23 @@ static struct iscsit_transport iser_target_transport = { static int __init isert_init(void) { - int ret; + isert_login_wq = alloc_workqueue("isert_login_wq", 0, 0); + if (!isert_login_wq) { + isert_err("Unable to allocate isert_login_wq\n"); + return -ENOMEM; + } isert_comp_wq = alloc_workqueue("isert_comp_wq", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!isert_comp_wq) { isert_err("Unable to allocate isert_comp_wq\n"); - return -ENOMEM; + goto destroy_login_wq; } isert_release_wq = alloc_workqueue("isert_release_wq", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE); if (!isert_release_wq) { isert_err("Unable to allocate isert_release_wq\n"); - ret = -ENOMEM; goto destroy_comp_wq; } @@ -2630,17 +2634,20 @@ static int __init isert_init(void) destroy_comp_wq: destroy_workqueue(isert_comp_wq); +destroy_login_wq: + destroy_workqueue(isert_login_wq); - return ret; + return -ENOMEM; } static void __exit isert_exit(void) { - flush_scheduled_work(); + flush_workqueue(isert_login_wq); destroy_workqueue(isert_release_wq); destroy_workqueue(isert_comp_wq); iscsit_unregister_transport(&iser_target_transport); isert_info("iSER_TARGET[0] - Released iser_target_transport\n"); + destroy_workqueue(isert_login_wq); } MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure"); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index c2c860d0c56e..9809c3883979 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -2785,7 +2785,7 @@ static void free_clt(struct rtrs_clt_sess *clt) /** * rtrs_clt_open() - Open a path to an RTRS server * @ops: holds the link event callback and the private pointer. - * @sessname: name of the session + * @pathname: name of the path to an RTRS server * @paths: Paths to be established defined by their src and dst addresses * @paths_num: Number of elements in the @paths array * @port: port to be used by the RTRS session diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 285b766e4e70..6058abf42ba7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -430,7 +430,7 @@ static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device, spin_lock_init(&pool->lock); INIT_LIST_HEAD(&pool->free_list); - if (device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + if (device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) mr_type = IB_MR_TYPE_SG_GAPS; else mr_type = IB_MR_TYPE_MEM_REG; @@ -3650,7 +3650,7 @@ static ssize_t add_target_store(struct device *dev, target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; target_host->max_segment_size = ib_dma_max_seg_size(ibdev); - if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + if (!(ibdev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)) target_host->virt_boundary_mask = ~srp_dev->mr_page_mask; target = host_to_target(target_host); @@ -3706,8 +3706,8 @@ static ssize_t add_target_store(struct device *dev, } if (srp_dev->use_fast_reg) { - bool gaps_reg = (ibdev->attrs.device_cap_flags & - IB_DEVICE_SG_GAPS_REG); + bool gaps_reg = ibdev->attrs.kernel_cap_flags & + IBK_SG_GAPS_REG; max_sectors_per_mr = srp_dev->max_pages_per_mr << (ilog2(srp_dev->mr_page_size) - 9); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 2874f222c313..ab13b7380265 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1509,7 +1509,6 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, static int __dump_mapping(void *context, uint64_t cblock, void *leaf) { - int r = 0; __le64 value; dm_oblock_t oblock; unsigned flags; @@ -1517,7 +1516,7 @@ static int __dump_mapping(void *context, uint64_t cblock, void *leaf) memcpy(&value, leaf, sizeof(value)); unpack_value(value, &oblock, &flags); - return r; + return 0; } static int __dump_mappings(struct dm_cache_metadata *cmd) diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 4277853c7535..d21648a923ea 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -154,6 +155,10 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md) return &md->stats; } +DECLARE_STATIC_KEY_FALSE(stats_enabled); +DECLARE_STATIC_KEY_FALSE(swap_bios_enabled); +DECLARE_STATIC_KEY_FALSE(zoned_enabled); + static inline bool dm_emulate_zone_append(struct mapped_device *md) { if (blk_queue_is_zoned(md->queue)) @@ -237,6 +242,12 @@ static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit) tio->flags |= (1U << bit); } +static inline bool dm_tio_is_normal(struct dm_target_io *tio) +{ + return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && + !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); +} + /* * One of these is allocated per original bio. * It contains the first clone used for that original. @@ -245,16 +256,20 @@ static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit) struct dm_io { unsigned short magic; blk_short_t flags; - atomic_t io_count; - struct mapped_device *md; - struct bio *orig_bio; - blk_status_t status; spinlock_t lock; unsigned long start_time; void *data; - struct hlist_node node; - struct task_struct *map_task; + struct dm_io *next; struct dm_stats_aux stats_aux; + blk_status_t status; + atomic_t io_count; + struct mapped_device *md; + + /* The three fields represent mapped part of original bio */ + struct bio *orig_bio; + unsigned int sector_offset; /* offset to end of orig_bio */ + unsigned int sectors; + /* last member of dm_target_io is 'struct bio' */ struct dm_target_io tio; }; @@ -263,8 +278,8 @@ struct dm_io { * dm_io flags */ enum { - DM_IO_START_ACCT, - DM_IO_ACCOUNTED + DM_IO_ACCOUNTED, + DM_IO_WAS_SPLIT }; static inline bool dm_io_flagged(struct dm_io *io, unsigned int bit) @@ -277,13 +292,6 @@ static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) io->flags |= (1U << bit); } -static inline void dm_io_inc_pending(struct dm_io *io) -{ - atomic_inc(&io->io_count); -} - -void dm_io_dec_pending(struct dm_io *io, blk_status_t error); - static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index fb80539865d7..159c6806c19b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3439,6 +3439,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static char hex2asc(unsigned char c) +{ + return c + '0' + ((unsigned)(9 - c) >> 4 & 0x27); +} + static void crypt_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -3457,9 +3462,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type, if (cc->key_size > 0) { if (cc->key_string) DMEMIT(":%u:%s", cc->key_size, cc->key_string); - else - for (i = 0; i < cc->key_size; i++) - DMEMIT("%02x", cc->key[i]); + else { + for (i = 0; i < cc->key_size; i++) { + DMEMIT("%c%c", hex2asc(cc->key[i] >> 4), + hex2asc(cc->key[i] & 0xf)); + } + } } else DMEMIT("-"); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 9a51bf51a859..869afef5654a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -296,8 +296,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio) } delayed->class = c; bio_set_dev(bio, c->dev->bdev); - if (bio_sectors(bio)) - bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); return delay_bio(dc, c, bio); } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 345229d7e59c..f2305eb758a2 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -280,9 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) struct flakey_c *fc = ti->private; bio_set_dev(bio, fc->dev->bdev); - if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) - bio->bi_iter.bi_sector = - flakey_map_sector(ti, bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 36ae30b73a6e..3d5a0ce123c9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4494,8 +4494,6 @@ try_smaller_buffer: } if (should_write_sb) { - int r; - init_journal(ic, 0, ic->journal_sections, 0); r = dm_integrity_failed(ic); if (unlikely(r)) { diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 76b486e4d2be..0a6abbbe3745 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -84,19 +84,12 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) return lc->start + dm_target_offset(ti, bi_sector); } -static void linear_map_bio(struct dm_target *ti, struct bio *bio) +static int linear_map(struct dm_target *ti, struct bio *bio) { struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) - bio->bi_iter.bi_sector = - linear_map_sector(ti, bio->bi_iter.bi_sector); -} - -static int linear_map(struct dm_target *ti, struct bio *bio) -{ - linear_map_bio(ti, bio); + bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); return DM_MAPIO_REMAPPED; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6ed9d2731254..0e325469a252 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -105,6 +105,7 @@ struct multipath { struct dm_mpath_io { struct pgpath *pgpath; size_t nr_bytes; + u64 start_time_ns; }; typedef int (*action_fn) (struct pgpath *pgpath); @@ -295,6 +296,7 @@ static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mp mpio->nr_bytes = bio->bi_iter.bi_size; mpio->pgpath = NULL; + mpio->start_time_ns = 0; *mpio_p = mpio; dm_bio_record(bio_details, bio); @@ -647,6 +649,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, mpio->pgpath = pgpath; + if (dm_ps_use_hr_timer(pgpath->pg->ps.type)) + mpio->start_time_ns = ktime_get_ns(); + bio->bi_status = 0; bio_set_dev(bio, pgpath->path.dev->bdev); bio->bi_opf |= REQ_FAILFAST_TRANSPORT; @@ -1713,7 +1718,8 @@ done: if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, - dm_start_time_ns_from_clone(clone)); + (mpio->start_time_ns ?: + dm_start_time_ns_from_clone(clone))); } return r; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index c47bc0e20275..83cac2b04b66 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -26,11 +26,26 @@ struct path_selector { void *context; }; +/* + * If a path selector uses this flag, a high resolution timer is used + * (via ktime_get_ns) to account for IO start time in BIO-based mpath. + * This improves performance of some path selectors (i.e. HST), in + * exchange for slightly higher overhead when submitting the BIO. + * The extra cost is usually offset by improved path selection for + * some benchmarks. + * + * This has no effect for request-based mpath, since it already uses a + * higher precision timer by default. + */ +#define DM_PS_USE_HR_TIMER 0x00000001 +#define dm_ps_use_hr_timer(type) ((type)->features & DM_PS_USE_HR_TIMER) + /* Information about a path selector type */ struct path_selector_type { char *name; struct module *module; + unsigned int features; unsigned int table_args; unsigned int info_args; diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index 82f2a06153dc..1d82c95d323d 100644 --- a/drivers/md/dm-ps-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c @@ -523,6 +523,7 @@ static int hst_end_io(struct path_selector *ps, struct dm_path *path, static struct path_selector_type hst_ps = { .name = "historical-service-time", .module = THIS_MODULE, + .features = DM_PS_USE_HR_TIMER, .table_args = 1, .info_args = 3, .create = hst_create, diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 0e039a8c0bf2..8326f9fe0e91 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -225,6 +225,7 @@ void dm_stats_cleanup(struct dm_stats *stats) atomic_read(&shared->in_flight[READ]), atomic_read(&shared->in_flight[WRITE])); } + cond_resched(); } dm_stat_free(&s->rcu_head); } @@ -330,6 +331,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { atomic_set(&s->stat_shared[ni].in_flight[READ], 0); atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); + cond_resched(); } if (s->n_histogram_entries) { @@ -342,6 +344,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { s->stat_shared[ni].tmp.histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } @@ -362,6 +365,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { p[ni].histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } } @@ -396,6 +400,9 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, dm_stats_recalc_precise_timestamps(stats); + if (!static_key_enabled(&stats_enabled.key)) + static_branch_enable(&stats_enabled); + mutex_unlock(&stats->mutex); resume_callback(md); @@ -497,6 +504,7 @@ static int dm_stats_list(struct dm_stats *stats, const char *program, } DMEMIT("\n"); } + cond_resched(); } mutex_unlock(&stats->mutex); @@ -774,6 +782,7 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, local_irq_enable(); } } + cond_resched(); } } @@ -889,6 +898,8 @@ static int dm_stats_print(struct dm_stats *stats, int id, if (unlikely(sz + 1 >= maxlen)) goto buffer_overflow; + + cond_resched(); } if (clear) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 11e48c0806f5..9e1d4a98f6ed 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -719,6 +719,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", dm_device_name(t->md), type); + if (tgt->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) + static_branch_enable(&swap_bios_enabled); + return 0; bad: @@ -1002,6 +1005,8 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } +static int dm_table_supports_poll(struct dm_table *t); + static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); @@ -1009,21 +1014,24 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned min_pool_size = 0; struct dm_target *ti; unsigned i; + bool poll_supported = false; if (unlikely(type == DM_TYPE_NONE)) { DMWARN("no table type is set, can't allocate mempools"); return -EINVAL; } - if (__table_type_bio_based(type)) + if (__table_type_bio_based(type)) { for (i = 0; i < t->num_targets; i++) { ti = t->targets + i; per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); } + poll_supported = !!dm_table_supports_poll(t); + } - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, - per_io_data_size, min_pool_size); + t->mempools = dm_alloc_md_mempools(md, type, per_io_data_size, min_pool_size, + t->integrity_supported, poll_supported); if (!t->mempools) return -ENOMEM; @@ -2101,6 +2109,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); } dm_update_crypto_profile(q, t); diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 57daa86c19cf..3e7b1fe1580b 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -550,13 +550,6 @@ int dm_zone_map_bio(struct dm_target_io *tio) return DM_MAPIO_KILL; } - /* - * The target map function may issue and complete the IO quickly. - * Take an extra reference on the IO to make sure it does disappear - * until we run dm_zone_map_bio_end(). - */ - dm_io_inc_pending(io); - /* Let the target do its work */ r = ti->type->map(ti, clone); switch (r) { @@ -587,9 +580,6 @@ int dm_zone_map_bio(struct dm_target_io *tio) break; } - /* Drop the extra reference on the IO */ - dm_io_dec_pending(io, sts); - if (sts != BLK_STS_OK) return DM_MAPIO_KILL; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 39081338ca61..d62f1354ecbf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -71,6 +71,10 @@ void dm_issue_global_event(void) wake_up(&dm_global_eventq); } +DEFINE_STATIC_KEY_FALSE(stats_enabled); +DEFINE_STATIC_KEY_FALSE(swap_bios_enabled); +DEFINE_STATIC_KEY_FALSE(zoned_enabled); + /* * One of these is allocated (on-stack) per original bio. */ @@ -80,7 +84,8 @@ struct clone_info { struct dm_io *io; sector_t sector; unsigned sector_count; - bool submit_as_polled; + bool is_abnormal_io:1; + bool submit_as_polled:1; }; #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) @@ -498,69 +503,76 @@ static bool bio_is_flush_with_data(struct bio *bio) return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); } -static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, - unsigned long start_time, struct dm_stats_aux *stats_aux) +static void dm_io_acct(struct dm_io *io, bool end) { - bool is_flush_with_data; - unsigned int bi_size; + struct dm_stats_aux *stats_aux = &io->stats_aux; + unsigned long start_time = io->start_time; + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; + unsigned int sectors; - /* If REQ_PREFLUSH set save any payload but do not account it */ - is_flush_with_data = bio_is_flush_with_data(bio); - if (is_flush_with_data) { - bi_size = bio->bi_iter.bi_size; - bio->bi_iter.bi_size = 0; - } + /* + * If REQ_PREFLUSH set, don't account payload, it will be + * submitted (and accounted) after this flush completes. + */ + if (bio_is_flush_with_data(bio)) + sectors = 0; + else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT)))) + sectors = bio_sectors(bio); + else + sectors = io->sectors; if (!end) - bio_start_io_acct_time(bio, start_time); + bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), + start_time); else - bio_end_io_acct(bio, start_time); + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&md->stats))) { + sector_t sector; + + if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT))) + sector = bio->bi_iter.bi_sector; + else + sector = bio_end_sector(bio) - io->sector_offset; - if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, bio_sectors(bio), + sector, sectors, end, start_time, stats_aux); - - /* Restore bio's payload so it does get accounted upon requeue */ - if (is_flush_with_data) - bio->bi_iter.bi_size = bi_size; + } } -static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) +static void __dm_start_io_acct(struct dm_io *io) { - dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, false); } static void dm_start_io_acct(struct dm_io *io, struct bio *clone) { - /* Must account IO to DM device in terms of orig_bio */ - struct bio *bio = io->orig_bio; - /* * Ensure IO accounting is only ever started once. - * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ - if (!clone || - likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { - if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) - return; + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) + return; + + /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ + if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) { dm_io_set_flag(io, DM_IO_ACCOUNTED); } else { unsigned long flags; - if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - return; /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ spin_lock_irqsave(&io->lock, flags); dm_io_set_flag(io, DM_IO_ACCOUNTED); spin_unlock_irqrestore(&io->lock, flags); } - __dm_start_io_acct(io, bio); + __dm_start_io_acct(io); } -static void dm_end_io_acct(struct dm_io *io, struct bio *bio) +static void dm_end_io_acct(struct dm_io *io) { - dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(io, true); } static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) @@ -569,7 +581,9 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); + clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs); + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; tio = clone_to_tio(clone); tio->flags = 0; @@ -578,17 +592,19 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; - io->status = 0; - atomic_set(&io->io_count, 1); + io->status = BLK_STS_OK; + + /* one ref is for submission, the other is for completion */ + atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); - io->orig_bio = NULL; + io->orig_bio = bio; io->md = md; - io->map_task = current; spin_lock_init(&io->lock); io->start_time = jiffies; io->flags = 0; - dm_stats_record_start(&md->stats, &io->stats_aux); + if (static_branch_unlikely(&stats_enabled)) + dm_stats_record_start(&md->stats, &io->stats_aux); return io; } @@ -599,7 +615,7 @@ static void free_io(struct dm_io *io) } static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) + unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { struct dm_target_io *tio; struct bio *clone; @@ -610,10 +626,13 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, - gfp_mask, &ci->io->md->bs); + struct mapped_device *md = ci->io->md; + + clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs); if (!clone) return NULL; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@ -663,14 +682,16 @@ static void queue_io(struct mapped_device *md, struct bio *bio) * function to access the md->map field, and make sure they call * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) +struct dm_table *dm_get_live_table(struct mapped_device *md, + int *srcu_idx) __acquires(md->io_barrier) { *srcu_idx = srcu_read_lock(&md->io_barrier); return srcu_dereference(md->map, &md->io_barrier); } -void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +void dm_put_live_table(struct mapped_device *md, + int srcu_idx) __releases(md->io_barrier) { srcu_read_unlock(&md->io_barrier, srcu_idx); } @@ -696,6 +717,24 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, + int *srcu_idx, struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + return dm_get_live_table_fast(md); + else + return dm_get_live_table(md, srcu_idx); +} + +static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx, + struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + dm_put_live_table_fast(md); + else + dm_put_live_table(md, srcu_idx); +} + static char *_dm_claim_ptr = "I belong to device-mapper"; /* @@ -875,14 +914,14 @@ static void dm_io_complete(struct dm_io *io) io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - dm_end_io_acct(io, bio); + dm_end_io_acct(io); else if (!io_error) { /* * Must handle target that DM_MAPIO_SUBMITTED only to * then bio_endio() rather than dm_submit_bio_remap() */ - __dm_start_io_acct(io, bio); - dm_end_io_acct(io, bio); + __dm_start_io_acct(io); + dm_end_io_acct(io); } free_io(io); smp_wmb(); @@ -899,7 +938,7 @@ static void dm_io_complete(struct dm_io *io) * may only reflect a subset of the pre-split original) * so clear REQ_POLLED in case of requeue. */ - bio->bi_opf &= ~REQ_POLLED; + bio_clear_polled(bio); if (io_error == BLK_STS_AGAIN) { /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ queue_io(md, bio); @@ -923,32 +962,37 @@ static void dm_io_complete(struct dm_io *io) } } -static inline bool dm_tio_is_normal(struct dm_target_io *tio) -{ - return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && - !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); -} - /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +static inline void __dm_io_dec_pending(struct dm_io *io) { - /* Push-back supersedes any I/O errors */ - if (unlikely(error)) { - unsigned long flags; - spin_lock_irqsave(&io->lock, flags); - if (!(io->status == BLK_STS_DM_REQUEUE && - __noflush_suspending(io->md))) - io->status = error; - spin_unlock_irqrestore(&io->lock, flags); - } - if (atomic_dec_and_test(&io->io_count)) dm_io_complete(io); } +static void dm_io_set_error(struct dm_io *io, blk_status_t error) +{ + unsigned long flags; + + /* Push-back supersedes any I/O errors */ + spin_lock_irqsave(&io->lock, flags); + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(io->md))) { + io->status = error; + } + spin_unlock_irqrestore(&io->lock, flags); +} + +static void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +{ + if (unlikely(error)) + dm_io_set_error(io, error); + + __dm_io_dec_pending(io); +} + void disable_discard(struct mapped_device *md) { struct queue_limits *limits = dm_get_queue_limits(md); @@ -974,35 +1018,43 @@ static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; struct dm_target_io *tio = clone_to_tio(bio); + struct dm_target *ti = tio->ti; + dm_endio_fn endio = ti->type->end_io; struct dm_io *io = tio->io; - struct mapped_device *md = tio->io->md; - dm_endio_fn endio = tio->ti->type->end_io; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct mapped_device *md = io->md; - if (unlikely(error == BLK_STS_TARGET)) { - if (bio_op(bio) == REQ_OP_DISCARD && - !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !q->limits.max_write_zeroes_sectors) - disable_write_zeroes(md); + if (likely(bio->bi_bdev != md->disk->part0)) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + if (unlikely(error == BLK_STS_TARGET)) { + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev)) + disable_discard(md); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !q->limits.max_write_zeroes_sectors) + disable_write_zeroes(md); + } + + if (static_branch_unlikely(&zoned_enabled) && + unlikely(blk_queue_is_zoned(q))) + dm_zone_endio(io, bio); } - if (blk_queue_is_zoned(q)) - dm_zone_endio(io, bio); - if (endio) { - int r = endio(tio->ti, bio, &error); + int r = endio(ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - /* - * Requeuing writes to a sequential zone of a zoned - * target will break the sequential write pattern: - * fail such IO. - */ - if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) - error = BLK_STS_IOERR; - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; + } else error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: @@ -1016,10 +1068,9 @@ static void clone_endio(struct bio *bio) } } - if (unlikely(swap_bios_limit(tio->ti, bio))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); - } free_tio(bio); dm_io_dec_pending(io, error); @@ -1161,7 +1212,7 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, * +--------------------+---------------+-------+ * * <-------------- *tio->len_ptr ---------------> - * <------- bi_size -------> + * <----- bio_sectors -----> * <-- n_sectors --> * * Region 1 was already iterated over with bio_advance or similar function. @@ -1178,26 +1229,26 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); - unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + unsigned bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(op_is_zone_mgmt(bio_op(bio))); BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); - BUG_ON(bi_size > *tio->len_ptr); - BUG_ON(n_sectors > bi_size); + BUG_ON(bio_sectors > *tio->len_ptr); + BUG_ON(n_sectors > bio_sectors); - *tio->len_ptr -= bi_size - n_sectors; + *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; + + /* + * __split_and_process_bio() may have already saved mapped part + * for accounting but it is being reduced so update accordingly. + */ + dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); + tio->io->sectors = n_sectors; } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -static inline void __dm_submit_bio_remap(struct bio *clone, - dev_t dev, sector_t old_sector) -{ - trace_block_bio_remap(clone, dev, old_sector); - submit_bio_noacct(clone); -} - /* * @clone: clone bio that DM core passed to target's .map function * @tgt_clone: clone of @clone bio that target needs submitted @@ -1212,8 +1263,6 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io; - WARN_ON_ONCE(!tio->ti->accounts_remapped_io); - /* establish bio that will get submitted */ if (!tgt_clone) tgt_clone = clone; @@ -1222,22 +1271,11 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. */ - if (io->map_task == current) { - /* Still in target's map function */ - dm_io_set_flag(io, DM_IO_START_ACCT); - } else { - /* - * Called by another thread, managed by DM target, - * wait for dm_split_and_process_bio() to store - * io->orig_bio - */ - while (unlikely(!smp_load_acquire(&io->orig_bio))) - msleep(1); - dm_start_io_acct(io, clone); - } + dm_start_io_acct(io, clone); - __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), + trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk), tio->old_sector); + submit_bio_noacct(tgt_clone); } EXPORT_SYMBOL_GPL(dm_submit_bio_remap); @@ -1260,55 +1298,53 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) static void __map_bio(struct bio *clone) { struct dm_target_io *tio = clone_to_tio(clone); - int r; - struct dm_io *io = tio->io; struct dm_target *ti = tio->ti; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; + int r; clone->bi_end_io = clone_endio; /* * Map the clone. */ - dm_io_inc_pending(io); tio->old_sector = clone->bi_iter.bi_sector; - if (unlikely(swap_bios_limit(ti, clone))) { - struct mapped_device *md = io->md; + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) { int latch = get_swap_bios(); if (unlikely(latch != md->swap_bios)) __set_swap_bios_limit(md, latch); down(&md->swap_bios_semaphore); } - /* - * Check if the IO needs a special mapping due to zone append emulation - * on zoned target. In this case, dm_zone_map_bio() calls the target - * map operation. - */ - if (dm_emulate_zone_append(io->md)) - r = dm_zone_map_bio(tio); - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Check if the IO needs a special mapping due to zone append + * emulation on zoned target. In this case, dm_zone_map_bio() + * calls the target map operation. + */ + if (unlikely(dm_emulate_zone_append(md))) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + } else r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: /* target has assumed ownership of this io */ if (!ti->accounts_remapped_io) - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_start_io_acct(io, clone); break; case DM_MAPIO_REMAPPED: - /* - * the bio has been remapped so dispatch it, but defer - * dm_start_io_acct() until after possible bio_split(). - */ - __dm_submit_bio_remap(clone, disk_devt(io->md->disk), - tio->old_sector); - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_submit_bio_remap(clone, NULL); break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: - if (unlikely(swap_bios_limit(ti, clone))) - up(&io->md->swap_bios_semaphore); + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) + up(&md->swap_bios_semaphore); free_tio(clone); if (r == DM_MAPIO_KILL) dm_io_dec_pending(io, BLK_STS_IOERR); @@ -1321,6 +1357,31 @@ static void __map_bio(struct bio *clone) } } +static void setup_split_accounting(struct clone_info *ci, unsigned len) +{ + struct dm_io *io = ci->io; + + if (ci->sector_count > len) { + /* + * Split needed, save the mapped part for accounting. + * NOTE: dm_accept_partial_bio() will update accordingly. + */ + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = len; + } + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&io->md->stats))) { + /* + * Save bi_sector in terms of its offset from end of + * original bio, only needed for DM-stats' benefit. + * - saved regardless of whether split needed so that + * dm_accept_partial_bio() doesn't need to. + */ + io->sector_offset = bio_end_sector(ci->bio) - ci->sector; + } +} + static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned num_bios) { @@ -1350,18 +1411,22 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } } -static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, +static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, unsigned num_bios, unsigned *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; + int ret = 0; switch (num_bios) { case 0: break; case 1: + if (len) + setup_split_accounting(ci, *len); clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); __map_bio(clone); + ret = 1; break; default: /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ @@ -1369,9 +1434,12 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, while ((clone = bio_list_pop(&blist))) { dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); __map_bio(clone); + ret += 1; } break; } + + return ret; } static void __send_empty_flush(struct clone_info *ci) @@ -1392,8 +1460,19 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + while ((ti = dm_table_get_target(ci->map, target_nr++))) { + int bios; + + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following subtraction + */ + atomic_sub(1, &ci->io->io_count); bio_uninit(ci->bio); } @@ -1402,11 +1481,18 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target unsigned num_bios) { unsigned len; + int bios; len = min_t(sector_t, ci->sector_count, max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); - __send_duplicate_bios(ci, ti, num_bios, &len); + atomic_add(num_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following (+1) subtraction + */ + atomic_sub(num_bios - bios + 1, &ci->io->io_count); ci->sector += len; ci->sector_count -= len; @@ -1414,21 +1500,24 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target static bool is_abnormal_io(struct bio *bio) { - bool r = false; + unsigned int op = bio_op(bio); - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - r = true; - break; + if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + break; + } } - return r; + return false; } -static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, - int *result) +static blk_status_t __process_abnormal_io(struct clone_info *ci, + struct dm_target *ti) { unsigned num_bios = 0; @@ -1442,8 +1531,6 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; break; - default: - return false; } /* @@ -1452,17 +1539,15 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * reconfiguration might also have changed that since the * check was performed. */ - if (!num_bios) - *result = -EOPNOTSUPP; - else { - __send_changing_extent_only(ci, ti, num_bios); - *result = 0; - } - return true; + if (unlikely(!num_bios)) + return BLK_STS_NOTSUPP; + + __send_changing_extent_only(ci, ti, num_bios); + return BLK_STS_OK; } /* - * Reuse ->bi_private as hlist head for storing all dm_io instances + * Reuse ->bi_private as dm_io list head for storing all dm_io instances * associated with this bio, and this bio's bi_private needs to be * stored in dm_io->data before the reuse. * @@ -1470,54 +1555,53 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * touch it after splitting. Meantime it won't be changed by anyone after * bio is submitted. So this reuse is safe. */ -static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) +static inline struct dm_io **dm_poll_list_head(struct bio *bio) { - return (struct hlist_head *)&bio->bi_private; + return (struct dm_io **)&bio->bi_private; } static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); + struct dm_io **head = dm_poll_list_head(bio); if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { bio->bi_opf |= REQ_DM_POLL_LIST; /* * Save .bi_private into dm_io, so that we can reuse - * .bi_private as hlist head for storing dm_io list + * .bi_private as dm_io list head for storing dm_io list */ io->data = bio->bi_private; - INIT_HLIST_HEAD(head); - /* tell block layer to poll for completion */ bio->bi_cookie = ~BLK_QC_T_NONE; + + io->next = NULL; } else { /* * bio recursed due to split, reuse original poll list, * and save bio->bi_private too. */ - io->data = hlist_entry(head->first, struct dm_io, node)->data; + io->data = (*head)->data; + io->next = *head; } - hlist_add_head(&io->node, head); + *head = io; } /* * Select the correct strategy for processing a non-flush bio. */ -static int __split_and_process_bio(struct clone_info *ci) +static blk_status_t __split_and_process_bio(struct clone_info *ci) { struct bio *clone; struct dm_target *ti; unsigned len; - int r; ti = dm_table_find_target(ci->map, ci->sector); - if (!ti) - return -EIO; - - if (__process_abnormal_io(ci, ti, &r)) - return r; + if (unlikely(!ti)) + return BLK_STS_IOERR; + else if (unlikely(ci->is_abnormal_io)) + return __process_abnormal_io(ci, ti); /* * Only support bio polling for normal IO, and the target io is @@ -1526,27 +1610,30 @@ static int __split_and_process_bio(struct clone_info *ci) ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + setup_split_accounting(ci, len); clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); __map_bio(clone); ci->sector += len; ci->sector_count -= len; - return 0; + return BLK_STS_OK; } static void init_clone_info(struct clone_info *ci, struct mapped_device *md, - struct dm_table *map, struct bio *bio) + struct dm_table *map, struct bio *bio, bool is_abnormal) { ci->map = map; ci->io = alloc_io(md, bio); ci->bio = bio; + ci->is_abnormal_io = is_abnormal; ci->submit_as_polled = false; ci->sector = bio->bi_iter.bi_sector; ci->sector_count = bio_sectors(bio); /* Shouldn't happen but sector_count was being set to 0 so... */ - if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) + if (static_branch_unlikely(&zoned_enabled) && + WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) ci->sector_count = 0; } @@ -1557,10 +1644,21 @@ static void dm_split_and_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio) { struct clone_info ci; - struct bio *orig_bio = NULL; - int error = 0; + struct dm_io *io; + blk_status_t error = BLK_STS_OK; + bool is_abnormal; - init_clone_info(&ci, md, map, bio); + is_abnormal = is_abnormal_io(bio); + if (unlikely(is_abnormal)) { + /* + * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * otherwise associated queue_limits won't be imposed. + */ + blk_queue_split(&bio); + } + + init_clone_info(&ci, md, map, bio, is_abnormal); + io = ci.io; if (bio->bi_opf & REQ_PREFLUSH) { __send_empty_flush(&ci); @@ -1569,40 +1667,34 @@ static void dm_split_and_process_bio(struct mapped_device *md, } error = __split_and_process_bio(&ci); - ci.io->map_task = NULL; if (error || !ci.sector_count) goto out; - /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. - * We take a clone of the original to store in ci.io->orig_bio to be - * used by dm_end_io_acct() and for dm_io_complete() to use for - * completion handling. */ - orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, - GFP_NOIO, &md->queue->bio_split); - bio_chain(orig_bio, bio); - trace_block_split(orig_bio, bio->bi_iter.bi_sector); + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); submit_bio_noacct(bio); out: - if (!orig_bio) - orig_bio = bio; - smp_store_release(&ci.io->orig_bio, orig_bio); - if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) - dm_start_io_acct(ci.io, NULL); - /* * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio * - * Add every dm_io instance into the hlist_head which is stored in - * bio->bi_private, so that dm_poll_bio can poll them all. + * Add every dm_io instance into the dm_io list head which is stored + * in bio->bi_private, so that dm_poll_bio can poll them all. */ - if (error || !ci.submit_as_polled) - dm_io_dec_pending(ci.io, errno_to_blk_status(error)); - else - dm_queue_poll_io(bio, ci.io); + if (error || !ci.submit_as_polled) { + /* + * In case of submission failure, the extra reference for + * submitting io isn't consumed yet + */ + if (error) + atomic_dec(&io->io_count); + dm_io_dec_pending(io, error); + } else + dm_queue_poll_io(bio, io); } static void dm_submit_bio(struct bio *bio) @@ -1611,7 +1703,7 @@ static void dm_submit_bio(struct bio *bio) int srcu_idx; struct dm_table *map; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table_bio(md, &srcu_idx, bio); /* If suspended, or map not yet available, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || @@ -1625,16 +1717,9 @@ static void dm_submit_bio(struct bio *bio) goto out; } - /* - * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) - * otherwise associated queue_limits won't be imposed. - */ - if (is_abnormal_io(bio)) - blk_queue_split(&bio); - dm_split_and_process_bio(md, map, bio); out: - dm_put_live_table(md, srcu_idx); + dm_put_live_table_bio(md, srcu_idx, bio); } static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, @@ -1653,18 +1738,16 @@ static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); - struct hlist_head tmp = HLIST_HEAD_INIT; - struct hlist_node *next; - struct dm_io *io; + struct dm_io **head = dm_poll_list_head(bio); + struct dm_io *list = *head; + struct dm_io *tmp = NULL; + struct dm_io *curr, *next; /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ if (!(bio->bi_opf & REQ_DM_POLL_LIST)) return 0; - WARN_ON_ONCE(hlist_empty(head)); - - hlist_move_list(head, &tmp); + WARN_ON_ONCE(!list); /* * Restore .bi_private before possibly completing dm_io. @@ -1675,24 +1758,27 @@ static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, * clearing REQ_DM_POLL_LIST here. */ bio->bi_opf &= ~REQ_DM_POLL_LIST; - bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; + bio->bi_private = list->data; - hlist_for_each_entry_safe(io, next, &tmp, node) { - if (dm_poll_dm_io(io, iob, flags)) { - hlist_del_init(&io->node); + for (curr = list, next = curr->next; curr; curr = next, next = + curr ? curr->next : NULL) { + if (dm_poll_dm_io(curr, iob, flags)) { /* - * clone_endio() has already occurred, so passing - * error as 0 here doesn't override io->status + * clone_endio() has already occurred, so no + * error handling is needed here. */ - dm_io_dec_pending(io, 0); + __dm_io_dec_pending(curr); + } else { + curr->next = tmp; + tmp = curr; } } /* Not done? */ - if (!hlist_empty(&tmp)) { + if (tmp) { bio->bi_opf |= REQ_DM_POLL_LIST; /* Reset bio->bi_private to dm_io list head */ - hlist_move_list(&tmp, head); + *head = tmp; return 0; } return 1; @@ -2901,8 +2987,8 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_io_data_size, - unsigned min_pool_size) + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); unsigned int pool_size = 0; @@ -2918,7 +3004,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); + ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0); if (ret) goto out; if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 9013dc1a7b00..3f89664fea01 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -221,8 +221,8 @@ void dm_kcopyd_exit(void); * Mempool operations */ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_bio_data_size, - unsigned min_pool_size); + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll); void dm_free_md_mempools(struct dm_md_mempools *pools); /* diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b87c8ae41d9b..f2a5e1ea508a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -867,8 +867,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev); /* T10-PI support */ - if (ctrl->device->dev->attrs.device_cap_flags & - IB_DEVICE_INTEGRITY_HANDOVER) + if (ctrl->device->dev->attrs.kernel_cap_flags & + IBK_INTEGRITY_HANDOVER) pi_capable = true; ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev, diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 2fab0b219b25..09fdcac87d17 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1221,8 +1221,8 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) ndev->inline_data_size = nport->inline_data_size; ndev->inline_page_count = inline_page_count; - if (nport->pi_enable && !(cm_id->device->attrs.device_cap_flags & - IB_DEVICE_INTEGRITY_HANDOVER)) { + if (nport->pi_enable && !(cm_id->device->attrs.kernel_cap_flags & + IBK_INTEGRITY_HANDOVER)) { pr_warn("T10-PI is not supported by device %s. Disabling it\n", cm_id->device->name); nport->pi_enable = false; diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index 31ef64eb7fbb..b3a1265711cc 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -649,7 +649,7 @@ static int smbd_ia_open( smbd_max_frmr_depth, info->id->device->attrs.max_fast_reg_page_list_len); info->mr_type = IB_MR_TYPE_MEM_REG; - if (info->id->device->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG) + if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) info->mr_type = IB_MR_TYPE_SG_GAPS; info->pd = ib_alloc_pd(info->id->device, 0); diff --git a/fs/locks.c b/fs/locks.c index 8c6df10cd9ed..ca28e0e50e56 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -300,6 +300,34 @@ void locks_release_private(struct file_lock *fl) } EXPORT_SYMBOL_GPL(locks_release_private); +/** + * locks_owner_has_blockers - Check for blocking lock requests + * @flctx: file lock context + * @owner: lock owner + * + * Return values: + * %true: @owner has at least one blocker + * %false: @owner has no blockers + */ +bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + struct file_lock *fl; + + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_owner != owner) + continue; + if (!list_empty(&fl->fl_blocked_requests)) { + spin_unlock(&flctx->flc_lock); + return true; + } + } + spin_unlock(&flctx->flc_lock); + return false; +} +EXPORT_SYMBOL_GPL(locks_owner_has_blockers); + /* Free a lock which is not in use. */ void locks_free_lock(struct file_lock *fl) { @@ -874,6 +902,8 @@ posix_test_lock(struct file *filp, struct file_lock *fl) struct file_lock *cfl; struct file_lock_context *ctx; struct inode *inode = locks_inode(filp); + void *owner; + void (*func)(void); ctx = smp_load_acquire(&inode->i_flctx); if (!ctx || list_empty_careful(&ctx->flc_posix)) { @@ -881,12 +911,23 @@ posix_test_lock(struct file *filp, struct file_lock *fl) return; } +retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { - if (posix_locks_conflict(fl, cfl)) { - locks_copy_conflock(fl, cfl); - goto out; + if (!posix_locks_conflict(fl, cfl)) + continue; + if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable + && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { + owner = cfl->fl_lmops->lm_mod_owner; + func = cfl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + (*func)(); + module_put(owner); + goto retry; } + locks_copy_conflock(fl, cfl); + goto out; } fl->fl_type = F_UNLCK; out: @@ -1060,6 +1101,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, int error; bool added = false; LIST_HEAD(dispose); + void *owner; + void (*func)(void); ctx = locks_get_lock_context(inode, request->fl_type); if (!ctx) @@ -1078,6 +1121,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, new_fl2 = locks_alloc_lock(); } +retry: percpu_down_read(&file_rwsem); spin_lock(&ctx->flc_lock); /* @@ -1089,6 +1133,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, list_for_each_entry(fl, &ctx->flc_posix, fl_list) { if (!posix_locks_conflict(request, fl)) continue; + if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable + && (*fl->fl_lmops->lm_lock_expirable)(fl)) { + owner = fl->fl_lmops->lm_mod_owner; + func = fl->fl_lmops->lm_expire_lock; + __module_get(owner); + spin_unlock(&ctx->flc_lock); + percpu_up_read(&file_rwsem); + (*func)(); + module_put(owner); + goto retry; + } if (conflock) locks_copy_conflock(conflock, fl); error = -EAGAIN; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 489c9c1d8f31..f172412447f5 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -303,6 +303,8 @@ nfsd_file_put_noref(struct nfsd_file *nf) void nfsd_file_put(struct nfsd_file *nf) { + might_sleep(); + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { nfsd_file_flush(nf); @@ -899,9 +901,9 @@ nfsd_file_is_cached(struct inode *inode) return ret; } -__be32 -nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - unsigned int may_flags, struct nfsd_file **pnf) +static __be32 +nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf, bool open) { __be32 status; struct net *net = SVC_NET(rqstp); @@ -996,10 +998,14 @@ open_file: nfsd_file_gc(); nf->nf_mark = nfsd_file_mark_find_or_create(nf); - if (nf->nf_mark) - status = nfsd_open_verified(rqstp, fhp, S_IFREG, - may_flags, &nf->nf_file); - else + if (nf->nf_mark) { + if (open) { + status = nfsd_open_verified(rqstp, fhp, may_flags, + &nf->nf_file); + trace_nfsd_file_open(nf, status); + } else + status = nfs_ok; + } else status = nfserr_jukebox; /* * If construction failed, or we raced with a call to unlink() @@ -1019,6 +1025,40 @@ open_file: goto out; } +/** + * nfsd_file_acquire - Get a struct nfsd_file with an open file + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file to be opened + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true); +} + +/** + * nfsd_file_create - Get a struct nfsd_file, do not open + * @rqstp: the RPC transaction being executed + * @fhp: the NFS filehandle of the file just created + * @may_flags: NFSD_MAY_ settings for the file + * @pnf: OUT: new or found "struct nfsd_file" object + * + * Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in + * network byte order is returned. + */ +__be32 +nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false); +} + /* * Note that fields may be added, removed or reordered in the future. Programs * scraping this file for info should test the labels to ensure they're diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 435ceab27897..1da0c79a5580 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -59,5 +59,7 @@ void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); +__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); int nfsd_file_cache_stats_open(struct inode *, struct file *); #endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 936eebd4c56d..981a3a7a6e16 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "cache.h" #include "xdr3.h" @@ -220,17 +221,132 @@ nfsd3_proc_write(struct svc_rqst *rqstp) } /* - * With NFSv3, CREATE processing is a lot easier than with NFSv2. - * At least in theory; we'll see how it fares in practice when the - * first reports about SunOS compatibility problems start to pour in... + * Implement NFSv3's unchecked, guarded, and exclusive CREATE + * semantics for regular files. Except for the created file, + * this operation is stateless on the server. + * + * Upon return, caller must release @fhp and @resfhp. */ +static __be32 +nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd3_createargs *argp) +{ + struct iattr *iap = &argp->attrs; + struct dentry *parent, *child; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + if (isdotent(argp->name, argp->len)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + child = lookup_one_len(argp->name, parent, argp->len); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + u32 *verifier = (u32 *)argp->verf; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + status = nfs_ok; + + switch (argp->createmode) { + case NFS3_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + case NFS3_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS3_CREATE_EXCLUSIVE: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + break; + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + host_err = vfs_create(&init_user_ns, inode, child, iap->ia_mode, true); + if (host_err < 0) { + status = nfserrno(host_err); + goto out; + } + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (argp->createmode == NFS3_CREATE_EXCLUSIVE) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET | ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, iap); + +out: + fh_unlock(fhp); + if (child && !IS_ERR(child)) + dput(child); + fh_drop_write(fhp); + return status; +} + static __be32 nfsd3_proc_create(struct svc_rqst *rqstp) { struct nfsd3_createargs *argp = rqstp->rq_argp; struct nfsd3_diropres *resp = rqstp->rq_resp; - svc_fh *dirfhp, *newfhp = NULL; - struct iattr *attr; + svc_fh *dirfhp, *newfhp; dprintk("nfsd: CREATE(3) %s %.*s\n", SVCFH_fmt(&argp->fh), @@ -239,21 +355,8 @@ nfsd3_proc_create(struct svc_rqst *rqstp) dirfhp = fh_copy(&resp->dirfh, &argp->fh); newfhp = fh_init(&resp->fh, NFS3_FHSIZE); - attr = &argp->attrs; - /* Unfudge the mode bits */ - attr->ia_mode &= ~S_IFMT; - if (!(attr->ia_valid & ATTR_MODE)) { - attr->ia_valid |= ATTR_MODE; - attr->ia_mode = S_IFREG; - } else { - attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG; - } - - /* Now create the file and set attributes */ - resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len, - attr, newfhp, argp->createmode, - (u32 *)argp->verf, NULL, NULL); + resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp); return rpc_success; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b207c76a873f..3895eb52d2b1 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -37,6 +37,8 @@ #include #include #include +#include + #include #include @@ -235,6 +237,183 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate &resfh->fh_handle); } +static inline bool nfsd4_create_is_exclusive(int createmode) +{ + return createmode == NFS4_CREATE_EXCLUSIVE || + createmode == NFS4_CREATE_EXCLUSIVE4_1; +} + +static __be32 +nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child, + struct nfsd4_open *open) +{ + struct file *filp; + struct path path; + int oflags; + + oflags = O_CREAT | O_LARGEFILE; + switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) { + case NFS4_SHARE_ACCESS_WRITE: + oflags |= O_WRONLY; + break; + case NFS4_SHARE_ACCESS_BOTH: + oflags |= O_RDWR; + break; + default: + oflags |= O_RDONLY; + } + + path.mnt = fhp->fh_export->ex_path.mnt; + path.dentry = child; + filp = dentry_create(&path, oflags, open->op_iattr.ia_mode, + current_cred()); + if (IS_ERR(filp)) + return nfserrno(PTR_ERR(filp)); + + open->op_filp = filp; + return nfs_ok; +} + +/* + * Implement NFSv4's unchecked, guarded, and exclusive create + * semantics for regular files. Open state for this new file is + * subsequently fabricated in nfsd4_process_open2(). + * + * Upon return, caller must release @fhp and @resfhp. + */ +static __be32 +nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct nfsd4_open *open) +{ + struct iattr *iap = &open->op_iattr; + struct dentry *parent, *child; + __u32 v_mtime, v_atime; + struct inode *inode; + __be32 status; + int host_err; + + if (isdotent(open->op_fname, open->op_fnamelen)) + return nfserr_exist; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); + if (status != nfs_ok) + return status; + parent = fhp->fh_dentry; + inode = d_inode(parent); + + host_err = fh_want_write(fhp); + if (host_err) + return nfserrno(host_err); + + fh_lock_nested(fhp, I_MUTEX_PARENT); + + child = lookup_one_len(open->op_fname, parent, open->op_fnamelen); + if (IS_ERR(child)) { + status = nfserrno(PTR_ERR(child)); + goto out; + } + + if (d_really_is_negative(child)) { + status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (status != nfs_ok) + goto out; + } + + status = fh_compose(resfhp, fhp->fh_export, child, fhp); + if (status != nfs_ok) + goto out; + + v_mtime = 0; + v_atime = 0; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + u32 *verifier = (u32 *)open->op_verf.data; + + /* + * Solaris 7 gets confused (bugid 4218508) if these have + * the high bit set, as do xfs filesystems without the + * "bigtime" feature. So just clear the high bits. If this + * is ever changed to use different attrs for storing the + * verifier, then do_open_lookup() will also need to be + * fixed accordingly. + */ + v_mtime = verifier[0] & 0x7fffffff; + v_atime = verifier[1] & 0x7fffffff; + } + + if (d_really_is_positive(child)) { + status = nfs_ok; + + switch (open->op_createmode) { + case NFS4_CREATE_UNCHECKED: + if (!d_is_reg(child)) + break; + + /* + * In NFSv4, we don't want to truncate the file + * now. This would be wrong if the OPEN fails for + * some other reason. Furthermore, if the size is + * nonzero, we should ignore it according to spec! + */ + open->op_truncate = (iap->ia_valid & ATTR_SIZE) && + !iap->ia_size; + break; + case NFS4_CREATE_GUARDED: + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + break; /* subtle */ + } + status = nfserr_exist; + break; + case NFS4_CREATE_EXCLUSIVE4_1: + if (d_inode(child)->i_mtime.tv_sec == v_mtime && + d_inode(child)->i_atime.tv_sec == v_atime && + d_inode(child)->i_size == 0) { + open->op_created = true; + goto set_attr; /* subtle */ + } + status = nfserr_exist; + } + goto out; + } + + if (!IS_POSIXACL(inode)) + iap->ia_mode &= ~current_umask(); + + status = nfsd4_vfs_create(fhp, child, open); + if (status != nfs_ok) + goto out; + open->op_created = true; + + /* A newly created file already has a file size of zero. */ + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + if (nfsd4_create_is_exclusive(open->op_createmode)) { + iap->ia_valid = ATTR_MTIME | ATTR_ATIME | + ATTR_MTIME_SET|ATTR_ATIME_SET; + iap->ia_mtime.tv_sec = v_mtime; + iap->ia_atime.tv_sec = v_atime; + iap->ia_mtime.tv_nsec = 0; + iap->ia_atime.tv_nsec = 0; + } + +set_attr: + status = nfsd_create_setattr(rqstp, fhp, resfhp, iap); + +out: + fh_unlock(fhp); + if (child && !IS_ERR(child)) + dput(child); + fh_drop_write(fhp); + return status; +} + static __be32 do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh) { @@ -264,16 +443,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * yes | yes | GUARDED4 | GUARDED4 */ - /* - * Note: create modes (UNCHECKED,GUARDED...) are the same - * in NFSv4 as in v3 except EXCLUSIVE4_1. - */ current->fs->umask = open->op_umask; - status = do_nfsd_create(rqstp, current_fh, open->op_fname, - open->op_fnamelen, &open->op_iattr, - *resfh, open->op_createmode, - (u32 *)open->op_verf.data, - &open->op_truncate, &open->op_created); + status = nfsd4_create_file(rqstp, current_fh, *resfh, open); current->fs->umask = 0; if (!status && open->op_label.len) @@ -284,7 +455,7 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru * use the returned bitmask to indicate which attributes * we used to store the verifier: */ - if (nfsd_create_is_exclusive(open->op_createmode) && status == 0) + if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0) open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_MODIFY); } else @@ -375,6 +546,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, (int)open->op_fnamelen, open->op_fname, open->op_openowner); + open->op_filp = NULL; + /* This check required by spec. */ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) return nfserr_inval; @@ -427,43 +600,35 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; switch (open->op_claim_type) { - case NFS4_OPEN_CLAIM_DELEGATE_CUR: - case NFS4_OPEN_CLAIM_NULL: - status = do_open_lookup(rqstp, cstate, open, &resfh); - if (status) - goto out; - break; - case NFS4_OPEN_CLAIM_PREVIOUS: - status = nfs4_check_open_reclaim(cstate->clp); - if (status) - goto out; - open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; - reclaim = true; - fallthrough; - case NFS4_OPEN_CLAIM_FH: - case NFS4_OPEN_CLAIM_DELEG_CUR_FH: - status = do_open_fhandle(rqstp, cstate, open); - if (status) - goto out; - resfh = &cstate->current_fh; - break; - case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - case NFS4_OPEN_CLAIM_DELEGATE_PREV: - dprintk("NFSD: unsupported OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_notsupp; + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_NULL: + status = do_open_lookup(rqstp, cstate, open, &resfh); + if (status) goto out; - default: - dprintk("NFSD: Invalid OPEN claim type %d\n", - open->op_claim_type); - status = nfserr_inval; + break; + case NFS4_OPEN_CLAIM_PREVIOUS: + status = nfs4_check_open_reclaim(cstate->clp); + if (status) goto out; + open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; + reclaim = true; + fallthrough; + case NFS4_OPEN_CLAIM_FH: + case NFS4_OPEN_CLAIM_DELEG_CUR_FH: + status = do_open_fhandle(rqstp, cstate, open); + if (status) + goto out; + resfh = &cstate->current_fh; + break; + case NFS4_OPEN_CLAIM_DELEG_PREV_FH: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + status = nfserr_notsupp; + goto out; + default: + status = nfserr_inval; + goto out; } - /* - * nfsd4_process_open2() does the actual opening of the file. If - * successful, it (1) truncates the file if open->op_truncate was - * set, (2) sets open->op_stateid, (3) sets open->op_delegation. - */ + status = nfsd4_process_open2(rqstp, resfh, open); WARN(status && open->op_created, "nfsd4_process_open2 failed to open newly-created file! status=%u\n", @@ -471,6 +636,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (reclaim && !status) nn->somebody_reclaimed = true; out: + if (open->op_filp) { + fput(open->op_filp); + open->op_filp = NULL; + } if (resfh && resfh != &cstate->current_fh) { fh_dup2(&cstate->current_fh, resfh); fh_put(resfh); @@ -801,7 +970,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * the client wants us to do more in this compound: */ if (!nfsd4_last_compound_op(rqstp)) - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, @@ -2481,11 +2650,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) cstate->minorversion = args->minorversion; fh_init(current_fh, NFS4_FHSIZE); fh_init(save_fh, NFS4_FHSIZE); + /* * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); /* * According to RFC3010, this takes precedence over all other errors. @@ -2600,7 +2770,7 @@ encode_op: out: cstate->status = status; /* Reset deferral mechanism for RPC deferrals */ - set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); return rpc_success; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 234e852fcdfa..9409a0dc1b76 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -125,6 +125,23 @@ static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; +static struct workqueue_struct *laundry_wq; + +int nfsd4_create_laundry_wq(void) +{ + int rc = 0; + + laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); + if (laundry_wq == NULL) + rc = -ENOMEM; + return rc; +} + +void nfsd4_destroy_laundry_wq(void) +{ + destroy_workqueue(laundry_wq); +} + static bool is_session_dead(struct nfsd4_session *ses) { return ses->se_flags & NFS4_SESSION_DEAD; @@ -152,6 +169,7 @@ static __be32 get_client_locked(struct nfs4_client *clp) if (is_client_expired(clp)) return nfserr_expired; atomic_inc(&clp->cl_rpc_users); + clp->cl_state = NFSD4_ACTIVE; return nfs_ok; } @@ -172,6 +190,7 @@ renew_client_locked(struct nfs4_client *clp) list_move_tail(&clp->cl_lru, &nn->client_lru); clp->cl_time = ktime_get_boottime_seconds(); + clp->cl_state = NFSD4_ACTIVE; } static void put_client_renew_locked(struct nfs4_client *clp) @@ -690,6 +709,57 @@ static unsigned int file_hashval(struct svc_fh *fh) static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; +/* + * Check if courtesy clients have conflicting access and resolve it if possible + * + * access: is op_share_access if share_access is true. + * Check if access mode, op_share_access, would conflict with + * the current deny mode of the file 'fp'. + * access: is op_share_deny if share_access is false. + * Check if the deny mode, op_share_deny, would conflict with + * current access of the file 'fp'. + * stp: skip checking this entry. + * new_stp: normal open, not open upgrade. + * + * Function returns: + * false - access/deny mode conflict with normal client. + * true - no conflict or conflict with courtesy client(s) is resolved. + */ +static bool +nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp, + struct nfs4_ol_stateid *stp, u32 access, bool share_access) +{ + struct nfs4_ol_stateid *st; + bool resolvable = true; + unsigned char bmap; + struct nfsd_net *nn; + struct nfs4_client *clp; + + lockdep_assert_held(&fp->fi_lock); + list_for_each_entry(st, &fp->fi_stateids, st_perfile) { + /* ignore lock stateid */ + if (st->st_openstp) + continue; + if (st == stp && new_stp) + continue; + /* check file access against deny mode or vice versa */ + bmap = share_access ? st->st_deny_bmap : st->st_access_bmap; + if (!(access & bmap_to_share_mode(bmap))) + continue; + clp = st->st_stid.sc_client; + if (try_to_expire_client(clp)) + continue; + resolvable = false; + break; + } + if (resolvable) { + clp = stp->st_stid.sc_client; + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } + return resolvable; +} + static void __nfs4_file_get_access(struct nfs4_file *fp, u32 access) { @@ -1090,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, get_clnt_odstate(odstate); dp->dl_type = NFS4_OPEN_DELEGATE_READ; dp->dl_retries = 1; + dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); get_nfs4_file(fp); @@ -2004,6 +2075,8 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) idr_init(&clp->cl_stateids); atomic_set(&clp->cl_rpc_users, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; + clp->cl_state = NFSD4_ACTIVE; + atomic_set(&clp->cl_delegs_in_recall, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); @@ -2408,10 +2481,17 @@ static int client_info_show(struct seq_file *m, void *v) memcpy(&clid, &clp->cl_clientid, sizeof(clid)); seq_printf(m, "clientid: 0x%llx\n", clid); seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr); - if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) + + if (clp->cl_state == NFSD4_COURTESY) + seq_puts(m, "status: courtesy\n"); + else if (clp->cl_state == NFSD4_EXPIRABLE) + seq_puts(m, "status: expirable\n"); + else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags)) seq_puts(m, "status: confirmed\n"); else seq_puts(m, "status: unconfirmed\n"); + seq_printf(m, "seconds from last renew: %lld\n", + ktime_get_boottime_seconds() - clp->cl_time); seq_printf(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); @@ -4694,9 +4774,18 @@ nfsd_break_deleg_cb(struct file_lock *fl) bool ret = false; struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner; struct nfs4_file *fp = dp->dl_stid.sc_file; + struct nfs4_client *clp = dp->dl_stid.sc_client; + struct nfsd_net *nn; trace_nfsd_cb_recall(&dp->dl_stid); + dp->dl_recalled = true; + atomic_inc(&clp->cl_delegs_in_recall); + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + } + /* * We don't want the locks code to timeout the lease for us; * we'll remove it ourself if a delegation isn't returned @@ -4739,9 +4828,14 @@ static int nfsd_change_deleg_cb(struct file_lock *onlist, int arg, struct list_head *dispose) { - if (arg & F_UNLCK) + struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner; + struct nfs4_client *clp = dp->dl_stid.sc_client; + + if (arg & F_UNLCK) { + if (dp->dl_recalled) + atomic_dec(&clp->cl_delegs_in_recall); return lease_modify(onlist, arg, dispose); - else + } else return -EAGAIN; } @@ -4947,7 +5041,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, - struct nfsd4_open *open) + struct nfsd4_open *open, bool new_stp) { struct nfsd_file *nf = NULL; __be32 status; @@ -4963,6 +5057,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, */ status = nfs4_file_check_deny(fp, open->op_share_deny); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4970,6 +5071,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, /* set access to the file */ status = nfs4_file_get_access(fp, open->op_share_access); if (status != nfs_ok) { + if (status != nfserr_share_denied) { + spin_unlock(&fp->fi_lock); + goto out; + } + if (nfs4_resolve_deny_conflicts_locked(fp, new_stp, + stp, open->op_share_access, true)) + status = nfserr_jukebox; spin_unlock(&fp->fi_lock); goto out; } @@ -4985,9 +5093,19 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); - if (status) - goto out_put_access; + + if (!open->op_filp) { + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + } else { + status = nfsd_file_create(rqstp, cur_fh, access, &nf); + if (status != nfs_ok) + goto out_put_access; + nf->nf_file = open->op_filp; + open->op_filp = NULL; + } + spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { fp->fi_fds[oflag] = nf; @@ -5016,21 +5134,29 @@ out_put_access: } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, + struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, + struct nfsd4_open *open) { __be32 status; unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) - return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); + return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false); /* test and set deny mode */ spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); if (status == nfs_ok) { - set_deny(open->op_share_deny, stp); - fp->fi_share_deny |= + if (status != nfserr_share_denied) { + set_deny(open->op_share_deny, stp); + fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); + } else { + if (nfs4_resolve_deny_conflicts_locked(fp, false, + stp, open->op_share_deny, false)) + status = nfserr_jukebox; + } } spin_unlock(&fp->fi_lock); @@ -5322,6 +5448,18 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open, */ } +/** + * nfsd4_process_open2 - finish open processing + * @rqstp: the RPC transaction being executed + * @current_fh: NFSv4 COMPOUND's current filehandle + * @open: OPEN arguments + * + * If successful, (1) truncate the file if open->op_truncate was + * set, (2) set open->op_stateid, (3) set open->op_delegation. + * + * Returns %nfs_ok on success; otherwise an nfs4stat value in + * network byte order is returned. + */ __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) { @@ -5371,7 +5509,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } } else { - status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); @@ -5605,6 +5743,81 @@ static void nfsd4_ssc_expire_umount(struct nfsd_net *nn) } #endif +/* Check if any lock belonging to this lockowner has any blockers */ +static bool +nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo) +{ + struct file_lock_context *ctx; + struct nfs4_ol_stateid *stp; + struct nfs4_file *nf; + + list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) { + nf = stp->st_stid.sc_file; + ctx = nf->fi_inode->i_flctx; + if (!ctx) + continue; + if (locks_owner_has_blockers(ctx, lo)) + return true; + } + return false; +} + +static bool +nfs4_anylock_blockers(struct nfs4_client *clp) +{ + int i; + struct nfs4_stateowner *so; + struct nfs4_lockowner *lo; + + if (atomic_read(&clp->cl_delegs_in_recall)) + return true; + spin_lock(&clp->cl_lock); + for (i = 0; i < OWNER_HASH_SIZE; i++) { + list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + if (so->so_is_open_owner) + continue; + lo = lockowner(so); + if (nfs4_lockowner_has_blockers(lo)) { + spin_unlock(&clp->cl_lock); + return true; + } + } + } + spin_unlock(&clp->cl_lock); + return false; +} + +static void +nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, + struct laundry_time *lt) +{ + struct list_head *pos, *next; + struct nfs4_client *clp; + + INIT_LIST_HEAD(reaplist); + spin_lock(&nn->client_lock); + list_for_each_safe(pos, next, &nn->client_lru) { + clp = list_entry(pos, struct nfs4_client, cl_lru); + if (clp->cl_state == NFSD4_EXPIRABLE) + goto exp_client; + if (!state_expired(lt, clp->cl_time)) + break; + if (!atomic_read(&clp->cl_rpc_users)) + clp->cl_state = NFSD4_COURTESY; + if (!client_has_state(clp) || + ktime_get_boottime_seconds() >= + (clp->cl_time + NFSD_COURTESY_CLIENT_TIMEOUT)) + goto exp_client; + if (nfs4_anylock_blockers(clp)) { +exp_client: + if (!mark_client_expired_locked(clp)) + list_add(&clp->cl_lru, reaplist); + } + } + spin_unlock(&nn->client_lock); +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -5627,7 +5840,6 @@ nfs4_laundromat(struct nfsd_net *nn) goto out; } nfsd4_end_grace(nn); - INIT_LIST_HEAD(&reaplist); spin_lock(&nn->s2s_cp_lock); idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) { @@ -5637,17 +5849,7 @@ nfs4_laundromat(struct nfsd_net *nn) _free_cpntf_state_locked(nn, cps); } spin_unlock(&nn->s2s_cp_lock); - - spin_lock(&nn->client_lock); - list_for_each_safe(pos, next, &nn->client_lru) { - clp = list_entry(pos, struct nfs4_client, cl_lru); - if (!state_expired(<, clp->cl_time)) - break; - if (mark_client_expired_locked(clp)) - continue; - list_add(&clp->cl_lru, &reaplist); - } - spin_unlock(&nn->client_lock); + nfs4_get_client_reaplist(nn, &reaplist, <); list_for_each_safe(pos, next, &reaplist) { clp = list_entry(pos, struct nfs4_client, cl_lru); trace_nfsd_clid_purged(&clp->cl_clientid); @@ -5722,7 +5924,6 @@ out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } -static struct workqueue_struct *laundry_wq; static void laundromat_main(struct work_struct *); static void @@ -6551,6 +6752,29 @@ nfsd4_lm_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +/* return pointer to struct nfs4_client if client is expirable */ +static bool +nfsd4_lm_lock_expirable(struct file_lock *cfl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner; + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn; + + if (try_to_expire_client(clp)) { + nn = net_generic(clp->net, nfsd_net_id); + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + return true; + } + return false; +} + +/* schedule laundromat to run immediately and wait for it to complete */ +static void +nfsd4_lm_expire_lock(void) +{ + flush_workqueue(laundry_wq); +} + static void nfsd4_lm_notify(struct file_lock *fl) { @@ -6577,9 +6801,12 @@ nfsd4_lm_notify(struct file_lock *fl) } static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_mod_owner = THIS_MODULE, .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_lm_get_owner, .lm_put_owner = nfsd4_lm_put_owner, + .lm_lock_expirable = nfsd4_lm_lock_expirable, + .lm_expire_lock = nfsd4_lm_expire_lock, }; static inline void @@ -7297,22 +7524,36 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) return status; } +/** + * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations + * @rqstp: RPC transaction + * @cstate: NFSv4 COMPOUND state + * @u: RELEASE_LOCKOWNER arguments + * + * The lockowner's so_count is bumped when a lock record is added + * or when copying a conflicting lock. The latter case is brief, + * but can lead to fleeting false positives when looking for + * locks-in-use. + * + * Return values: + * %nfs_ok: lockowner released or not found + * %nfserr_locks_held: lockowner still in use + * %nfserr_stale_clientid: clientid no longer active + * %nfserr_expired: clientid not recognized + */ __be32 nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner; - clientid_t *clid = &rlockowner->rl_clientid; - struct nfs4_stateowner *sop; - struct nfs4_lockowner *lo = NULL; - struct nfs4_ol_stateid *stp; - struct xdr_netobj *owner = &rlockowner->rl_owner; - unsigned int hashval = ownerstr_hashval(owner); - __be32 status; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + clientid_t *clid = &rlockowner->rl_clientid; + struct nfs4_ol_stateid *stp; + struct nfs4_lockowner *lo; struct nfs4_client *clp; - LIST_HEAD (reaplist); + LIST_HEAD(reaplist); + __be32 status; dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", clid->cl_boot, clid->cl_id); @@ -7320,34 +7561,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, status = set_client(clid, cstate, nn); if (status) return status; - clp = cstate->clp; - /* Find the matching lock stateowner */ + spin_lock(&clp->cl_lock); - list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval], - so_strhash) { - - if (sop->so_is_open_owner || !same_owner_str(sop, owner)) - continue; - - /* see if there are still any locks associated with it */ - lo = lockowner(sop); - list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_stid.sc_file, lo)) { - status = nfserr_locks_held; - spin_unlock(&clp->cl_lock); - return status; - } - } - - nfs4_get_stateowner(sop); - break; - } + lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner); if (!lo) { spin_unlock(&clp->cl_lock); - return status; + return nfs_ok; + } + if (atomic_read(&lo->lo_owner.so_count) != 2) { + spin_unlock(&clp->cl_lock); + nfs4_put_stateowner(&lo->lo_owner); + return nfserr_locks_held; } - unhash_lockowner_locked(lo); while (!list_empty(&lo->lo_owner.so_stateids)) { stp = list_first_entry(&lo->lo_owner.so_stateids, @@ -7357,11 +7583,11 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); + free_ol_stateid_reaplist(&reaplist); remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); - - return status; + return nfs_ok; } static inline struct nfs4_client_reclaim * @@ -7602,22 +7828,12 @@ nfs4_state_start(void) { int ret; - laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4"); - if (laundry_wq == NULL) { - ret = -ENOMEM; - goto out; - } ret = nfsd4_create_callback_queue(); if (ret) - goto out_free_laundry; + return ret; set_max_delegations(); return 0; - -out_free_laundry: - destroy_workqueue(laundry_wq); -out: - return ret; } void @@ -7654,7 +7870,6 @@ nfs4_state_shutdown_net(struct net *net) void nfs4_state_shutdown(void) { - destroy_workqueue(laundry_wq); nfsd4_destroy_callback_queue(); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index da92e7d2ab6a..61b2aae81abb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2411,7 +2411,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); return true; } diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 0b3f12aa37ff..7da88bdc0d6c 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -206,7 +206,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct svc_cacherep *rp; unsigned int i; - nfsd_reply_cache_stats_destroy(nn); unregister_shrinker(&nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { @@ -217,6 +216,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) rp, nn); } } + nfsd_reply_cache_stats_destroy(nn); kvfree(nn->drc_hashtbl); nn->drc_hashtbl = NULL; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 16920e4512bd..0621c2faf242 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1535,20 +1535,25 @@ static int __init init_nfsd(void) retval = create_proc_exports_entry(); if (retval) goto out_free_lockd; - retval = register_filesystem(&nfsd_fs_type); - if (retval) - goto out_free_exports; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_free_filesystem; + goto out_free_exports; retval = register_cld_notifier(); + if (retval) + goto out_free_subsys; + retval = nfsd4_create_laundry_wq(); + if (retval) + goto out_free_cld; + retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_all; return 0; out_free_all: + nfsd4_destroy_laundry_wq(); +out_free_cld: + unregister_cld_notifier(); +out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); -out_free_filesystem: - unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); @@ -1566,6 +1571,8 @@ out_free_slabs: static void __exit exit_nfsd(void) { + unregister_filesystem(&nfsd_fs_type); + nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); @@ -1575,7 +1582,6 @@ static void __exit exit_nfsd(void) nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); - unregister_filesystem(&nfsd_fs_type); } MODULE_AUTHOR("Olaf Kirch "); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 4fc1fd639527..847b482155ae 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -162,6 +162,8 @@ void nfs4_state_shutdown_net(struct net *net); int nfs4_reset_recoverydir(char *recdir); char * nfs4_recoverydir(void); bool nfsd4_spo_must_allow(struct svc_rqst *rqstp); +int nfsd4_create_laundry_wq(void); +void nfsd4_destroy_laundry_wq(void); #else static inline int nfsd4_init_slabs(void) { return 0; } static inline void nfsd4_free_slabs(void) { } @@ -175,6 +177,8 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) { return false; } +static inline int nfsd4_create_laundry_wq(void) { return 0; }; +static inline void nfsd4_destroy_laundry_wq(void) {}; #endif /* @@ -336,6 +340,7 @@ void nfsd_lockd_shutdown(void); #define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ +#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */ /* * The following attributes are currently not supported by the NFSv4 server: diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 95457cfd37fc..f3d6313914ed 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -149,6 +149,7 @@ struct nfs4_delegation { /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; + bool dl_recalled; }; #define cb_to_delegation(cb) \ @@ -282,6 +283,28 @@ struct nfsd4_sessionid { #define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ +/* + * State Meaning Where set + * -------------------------------------------------------------------------- + * | NFSD4_ACTIVE | Confirmed, active | Default | + * |------------------- ----------------------------------------------------| + * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist | + * | | Lease/lock/share | | + * | | reservation conflict | | + * | | can cause Courtesy | | + * | | client to be expired | | + * |------------------------------------------------------------------------| + * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat | + * | | expired by Laundromat| try_to_expire_client | + * | | due to conflict | | + * |------------------------------------------------------------------------| + */ +enum { + NFSD4_ACTIVE = 0, + NFSD4_COURTESY, + NFSD4_EXPIRABLE, +}; + /* * struct nfs4_client - one per client. Clientids live here. * @@ -385,6 +408,9 @@ struct nfs4_client { struct list_head async_copies; /* list of async copies */ spinlock_t async_lock; /* lock for async copies */ atomic_t cl_cb_inflight; /* Outstanding callbacks */ + + unsigned int cl_state; + atomic_t cl_delegs_in_recall; }; /* struct nfs4_client_reset @@ -702,4 +728,9 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); extern void nfsd4_record_grace_done(struct nfsd_net *nn); +static inline bool try_to_expire_client(struct nfs4_client *clp) +{ + cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE); + return clp->cl_state == NFSD4_EXPIRABLE; +} #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 242fa123e0e9..a60ead3b227a 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -692,12 +692,6 @@ DEFINE_CLID_EVENT(confirmed_r); /* * from fs/nfsd/filecache.h */ -TRACE_DEFINE_ENUM(NFSD_FILE_HASHED); -TRACE_DEFINE_ENUM(NFSD_FILE_PENDING); -TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ); -TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE); -TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED); - #define show_nf_flags(val) \ __print_flags(val, "|", \ { 1 << NFSD_FILE_HASHED, "HASHED" }, \ @@ -784,6 +778,34 @@ TRACE_EVENT(nfsd_file_acquire, __entry->nf_file, __entry->status) ); +TRACE_EVENT(nfsd_file_open, + TP_PROTO(struct nfsd_file *nf, __be32 status), + TP_ARGS(nf, status), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) /* cannot be dereferenced */ + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(void *, nf_file) /* cannot be dereferenced */ + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), + __entry->nf_file) +) + DECLARE_EVENT_CLASS(nfsd_file_search_class, TP_PROTO(struct inode *inode, unsigned int hash, int found), TP_ARGS(inode, hash, found), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c22ad0532e8e..840e3af63a6f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -827,14 +827,23 @@ retry: return err; } +/** + * nfsd_open_verified - Open a regular file for the filecache + * @rqstp: RPC request + * @fhp: NFS filehandle of the file to open + * @may_flags: internal permission flags + * @filp: OUT: open "struct file *" + * + * Returns an nfsstat value in network byte order. + */ __be32 -nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, - int may_flags, struct file **filp) +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, + struct file **filp) { __be32 err; validate_process_creds(); - err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); validate_process_creds(); return err; } @@ -849,17 +858,11 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct svc_rqst *rqstp = sd->u.data; - struct page **pp = rqstp->rq_next_page; - struct page *page = buf->page; - if (rqstp->rq_res.page_len == 0) { - svc_rqst_replace_page(rqstp, page); + svc_rqst_replace_page(rqstp, buf->page); + if (rqstp->rq_res.page_len == 0) rqstp->rq_res.page_base = buf->offset; - } else if (page != pp[-1]) { - svc_rqst_replace_page(rqstp, page); - } rqstp->rq_res.page_len += sd->len; - return sd->len; } @@ -1187,14 +1190,26 @@ out: return err; } -static __be32 -nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, - struct iattr *iap) +/** + * nfsd_create_setattr - Set a created file's attributes + * @rqstp: RPC transaction being executed + * @fhp: NFS filehandle of parent directory + * @resfhp: NFS filehandle of new object + * @iap: requested attributes of new object + * + * Returns nfs_ok on success, or an nfsstat in network byte order. + */ +__be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct iattr *iap) { + __be32 status; + /* - * Mode has already been set earlier in create: + * Mode has already been set by file creation. */ iap->ia_valid &= ~ATTR_MODE; + /* * Setting uid/gid works only for root. Irix appears to * send along the gid on create when it tries to implement @@ -1202,10 +1217,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, */ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) iap->ia_valid &= ~(ATTR_UID|ATTR_GID); + + /* + * Callers expect new file metadata to be committed even + * if the attributes have not changed. + */ if (iap->ia_valid) - return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0); - /* Callers expect file metadata to be committed here */ - return nfserrno(commit_metadata(resfhp)); + status = nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0); + else + status = nfserrno(commit_metadata(resfhp)); + + /* + * Transactional filesystems had a chance to commit changes + * for both parent and child simultaneously making the + * following commit_metadata a noop in many cases. + */ + if (!status) + status = nfserrno(commit_metadata(fhp)); + + /* + * Update the new filehandle to pick up the new attributes. + */ + if (!status) + status = fh_update(resfhp); + + return status; } /* HPUX client sometimes creates a file in mode 000, and sets size to 0. @@ -1232,7 +1268,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dchild; struct inode *dirp; __be32 err; - __be32 err2; int host_err; dentry = fhp->fh_dentry; @@ -1305,22 +1340,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, if (host_err < 0) goto out_nfserr; - err = nfsd_create_setattr(rqstp, resfhp, iap); + err = nfsd_create_setattr(rqstp, fhp, resfhp, iap); - /* - * nfsd_create_setattr already committed the child. Transactional - * filesystems had a chance to commit changes for both parent and - * child simultaneously making the following commit_metadata a - * noop. - */ - err2 = nfserrno(commit_metadata(fhp)); - if (err2) - err = err2; - /* - * Update the file handle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); out: dput(dchild); return err; @@ -1375,172 +1396,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, rdev, resfhp); } -/* - * NFSv3 and NFSv4 version of nfsd_create - */ -__be32 -do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, - char *fname, int flen, struct iattr *iap, - struct svc_fh *resfhp, int createmode, u32 *verifier, - bool *truncp, bool *created) -{ - struct dentry *dentry, *dchild = NULL; - struct inode *dirp; - __be32 err; - int host_err; - __u32 v_mtime=0, v_atime=0; - - err = nfserr_perm; - if (!flen) - goto out; - err = nfserr_exist; - if (isdotent(fname, flen)) - goto out; - if (!(iap->ia_valid & ATTR_MODE)) - iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); - if (err) - goto out; - - dentry = fhp->fh_dentry; - dirp = d_inode(dentry); - - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - - fh_lock_nested(fhp, I_MUTEX_PARENT); - - /* - * Compose the response file handle. - */ - dchild = lookup_one_len(fname, dentry, flen); - host_err = PTR_ERR(dchild); - if (IS_ERR(dchild)) - goto out_nfserr; - - /* If file doesn't exist, check for permissions to create one */ - if (d_really_is_negative(dchild)) { - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); - if (err) - goto out; - } - - err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); - if (err) - goto out; - - if (nfsd_create_is_exclusive(createmode)) { - /* solaris7 gets confused (bugid 4218508) if these have - * the high bit set, as do xfs filesystems without the - * "bigtime" feature. So just clear the high bits. If this is - * ever changed to use different attrs for storing the - * verifier, then do_open_lookup() will also need to be fixed - * accordingly. - */ - v_mtime = verifier[0]&0x7fffffff; - v_atime = verifier[1]&0x7fffffff; - } - - if (d_really_is_positive(dchild)) { - err = 0; - - switch (createmode) { - case NFS3_CREATE_UNCHECKED: - if (! d_is_reg(dchild)) - goto out; - else if (truncp) { - /* in nfsv4, we need to treat this case a little - * differently. we don't want to truncate the - * file now; this would be wrong if the OPEN - * fails for some other reason. furthermore, - * if the size is nonzero, we should ignore it - * according to spec! - */ - *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; - } - else { - iap->ia_valid &= ATTR_SIZE; - goto set_attr; - } - break; - case NFS3_CREATE_EXCLUSIVE: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = true; - break; - } - fallthrough; - case NFS4_CREATE_EXCLUSIVE4_1: - if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime - && d_inode(dchild)->i_atime.tv_sec == v_atime - && d_inode(dchild)->i_size == 0 ) { - if (created) - *created = true; - goto set_attr; - } - fallthrough; - case NFS3_CREATE_GUARDED: - err = nfserr_exist; - } - fh_drop_write(fhp); - goto out; - } - - if (!IS_POSIXACL(dirp)) - iap->ia_mode &= ~current_umask(); - - host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); - if (host_err < 0) { - fh_drop_write(fhp); - goto out_nfserr; - } - if (created) - *created = true; - - nfsd_check_ignore_resizing(iap); - - if (nfsd_create_is_exclusive(createmode)) { - /* Cram the verifier into atime/mtime */ - iap->ia_valid = ATTR_MTIME|ATTR_ATIME - | ATTR_MTIME_SET|ATTR_ATIME_SET; - /* XXX someone who knows this better please fix it for nsec */ - iap->ia_mtime.tv_sec = v_mtime; - iap->ia_atime.tv_sec = v_atime; - iap->ia_mtime.tv_nsec = 0; - iap->ia_atime.tv_nsec = 0; - } - - set_attr: - err = nfsd_create_setattr(rqstp, resfhp, iap); - - /* - * nfsd_create_setattr already committed the child - * (and possibly also the parent). - */ - if (!err) - err = nfserrno(commit_metadata(fhp)); - - /* - * Update the filehandle to get the new inode info. - */ - if (!err) - err = fh_update(resfhp); - - out: - fh_unlock(fhp); - if (dchild && !IS_ERR(dchild)) - dput(dchild); - fh_drop_write(fhp); - return err; - - out_nfserr: - err = nfserrno(host_err); - goto out; -} - /* * Read a symlink. On entry, *lenp must contain the maximum path length that * fits into the buffer. On return, it contains the true length. diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index ccb87b2864f6..26347d76f44a 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -69,10 +69,8 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, int type, dev_t rdev, struct svc_fh *res); __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); -__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, - char *name, int len, struct iattr *attrs, - struct svc_fh *res, int createmode, - u32 *verifier, bool *truncp, bool *created); +__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct svc_fh *resfhp, struct iattr *iap); __be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp, u64 offset, u32 count, __be32 *verf); #ifdef CONFIG_NFSD_V4 @@ -88,7 +86,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, @@ -159,10 +157,4 @@ static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) AT_STATX_SYNC_AS_STAT)); } -static inline int nfsd_create_is_exclusive(int createmode) -{ - return createmode == NFS3_CREATE_EXCLUSIVE - || createmode == NFS4_CREATE_EXCLUSIVE4_1; -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 846ab6df9d48..7b744011f2d3 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,6 +273,7 @@ struct nfsd4_open { bool op_truncate; /* used during processing */ bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ + struct file *op_filp; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ struct nfs4_clnt_odstate *op_odstate; /* used during processing */ diff --git a/fs/open.c b/fs/open.c index c535ca7a3d26..11e233e23a8e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -982,6 +982,48 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * dentry_create - Create and open a file + * @path: path to create + * @flags: O_ flags + * @mode: mode bits for new file + * @cred: credentials to use + * + * Caller must hold the parent directory's lock, and have prepared + * a negative dentry, placed in @path->dentry, for the new file. + * + * Caller sets @path->mnt to the vfsmount of the filesystem where + * the new file is to be created. The parent directory and the + * negative dentry must reside on the same filesystem instance. + * + * On success, returns a "struct file *". Otherwise a ERR_PTR + * is returned. + */ +struct file *dentry_create(const struct path *path, int flags, umode_t mode, + const struct cred *cred) +{ + struct file *f; + int error; + + validate_creds(cred); + f = alloc_empty_file(flags, cred); + if (IS_ERR(f)) + return f; + + error = vfs_create(mnt_user_ns(path->mnt), + d_inode(path->dentry->d_parent), + path->dentry, mode, true); + if (!error) + error = vfs_open(path, f); + + if (unlikely(error)) { + fput(f); + return ERR_PTR(error); + } + return f; +} +EXPORT_SYMBOL(dentry_create); + struct file *open_with_fake_path(const struct path *path, int flags, struct inode *inode, const struct cred *cred) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 4fd63af9aab1..1eeb6cabbdb2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1022,6 +1022,7 @@ struct file_lock_operations { }; struct lock_manager_operations { + void *lm_mod_owner; fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ @@ -1030,6 +1031,8 @@ struct lock_manager_operations { int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); bool (*lm_breaker_owns_lease)(struct file_lock *); + bool (*lm_lock_expirable)(struct file_lock *cfl); + void (*lm_expire_lock)(void); }; struct lock_manager { @@ -1167,6 +1170,8 @@ extern void lease_unregister_notifier(struct notifier_block *); struct files_struct; extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); +extern bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) @@ -1302,6 +1307,11 @@ static inline int lease_modify(struct file_lock *fl, int arg, struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} +static inline bool locks_owner_has_blockers(struct file_lock_context *flctx, + fl_owner_t owner) +{ + return false; +} #endif /* !CONFIG_FILE_LOCKING */ static inline struct inode *file_inode(const struct file *f) @@ -2630,6 +2640,8 @@ static inline struct file *file_open_root_mnt(struct vfsmount *mnt, name, flags, mode); } extern struct file * dentry_open(const struct path *, int, const struct cred *); +extern struct file *dentry_create(const struct path *path, int flags, + umode_t mode, const struct cred *cred); extern struct file * open_with_fake_path(const struct path *, int, struct inode*, const struct cred *); static inline struct file *file_clone_open(struct file *file) diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index b134b2b3371c..ec5a555df96f 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -121,17 +121,17 @@ struct cache_detail { struct net *net; }; - /* this must be embedded in any request structure that * identifies an object that will want a callback on * a cache fill */ struct cache_req { struct cache_deferred_req *(*defer)(struct cache_req *req); - int thread_wait; /* How long (jiffies) we can block the - * current thread to wait for updates. - */ + unsigned long thread_wait; /* How long (jiffies) we can block the + * current thread to wait for updates. + */ }; + /* this must be embedded in a deferred_request that is being * delayed awaiting cache-fill */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 217711fc9cac..daecb009c05b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -257,7 +257,6 @@ struct svc_rqst { void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ - size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; struct xdr_stream rq_arg_stream; struct xdr_stream rq_res_stream; @@ -397,7 +396,6 @@ struct svc_deferred_req { size_t daddrlen; void *xprt_ctxt; struct cache_deferred_req handle; - size_t xprt_hlen; int argslen; __be32 args[]; }; @@ -506,7 +504,7 @@ int svc_register(const struct svc_serv *, struct net *, const int, void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); -struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); +struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv); char * svc_print_addr(struct svc_rqst *, char *, size_t); const char * svc_proc_name(const struct svc_rqst *rqstp); int svc_encode_result_payload(struct svc_rqst *rqstp, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 11ee4eaf84bd..9c6317cf80d5 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -220,32 +220,24 @@ enum rdma_link_layer { }; enum ib_device_cap_flags { - IB_DEVICE_RESIZE_MAX_WR = (1 << 0), - IB_DEVICE_BAD_PKEY_CNTR = (1 << 1), - IB_DEVICE_BAD_QKEY_CNTR = (1 << 2), - IB_DEVICE_RAW_MULTI = (1 << 3), - IB_DEVICE_AUTO_PATH_MIG = (1 << 4), - IB_DEVICE_CHANGE_PHY_PORT = (1 << 5), - IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), - IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), - IB_DEVICE_SHUTDOWN_PORT = (1 << 8), - /* Not in use, former INIT_TYPE = (1 << 9),*/ - IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), - IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), - IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), - IB_DEVICE_SRQ_RESIZE = (1 << 13), - IB_DEVICE_N_NOTIFY_CQ = (1 << 14), + IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR, + IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR, + IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR, + IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI, + IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG, + IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT, + IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE, + IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD, + IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT, + /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */ + IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT, + IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID, + IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN, + IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE, + IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ, - /* - * This device supports a per-device lkey or stag that can be - * used without performing a memory registration for the local - * memory. Note that ULPs should never check this flag, but - * instead of use the local_dma_lkey flag in the ib_pd structure, - * which will always contain a usable lkey. - */ - IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), - /* Reserved, old SEND_W_INV = (1 << 16),*/ - IB_DEVICE_MEM_WINDOW = (1 << 17), + /* Reserved, old SEND_W_INV = 1 << 16,*/ + IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW, /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB @@ -253,9 +245,8 @@ enum ib_device_cap_flags { * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ - IB_DEVICE_UD_IP_CSUM = (1 << 18), - IB_DEVICE_UD_TSO = (1 << 19), - IB_DEVICE_XRC = (1 << 20), + IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM, + IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC, /* * This device supports the IB "base memory management extension", @@ -266,31 +257,53 @@ enum ib_device_cap_flags { * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ - IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21), - IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22), - IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23), - IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24), - IB_DEVICE_RC_IP_CSUM = (1 << 25), + IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS, + IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A, + IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B, + IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM, /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */ - IB_DEVICE_RAW_IP_CSUM = (1 << 26), - /* - * Devices should set IB_DEVICE_CROSS_CHANNEL if they - * support execution of WQEs that involve synchronization - * of I/O operations with single completion queue managed - * by hardware. - */ - IB_DEVICE_CROSS_CHANNEL = (1 << 27), - IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), - IB_DEVICE_INTEGRITY_HANDOVER = (1 << 30), - IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), - IB_DEVICE_SG_GAPS_REG = (1ULL << 32), - IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), + IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM, + IB_DEVICE_MANAGED_FLOW_STEERING = + IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING, /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ - IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34), - IB_DEVICE_RDMA_NETDEV_OPA = (1ULL << 35), + IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS, /* The device supports padding incoming writes to cacheline. */ - IB_DEVICE_PCI_WRITE_END_PADDING = (1ULL << 36), - IB_DEVICE_ALLOW_USER_UNREG = (1ULL << 37), + IB_DEVICE_PCI_WRITE_END_PADDING = + IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, +}; + +enum ib_kernel_cap_flags { + /* + * This device supports a per-device lkey or stag that can be + * used without performing a memory registration for the local + * memory. Note that ULPs should never check this flag, but + * instead of use the local_dma_lkey flag in the ib_pd structure, + * which will always contain a usable lkey. + */ + IBK_LOCAL_DMA_LKEY = 1 << 0, + /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */ + IBK_INTEGRITY_HANDOVER = 1 << 1, + /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */ + IBK_ON_DEMAND_PAGING = 1 << 2, + /* IB_MR_TYPE_SG_GAPS is supported */ + IBK_SG_GAPS_REG = 1 << 3, + /* Driver supports RDMA_NLDEV_CMD_DELLINK */ + IBK_ALLOW_USER_UNREG = 1 << 4, + + /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */ + IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5, + /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */ + IBK_UD_TSO = 1 << 6, + /* iopib will use the device ops: + * get_vf_config + * get_vf_guid + * get_vf_stats + * set_vf_guid + * set_vf_link_state + */ + IBK_VIRTUAL_FUNCTION = 1 << 7, + /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */ + IBK_RDMA_NETDEV_OPA = 1 << 8, }; enum ib_atomic_cap { @@ -389,6 +402,7 @@ struct ib_device_attr { int max_qp; int max_qp_wr; u64 device_cap_flags; + u64 kernel_cap_flags; int max_send_sge; int max_recv_sge; int max_sge_rd; @@ -564,7 +578,7 @@ struct rdma_stat_desc { /** * struct rdma_hw_stats * @lock - Mutex to protect parallel write access to lifespan and values - * of counters, which are 64bits and not guaranteeed to be written + * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. * @timestamp - Used by the core code to track when the last update was * @lifespan - Used by the core code to determine how old the counters @@ -1621,19 +1635,23 @@ struct ib_srq { }; enum ib_raw_packet_caps { - /* Strip cvlan from incoming packet and report it in the matching work + /* + * Strip cvlan from incoming packet and report it in the matching work * completion is supported. */ - IB_RAW_PACKET_CAP_CVLAN_STRIPPING = (1 << 0), - /* Scatter FCS field of an incoming packet to host memory is supported. + IB_RAW_PACKET_CAP_CVLAN_STRIPPING = + IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING, + /* + * Scatter FCS field of an incoming packet to host memory is supported. */ - IB_RAW_PACKET_CAP_SCATTER_FCS = (1 << 1), + IB_RAW_PACKET_CAP_SCATTER_FCS = IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS, /* Checksum offloads are supported (for both send and receive). */ - IB_RAW_PACKET_CAP_IP_CSUM = (1 << 2), - /* When a packet is received for an RQ with no receive WQEs, the + IB_RAW_PACKET_CAP_IP_CSUM = IB_UVERBS_RAW_PACKET_CAP_IP_CSUM, + /* + * When a packet is received for an RQ with no receive WQEs, the * packet processing is delayed. */ - IB_RAW_PACKET_CAP_DELAY_DROP = (1 << 3), + IB_RAW_PACKET_CAP_DELAY_DROP = IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP, }; enum ib_wq_type { @@ -4304,7 +4322,7 @@ static inline int ib_check_mr_access(struct ib_device *ib_dev, return -EINVAL; if (flags & IB_ACCESS_ON_DEMAND && - !(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EINVAL; return 0; } diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h index cbe3c2811455..f3d5377b217a 100644 --- a/include/rdma/opa_vnic.h +++ b/include/rdma/opa_vnic.h @@ -90,8 +90,7 @@ struct opa_vnic_stats { static inline bool rdma_cap_opa_vnic(struct ib_device *device) { - return !!(device->attrs.device_cap_flags & - IB_DEVICE_RDMA_NETDEV_OPA); + return !!(device->attrs.kernel_cap_flags & IBK_RDMA_NETDEV_OPA); } #endif /* _OPA_VNIC_H */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 3995c58a1c51..b61d9c90fa26 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -2015,19 +2015,17 @@ DECLARE_EVENT_CLASS(svc_deferred_event, TP_STRUCT__entry( __field(const void *, dr) __field(u32, xid) - __array(__u8, addr, INET6_ADDRSTRLEN + 10) + __sockaddr(addr, dr->addrlen) ), TP_fast_assign( __entry->dr = dr; - __entry->xid = be32_to_cpu(*(__be32 *)(dr->args + - (dr->xprt_hlen>>2))); - snprintf(__entry->addr, sizeof(__entry->addr) - 1, - "%pISpc", (struct sockaddr *)&dr->addr); + __entry->xid = be32_to_cpu(*(__be32 *)dr->args); + __assign_sockaddr(addr, &dr->addr, dr->addrlen); ), - TP_printk("addr=%s dr=%p xid=0x%08x", __entry->addr, __entry->dr, - __entry->xid) + TP_printk("addr=%pISpc dr=%p xid=0x%08x", __get_sockaddr(addr), + __entry->dr, __entry->xid) ); #define DEFINE_SVC_DEFERRED_EVENT(name) \ diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 7ee73a0652f1..7dd903d932e5 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -1298,4 +1298,46 @@ struct ib_uverbs_ex_modify_cq { #define IB_DEVICE_NAME_MAX 64 +/* + * bits 9, 15, 16, 19, 22, 27, 30, 31, 32, 33, 35 and 37 may be set by old + * kernels and should not be used. + */ +enum ib_uverbs_device_cap_flags { + IB_UVERBS_DEVICE_RESIZE_MAX_WR = 1 << 0, + IB_UVERBS_DEVICE_BAD_PKEY_CNTR = 1 << 1, + IB_UVERBS_DEVICE_BAD_QKEY_CNTR = 1 << 2, + IB_UVERBS_DEVICE_RAW_MULTI = 1 << 3, + IB_UVERBS_DEVICE_AUTO_PATH_MIG = 1 << 4, + IB_UVERBS_DEVICE_CHANGE_PHY_PORT = 1 << 5, + IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, + IB_UVERBS_DEVICE_CURR_QP_STATE_MOD = 1 << 7, + IB_UVERBS_DEVICE_SHUTDOWN_PORT = 1 << 8, + /* IB_UVERBS_DEVICE_INIT_TYPE = 1 << 9, (not in use) */ + IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, + IB_UVERBS_DEVICE_SYS_IMAGE_GUID = 1 << 11, + IB_UVERBS_DEVICE_RC_RNR_NAK_GEN = 1 << 12, + IB_UVERBS_DEVICE_SRQ_RESIZE = 1 << 13, + IB_UVERBS_DEVICE_N_NOTIFY_CQ = 1 << 14, + IB_UVERBS_DEVICE_MEM_WINDOW = 1 << 17, + IB_UVERBS_DEVICE_UD_IP_CSUM = 1 << 18, + IB_UVERBS_DEVICE_XRC = 1 << 20, + IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS = 1 << 21, + IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A = 1 << 23, + IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B = 1 << 24, + IB_UVERBS_DEVICE_RC_IP_CSUM = 1 << 25, + /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_IP_CSUM. */ + IB_UVERBS_DEVICE_RAW_IP_CSUM = 1 << 26, + IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING = 1 << 29, + /* Deprecated. Please use IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS. */ + IB_UVERBS_DEVICE_RAW_SCATTER_FCS = 1ULL << 34, + IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING = 1ULL << 36, +}; + +enum ib_uverbs_raw_packet_caps { + IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING = 1 << 0, + IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS = 1 << 1, + IB_UVERBS_RAW_PACKET_CAP_IP_CSUM = 1 << 2, + IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP = 1 << 3, +}; + #endif /* IB_USER_VERBS_H */ diff --git a/net/rds/ib.c b/net/rds/ib.c index 24c9a9005a6f..9826fe7f9d00 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -154,8 +154,8 @@ static int rds_ib_add_one(struct ib_device *device) rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); rds_ibdev->odp_capable = - !!(device->attrs.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING) && + !!(device->attrs.kernel_cap_flags & + IBK_ON_DEMAND_PAGING) && !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & IB_ODP_SUPPORT_WRITE) && !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index c2ba9d4cd2c7..bcd74dddbe2d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -900,7 +900,7 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g * rejecting the server-computed MIC in this somewhat rare case, * do not use splice with the GSS integrity service. */ - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Did we already verify the signature on the original pass through? */ if (rqstp->rq_deferred) @@ -972,7 +972,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs int pad, remaining_len, offset; u32 rseqno; - clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index bb1177395b99..c3c693b51c94 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -33,7 +33,9 @@ #include #include #include + #include "netns.h" +#include "fail.h" #define RPCDBG_FACILITY RPCDBG_CACHE @@ -688,16 +690,30 @@ static void cache_limit_defers(void) discard->revisit(discard, 1); } +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +static inline bool cache_defer_immediately(void) +{ + return !fail_sunrpc.ignore_cache_wait && + should_fail(&fail_sunrpc.attr, 1); +} +#else +static inline bool cache_defer_immediately(void) +{ + return false; +} +#endif + /* Return true if and only if a deferred request is queued. */ static bool cache_defer_req(struct cache_req *req, struct cache_head *item) { struct cache_deferred_req *dreq; - if (req->thread_wait) { + if (!cache_defer_immediately()) { cache_wait_req(req, item); if (!test_bit(CACHE_PENDING, &item->flags)) return false; } + dreq = req->defer(req); if (dreq == NULL) return false; diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 7dc9cc929bfd..a176d5a0b0ee 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -262,6 +262,9 @@ static void fail_sunrpc_init(void) debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir, &fail_sunrpc.ignore_server_disconnect); + + debugfs_create_bool("ignore-cache-wait", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_cache_wait); } #else static void fail_sunrpc_init(void) diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h index 69dc30cc44b8..4b4b500df428 100644 --- a/net/sunrpc/fail.h +++ b/net/sunrpc/fail.h @@ -14,8 +14,8 @@ struct fail_sunrpc_attr { struct fault_attr attr; bool ignore_client_disconnect; - bool ignore_server_disconnect; + bool ignore_cache_wait; }; extern struct fail_sunrpc_attr fail_sunrpc; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 557004017548..7c9a0d0b1230 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -356,15 +356,21 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx) } } -/* - * Use the mapping mode to choose a pool for a given CPU. - * Used when enqueueing an incoming RPC. Always returns - * a non-NULL pool pointer. +/** + * svc_pool_for_cpu - Select pool to run a thread on this cpu + * @serv: An RPC service + * + * Use the active CPU and the svc_pool_map's mode setting to + * select the svc thread pool to use. Once initialized, the + * svc_pool_map does not change. + * + * Return value: + * A pointer to an svc_pool */ -struct svc_pool * -svc_pool_for_cpu(struct svc_serv *serv, int cpu) +struct svc_pool *svc_pool_for_cpu(struct svc_serv *serv) { struct svc_pool_map *m = &svc_pool_map; + int cpu = raw_smp_processor_id(); unsigned int pidx = 0; if (serv->sv_nrpools <= 1) @@ -1238,10 +1244,10 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_short_len; /* Will be turned off by GSS integrity and privacy services */ - set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); + __set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ - set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); - clear_bit(RQ_DROPME, &rqstp->rq_flags); + __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); + __clear_bit(RQ_DROPME, &rqstp->rq_flags); svc_putu32(resv, rqstp->rq_xid); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5b59e2103526..2c4dd7ca95b0 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -448,7 +448,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; struct svc_rqst *rqstp = NULL; - int cpu; if (!svc_xprt_ready(xprt)) return; @@ -461,8 +460,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) return; - cpu = get_cpu(); - pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + pool = svc_pool_for_cpu(xprt->xpt_server); atomic_long_inc(&pool->sp_stats.packets); @@ -485,7 +483,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) rqstp = NULL; out_unlock: rcu_read_unlock(); - put_cpu(); trace_svc_xprt_enqueue(xprt, rqstp); } EXPORT_SYMBOL_GPL(svc_xprt_enqueue); @@ -1230,7 +1227,6 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->addrlen = rqstp->rq_addrlen; dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; - dr->xprt_hlen = rqstp->rq_xprt_hlen; dr->xprt_ctxt = rqstp->rq_xprt_ctxt; rqstp->rq_xprt_ctxt = NULL; @@ -1242,7 +1238,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; - set_bit(RQ_DROPME, &rqstp->rq_flags); + __set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; return &dr->handle; @@ -1258,22 +1254,21 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) trace_svc_defer_recv(dr); /* setup iov_base past transport header */ - rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); + rqstp->rq_arg.head[0].iov_base = dr->args; /* The iov_len does not include the transport header bytes */ - rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen; + rqstp->rq_arg.head[0].iov_len = dr->argslen << 2; rqstp->rq_arg.page_len = 0; /* The rq_arg.len includes the transport header bytes */ - rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_arg.len = dr->argslen << 2; rqstp->rq_prot = dr->prot; memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); rqstp->rq_addrlen = dr->addrlen; /* Save off transport header len in case we get deferred again */ - rqstp->rq_xprt_hlen = dr->xprt_hlen; rqstp->rq_daddr = dr->daddr; rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; svc_xprt_received(rqstp->rq_xprt); - return (dr->argslen<<2) - dr->xprt_hlen; + return dr->argslen << 2; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 45336e68bf79..2fc98fea59b4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -117,15 +117,6 @@ static void svc_reclassify_socket(struct socket *sock) */ static void svc_tcp_release_rqst(struct svc_rqst *rqstp) { - struct sk_buff *skb = rqstp->rq_xprt_ctxt; - - if (skb) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - - rqstp->rq_xprt_ctxt = NULL; - skb_free_datagram_locked(svsk->sk_sk, skb); - } } /** @@ -259,8 +250,6 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, ssize_t len; size_t t; - rqstp->rq_xprt_hlen = 0; - clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) { @@ -309,9 +298,9 @@ static void svc_sock_setbufsize(struct svc_sock *svsk, unsigned int nreqs) static void svc_sock_secure_port(struct svc_rqst *rqstp) { if (svc_port_is_privileged(svc_addr(rqstp))) - set_bit(RQ_SECURE, &rqstp->rq_flags); + __set_bit(RQ_SECURE, &rqstp->rq_flags); else - clear_bit(RQ_SECURE, &rqstp->rq_flags); + __clear_bit(RQ_SECURE, &rqstp->rq_flags); } /* @@ -1019,9 +1008,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags)) - set_bit(RQ_LOCAL, &rqstp->rq_flags); + __set_bit(RQ_LOCAL, &rqstp->rq_flags); else - clear_bit(RQ_LOCAL, &rqstp->rq_flags); + __clear_bit(RQ_LOCAL, &rqstp->rq_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 3fcd8e1b2550..de0bdb6b729f 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -195,7 +195,7 @@ int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) ep->re_attr.cap.max_recv_sge = 1; ep->re_mrtype = IB_MR_TYPE_MEM_REG; - if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) + if (attrs->kernel_cap_flags & IBK_SG_GAPS_REG) ep->re_mrtype = IB_MR_TYPE_SG_GAPS; /* Quirk: Some devices advertise a large max_fast_reg_page_list_len diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 864131a9fc6e..5242ad121450 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -831,7 +831,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_err; if (ret == 0) goto out_drop; - rqstp->rq_xprt_hlen = 0; if (svc_rdma_is_reverse_direction_reply(xprt, ctxt)) goto out_backchannel; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 94b20fb47135..199fa012f18a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -602,7 +602,7 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) static void svc_rdma_secure_port(struct svc_rqst *rqstp) { - set_bit(RQ_SECURE, &rqstp->rq_flags); + __set_bit(RQ_SECURE, &rqstp->rq_flags); } static void svc_rdma_kill_temp_xprt(struct svc_xprt *xprt) diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile index 6f0aecad5d67..b34d11e22636 100644 --- a/scripts/gcc-plugins/Makefile +++ b/scripts/gcc-plugins/Makefile @@ -28,7 +28,7 @@ GCC_PLUGINS_DIR = $(shell $(CC) -print-file-name=plugin) plugin_cxxflags = -Wp,-MMD,$(depfile) $(KBUILD_HOSTCXXFLAGS) -fPIC \ -include $(srctree)/include/linux/compiler-version.h \ - -include $(objtree)/include/generated/utsrelease.h \ + -DPLUGIN_VERSION=$(call stringify,$(KERNELVERSION)) \ -I $(GCC_PLUGINS_DIR)/include -I $(obj) -std=gnu++11 \ -fno-rtti -fno-exceptions -fasynchronous-unwind-tables \ -ggdb -Wno-narrowing -Wno-unused-variable \ diff --git a/scripts/gcc-plugins/latent_entropy_plugin.c b/scripts/gcc-plugins/latent_entropy_plugin.c index 5d415b2572a8..848918764174 100644 --- a/scripts/gcc-plugins/latent_entropy_plugin.c +++ b/scripts/gcc-plugins/latent_entropy_plugin.c @@ -82,7 +82,7 @@ __visible int plugin_is_GPL_compatible; static GTY(()) tree latent_entropy_decl; static struct plugin_info latent_entropy_plugin_info = { - .version = UTS_RELEASE, + .version = PLUGIN_VERSION, .help = "disable\tturn off latent entropy instrumentation\n", }; diff --git a/scripts/gcc-plugins/randomize_layout_plugin.c b/scripts/gcc-plugins/randomize_layout_plugin.c index ea2aea570404..951b74ba1b24 100644 --- a/scripts/gcc-plugins/randomize_layout_plugin.c +++ b/scripts/gcc-plugins/randomize_layout_plugin.c @@ -34,7 +34,7 @@ __visible int plugin_is_GPL_compatible; static int performance_mode; static struct plugin_info randomize_layout_plugin_info = { - .version = UTS_RELEASE, + .version = PLUGIN_VERSION, .help = "disable\t\t\tdo not activate plugin\n" "performance-mode\tenable cacheline-aware layout randomization\n" }; diff --git a/scripts/gcc-plugins/sancov_plugin.c b/scripts/gcc-plugins/sancov_plugin.c index f3d629555b84..b76cb9c42cec 100644 --- a/scripts/gcc-plugins/sancov_plugin.c +++ b/scripts/gcc-plugins/sancov_plugin.c @@ -26,7 +26,7 @@ __visible int plugin_is_GPL_compatible; tree sancov_fndecl; static struct plugin_info sancov_plugin_info = { - .version = UTS_RELEASE, + .version = PLUGIN_VERSION, .help = "sancov plugin\n", }; diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index de817d54b8af..ff91885f9470 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -44,7 +44,7 @@ static bool verbose = false; static GTY(()) tree track_function_decl; static struct plugin_info stackleak_plugin_info = { - .version = UTS_RELEASE, + .version = PLUGIN_VERSION, .help = "track-min-size=nn\ttrack stack for functions with a stack frame size >= nn bytes\n" "arch=target_arch\tspecify target build arch\n" "disable\t\tdo not activate the plugin\n" diff --git a/scripts/gcc-plugins/structleak_plugin.c b/scripts/gcc-plugins/structleak_plugin.c index 86b608a24ec0..8bc04068ed39 100644 --- a/scripts/gcc-plugins/structleak_plugin.c +++ b/scripts/gcc-plugins/structleak_plugin.c @@ -37,7 +37,7 @@ __visible int plugin_is_GPL_compatible; static struct plugin_info structleak_plugin_info = { - .version = UTS_RELEASE, + .version = PLUGIN_VERSION, .help = "disable\tdo not activate plugin\n" "byref\tinit structs passed by reference\n" "byref-all\tinit anything passed by reference\n"