From b410ed2a8572d41c68bd9208555610e4b07d0703 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 8 Mar 2021 17:11:43 +0200 Subject: [PATCH 01/51] perf auxtrace: Fix auxtrace queue conflict The only requirement of an auxtrace queue is that the buffers are in time order. That is achieved by making separate queues for separate perf buffer or AUX area buffer mmaps. That generally means a separate queue per cpu for per-cpu contexts, and a separate queue per thread for per-task contexts. When buffers are added to a queue, perf checks that the buffer cpu and thread id (tid) match the queue cpu and thread id. However, generally, that need not be true, and perf will queue buffers correctly anyway, so the check is not needed. In addition, the check gets erroneously hit when using sample mode to trace multiple threads. Consequently, fix that case by removing the check. Fixes: e502789302a6 ("perf auxtrace: Add helpers for queuing AUX area tracing data") Reported-by: Andi Kleen Signed-off-by: Adrian Hunter Reviewed-by: Andi Kleen Cc: Jiri Olsa Link: http://lore.kernel.org/lkml/20210308151143.18338-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/auxtrace.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 953f4afacd3b..5b6ccb90b397 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -298,10 +298,6 @@ static int auxtrace_queues__queue_buffer(struct auxtrace_queues *queues, queue->set = true; queue->tid = buffer->tid; queue->cpu = buffer->cpu; - } else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) { - pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n", - queue->cpu, queue->tid, buffer->cpu, buffer->tid); - return -EINVAL; } buffer->buffer_nr = queues->next_buffer_nr++; From c3d59cfde9cc1fa699eb6bf0d3ce4156354e3a98 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 9 Mar 2021 12:04:47 +0100 Subject: [PATCH 02/51] perf synthetic-events: Fix uninitialized 'kernel_thread' variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit perf build fails on 5.12.0rc2 on s390 with this error message: util/synthetic-events.c: In function ‘__event__synthesize_thread.part.0.isra’: util/synthetic-events.c:787:19: error: ‘kernel_thread’ may be used uninitialized in this function [-Werror=maybe-uninitialized] 787 | if (_pid == pid && !kernel_thread) { | ~~~~~~~~~~~~^~~~~~~~~~~~~~~~~ The build succeeds using command 'make DEBUG=y'. The variable kernel_thread is set by this function sequence: __event__synthesize_thread() | defines bool kernel_thread; as local variable and calls +--> perf_event__prepare_comm(..., &kernel_thread) +--> perf_event__get_comm_ids(..., bool *kernel); On return of this function variable kernel is always set to true or false. To prevent this compile error, assign variable kernel_thread a value when it is defined. Output after: [root@m35lp76 perf]# make util/synthetic-events.o .... CC util/synthetic-events.o [root@m35lp76 perf]# Fixes: c1b907953b2cd9ff ("perf tools: Skip PERF_RECORD_MMAP event synthesis for kernel threads") Signed-off-by: Thomas Richter Cc: Heiko Carstens Cc: Namhyung Kim Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20210309110447.834292-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b698046ec2db..5dd451695f33 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -758,7 +758,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, for (i = 0; i < n; i++) { char *end; pid_t _pid; - bool kernel_thread; + bool kernel_thread = false; _pid = strtol(dirent[i]->d_name, &end, 10); if (*end) From 49f2675f5b4d5d0af22c963f9f6152abb1bb15aa Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 10 Mar 2021 09:18:30 -0300 Subject: [PATCH 03/51] tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: 30b5c851af7991ad ("KVM: x86/xen: Add support for vCPU runstate information") That don't cause any change in tooling as it doesn't introduce any new ioctl, just parameters to existing one. This silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: David Woodhouse Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 8b281f722e5b..f6afee209620 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1154,6 +1154,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) struct kvm_xen_hvm_config { __u32 flags; @@ -1621,12 +1622,24 @@ struct kvm_xen_vcpu_attr { union { __u64 gpa; __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; } u; }; /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 /* Secure Encrypted Virtualization command */ enum sev_cmd_id { From 2a76f6de07906f0bb5f2a13fb02845db1695cc29 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 9 Mar 2021 15:49:45 -0800 Subject: [PATCH 04/51] perf synthetic events: Avoid write of uninitialized memory when generating PERF_RECORD_MMAP* records Account for alignment bytes in the zero-ing memset. Fixes: 1a853e36871b533c ("perf record: Allow specifying a pid to record") Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Ingo Molnar Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210309234945.419254-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/synthetic-events.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index 5dd451695f33..dff178103ce5 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -424,7 +424,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, while (!io.eof) { static const char anonstr[] = "//anon"; - size_t size; + size_t size, aligned_size; /* ensure null termination since stack will be reused. */ event->mmap2.filename[0] = '\0'; @@ -484,11 +484,12 @@ out: } size = strlen(event->mmap2.filename) + 1; - size = PERF_ALIGN(size, sizeof(u64)); + aligned_size = PERF_ALIGN(size, sizeof(u64)); event->mmap2.len -= event->mmap.start; event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); + (sizeof(event->mmap2.filename) - aligned_size)); + memset(event->mmap2.filename + size, 0, machine->id_hdr_size + + (aligned_size - size)); event->mmap2.header.size += machine->id_hdr_size; event->mmap2.pid = tgid; event->mmap2.tid = pid; From e40647762fb5881360874e08e03e972d58d63c42 Mon Sep 17 00:00:00 2001 From: Jin Yao Date: Wed, 10 Mar 2021 13:11:38 +0800 Subject: [PATCH 05/51] perf pmu: Validate raw event with sysfs exported format bits A raw PMU event (eventsel+umask) in the form of rNNN is supported by perf but lacks of checking for the validity of raw encoding. For example, bit 16 and bit 17 are not valid on KBL but perf doesn't report warning when encoding with these bits. Before: # ./perf stat -e cpu/r031234/ -a -- sleep 1 Performance counter stats for 'system wide': 0 cpu/r031234/ 1.003798924 seconds time elapsed It may silently measure the wrong event! The kernel supported bits have been exported through /sys/devices//format/. Perf collects the information to 'struct perf_pmu_format' and links it to 'pmu->format' list. The 'struct perf_pmu_format' has a bitmap which records the valid bits for this format. For example, root@kbl-ppc:/sys/devices/cpu/format# cat umask config:8-15 The valid bits (bit8-bit15) are recorded in bitmap of format 'umask'. We collect total valid bits of all formats, save to a local variable 'masks' and reverse it. Now '~masks' represents total invalid bits. bits = config & ~masks; The set bits in 'bits' indicate the invalid bits used in config. Finally we use bitmap_scnprintf to report the invalid bits. Some architectures may not export supported bits through sysfs, so if masks is 0, perf_pmu__warn_invalid_config directly returns. After: Single event without name: # ./perf stat -e cpu/r031234/ -a -- sleep 1 WARNING: event 'N/A' not valid (bits 16-17 of config '31234' not supported by kernel)! Performance counter stats for 'system wide': 0 cpu/r031234/ 1.001597373 seconds time elapsed Multiple events with names: # ./perf stat -e cpu/rf01234,name=aaa/,cpu/r031234,name=bbb/ -a -- sleep 1 WARNING: event 'aaa' not valid (bits 20,22 of config 'f01234' not supported by kernel)! WARNING: event 'bbb' not valid (bits 16-17 of config '31234' not supported by kernel)! Performance counter stats for 'system wide': 0 aaa 0 bbb 1.001573787 seconds time elapsed Warnings are reported for invalid bits. Co-developed-by: Jiri Olsa Signed-off-by: Jin Yao Reviewed-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Jin Yao Cc: Kan Liang Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210310051138.12154-1-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 3 +++ tools/perf/util/pmu.c | 33 +++++++++++++++++++++++++++++++++ tools/perf/util/pmu.h | 3 +++ 3 files changed, 39 insertions(+) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 42c84adeb2fb..c0c0fab22cb8 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -356,6 +356,9 @@ __add_event(struct list_head *list, int *idx, struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) : cpu_list ? perf_cpu_map__new(cpu_list) : NULL; + if (pmu && attr->type == PERF_TYPE_RAW) + perf_pmu__warn_invalid_config(pmu, attr->config, name); + if (init_attr) event_attr_init(attr); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 44ef28302fc7..46fd0f998484 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1812,3 +1812,36 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) return nr_caps; } + +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name) +{ + struct perf_pmu_format *format; + __u64 masks = 0, bits; + char buf[100]; + unsigned int i; + + list_for_each_entry(format, &pmu->format, list) { + if (format->value != PERF_PMU_FORMAT_VALUE_CONFIG) + continue; + + for_each_set_bit(i, format->bits, PERF_PMU_FORMAT_BITS) + masks |= 1ULL << i; + } + + /* + * Kernel doesn't export any valid format bits. + */ + if (masks == 0) + return; + + bits = config & ~masks; + if (bits == 0) + return; + + bitmap_scnprintf((unsigned long *)&bits, sizeof(bits) * 8, buf, sizeof(buf)); + + pr_warning("WARNING: event '%s' not valid (bits %s of config " + "'%llx' not supported by kernel)!\n", + name ?: "N/A", buf, config); +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 8164388478c6..160b0f561771 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -123,4 +123,7 @@ int perf_pmu__convert_scale(const char *scale, char **end, double *sval); int perf_pmu__caps_parse(struct perf_pmu *pmu); +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name); + #endif /* __PMU_H */ From 1a096ae46e21b73f83a581e617f76326c1de592d Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Tue, 16 Mar 2021 09:24:53 +0800 Subject: [PATCH 06/51] perf top: Fix BPF support related crash with perf_event_paranoid=3 + kptr_restrict After installing the libelf-dev package and compiling perf, if we have kptr_restrict=2 and perf_event_paranoid=3 'perf top' will crash because the value of /proc/kallsyms cannot be obtained, which leads to info->jited_ksyms == NULL. In order to solve this problem, Add a check before use. Also plug some leaks on the error path. Suggested-by: Jiri Olsa Signed-off-by: Jackie Liu Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: jackie liu Link: http://lore.kernel.org/lkml/20210316012453.1156-1-liuyun01@kylinos.cn Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/bpf-event.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 57d58c81a5f8..cdecda1ddd36 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -196,25 +196,32 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, } if (info_linear->info_len < offsetof(struct bpf_prog_info, prog_tags)) { + free(info_linear); pr_debug("%s: the kernel is too old, aborting\n", __func__); return -2; } info = &info_linear->info; + if (!info->jited_ksyms) { + free(info_linear); + return -1; + } /* number of ksyms, func_lengths, and tags should match */ sub_prog_cnt = info->nr_jited_ksyms; if (sub_prog_cnt != info->nr_prog_tags || - sub_prog_cnt != info->nr_jited_func_lens) + sub_prog_cnt != info->nr_jited_func_lens) { + free(info_linear); return -1; + } /* check BTF func info support */ if (info->btf_id && info->nr_func_info && info->func_info_rec_size) { /* btf func info number should be same as sub_prog_cnt */ if (sub_prog_cnt != info->nr_func_info) { pr_debug("%s: mismatch in BPF sub program count and BTF function info count, aborting\n", __func__); - err = -1; - goto out; + free(info_linear); + return -1; } if (btf__get_from_id(info->btf_id, &btf)) { pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id); From 701454bce906241ba7f50e2773881560d6404d29 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 16 Feb 2021 19:27:14 +0100 Subject: [PATCH 07/51] auxdisplay: Remove in_interrupt() usage. charlcd_write() is invoked as a VFS->write() callback and as such it is always invoked from preemptible context and may sleep. charlcd_puts() is invoked from register/unregister callback which is preemptible. The reboot notifier callback is also invoked from preemptible context. Therefore there is no need to use in_interrupt() to figure out if it is safe to sleep because it always is. in_interrupt() and related context checks are being removed from non-core code. Using schedule() to schedule (and be friendly to others) is discouraged and cond_resched() should be used instead. Remove in_interrupt() and use cond_resched() to schedule every 32 iterations if needed. Link: https://lkml.kernel.org/r/20200914204209.256266093@linutronix.de Signed-off-by: Sebastian Andrzej Siewior [mo: fixed a couple typos in comment and commit message] Signed-off-by: Miguel Ojeda --- drivers/auxdisplay/charlcd.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index f43430e9dcee..24fd6f369ebe 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -470,12 +470,14 @@ static ssize_t charlcd_write(struct file *file, const char __user *buf, char c; for (; count-- > 0; (*ppos)++, tmp++) { - if (!in_interrupt() && (((count + 1) & 0x1f) == 0)) + if (((count + 1) & 0x1f) == 0) { /* - * let's be a little nice with other processes - * that need some CPU + * charlcd_write() is invoked as a VFS->write() callback + * and as such it is always invoked from preemptible + * context and may sleep. */ - schedule(); + cond_resched(); + } if (get_user(c, tmp)) return -EFAULT; @@ -537,12 +539,8 @@ static void charlcd_puts(struct charlcd *lcd, const char *s) int count = strlen(s); for (; count-- > 0; tmp++) { - if (!in_interrupt() && (((count + 1) & 0x1f) == 0)) - /* - * let's be a little nice with other processes - * that need some CPU - */ - schedule(); + if (((count + 1) & 0x1f) == 0) + cond_resched(); charlcd_write_char(lcd, *tmp); } From 0b8cfa974dfc964e6382c9e25fa6c1bdac6ef499 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 21 Mar 2021 14:16:08 -0600 Subject: [PATCH 08/51] io_uring: don't use {test,clear}_tsk_thread_flag() for current Linus correctly points out that this is both unnecessary and generates much worse code on some archs as going from current to thread_info is actually backwards - and obviously just wasteful, since the thread_info is what we care about. Since io_uring only operates on current for these operations, just use test_thread_flag() instead. For io-wq, we can further simplify and use tracehook_notify_signal() to handle the TIF_NOTIFY_SIGNAL work and clear the flag. The latter isn't an actual bug right now, but it may very well be in the future if we place other work items under TIF_NOTIFY_SIGNAL. Reported-by: Linus Torvalds Link: https://lore.kernel.org/io-uring/CAHk-=wgYhNck33YHKZ14mFB5MzTTk8gqXHcfj=RWTAXKwgQJgg@mail.gmail.com/ Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 ++---- fs/io_uring.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 3dc10bfd8c3b..2dd43bdb9c7b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -388,11 +388,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) static bool io_flush_signals(void) { - if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { __set_current_state(TASK_RUNNING); - if (current->task_works) - task_work_run(); - clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); + tracehook_notify_signal(); return true; } return false; diff --git a/fs/io_uring.c b/fs/io_uring.c index 543551d70327..be04bc6b5b99 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6873,7 +6873,7 @@ static int io_run_task_work_sig(void) return 1; if (!signal_pending(current)) return 0; - if (test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL)) + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) return -ERESTARTSYS; return -EINTR; } From d07f1e8a42614cc938c9c88866d4474a5a7fee31 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 22 Mar 2021 01:45:58 +0000 Subject: [PATCH 09/51] io_uring: correct io_queue_async_work() traces Request's io-wq work is hashed in io_prep_async_link(), so as trace_io_uring_queue_async_work() looks at it should follow after prep has been done. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/709c9f872f4d2e198c7aed9c49019ca7095dd24d.1616366969.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index be04bc6b5b99..50acd6dfd966 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1239,10 +1239,10 @@ static void io_queue_async_work(struct io_kiocb *req) BUG_ON(!tctx); BUG_ON(!tctx->io_wq); - trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, - &req->work, req->flags); /* init ->work of the whole link before punting */ io_prep_async_link(req); + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, + &req->work, req->flags); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); From b65c128f963df367a8adcfb08f5ecf8721052723 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 22 Mar 2021 01:45:59 +0000 Subject: [PATCH 10/51] io_uring: don't skip file_end_write() on reissue Don't miss to call kiocb_end_write() from __io_complete_rw() on reissue. Shouldn't be much of a problem as the function actually does some work only for ISREG, and NONBLOCK won't be reissued. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/32af9b77c5b874e1bee1a3c46396094bd969e577.1616366969.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 50acd6dfd966..a664b44a5a94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2524,13 +2524,12 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, { int cflags = 0; + if (req->rw.kiocb.ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_reissue(req)) return; if (res != req->result) req_set_fail_links(req); - - if (req->rw.kiocb.ki_flags & IOCB_WRITE) - kiocb_end_write(req); if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); __io_req_complete(req, issue_flags, res, cflags); From d81269fecb8ce16eb07efafc9ff5520b2a31c486 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 19 Mar 2021 10:21:19 +0000 Subject: [PATCH 11/51] io_uring: fix provide_buffers sign extension io_provide_buffers_prep()'s "p->len * p->nbufs" to sign extension problems. Not a huge problem as it's only used for access_ok() and increases the checked length, but better to keep typing right. Reported-by: Colin Ian King Fixes: efe68c1ca8f49 ("io_uring: validate the full range of provided buffers for access") Signed-off-by: Pavel Begunkov Reviewed-by: Colin Ian King Link: https://lore.kernel.org/r/562376a39509e260d8532186a06226e56eb1f594.1616149233.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a664b44a5a94..f3ae83a2d7bc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3977,6 +3977,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) static int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + unsigned long size; struct io_provide_buf *p = &req->pbuf; u64 tmp; @@ -3990,7 +3991,8 @@ static int io_provide_buffers_prep(struct io_kiocb *req, p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); - if (!access_ok(u64_to_user_ptr(p->addr), (p->len * p->nbufs))) + size = (unsigned long)p->len * p->nbufs; + if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; p->bgid = READ_ONCE(sqe->buf_group); From 8249d17d3194eac064a8ca5bc5ca0abc86feecde Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 18 Mar 2021 13:26:57 -0700 Subject: [PATCH 12/51] x86/mem_encrypt: Correct physical address calculation in __set_clr_pte_enc() The pfn variable contains the page frame number as returned by the pXX_pfn() functions, shifted to the right by PAGE_SHIFT to remove the page bits. After page protection computations are done to it, it gets shifted back to the physical address using page_level_shift(). That is wrong, of course, because that function determines the shift length based on the level of the page in the page table but in all the cases, it was shifted by PAGE_SHIFT before. Therefore, shift it back using PAGE_SHIFT to get the correct physical address. [ bp: Rewrite commit message. ] Fixes: dfaaec9033b8 ("x86: Add support for changing memory encryption attribute in early boot") Signed-off-by: Isaku Yamahata Signed-off-by: Borislav Petkov Reviewed-by: Kirill A. Shutemov Reviewed-by: Tom Lendacky Cc: Link: https://lkml.kernel.org/r/81abbae1657053eccc535c16151f63cd049dcb97.1616098294.git.isaku.yamahata@intel.com --- arch/x86/mm/mem_encrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 4b01f7dbaf30..ae78cef79980 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -262,7 +262,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) if (pgprot_val(old_prot) == pgprot_val(new_prot)) return; - pa = pfn << page_level_shift(level); + pa = pfn << PAGE_SHIFT; size = page_level_size(level); /* From 291da9d4a9eb3a1cb0610b7f4480f5b52b1825e7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Mar 2021 09:46:13 +0100 Subject: [PATCH 13/51] locking/mutex: Fix non debug version of mutex_lock_io_nested() If CONFIG_DEBUG_LOCK_ALLOC=n then mutex_lock_io_nested() maps to mutex_lock() which is clearly wrong because mutex_lock() lacks the io_schedule_prepare()/finish() invocations. Map it to mutex_lock_io(). Fixes: f21860bac05b ("locking/mutex, sched/wait: Fix the mutex_lock_io_nested() define") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/878s6fshii.fsf@nanos.tec.linutronix.de --- include/linux/mutex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 0cd631a19727..515cff77a4f4 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -185,7 +185,7 @@ extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) -# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock) +# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock) #endif /* From 9fcb51c14da2953de585c5c6e50697b8a6e91a7b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Mar 2021 13:48:36 +0100 Subject: [PATCH 14/51] x86/build: Turn off -fcf-protection for realmode targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new Ubuntu GCC packages turn on -fcf-protection globally, which causes a build failure in the x86 realmode code: cc1: error: ‘-fcf-protection’ is not compatible with this target Turn it off explicitly on compilers that understand this option. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210323124846.1584944-1-arnd@kernel.org --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d6d5a28c3bf..9a85eae37b17 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,7 +27,7 @@ endif REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse + -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) REALMODE_CFLAGS += -ffreestanding REALMODE_CFLAGS += -fno-stack-protector From 5116784039f0421e9a619023cfba3e302c3d9adc Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Tue, 23 Mar 2021 16:52:19 +0800 Subject: [PATCH 15/51] block: clear GD_NEED_PART_SCAN later in bdev_disk_changed The GD_NEED_PART_SCAN is set by bdev_check_media_change to initiate a partition scan while removing a block device. It should be cleared after blk_drop_paritions because blk_drop_paritions could return -EBUSY and then the consequence __blkdev_get has no chance to do delete_partition if GD_NEED_PART_SCAN already cleared. It causes some problems on some card readers. Ex. Realtek card reader 0bda:0328 and 0bda:0158. The device node of the partition will not disappear after the memory card removed. Thus the user applications can not update the device mapping correctly. BugLink: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1920874 Signed-off-by: Chris Chiu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210323085219.24428-1-chris.chiu@canonical.com Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 92ed7d5df677..28d583fcdc2c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1240,13 +1240,13 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); - clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - rescan: ret = blk_drop_partitions(bdev); if (ret) return ret; + clear_bit(GD_NEED_PART_SCAN, &disk->state); + /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). From a958937ff166fc60d1c3a721036f6ff41bfa2821 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Thu, 11 Feb 2021 09:38:07 -0500 Subject: [PATCH 16/51] block: recalculate segment count for multi-segment discards correctly When a stacked block device inserts a request into another block device using blk_insert_cloned_request, the request's nr_phys_segments field gets recalculated by a call to blk_recalc_rq_segments in blk_cloned_rq_check_limits. But blk_recalc_rq_segments does not know how to handle multi-segment discards. For disk types which can handle multi-segment discards like nvme, this results in discard requests which claim a single segment when it should report several, triggering a warning in nvme and causing nvme to fail the discard from the invalid state. WARNING: CPU: 5 PID: 191 at drivers/nvme/host/core.c:700 nvme_setup_discard+0x170/0x1e0 [nvme_core] ... nvme_setup_cmd+0x217/0x270 [nvme_core] nvme_loop_queue_rq+0x51/0x1b0 [nvme_loop] __blk_mq_try_issue_directly+0xe7/0x1b0 blk_mq_request_issue_directly+0x41/0x70 ? blk_account_io_start+0x40/0x50 dm_mq_queue_rq+0x200/0x3e0 blk_mq_dispatch_rq_list+0x10a/0x7d0 ? __sbitmap_queue_get+0x25/0x90 ? elv_rb_del+0x1f/0x30 ? deadline_remove_request+0x55/0xb0 ? dd_dispatch_request+0x181/0x210 __blk_mq_do_dispatch_sched+0x144/0x290 ? bio_attempt_discard_merge+0x134/0x1f0 __blk_mq_sched_dispatch_requests+0x129/0x180 blk_mq_sched_dispatch_requests+0x30/0x60 __blk_mq_run_hw_queue+0x47/0xe0 __blk_mq_delay_run_hw_queue+0x15b/0x170 blk_mq_sched_insert_requests+0x68/0xe0 blk_mq_flush_plug_list+0xf0/0x170 blk_finish_plug+0x36/0x50 xlog_cil_committed+0x19f/0x290 [xfs] xlog_cil_process_committed+0x57/0x80 [xfs] xlog_state_do_callback+0x1e0/0x2a0 [xfs] xlog_ioend_work+0x2f/0x80 [xfs] process_one_work+0x1b6/0x350 worker_thread+0x53/0x3e0 ? process_one_work+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 This patch fixes blk_recalc_rq_segments to be aware of devices which can have multi-segment discards. It calculates the correct discard segment count by counting the number of bio as each discard bio is considered its own segment. Fixes: 1e739730c5b9 ("block: optionally merge discontiguous discard bios into a single request") Signed-off-by: David Jeffery Reviewed-by: Ming Lei Reviewed-by: Laurence Oberman Link: https://lore.kernel.org/r/20210211143807.GA115624@redhat Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index ffb4aa0ea68b..4d97fb6dd226 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -382,6 +382,14 @@ unsigned int blk_recalc_rq_segments(struct request *rq) switch (bio_op(rq->bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + if (queue_max_discard_segments(rq->q) > 1) { + struct bio *bio = rq->bio; + + for_each_bio(bio) + nr_phys_segs++; + return nr_phys_segs; + } + return 1; case REQ_OP_WRITE_ZEROES: return 0; case REQ_OP_WRITE_SAME: From a185f1db59f13de73aa470559030e90e50b34d93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 23 Mar 2021 10:52:38 +0000 Subject: [PATCH 17/51] io_uring: do ctx sqd ejection in a clear context WARNING: CPU: 1 PID: 27907 at fs/io_uring.c:7147 io_sq_thread_park+0xb5/0xd0 fs/io_uring.c:7147 CPU: 1 PID: 27907 Comm: iou-sqp-27905 Not tainted 5.12.0-rc4-syzkaller #0 RIP: 0010:io_sq_thread_park+0xb5/0xd0 fs/io_uring.c:7147 Call Trace: io_ring_ctx_wait_and_kill+0x214/0x700 fs/io_uring.c:8619 io_uring_release+0x3e/0x50 fs/io_uring.c:8646 __fput+0x288/0x920 fs/file_table.c:280 task_work_run+0xdd/0x1a0 kernel/task_work.c:140 io_run_task_work fs/io_uring.c:2238 [inline] io_run_task_work fs/io_uring.c:2228 [inline] io_uring_try_cancel_requests+0x8ec/0xc60 fs/io_uring.c:8770 io_uring_cancel_sqpoll+0x1cf/0x290 fs/io_uring.c:8974 io_sqpoll_cancel_cb+0x87/0xb0 fs/io_uring.c:8907 io_run_task_work_head+0x58/0xb0 fs/io_uring.c:1961 io_sq_thread+0x3e2/0x18d0 fs/io_uring.c:6763 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 May happen that last ctx ref is killed in io_uring_cancel_sqpoll(), so fput callback (i.e. io_uring_release()) is enqueued through task_work, and run by same cancellation. As it's deeply nested we can't do parking or taking sqd->lock there, because its state is unclear. So avoid ctx ejection from sqd list from io_ring_ctx_wait_and_kill() and do it in a clear context in io_ring_exit_work(). Fixes: f6d54255f423 ("io_uring: halt SQO submission on ctx exit") Reported-by: syzbot+e3a3f84f5cecf61f0583@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e90df88b8ff2cabb14a7534601d35d62ab4cb8c7.1616496707.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index f3ae83a2d7bc..8c5789b96dbb 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8564,6 +8564,14 @@ static void io_ring_exit_work(struct work_struct *work) struct io_tctx_node *node; int ret; + /* prevent SQPOLL from submitting new requests */ + if (ctx->sq_data) { + io_sq_thread_park(ctx->sq_data); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(ctx->sq_data); + io_sq_thread_unpark(ctx->sq_data); + } + /* * If we're doing polled IO and end up having requests being * submitted async (out-of-line), then completions can come in while @@ -8615,14 +8623,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); mutex_unlock(&ctx->uring_lock); - /* prevent SQPOLL from submitting new requests */ - if (ctx->sq_data) { - io_sq_thread_park(ctx->sq_data); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(ctx->sq_data); - io_sq_thread_unpark(ctx->sq_data); - } - io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); From 1833b64fee1032d1f48afaa3956bc0ea6b10d5e0 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 20 Mar 2021 23:10:12 +0100 Subject: [PATCH 18/51] perf daemon: Force waipid for all session on SIGCHLD delivery If we don't process SIGCHLD before another comes, we will see just one SIGCHLD as a result. In this case current code will miss exit notification for a session and wait forever. Adding extra waitpid check for all sessions when SIGCHLD is received, to make sure we don't miss any session exit. Also fix close condition for signal_fd. Reported-by: Ian Rogers Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210320221013.1619613-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-daemon.c | 52 +++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index ace8772a4f03..4697493842f5 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -402,35 +402,42 @@ static pid_t handle_signalfd(struct daemon *daemon) int status; pid_t pid; + /* + * Take signal fd data as pure signal notification and check all + * the sessions state. The reason is that multiple signals can get + * coalesced in kernel and we can receive only single signal even + * if multiple SIGCHLD were generated. + */ err = read(daemon->signal_fd, &si, sizeof(struct signalfd_siginfo)); - if (err != sizeof(struct signalfd_siginfo)) + if (err != sizeof(struct signalfd_siginfo)) { + pr_err("failed to read signal fd\n"); return -1; + } list_for_each_entry(session, &daemon->sessions, list) { - - if (session->pid != (int) si.ssi_pid) + if (session->pid == -1) continue; - pid = waitpid(session->pid, &status, 0); - if (pid == session->pid) { - if (WIFEXITED(status)) { - pr_info("session '%s' exited, status=%d\n", - session->name, WEXITSTATUS(status)); - } else if (WIFSIGNALED(status)) { - pr_info("session '%s' killed (signal %d)\n", - session->name, WTERMSIG(status)); - } else if (WIFSTOPPED(status)) { - pr_info("session '%s' stopped (signal %d)\n", - session->name, WSTOPSIG(status)); - } else { - pr_info("session '%s' Unexpected status (0x%x)\n", - session->name, status); - } + pid = waitpid(session->pid, &status, WNOHANG); + if (pid <= 0) + continue; + + if (WIFEXITED(status)) { + pr_info("session '%s' exited, status=%d\n", + session->name, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + pr_info("session '%s' killed (signal %d)\n", + session->name, WTERMSIG(status)); + } else if (WIFSTOPPED(status)) { + pr_info("session '%s' stopped (signal %d)\n", + session->name, WSTOPSIG(status)); + } else { + pr_info("session '%s' Unexpected status (0x%x)\n", + session->name, status); } session->state = KILL; session->pid = -1; - return pid; } return 0; @@ -443,7 +450,6 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d .fd = daemon->signal_fd, .events = POLLIN, }; - pid_t wpid = 0, pid = session->pid; time_t start; start = time(NULL); @@ -452,7 +458,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d int err = poll(&pollfd, 1, 1000); if (err > 0) { - wpid = handle_signalfd(daemon); + handle_signalfd(daemon); } else if (err < 0) { perror("failed: poll\n"); return -1; @@ -460,7 +466,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d if (start + secs < time(NULL)) return -1; - } while (wpid != pid); + } while (session->pid != -1); return 0; } @@ -1344,7 +1350,7 @@ out: close(sock_fd); if (conf_fd != -1) close(conf_fd); - if (conf_fd != -1) + if (signal_fd != -1) close(signal_fd); pr_info("daemon exited\n"); From 9f177fd8f20b46bbd76dbcc90184caf3b8548a9f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 20 Mar 2021 23:10:13 +0100 Subject: [PATCH 19/51] perf daemon: Return from kill functions We should return correctly and warn in both daemon_session__kill() and daemon__kill() after we tried everything to kill sessions. The current code will keep on looping and waiting. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Ian Rogers Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210320221013.1619613-2-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-daemon.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index 4697493842f5..7c4a9d424a64 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -908,7 +908,9 @@ static void daemon_session__kill(struct daemon_session *session, daemon_session__signal(session, SIGKILL); break; default: - break; + pr_err("failed to wait for session %s\n", + session->name); + return; } how++; @@ -961,7 +963,8 @@ static void daemon__kill(struct daemon *daemon) daemon__signal(daemon, SIGKILL); break; default: - break; + pr_err("failed to wait for sessions\n"); + return; } how++; From eb8f998bbc3d51042ff290b9f6480c1886f6cfb9 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 24 Mar 2021 09:37:34 +0100 Subject: [PATCH 20/51] perf test: Remove now useless failing sub test "BPF relocation checker" For some time now the 'perf test 42: BPF filter' returns an error on bpf relocation subtest, at least on x86 and s390. This is caused by d859900c4c56dc4f ("bpf, libbpf: support global data/bss/rodata sections") which introduces support for global variables in eBPF programs. Perf test 42.4 checks that the eBPF relocation fails when the eBPF program contains a global variable. It returns OK when the eBPF program could not be loaded and FAILED otherwise. With above commit the test logic for the eBPF relocation is obsolete. The loading of the eBPF now succeeds and the test always shows FAILED. This patch removes the sub test completely. Also a lot of eBPF program testing is done in the eBPF test suite, it also contains tests for global variables. Output before: 42: BPF filter : 42.1: Basic BPF filtering : Ok 42.2: BPF pinning : Ok 42.3: BPF prologue generation : Ok 42.4: BPF relocation checker : Failed # Output after: # ./perf test -F 42 42: BPF filter : 42.1: Basic BPF filtering : Ok 42.2: BPF pinning : Ok 42.3: BPF prologue generation : Ok # Signed-off-by: Thomas Richter Suggested-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Cc: Heiko Carstens Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: http://lore.kernel.org/lkml/20210324083734.1953123-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/bpf.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index f57e075b0ed2..c72adbd67386 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -86,7 +86,7 @@ static struct { .msg_load_fail = "check your vmlinux setting?", .target_func = &epoll_pwait_loop, .expect_result = (NR_ITERS + 1) / 2, - .pin = true, + .pin = true, }, #ifdef HAVE_BPF_PROLOGUE { @@ -99,13 +99,6 @@ static struct { .expect_result = (NR_ITERS + 1) / 4, }, #endif - { - .prog_id = LLVM_TESTCASE_BPF_RELOCATION, - .desc = "BPF relocation checker", - .name = "[bpf_relocation_test]", - .msg_compile_fail = "fix 'perf test LLVM' first", - .msg_load_fail = "libbpf error when dealing with relocation", - }, }; static int do_test(struct bpf_object *obj, int (*func)(void), From 41d585411311abf187e5f09042978fe7073a9375 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 15 Mar 2021 13:56:41 +0900 Subject: [PATCH 21/51] perf record: Fix memory leak in vDSO found using ASAN I got several memory leak reports from Asan with a simple command. It was because VDSO is not released due to the refcount. Like in __dsos_addnew_id(), it should put the refcount after adding to the list. $ perf record true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.030 MB perf.data (10 samples) ] ================================================================= ==692599==ERROR: LeakSanitizer: detected memory leaks Direct leak of 439 byte(s) in 1 object(s) allocated from: #0 0x7fea52341037 in __interceptor_calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:154 #1 0x559bce4aa8ee in dso__new_id util/dso.c:1256 #2 0x559bce59245a in __machine__addnew_vdso util/vdso.c:132 #3 0x559bce59245a in machine__findnew_vdso util/vdso.c:347 #4 0x559bce50826c in map__new util/map.c:175 #5 0x559bce503c92 in machine__process_mmap2_event util/machine.c:1787 #6 0x559bce512f6b in machines__deliver_event util/session.c:1481 #7 0x559bce515107 in perf_session__deliver_event util/session.c:1551 #8 0x559bce51d4d2 in do_flush util/ordered-events.c:244 #9 0x559bce51d4d2 in __ordered_events__flush util/ordered-events.c:323 #10 0x559bce519bea in __perf_session__process_events util/session.c:2268 #11 0x559bce519bea in perf_session__process_events util/session.c:2297 #12 0x559bce2e7a52 in process_buildids /home/namhyung/project/linux/tools/perf/builtin-record.c:1017 #13 0x559bce2e7a52 in record__finish_output /home/namhyung/project/linux/tools/perf/builtin-record.c:1234 #14 0x559bce2ed4f6 in __cmd_record /home/namhyung/project/linux/tools/perf/builtin-record.c:2026 #15 0x559bce2ed4f6 in cmd_record /home/namhyung/project/linux/tools/perf/builtin-record.c:2858 #16 0x559bce422db4 in run_builtin /home/namhyung/project/linux/tools/perf/perf.c:313 #17 0x559bce2acac8 in handle_internal_command /home/namhyung/project/linux/tools/perf/perf.c:365 #18 0x559bce2acac8 in run_argv /home/namhyung/project/linux/tools/perf/perf.c:409 #19 0x559bce2acac8 in main /home/namhyung/project/linux/tools/perf/perf.c:539 #20 0x7fea51e76d09 in __libc_start_main ../csu/libc-start.c:308 Indirect leak of 32 byte(s) in 1 object(s) allocated from: #0 0x7fea52341037 in __interceptor_calloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:154 #1 0x559bce520907 in nsinfo__copy util/namespaces.c:169 #2 0x559bce50821b in map__new util/map.c:168 #3 0x559bce503c92 in machine__process_mmap2_event util/machine.c:1787 #4 0x559bce512f6b in machines__deliver_event util/session.c:1481 #5 0x559bce515107 in perf_session__deliver_event util/session.c:1551 #6 0x559bce51d4d2 in do_flush util/ordered-events.c:244 #7 0x559bce51d4d2 in __ordered_events__flush util/ordered-events.c:323 #8 0x559bce519bea in __perf_session__process_events util/session.c:2268 #9 0x559bce519bea in perf_session__process_events util/session.c:2297 #10 0x559bce2e7a52 in process_buildids /home/namhyung/project/linux/tools/perf/builtin-record.c:1017 #11 0x559bce2e7a52 in record__finish_output /home/namhyung/project/linux/tools/perf/builtin-record.c:1234 #12 0x559bce2ed4f6 in __cmd_record /home/namhyung/project/linux/tools/perf/builtin-record.c:2026 #13 0x559bce2ed4f6 in cmd_record /home/namhyung/project/linux/tools/perf/builtin-record.c:2858 #14 0x559bce422db4 in run_builtin /home/namhyung/project/linux/tools/perf/perf.c:313 #15 0x559bce2acac8 in handle_internal_command /home/namhyung/project/linux/tools/perf/perf.c:365 #16 0x559bce2acac8 in run_argv /home/namhyung/project/linux/tools/perf/perf.c:409 #17 0x559bce2acac8 in main /home/namhyung/project/linux/tools/perf/perf.c:539 #18 0x7fea51e76d09 in __libc_start_main ../csu/libc-start.c:308 SUMMARY: AddressSanitizer: 471 byte(s) leaked in 2 allocation(s). Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210315045641.700430-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/vdso.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index 3cc91ad048ea..43beb169631d 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -133,6 +133,8 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s if (dso != NULL) { __dsos__add(&machine->dsos, dso); dso__set_long_name(dso, long_name, false); + /* Put dso here because __dsos_add already got it */ + dso__put(dso); } return dso; From 7de55b7d6f09a2865279d3c41c0fbdbfdb87486a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 25 Mar 2021 00:47:26 +0900 Subject: [PATCH 22/51] block: support zone append bvecs Christoph reported that we'll likely trigger the WARN_ON_ONCE() checking that we're not submitting a bvec with REQ_OP_ZONE_APPEND in bio_iov_iter_get_pages() some time ago using zoned btrfs, but I couldn't reproduce it back then. Now Naohiro was able to trigger the bug as well with xfstests generic/095 on a zoned btrfs. There is nothing that prevents bvec submissions via REQ_OP_ZONE_APPEND if the hardware's zone append limit is met. Reported-by: Naohiro Aota Reported-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/10bd414d9326c90cd69029077db63b363854eee5.1616600835.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- block/bio.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/block/bio.c b/block/bio.c index 26b7f721cda8..963d1d406b3a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -949,7 +949,7 @@ void bio_release_pages(struct bio *bio, bool mark_dirty) } EXPORT_SYMBOL_GPL(bio_release_pages); -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { WARN_ON_ONCE(bio->bi_max_vecs); @@ -959,11 +959,26 @@ static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio->bi_iter.bi_size = iter->count; bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); +} +static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +{ + __bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, iter->count); return 0; } +static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) +{ + struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct iov_iter i = *iter; + + iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); + __bio_iov_bvec_set(bio, &i); + iov_iter_advance(iter, i.count); + return 0; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1094,8 +1109,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) int ret = 0; if (iov_iter_is_bvec(iter)) { - if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) - return -EINVAL; + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + return bio_iov_bvec_set_append(bio, iter); return bio_iov_bvec_set(bio, iter); } From 8b1c9b2025491d7c86255fb773b00ecf94b53acc Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Fri, 19 Mar 2021 14:50:28 -0600 Subject: [PATCH 23/51] scsi: ibmvfc: Fix potential race in ibmvfc_wait_for_ops() For various EH activities the ibmvfc driver uses ibmvfc_wait_for_ops() to wait for the completion of commands that match a given criteria be it cancel key, or specific LUN. With recent changes commands are completed outside the lock in bulk by removing them from the sent list and adding them to a private completion list. This introduces a potential race in ibmvfc_wait_for_ops() since the criteria for a command to be outstanding is no longer simply being on the sent list, but instead not being on the free list. Avoid this race by scanning the entire command event pool and checking that any matching command that ibmvfc needs to wait on is not already on the free list. Link: https://lore.kernel.org/r/20210319205029.312969-2-tyreld@linux.ibm.com Fixes: 1f4a4a19508d ("scsi: ibmvfc: Complete commands outside the host/queue lock") Reviewed-by: Brian King Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi/ibmvfc.c | 42 ++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 6a92891ac488..94c07cc82563 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -2371,6 +2371,24 @@ static int ibmvfc_match_lun(struct ibmvfc_event *evt, void *device) return 0; } +/** + * ibmvfc_event_is_free - Check if event is free or not + * @evt: ibmvfc event struct + * + * Returns: + * true / false + **/ +static bool ibmvfc_event_is_free(struct ibmvfc_event *evt) +{ + struct ibmvfc_event *loop_evt; + + list_for_each_entry(loop_evt, &evt->queue->free, queue_list) + if (loop_evt == evt) + return true; + + return false; +} + /** * ibmvfc_wait_for_ops - Wait for ops to complete * @vhost: ibmvfc host struct @@ -2385,7 +2403,7 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device, { struct ibmvfc_event *evt; DECLARE_COMPLETION_ONSTACK(comp); - int wait; + int wait, i; unsigned long flags; signed long timeout = IBMVFC_ABORT_WAIT_TIMEOUT * HZ; @@ -2393,10 +2411,13 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device, do { wait = 0; spin_lock_irqsave(&vhost->crq.l_lock, flags); - list_for_each_entry(evt, &vhost->crq.sent, queue_list) { - if (match(evt, device)) { - evt->eh_comp = ∁ - wait++; + for (i = 0; i < vhost->crq.evt_pool.size; i++) { + evt = &vhost->crq.evt_pool.events[i]; + if (!ibmvfc_event_is_free(evt)) { + if (match(evt, device)) { + evt->eh_comp = ∁ + wait++; + } } } spin_unlock_irqrestore(&vhost->crq.l_lock, flags); @@ -2407,10 +2428,13 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device, if (!timeout) { wait = 0; spin_lock_irqsave(&vhost->crq.l_lock, flags); - list_for_each_entry(evt, &vhost->crq.sent, queue_list) { - if (match(evt, device)) { - evt->eh_comp = NULL; - wait++; + for (i = 0; i < vhost->crq.evt_pool.size; i++) { + evt = &vhost->crq.evt_pool.events[i]; + if (!ibmvfc_event_is_free(evt)) { + if (match(evt, device)) { + evt->eh_comp = NULL; + wait++; + } } } spin_unlock_irqrestore(&vhost->crq.l_lock, flags); From 62fc2661482b6beccfab8a5987419e96a9499fb4 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Fri, 19 Mar 2021 14:50:29 -0600 Subject: [PATCH 24/51] scsi: ibmvfc: Make ibmvfc_wait_for_ops() MQ aware During MQ enablement of the ibmvfc driver ibmvfc_wait_for_ops() was missed. This function is responsible for waiting on commands to complete that match a certain criteria such as LUN or cancel key. The implementation as is only scans the CRQ for events ignoring any sub-queues and as a result will exit successfully without doing anything when operating in MQ channelized mode. Check the MQ and channel use flags to determine which queues are applicable, and scan each queue accordingly. Note in MQ mode SCSI commands are only issued down sub-queues and the CRQ is only used for driver specific management commands. As such the CRQ events are ignored when operating in MQ mode with channels. Link: https://lore.kernel.org/r/20210319205029.312969-3-tyreld@linux.ibm.com Fixes: 9000cb998bcf ("scsi: ibmvfc: Enable MQ and set reasonable defaults") Reviewed-by: Brian King Signed-off-by: Tyrel Datwyler Signed-off-by: Martin K. Petersen --- drivers/scsi/ibmvscsi/ibmvfc.c | 51 ++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 94c07cc82563..bb64e3247a6c 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -2403,41 +2403,58 @@ static int ibmvfc_wait_for_ops(struct ibmvfc_host *vhost, void *device, { struct ibmvfc_event *evt; DECLARE_COMPLETION_ONSTACK(comp); - int wait, i; + int wait, i, q_index, q_size; unsigned long flags; signed long timeout = IBMVFC_ABORT_WAIT_TIMEOUT * HZ; + struct ibmvfc_queue *queues; ENTER; + if (vhost->mq_enabled && vhost->using_channels) { + queues = vhost->scsi_scrqs.scrqs; + q_size = vhost->scsi_scrqs.active_queues; + } else { + queues = &vhost->crq; + q_size = 1; + } + do { wait = 0; - spin_lock_irqsave(&vhost->crq.l_lock, flags); - for (i = 0; i < vhost->crq.evt_pool.size; i++) { - evt = &vhost->crq.evt_pool.events[i]; - if (!ibmvfc_event_is_free(evt)) { - if (match(evt, device)) { - evt->eh_comp = ∁ - wait++; + spin_lock_irqsave(vhost->host->host_lock, flags); + for (q_index = 0; q_index < q_size; q_index++) { + spin_lock(&queues[q_index].l_lock); + for (i = 0; i < queues[q_index].evt_pool.size; i++) { + evt = &queues[q_index].evt_pool.events[i]; + if (!ibmvfc_event_is_free(evt)) { + if (match(evt, device)) { + evt->eh_comp = ∁ + wait++; + } } } + spin_unlock(&queues[q_index].l_lock); } - spin_unlock_irqrestore(&vhost->crq.l_lock, flags); + spin_unlock_irqrestore(vhost->host->host_lock, flags); if (wait) { timeout = wait_for_completion_timeout(&comp, timeout); if (!timeout) { wait = 0; - spin_lock_irqsave(&vhost->crq.l_lock, flags); - for (i = 0; i < vhost->crq.evt_pool.size; i++) { - evt = &vhost->crq.evt_pool.events[i]; - if (!ibmvfc_event_is_free(evt)) { - if (match(evt, device)) { - evt->eh_comp = NULL; - wait++; + spin_lock_irqsave(vhost->host->host_lock, flags); + for (q_index = 0; q_index < q_size; q_index++) { + spin_lock(&queues[q_index].l_lock); + for (i = 0; i < queues[q_index].evt_pool.size; i++) { + evt = &queues[q_index].evt_pool.events[i]; + if (!ibmvfc_event_is_free(evt)) { + if (match(evt, device)) { + evt->eh_comp = NULL; + wait++; + } } } + spin_unlock(&queues[q_index].l_lock); } - spin_unlock_irqrestore(&vhost->crq.l_lock, flags); + spin_unlock_irqrestore(vhost->host->host_lock, flags); if (wait) dev_err(vhost->dev, "Timed out waiting for aborted commands\n"); LEAVE; From 39c0c8553bfb5a3d108aa47f1256076d507605e3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 20 Mar 2021 16:23:53 -0700 Subject: [PATCH 25/51] scsi: Revert "qla2xxx: Make sure that aborted commands are freed" Calling vha->hw->tgt.tgt_ops->free_cmd() from qlt_xmit_response() is wrong since the command for which a response is sent must remain valid until the SCSI target core calls .release_cmd(). It has been observed that the following scenario triggers a kernel crash: - qlt_xmit_response() calls qlt_check_reserve_free_req() - qlt_check_reserve_free_req() returns -EAGAIN - qlt_xmit_response() calls vha->hw->tgt.tgt_ops->free_cmd(cmd) - transport_handle_queue_full() tries to retransmit the response Fix this crash by reverting the patch that introduced it. Link: https://lore.kernel.org/r/20210320232359.941-2-bvanassche@acm.org Fixes: 0dcec41acb85 ("scsi: qla2xxx: Make sure that aborted commands are freed") Cc: Quinn Tran Cc: Mike Christie Reviewed-by: Daniel Wagner Reviewed-by: Himanshu Madhani Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_target.c | 13 +++++-------- drivers/scsi/qla2xxx/tcm_qla2xxx.c | 4 ---- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index c48daf52725d..480e7d2dcf3e 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -3222,8 +3222,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, if (!qpair->fw_started || (cmd->reset_count != qpair->chip_reset) || (cmd->sess && cmd->sess->deleted)) { cmd->state = QLA_TGT_STATE_PROCESSED; - res = 0; - goto free; + return 0; } ql_dbg_qp(ql_dbg_tgt, qpair, 0xe018, @@ -3234,8 +3233,9 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, res = qlt_pre_xmit_response(cmd, &prm, xmit_type, scsi_status, &full_req_cnt); - if (unlikely(res != 0)) - goto free; + if (unlikely(res != 0)) { + return res; + } spin_lock_irqsave(qpair->qp_lock_ptr, flags); @@ -3255,8 +3255,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, vha->flags.online, qla2x00_reset_active(vha), cmd->reset_count, qpair->chip_reset); spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); - res = 0; - goto free; + return 0; } /* Does F/W have an IOCBs for this request */ @@ -3359,8 +3358,6 @@ out_unmap_unlock: qlt_unmap_sg(vha, cmd); spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); -free: - vha->hw->tgt.tgt_ops->free_cmd(cmd); return res; } EXPORT_SYMBOL(qlt_xmit_response); diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index b55fc768a2a7..8b4890cdd4ca 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -644,7 +644,6 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd) { struct qla_tgt_cmd *cmd = container_of(se_cmd, struct qla_tgt_cmd, se_cmd); - struct scsi_qla_host *vha = cmd->vha; if (cmd->aborted) { /* Cmd can loop during Q-full. tcm_qla2xxx_aborted_task @@ -657,7 +656,6 @@ static int tcm_qla2xxx_queue_data_in(struct se_cmd *se_cmd) cmd->se_cmd.transport_state, cmd->se_cmd.t_state, cmd->se_cmd.se_cmd_flags); - vha->hw->tgt.tgt_ops->free_cmd(cmd); return 0; } @@ -685,7 +683,6 @@ static int tcm_qla2xxx_queue_status(struct se_cmd *se_cmd) { struct qla_tgt_cmd *cmd = container_of(se_cmd, struct qla_tgt_cmd, se_cmd); - struct scsi_qla_host *vha = cmd->vha; int xmit_type = QLA_TGT_XMIT_STATUS; if (cmd->aborted) { @@ -699,7 +696,6 @@ static int tcm_qla2xxx_queue_status(struct se_cmd *se_cmd) cmd, kref_read(&cmd->se_cmd.cmd_kref), cmd->se_cmd.transport_state, cmd->se_cmd.t_state, cmd->se_cmd.se_cmd_flags); - vha->hw->tgt.tgt_ops->free_cmd(cmd); return 0; } cmd->bufflen = se_cmd->data_length; From f69953837ca5d98aa983a138dc0b90a411e9c763 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 7 Mar 2021 19:30:24 -0800 Subject: [PATCH 26/51] scsi: qedi: Fix error return code of qedi_alloc_global_queues() When kzalloc() returns NULL to qedi->global_queues[i], no error return code of qedi_alloc_global_queues() is assigned. To fix this bug, status is assigned with -ENOMEM in this case. Link: https://lore.kernel.org/r/20210308033024.27147-1-baijiaju1990@gmail.com Fixes: ace7f46ba5fd ("scsi: qedi: Add QLogic FastLinQ offload iSCSI driver framework.") Reported-by: TOTE Robot Acked-by: Manish Rangankar Signed-off-by: Jia-Ju Bai Signed-off-by: Martin K. Petersen --- drivers/scsi/qedi/qedi_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index 47ad64b06623..69c5b5ee2169 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -1675,6 +1675,7 @@ static int qedi_alloc_global_queues(struct qedi_ctx *qedi) if (!qedi->global_queues[i]) { QEDI_ERR(&qedi->dbg_ctx, "Unable to allocation global queue %d.\n", i); + status = -ENOMEM; goto mem_alloc_failure; } From 3401ecf7fc1b9458a19d42c0e26a228f18ac7dda Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 7 Mar 2021 19:52:41 -0800 Subject: [PATCH 27/51] scsi: mpt3sas: Fix error return code of mpt3sas_base_attach() When kzalloc() returns NULL, no error return code of mpt3sas_base_attach() is assigned. To fix this bug, r is assigned with -ENOMEM in this case. Link: https://lore.kernel.org/r/20210308035241.3288-1-baijiaju1990@gmail.com Fixes: c696f7b83ede ("scsi: mpt3sas: Implement device_remove_in_progress check in IOCTL path") Reported-by: TOTE Robot Signed-off-by: Jia-Ju Bai Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index ac066f86bb14..ac0eef975f17 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -7806,14 +7806,18 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc) ioc->pend_os_device_add_sz++; ioc->pend_os_device_add = kzalloc(ioc->pend_os_device_add_sz, GFP_KERNEL); - if (!ioc->pend_os_device_add) + if (!ioc->pend_os_device_add) { + r = -ENOMEM; goto out_free_resources; + } ioc->device_remove_in_progress_sz = ioc->pend_os_device_add_sz; ioc->device_remove_in_progress = kzalloc(ioc->device_remove_in_progress_sz, GFP_KERNEL); - if (!ioc->device_remove_in_progress) + if (!ioc->device_remove_in_progress) { + r = -ENOMEM; goto out_free_resources; + } ioc->fwfault_debug = mpt3sas_fwfault_debug; From 077ce028b8e0684d5ee7da573bd835b14b591546 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Tue, 23 Mar 2021 22:24:30 +0100 Subject: [PATCH 28/51] scsi: target: pscsi: Avoid OOM in pscsi_map_sg() pscsi_map_sg() uses the variable nr_pages as a hint for bio_kmalloc() how many vector elements to allocate. If nr_pages is < BIO_MAX_PAGES, it will be reset to 0 after successful allocation of the bio. If bio_add_pc_page() fails later for whatever reason, pscsi_map_sg() tries to allocate another bio, passing nr_vecs = 0. This causes bio_add_pc_page() to fail immediately in the next call. pci_map_sg() continues to allocate zero-length bios until memory is exhausted and the kernel crashes with OOM. This can be easily observed by exporting a SATA DVD drive via pscsi. The target crashes as soon as the client tries to access the DVD LUN. In the case I analyzed, bio_add_pc_page() would fail because the DVD device's max_sectors_kb (128) was exceeded. Avoid this by simply not resetting nr_pages to 0 after allocating the bio. This way, the client receives an I/O error when it tries to send requests exceeding the devices max_sectors_kb, and eventually gets it right. The client must still limit max_sectors_kb e.g. by an udev rule if (like in my case) the driver doesn't report valid block limits, otherwise it encounters I/O errors. Link: https://lore.kernel.org/r/20210323212431.15306-1-mwilck@suse.com Reviewed-by: Christoph Hellwig Reviewed-by: Lee Duncan Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pscsi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 3cbc074992bc..6fa9fddc2e1e 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -882,7 +882,6 @@ pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, if (!bio) { new_bio: nr_vecs = bio_max_segs(nr_pages); - nr_pages -= nr_vecs; /* * Calls bio_kmalloc() and sets bio->bi_end_io() */ From 36fa766faa0c822c860e636fe82b1affcd022974 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Tue, 23 Mar 2021 22:24:31 +0100 Subject: [PATCH 29/51] scsi: target: pscsi: Clean up after failure in pscsi_map_sg() If pscsi_map_sg() fails, make sure to drop references to already allocated bios. Link: https://lore.kernel.org/r/20210323212431.15306-2-mwilck@suse.com Reviewed-by: Christoph Hellwig Reviewed-by: Lee Duncan Signed-off-by: Martin Wilck Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pscsi.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 6fa9fddc2e1e..9ee797b8cb7e 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -938,6 +938,14 @@ new_bio: return 0; fail: + if (bio) + bio_put(bio); + while (req->bio) { + bio = req->bio; + req->bio = bio->bi_next; + bio_put(bio); + } + req->biotail = NULL; return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } From f5d2d23bf0d948ce0b9307b7bacae7ff0bc03c71 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 10:16:12 -0600 Subject: [PATCH 30/51] io-wq: fix race around pending work on teardown syzbot reports that it's triggering the warning condition on having pending work on shutdown: WARNING: CPU: 1 PID: 12346 at fs/io-wq.c:1061 io_wq_destroy fs/io-wq.c:1061 [inline] WARNING: CPU: 1 PID: 12346 at fs/io-wq.c:1061 io_wq_put+0x153/0x260 fs/io-wq.c:1072 Modules linked in: CPU: 1 PID: 12346 Comm: syz-executor.5 Not tainted 5.12.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:io_wq_destroy fs/io-wq.c:1061 [inline] RIP: 0010:io_wq_put+0x153/0x260 fs/io-wq.c:1072 Code: 8d e8 71 90 ea 01 49 89 c4 41 83 fc 40 7d 4f e8 33 4d 97 ff 42 80 7c 2d 00 00 0f 85 77 ff ff ff e9 7a ff ff ff e8 1d 4d 97 ff <0f> 0b eb b9 8d 6b ff 89 ee 09 de bf ff ff ff ff e8 18 51 97 ff 09 RSP: 0018:ffffc90001ebfb08 EFLAGS: 00010293 RAX: ffffffff81e16083 RBX: ffff888019038040 RCX: ffff88801e86b780 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000040 RBP: 1ffff1100b2f8a80 R08: ffffffff81e15fce R09: ffffed100b2f8a82 R10: ffffed100b2f8a82 R11: 0000000000000000 R12: 0000000000000000 R13: dffffc0000000000 R14: ffff8880597c5400 R15: ffff888019038000 FS: 00007f8dcd89c700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055e9a054e160 CR3: 000000001dfb8000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: io_uring_clean_tctx+0x1b7/0x210 fs/io_uring.c:8802 __io_uring_files_cancel+0x13c/0x170 fs/io_uring.c:8820 io_uring_files_cancel include/linux/io_uring.h:47 [inline] do_exit+0x258/0x2340 kernel/exit.c:780 do_group_exit+0x168/0x2d0 kernel/exit.c:922 get_signal+0x1734/0x1ef0 kernel/signal.c:2773 arch_do_signal_or_restart+0x3c/0x610 arch/x86/kernel/signal.c:811 handle_signal_work kernel/entry/common.c:147 [inline] exit_to_user_mode_loop kernel/entry/common.c:171 [inline] exit_to_user_mode_prepare+0xac/0x1e0 kernel/entry/common.c:208 __syscall_exit_to_user_mode_work kernel/entry/common.c:290 [inline] syscall_exit_to_user_mode+0x48/0x180 kernel/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x465f69 which shouldn't happen, but seems to be possible due to a race on whether or not the io-wq manager sees a fatal signal first, or whether the io-wq workers do. If we race with queueing work and then send a fatal signal to the owning task, and the io-wq worker sees that before the manager sets IO_WQ_BIT_EXIT, then it's possible to have the worker exit and leave work behind. Just turn the WARN_ON_ONCE() into a cancelation condition instead. Reported-by: syzbot+77a738a6bc947bf639ca@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 2dd43bdb9c7b..b7c1fa932cb3 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -1063,7 +1063,11 @@ static void io_wq_destroy(struct io_wq *wq) for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; - WARN_ON_ONCE(!wq_list_empty(&wqe->work_list)); + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; + io_wqe_cancel_pending_work(wqe, &match); kfree(wqe); } io_wq_put_hash(wq->hash); From 90b8749022bbdd0c94a13182a78f4903b98fd0d7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 19:05:14 +0000 Subject: [PATCH 31/51] io_uring: maintain CQE order of a failed link Arguably we want CQEs of linked requests be in a strict order of submission as it always was. Now if init of a request fails its CQE may be posted before all prior linked requests including the head of the link. Fix it by failing it last. Fixes: de59bc104c24f ("io_uring: fail links more in io_submit_sqe()") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b7a96b05832e7ab23ad55f84092a2548c4a888b0.1616699075.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8c5789b96dbb..54ea561db4a5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6480,8 +6480,6 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: - io_put_req(req); - io_req_complete(req, ret); if (link->head) { /* fail even hard links since we don't submit */ link->head->flags |= REQ_F_FAIL_LINK; @@ -6489,6 +6487,8 @@ fail_req: io_req_complete(link->head, -ECANCELED); link->head = NULL; } + io_put_req(req); + io_req_complete(req, ret); return ret; } ret = io_req_prep(req, sqe); From 1dc481c0b0cf18d3952d93a73c4ece90dec277f0 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Sat, 20 Mar 2021 18:45:54 +0800 Subject: [PATCH 32/51] perf test: Change to use bash for daemon test When executing the daemon test on Arm64 and x86 with Debian (Buster) distro, both skip the test case with the log: # ./perf test -v 76 76: daemon operations : --- start --- test child forked, pid 11687 test daemon list trap: SIGINT: bad trap ./tests/shell/daemon.sh: 173: local: cpu-clock: bad variable name test child finished with -2 ---- end ---- daemon operations: Skip So the error happens for the variable expansion when use local variable in the shell script. Since Debian Buster uses dash but not bash as non-interactive shell, when execute the daemon testing, it hits a known issue for dash which was reported [1]. To resolve this issue, one option is to add double quotes for all local variables assignment, so need to change the code from: local line=`perf daemon --config ${config} -x: | head -2 | tail -1` ... to: local line="`perf daemon --config ${config} -x: | head -2 | tail -1`" But the testing script has bunch of local variables, this leads to big changes for whole script. On the other hand, the testing script asks to use the "local" feature which is bash-specific, so this patch explicitly uses "#!/bin/bash" to ensure running the script with bash. After: # ./perf test -v 76 76: daemon operations : --- start --- test child forked, pid 11329 test daemon list test daemon reconfig test daemon stop test daemon signal signal 12 sent to session 'test [11596]' signal 12 sent to session 'test [11596]' test daemon ping test daemon lock test child finished with 0 ---- end ---- daemon operations: Ok [1] https://bugs.launchpad.net/ubuntu/+source/dash/+bug/139097 Fixes: 2291bb915b55 ("perf tests: Add daemon 'list' command test") Signed-off-by: Leo Yan Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Ian Rogers Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20210320104554.529213-1-leo.yan@linaro.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/daemon.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index 5ad3ca8d681b..58984380b211 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # daemon operations # SPDX-License-Identifier: GPL-2.0 From 45a4546c6167a2da348a31ca439d8a8ff773b6ea Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Thu, 25 Mar 2021 12:34:54 +0000 Subject: [PATCH 33/51] cifs: Adjust key sizes and key generation routines for AES256 encryption For AES256 encryption (GCM and CCM), we need to adjust the size of a few fields to 32 bytes instead of 16 to accommodate the larger keys. Also, the L value supplied to the key generator needs to be changed from to 256 when these algorithms are used. Keeping the ioctl struct for dumping keys of the same size for now. Will send out a different patch for that one. Signed-off-by: Shyam Prasad N Reviewed-by: Ronnie Sahlberg CC: # v5.10+ Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 4 ++-- fs/cifs/cifspdu.h | 5 +++++ fs/cifs/smb2glob.h | 1 + fs/cifs/smb2ops.c | 9 +++++---- fs/cifs/smb2transport.c | 37 ++++++++++++++++++++++++++++--------- 5 files changed, 41 insertions(+), 15 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 31fc8695abd6..67c056a9a519 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -919,8 +919,8 @@ struct cifs_ses { bool binding:1; /* are we binding the session? */ __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; - __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; - __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; __u8 binding_preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 64fe5a47b5e8..9adc74bd9f8f 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -147,6 +147,11 @@ */ #define SMB3_SIGN_KEY_SIZE (16) +/* + * Size of the smb3 encryption/decryption keys + */ +#define SMB3_ENC_DEC_KEY_SIZE (32) + #define CIFS_CLIENT_CHALLENGE_SIZE (8) #define CIFS_SERVER_CHALLENGE_SIZE (8) #define CIFS_HMAC_MD5_HASH_SIZE (16) diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index 99a1951a01ec..d9a990c99121 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -58,6 +58,7 @@ #define SMB2_HMACSHA256_SIZE (32) #define SMB2_CMACAES_SIZE (16) #define SMB3_SIGNKEY_SIZE (16) +#define SMB3_GCM128_CRYPTKEY_SIZE (16) #define SMB3_GCM256_CRYPTKEY_SIZE (32) /* Maximum buffer size value we can send with 1 credit */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9bae7e8deb09..3e94f83897e9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4158,7 +4158,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) if (ses->Suid == ses_id) { ses_enc_key = enc ? ses->smb3encryptionkey : ses->smb3decryptionkey; - memcpy(key, ses_enc_key, SMB3_SIGN_KEY_SIZE); + memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); spin_unlock(&cifs_tcp_ses_lock); return 0; } @@ -4185,7 +4185,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, int rc = 0; struct scatterlist *sg; u8 sign[SMB2_SIGNATURE_SIZE] = {}; - u8 key[SMB3_SIGN_KEY_SIZE]; + u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; char *iv; unsigned int iv_len; @@ -4209,10 +4209,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, tfm = enc ? server->secmech.ccmaesencrypt : server->secmech.ccmaesdecrypt; - if (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); else - rc = crypto_aead_setkey(tfm, key, SMB3_SIGN_KEY_SIZE); + rc = crypto_aead_setkey(tfm, key, SMB3_GCM128_CRYPTKEY_SIZE); if (rc) { cifs_server_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index ebccd71cc60a..e6fa76ab70be 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -298,7 +298,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; - __u8 L[4] = {0, 0, 0, 128}; + __u8 L128[4] = {0, 0, 0, 128}; + __u8 L256[4] = {0, 0, 1, 0}; int rc = 0; unsigned char prfhash[SMB2_HMACSHA256_SIZE]; unsigned char *hashptr = prfhash; @@ -354,8 +355,14 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, goto smb3signkey_ret; } - rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, - L, 4); + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L256, 4); + } else { + rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash, + L128, 4); + } if (rc) { cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); goto smb3signkey_ret; @@ -390,6 +397,9 @@ generate_smb3signingkey(struct cifs_ses *ses, const struct derivation_triplet *ptriplet) { int rc; +#ifdef CONFIG_CIFS_DEBUG_DUMP_KEYS + struct TCP_Server_Info *server = ses->server; +#endif /* * All channels use the same encryption/decryption keys but @@ -422,11 +432,11 @@ generate_smb3signingkey(struct cifs_ses *ses, rc = generate_key(ses, ptriplet->encryption.label, ptriplet->encryption.context, ses->smb3encryptionkey, - SMB3_SIGN_KEY_SIZE); + SMB3_ENC_DEC_KEY_SIZE); rc = generate_key(ses, ptriplet->decryption.label, ptriplet->decryption.context, ses->smb3decryptionkey, - SMB3_SIGN_KEY_SIZE); + SMB3_ENC_DEC_KEY_SIZE); if (rc) return rc; } @@ -442,14 +452,23 @@ generate_smb3signingkey(struct cifs_ses *ses, */ cifs_dbg(VFS, "Session Id %*ph\n", (int)sizeof(ses->Suid), &ses->Suid); + cifs_dbg(VFS, "Cipher type %d\n", server->cipher_type); cifs_dbg(VFS, "Session Key %*ph\n", SMB2_NTLMV2_SESSKEY_SIZE, ses->auth_key.response); cifs_dbg(VFS, "Signing Key %*ph\n", SMB3_SIGN_KEY_SIZE, ses->smb3signingkey); - cifs_dbg(VFS, "ServerIn Key %*ph\n", - SMB3_SIGN_KEY_SIZE, ses->smb3encryptionkey); - cifs_dbg(VFS, "ServerOut Key %*ph\n", - SMB3_SIGN_KEY_SIZE, ses->smb3decryptionkey); + if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, ses->smb3decryptionkey); + } else { + cifs_dbg(VFS, "ServerIn Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3encryptionkey); + cifs_dbg(VFS, "ServerOut Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, ses->smb3decryptionkey); + } #endif return rc; } From 10442994ba195efef6fdcc0c3699e4633cb5161b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 26 Mar 2021 08:57:10 -0600 Subject: [PATCH 34/51] kernel: don't call do_exit() for PF_IO_WORKER threads Right now we're never calling get_signal() from PF_IO_WORKER threads, but in preparation for doing so, don't handle a fatal signal for them. The workers have state they need to cleanup when exiting, so just return instead of calling do_exit() on their behalf. The threads themselves will detect a fatal signal and do proper shutdown. Signed-off-by: Jens Axboe --- kernel/signal.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/signal.c b/kernel/signal.c index f2a1b898da29..d22177d37b21 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2771,6 +2771,14 @@ relock: do_coredump(&ksig->info); } + /* + * PF_IO_WORKER threads will catch and exit on fatal signals + * themselves. They have cleanup that must be performed, so + * we cannot call do_exit() on their behalf. + */ + if (current->flags & PF_IO_WORKER) + goto out; + /* * Death signals, no core dump. */ @@ -2778,7 +2786,7 @@ relock: /* NOTREACHED */ } spin_unlock_irq(&sighand->siglock); - +out: ksig->sig = signr; if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) From 3bffbe9e0b2721bb62d226a4d4211bddae52b00a Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 26 Mar 2021 10:28:16 +0000 Subject: [PATCH 35/51] cifs: Fix chmod with modefromsid when an older ACE already exists. My recent fixes to cifsacl to maintain inherited ACEs had regressed modefromsid when an older ACL already exists. Found testing xfstest 495 with modefromsid mount option Fixes: f5065508897a ("cifs: Retain old ACEs when converting between mode bits and ACL") Signed-off-by: Shyam Prasad N Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 2be22a5c690f..d178cf85e926 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1130,8 +1130,7 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, } /* If it's any one of the ACE we're replacing, skip! */ - if (!mode_from_sid && - ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || + if (((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || (compare_sids(&pntace->sid, pownersid) == 0) || (compare_sids(&pntace->sid, pgrpsid) == 0) || (compare_sids(&pntace->sid, &sid_everyone) == 0) || From cee8f4f6fcabfdf229542926128e9874d19016d5 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 25 Mar 2021 16:26:35 +1000 Subject: [PATCH 36/51] cifs: revalidate mapping when we open files for SMB1 POSIX RHBZ: 1933527 Under SMB1 + POSIX, if an inode is reused on a server after we have read and cached a part of a file, when we then open the new file with the re-cycled inode there is a chance that we may serve the old data out of cache to the application. This only happens for SMB1 (deprecated) and when posix are used. The simplest solution to avoid this race is to force a revalidate on smb1-posix open. Signed-off-by: Ronnie Sahlberg Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 26de4329d161..042e24aad410 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -165,6 +165,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, goto posix_open_ret; } } else { + cifs_revalidate_mapping(*pinode); cifs_fattr_to_inode(*pinode, &fattr); } From 219481a8f90ec3a5eed9638fb35609e4b1aeece7 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Fri, 19 Mar 2021 14:57:11 +0100 Subject: [PATCH 37/51] cifs: Silently ignore unknown oplock break handle Make SMB2 not print out an error when an oplock break is received for an unknown handle, similar to SMB1. The debug message which is printed for these unknown handles may also be misleading, so fix that too. The SMB2 lease break path is not affected by this patch. Without this, a program which writes to a file from one thread, and opens, reads, and writes the same file from another thread triggers the below errors several times a minute when run against a Samba server configured with "smb2 leases = no". CIFS: VFS: \\192.168.0.1 No task to wake, unknown frame received! NumMids 2 00000000: 424d53fe 00000040 00000000 00000012 .SMB@........... 00000010: 00000001 00000000 ffffffff ffffffff ................ 00000020: 00000000 00000000 00000000 00000000 ................ 00000030: 00000000 00000000 00000000 00000000 ................ Signed-off-by: Vincent Whitchurch Reviewed-by: Tom Talpey Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/cifs/smb2misc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b50164e2c88d..aac384f69f74 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -754,8 +754,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) } } spin_unlock(&cifs_tcp_ses_lock); - cifs_dbg(FYI, "Can not process oplock break for non-existent connection\n"); - return false; + cifs_dbg(FYI, "No file id matched, oplock break ignored\n"); + return true; } void From cfc63fc8126a93cbf95379bc4cad79a7b15b6ece Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 26 Mar 2021 18:41:55 -0500 Subject: [PATCH 38/51] smb3: fix cached file size problems in duplicate extents (reflink) There were two problems (one of which could cause data corruption) that were noticed with duplicate extents (ie reflink) when debugging why various xfstests were being incorrectly skipped (e.g. generic/138, generic/140, generic/142). First, we were not updating the file size locally in the cache when extending a file due to reflink (it would refresh after actimeo expires) but xfstest was checking the size immediately which was still 0 so caused the test to be skipped. Second, we were setting the target file size (which could shrink the file) in all cases to the end of the reflinked range rather than only setting the target file size when reflink would extend the file. CC: Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3e94f83897e9..f703204fb185 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2038,6 +2038,7 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; + struct inode *inode; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -2054,10 +2055,21 @@ smb2_duplicate_extents(const unsigned int xid, cifs_dbg(FYI, "Duplicate extents: src off %lld dst off %lld len %lld\n", src_off, dest_off, len); - rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); - if (rc) - goto duplicate_extents_out; + inode = d_inode(trgtfile->dentry); + if (inode->i_size < dest_off + len) { + rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); + if (rc) + goto duplicate_extents_out; + /* + * Although also could set plausible allocation size (i_blocks) + * here in addition to setting the file size, in reflink + * it is likely that the target file is sparse. Its allocation + * size will be queried on next revalidate, but it is important + * to make sure that file's cached size is updated immediately + */ + cifs_setsize(inode, dest_off + len); + } rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, From e82fc7855749aa197740a60ef22c492c41ea5d5f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 27 Mar 2021 15:13:09 +0800 Subject: [PATCH 39/51] block: don't create too many partitions Commit a33df75c6328 ("block: use an xarray for disk->part_tbl") drops the check on max supported number of partitionsr, and allows partition with bigger partition numbers to be added. However, ->bd_partno is defined as u8, so partition index of xarray table may not match with ->bd_partno. Then delete_partition() may delete one unmatched partition, and caused use-after-free. Reviewed-by: Bart Van Assche Reported-by: syzbot+8fede7e30c7cee0de139@syzkaller.appspotmail.com Fixes: a33df75c6328 ("block: use an xarray for disk->part_tbl") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/partitions/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/partitions/core.c b/block/partitions/core.c index 1a7558917c47..46f055bc7ecb 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -322,6 +322,13 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, const char *dname; int err; + /* + * disk_max_parts() won't be zero, either GENHD_FL_EXT_DEVT is set + * or 'minors' is passed to alloc_disk(). + */ + if (partno >= disk_max_parts(disk)) + return ERR_PTR(-EINVAL); + /* * Partitions are not supported on zoned block devices that are used as * such. From dbe1bdbb39db7dfe80a903f0d267f62cf3f093d2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:16:06 -0600 Subject: [PATCH 40/51] io_uring: handle signals for IO threads like a normal thread We go through various hoops to disallow signals for the IO threads, but there's really no reason why we cannot just allow them. The IO threads never return to userspace like a normal thread, and hence don't go through normal signal processing. Instead, just check for a pending signal as part of the work loop, and call get_signal() to handle it for us if anything is pending. With that, we can support receiving signals, including special ones like SIGSTOP. Acked-by: "Eric W. Biederman" Signed-off-by: Jens Axboe --- fs/io-wq.c | 20 ++++++++++++++------ fs/io_uring.c | 9 ++++++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index b7c1fa932cb3..7434eb40ca8c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "../kernel/sched/sched.h" #include "io-wq.h" @@ -503,10 +502,15 @@ loop: if (io_flush_signals()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); - if (try_to_freeze() || ret) - continue; - if (fatal_signal_pending(current)) + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; break; + } + if (ret) + continue; /* timed out, exit unless we're the fixed worker */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || !(worker->flags & IO_WORKER_F_FIXED)) @@ -714,9 +718,13 @@ static int io_wq_manager(void *data) set_current_state(TASK_INTERRUPTIBLE); io_wq_check_workers(wq); schedule_timeout(HZ); - try_to_freeze(); - if (fatal_signal_pending(current)) + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; set_bit(IO_WQ_BIT_EXIT, &wq->state); + } } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); io_wq_check_workers(wq); diff --git a/fs/io_uring.c b/fs/io_uring.c index 54ea561db4a5..2d43f7b87083 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,7 +78,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -6765,8 +6764,13 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; continue; } - if (fatal_signal_pending(current)) + if (signal_pending(current)) { + struct ksignal ksig; + + if (!get_signal(&ksig)) + continue; break; + } sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { @@ -6809,7 +6813,6 @@ static int io_sq_thread(void *data) mutex_unlock(&sqd->lock); schedule(); - try_to_freeze(); mutex_lock(&sqd->lock); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_clear_wakeup_flag(ctx); From b16b3855d89fba640996fefdd3a113c0aa0e380d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 26 Mar 2021 09:05:22 -0600 Subject: [PATCH 41/51] kernel: stop masking signals in create_io_thread() This is racy - move the blocking into when the task is created and we're marking it as PF_IO_WORKER anyway. The IO threads are now prepared to handle signals like SIGSTOP as well, so clear that from the mask to allow proper stopping of IO threads. Acked-by: "Eric W. Biederman" Reported-by: Oleg Nesterov Signed-off-by: Jens Axboe --- kernel/fork.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index d3171e8e88e5..ddaa15227071 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1940,8 +1940,14 @@ static __latent_entropy struct task_struct *copy_process( p = dup_task_struct(current, node); if (!p) goto fork_out; - if (args->io_thread) + if (args->io_thread) { + /* + * Mark us an IO worker, and block any signal that isn't + * fatal or STOP + */ p->flags |= PF_IO_WORKER; + siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); + } /* * This _must_ happen before we call free_task(), i.e. before we jump @@ -2430,14 +2436,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .stack_size = (unsigned long)arg, .io_thread = 1, }; - struct task_struct *tsk; - tsk = copy_process(NULL, 0, node, &args); - if (!IS_ERR(tsk)) { - sigfillset(&tsk->blocked); - sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); - } - return tsk; + return copy_process(NULL, 0, node, &args); } /* From 5a842a7448bbfa9bda0a74ca4f239c1b02bb98d8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:18:15 -0600 Subject: [PATCH 42/51] Revert "signal: don't allow sending any signals to PF_IO_WORKER threads" This reverts commit 5be28c8f85ce99ed2d329d2ad8bdd18ea19473a5. IO threads now take signals just fine, so there's no reason to limit them specifically. Revert the change that prevented that from happening. Signed-off-by: Jens Axboe --- kernel/signal.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index d22177d37b21..9e172b9341f4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -834,9 +834,6 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, if (!valid_signal(sig)) return -EINVAL; - /* PF_IO_WORKER threads don't take any signals */ - if (t->flags & PF_IO_WORKER) - return -ESRCH; if (!si_fromuser(info)) return 0; From e8b33b8cfafcfcef287ae4c0f23a173bfcf617f3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:18:59 -0600 Subject: [PATCH 43/51] Revert "kernel: treat PF_IO_WORKER like PF_KTHREAD for ptrace/signals" This reverts commit 6fb8f43cede0e4bd3ead847de78d531424a96be9. The IO threads do allow signals now, including SIGSTOP, and we can allow ptrace attach. Attaching won't reveal anything interesting for the IO threads, but it will allow eg gdb to attach to a task with io_urings and IO threads without complaining. And once attached, it will allow the usual introspection into regular threads. Signed-off-by: Jens Axboe --- kernel/ptrace.c | 2 +- kernel/signal.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 821cf1723814..61db50f7ca86 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request, audit_ptrace(task); retval = -EPERM; - if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER))) + if (unlikely(task->flags & PF_KTHREAD)) goto out; if (same_thread_group(task, current)) goto out; diff --git a/kernel/signal.c b/kernel/signal.c index 9e172b9341f4..dd86841cce94 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) return true; /* Only allow kernel generated signals to this kthread */ - if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) && + if (unlikely((t->flags & PF_KTHREAD) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; @@ -1097,7 +1097,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ - if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER))) + if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) goto out_set; /* From d3dc04cd81e0eaf50b2d09ab051a13300e587439 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:22:11 -0600 Subject: [PATCH 44/51] Revert "kernel: freezer should treat PF_IO_WORKER like PF_KTHREAD for freezing" This reverts commit 15b2219facadec583c24523eed40fa45865f859f. Before IO threads accepted signals, the freezer using take signals to wake up an IO thread would cause them to loop without any way to clear the pending signal. That is no longer the case, so stop special casing PF_IO_WORKER in the freezer. Signed-off-by: Jens Axboe --- kernel/freezer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/freezer.c b/kernel/freezer.c index 1a2d57d1327c..dc520f01f99d 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -134,7 +134,7 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) + if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else wake_up_state(p, TASK_INTERRUPTIBLE); From 1e4cf0d3d072173ee70757ee4aec11b2839705f9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Mar 2021 18:23:44 -0600 Subject: [PATCH 45/51] Revert "signal: don't allow STOP on PF_IO_WORKER threads" This reverts commit 4db4b1a0d1779dc159f7b87feb97030ec0b12597. The IO threads allow and handle SIGSTOP now, so don't special case them anymore in task_set_jobctl_pending(). Signed-off-by: Jens Axboe --- kernel/signal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index dd86841cce94..f2718350bf4b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -288,8 +288,7 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask) JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); - if (unlikely(fatal_signal_pending(task) || - (task->flags & (PF_EXITING | PF_IO_WORKER)))) + if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) return false; if (mask & JOBCTL_STOP_SIGMASK) From 1ee4160c73b2102a52bc97a4128a89c34821414f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:42 +0000 Subject: [PATCH 46/51] io_uring: fix timeout cancel return code When we cancel a timeout we should emit a sensible return code, like -ECANCELED but not 0, otherwise it may trick users. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7b0ad1065e3bd1994722702bd0ba9e7bc9b0683b.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2d43f7b87083..229ab9bfb45b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1247,7 +1247,7 @@ static void io_queue_async_work(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_kill_timeout(struct io_kiocb *req) +static void io_kill_timeout(struct io_kiocb *req, int status) { struct io_timeout_data *io = req->async_data; int ret; @@ -1257,7 +1257,7 @@ static void io_kill_timeout(struct io_kiocb *req) atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); - io_cqring_fill_event(req, 0); + io_cqring_fill_event(req, status); io_put_req_deferred(req, 1); } } @@ -1274,7 +1274,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { if (io_match_task(req, tsk, files)) { - io_kill_timeout(req); + io_kill_timeout(req, -ECANCELED); canceled++; } } @@ -1326,7 +1326,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) break; list_del_init(&req->timeout.list); - io_kill_timeout(req); + io_kill_timeout(req, 0); } while (!list_empty(&ctx->timeout_list)); ctx->cq_last_tm_flush = seq; From 80c4cbdb5ee604712e59fe304d7bf084b562f705 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:43 +0000 Subject: [PATCH 47/51] io_uring: do post-completion chore on t-out cancel Don't forget about io_commit_cqring() + io_cqring_ev_posted() after exit/exec cancelling timeouts. Both functions declared only after io_kill_timeouts(), so to avoid tons of forward declarations move it down. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/72ace588772c0f14834a6a4185d56c445a366fb4.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 229ab9bfb45b..8498f74595f3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1262,26 +1262,6 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -/* - * Returns true if we found and killed one or more timeouts - */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - struct files_struct *files) -{ - struct io_kiocb *req, *tmp; - int canceled = 0; - - spin_lock_irq(&ctx->completion_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { - if (io_match_task(req, tsk, files)) { - io_kill_timeout(req, -ECANCELED); - canceled++; - } - } - spin_unlock_irq(&ctx->completion_lock); - return canceled != 0; -} - static void __io_queue_deferred(struct io_ring_ctx *ctx) { do { @@ -8611,6 +8591,28 @@ static void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } +/* Returns true if we found and killed one or more timeouts */ +static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + struct files_struct *files) +{ + struct io_kiocb *req, *tmp; + int canceled = 0; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + if (io_match_task(req, tsk, files)) { + io_kill_timeout(req, -ECANCELED); + canceled++; + } + } + io_commit_cqring(ctx); + spin_unlock_irq(&ctx->completion_lock); + + if (canceled != 0) + io_cqring_ev_posted(ctx); + return canceled != 0; +} + static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; From 2482b58ffbdc80cfaae969ad19cb32803056505b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:44 +0000 Subject: [PATCH 48/51] io_uring: don't cancel-track common timeouts Don't account usual timeouts (i.e. not linked) as REQ_F_INFLIGHT but keep behaviour prior to dd59a3d595cc1 ("io_uring: reliably cancel linked timeouts"). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/104441ef5d97e3932113d44501fda0df88656b83.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8498f74595f3..87d198d223a2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5563,7 +5563,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - io_req_track_inflight(req); + if (is_timeout_link) + io_req_track_inflight(req); return 0; } From 78d9d7c2a331fb7a68a86e53ef7e12966459e0c5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 25 Mar 2021 18:32:45 +0000 Subject: [PATCH 49/51] io_uring: don't cancel extra on files match As tasks always wait and kill their io-wq on exec/exit, files are of no more concern to us, so we don't need to specifically cancel them by hand in those cases. Moreover we should not, because io_match_task() looks at req->task->files now, which is always true and so leads to extra cancellations, that wasn't a case before per-task io-wq. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0566c1de9b9dd417f5de345c817ca953580e0e2e.1616696997.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 87d198d223a2..880abd8b6d31 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1094,8 +1094,6 @@ static bool io_match_task(struct io_kiocb *head, io_for_each_link(req, head) { if (req->flags & REQ_F_INFLIGHT) return true; - if (req->task->files == files) - return true; } return false; } From 2b8ed1c94182dbbd0163d0eb443a934cbf6b0d85 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 26 Mar 2021 19:52:51 +0000 Subject: [PATCH 50/51] io_uring: remove unsued assignment to pointer io There is an assignment to io that is never read after the assignment, the assignment is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 880abd8b6d31..1949b80677e7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4798,7 +4798,6 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = -ENOMEM; goto out; } - io = req->async_data; memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } From a5e13c6df0e41702d2b2c77c8ad41677ebb065b3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Mar 2021 15:48:16 -0700 Subject: [PATCH 51/51] Linux 5.12-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d4784d181123..73add16f9898 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Frozen Wasteland # *DOCUMENTATION*