From d257cc8cb8d5355ffc43a96bab94db7b5a324803 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Nov 2021 20:29:12 -0500 Subject: [PATCH 01/21] locking/rwsem: Make handoff bit handling more consistent There are some inconsistency in the way that the handoff bit is being handled in readers and writers that lead to a race condition. Firstly, when a queue head writer set the handoff bit, it will clear it when the writer is being killed or interrupted on its way out without acquiring the lock. That is not the case for a queue head reader. The handoff bit will simply be inherited by the next waiter. Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both the waiter and handoff bits are cleared if the wait queue becomes empty. For rwsem_down_write_slowpath(), however, the handoff bit is not checked and cleared if the wait queue is empty. This can potentially make the handoff bit set with empty wait queue. Worse, the situation in rwsem_down_write_slowpath() relies on wstate, a variable set outside of the critical section containing the ->count manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF can be double subtracted, corrupting ->count. To make the handoff bit handling more consistent and robust, extract out handoff bit clearing code into the new rwsem_del_waiter() helper function. Also, completely eradicate wstate; always evaluate everything inside the same critical section. The common function will only use atomic_long_andnot() to clear bits when the wait queue is empty to avoid possible race condition. If the first waiter with handoff bit set is killed or interrupted to exit the slowpath without acquiring the lock, the next waiter will inherit the handoff bit. While at it, simplify the trylock for loop in rwsem_down_write_slowpath() to make it easier to read. Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") Reported-by: Zhenhua Ma Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com --- kernel/locking/rwsem.c | 175 ++++++++++++++++++++--------------------- 1 file changed, 87 insertions(+), 88 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c51387a43265..e039cf1605af 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,9 +105,9 @@ * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. - * 1) rwsem_mark_wake() for readers. - * 2) rwsem_try_write_lock() for writers. - * 3) Error path of rwsem_down_write_slowpath(). + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff @@ -334,6 +334,9 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + + /* Writer only, not initialized in reader */ + bool handoff_set; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -344,12 +347,6 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; -enum writer_wait_state { - WRITER_NOT_FIRST, /* Writer is not first in wait list */ - WRITER_FIRST, /* Writer is first in wait list */ - WRITER_HANDOFF /* Writer is first & handoff needed */ -}; - /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait @@ -365,6 +362,31 @@ enum writer_wait_state { */ #define MAX_READERS_WAKEUP 0x100 +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + */ +static inline void +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -376,6 +398,8 @@ enum writer_wait_state { * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, @@ -490,17 +514,24 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_FLAG_WAITERS; - } - /* - * When we've woken a reader, we no longer need to force writers - * to give up the lock and we can clear HANDOFF. - */ - if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) - adjustment -= RWSEM_FLAG_HANDOFF; + oldcount = atomic_long_read(&sem->count); + if (list_empty(&sem->wait_list)) { + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ + adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff - * bit is set or the lock is acquired with handoff bit cleared. + * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, - enum writer_wait_state wstate) + struct rwsem_waiter *waiter) { + bool first = rwsem_first_waiter(sem) == waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (has_handoff && wstate == WRITER_NOT_FIRST) - return false; + if (has_handoff) { + if (!first) + return false; + + /* First waiter inherits a previously set handoff bit */ + waiter->handoff_set = true; + } new = count; if (count & RWSEM_LOCK_MASK) { - if (has_handoff || (wstate != WRITER_HANDOFF)) + if (has_handoff || (!rt_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) return false; new |= RWSEM_FLAG_HANDOFF; @@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * We have either acquired the lock with handoff bit cleared or * set the handoff bit. */ - if (new & RWSEM_FLAG_HANDOFF) + if (new & RWSEM_FLAG_HANDOFF) { + waiter->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); return false; + } + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); rwsem_set_owner(sem); return true; } @@ -956,7 +1001,7 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1002,11 +1047,7 @@ queue: return sem; out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) { - atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, - &sem->count); - } + rwsem_del_waiter(sem, &waiter); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -1020,9 +1061,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - enum writer_wait_state wstate; struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; - - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ - if (wstate == WRITER_NOT_FIRST) { + if (rwsem_first_waiter(sem) != &waiter) { count = atomic_long_read(&sem->count); /* @@ -1083,13 +1119,16 @@ wait: /* wait until we successfully acquire the lock */ set_current_state(state); for (;;) { - if (rwsem_try_write_lock(sem, wstate)) { + if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ break; } raw_spin_unlock_irq(&sem->wait_lock); + if (signal_pending_state(state, current)) + goto out_nolock; + /* * After setting the handoff bit and failing to acquire * the lock, attempt to spin on owner to accelerate lock @@ -1098,7 +1137,7 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF) { + if (waiter.handoff_set) { enum owner_state owner_state; preempt_disable(); @@ -1109,66 +1148,26 @@ wait: goto trylock_again; } - /* Block until there are no active lockers. */ - for (;;) { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - /* - * If HANDOFF bit is set, unconditionally do - * a trylock. - */ - if (wstate == WRITER_HANDOFF) - break; - - if ((wstate == WRITER_NOT_FIRST) && - (rwsem_first_waiter(sem) == &waiter)) - wstate = WRITER_FIRST; - - count = atomic_long_read(&sem->count); - if (!(count & RWSEM_LOCK_MASK)) - break; - - /* - * The setting of the handoff bit is deferred - * until rwsem_try_write_lock() is called. - */ - if ((wstate == WRITER_FIRST) && (rt_task(current) || - time_after(jiffies, waiter.timeout))) { - wstate = WRITER_HANDOFF; - lockevent_inc(rwsem_wlock_handoff); - break; - } - } + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); - list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); - - return ret; + return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - - if (unlikely(wstate == WRITER_HANDOFF)) - atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); - - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else + rwsem_del_waiter(sem, &waiter); + if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); - return ERR_PTR(-EINTR); } From 14c24048841151548a3f4d9e218510c844c1b737 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Nov 2021 17:44:55 +0800 Subject: [PATCH 02/21] locking/rwsem: Optimize down_read_trylock() under highly contended case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found that a process with 10 thousnads threads has been encountered a regression problem from Linux-v4.14 to Linux-v5.4. It is a kind of workload which will concurrently allocate lots of memory in different threads sometimes. In this case, we will see the down_read_trylock() with a high hotspot. Therefore, we suppose that rwsem has a regression at least since Linux-v5.4. In order to easily debug this problem, we write a simply benchmark to create the similar situation lile the following. ```c++ #include #include #include #include #include #include #include #include #include volatile int mutex; void trigger(int cpu, char* ptr, std::size_t sz) { cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); assert(pthread_setaffinity_np(pthread_self(), sizeof(set), &set) == 0); while (mutex); for (std::size_t i = 0; i < sz; i += 4096) { *ptr = '\0'; ptr += 4096; } } int main(int argc, char* argv[]) { std::size_t sz = 100; if (argc > 1) sz = atoi(argv[1]); auto nproc = std::thread::hardware_concurrency(); std::vector thr; sz <<= 30; auto* ptr = mmap(nullptr, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); assert(ptr != MAP_FAILED); char* cptr = static_cast(ptr); auto run = sz / nproc; run = (run >> 12) << 12; mutex = 1; for (auto i = 0U; i < nproc; ++i) { thr.emplace_back(std::thread([i, cptr, run]() { trigger(i, cptr, run); })); cptr += run; } rusage usage_start; getrusage(RUSAGE_SELF, &usage_start); auto start = std::chrono::system_clock::now(); mutex = 0; for (auto& t : thr) t.join(); rusage usage_end; getrusage(RUSAGE_SELF, &usage_end); auto end = std::chrono::system_clock::now(); timeval utime; timeval stime; timersub(&usage_end.ru_utime, &usage_start.ru_utime, &utime); timersub(&usage_end.ru_stime, &usage_start.ru_stime, &stime); printf("usr: %ld.%06ld\n", utime.tv_sec, utime.tv_usec); printf("sys: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("real: %lu\n", std::chrono::duration_cast(end - start).count()); return 0; } ``` The functionality of above program is simply which creates `nproc` threads and each of them are trying to touch memory (trigger page fault) on different CPU. Then we will see the similar profile by `perf top`. 25.55% [kernel] [k] down_read_trylock 14.78% [kernel] [k] handle_mm_fault 13.45% [kernel] [k] up_read 8.61% [kernel] [k] clear_page_erms 3.89% [kernel] [k] __do_page_fault The highest hot instruction, which accounts for about 92%, in down_read_trylock() is cmpxchg like the following. 91.89 │ lock cmpxchg %rdx,(%rdi) Sice the problem is found by migrating from Linux-v4.14 to Linux-v5.4, so we easily found that the commit ddb20d1d3aed ("locking/rwsem: Optimize down_read_trylock()") caused the regression. The reason is that the commit assumes the rwsem is not contended at all. But it is not always true for mmap lock which could be contended with thousands threads. So most threads almost need to run at least 2 times of "cmpxchg" to acquire the lock. The overhead of atomic operation is higher than non-atomic instructions, which caused the regression. By using the above benchmark, the real executing time on a x86-64 system before and after the patch were: Before Patch After Patch # of Threads real real reduced by ------------ ------ ------ ---------- 1 65,373 65,206 ~0.0% 4 15,467 15,378 ~0.5% 40 6,214 5,528 ~11.0% For the uncontended case, the new down_read_trylock() is the same as before. For the contended cases, the new down_read_trylock() is faster than before. The more contended, the more fast. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20211118094455.9068-1-songmuchun@bytedance.com --- kernel/locking/rwsem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e039cf1605af..04a74d040a6d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1248,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - /* - * Optimize for the case when the rwsem is not locked at all. - */ - tmp = RWSEM_UNLOCKED_VALUE; - do { + tmp = atomic_long_read(&sem->count); + while (!(tmp & RWSEM_READ_FAILED_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); return 1; } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); + } return 0; } From 73743c3b092277febbf69b250ce8ebbca0525aa2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 9 Nov 2021 13:22:32 +0100 Subject: [PATCH 03/21] perf: Ignore sigtrap for tracepoints destined for other tasks syzbot reported that the warning in perf_sigtrap() fires, saying that the event's task does not match current: | WARNING: CPU: 0 PID: 9090 at kernel/events/core.c:6446 perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | Modules linked in: | CPU: 0 PID: 9090 Comm: syz-executor.1 Not tainted 5.15.0-syzkaller #0 | Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 | RIP: 0010:perf_sigtrap kernel/events/core.c:6446 [inline] | RIP: 0010:perf_pending_event_disable kernel/events/core.c:6470 [inline] | RIP: 0010:perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | ... | Call Trace: | | irq_work_single+0x106/0x220 kernel/irq_work.c:211 | irq_work_run_list+0x6a/0x90 kernel/irq_work.c:242 | irq_work_run+0x4f/0xd0 kernel/irq_work.c:251 | __sysvec_irq_work+0x95/0x3d0 arch/x86/kernel/irq_work.c:22 | sysvec_irq_work+0x8e/0xc0 arch/x86/kernel/irq_work.c:17 | | | asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:664 | RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] | RIP: 0010:_raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:194 | ... | coredump_task_exit kernel/exit.c:371 [inline] | do_exit+0x1865/0x25c0 kernel/exit.c:771 | do_group_exit+0xe7/0x290 kernel/exit.c:929 | get_signal+0x3b0/0x1ce0 kernel/signal.c:2820 | arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 | handle_signal_work kernel/entry/common.c:148 [inline] | exit_to_user_mode_loop kernel/entry/common.c:172 [inline] | exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 | __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] | syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 | do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 | entry_SYSCALL_64_after_hwframe+0x44/0xae On x86 this shouldn't happen, which has arch_irq_work_raise(). The test program sets up a perf event with sigtrap set to fire on the 'sched_wakeup' tracepoint, which fired in ttwu_do_wakeup(). This happened because the 'sched_wakeup' tracepoint also takes a task argument passed on to perf_tp_event(), which is used to deliver the event to that other task. Since we cannot deliver synchronous signals to other tasks, skip an event if perf_tp_event() is targeted at another task and perf_event_attr::sigtrap is set, which will avoid ever entering perf_sigtrap() for such events. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: syzbot+663359e32ce6f1a305ad@syzkaller.appspotmail.com Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YYpoCOBmC/kJWfmI@elver.google.com --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..30d94f68c5bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9759,6 +9759,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, continue; if (event->attr.config != entry->type) continue; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + continue; if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } From c0f2077baa4113f38f008b8e912b9fb3ff8d43df Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 23 Nov 2021 08:04:34 +0100 Subject: [PATCH 04/21] x86/boot: Mark prepare_command_line() __init Fix: WARNING: modpost: vmlinux.o(.text.unlikely+0x64d0): Section mismatch in reference \ from the function prepare_command_line() to the variable .init.data:command_line The function prepare_command_line() references the variable __initdata command_line. This is often because prepare_command_line lacks a __initdata annotation or the annotation of command_line is wrong. Apparently some toolchains do different inlining decisions. Reported-by: Stephen Rothwell Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YZySgpmBcNNM2qca@zn.tnic --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c410be738ae7..6a190c7f4d71 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -742,7 +742,7 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } -static char *prepare_command_line(void) +static char * __init prepare_command_line(void) { #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE From dce1ca0525bfdc8a69a9343bc714fbc19a2f04b3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 23 Nov 2021 11:40:47 +0000 Subject: [PATCH 05/21] sched/scs: Reset task stack state in bringup_cpu() To hot unplug a CPU, the idle task on that CPU calls a few layers of C code before finally leaving the kernel. When KASAN is in use, poisoned shadow is left around for each of the active stack frames, and when shadow call stacks are in use. When shadow call stacks (SCS) are in use the task's saved SCS SP is left pointing at an arbitrary point within the task's shadow call stack. When a CPU is offlined than onlined back into the kernel, this stale state can adversely affect execution. Stale KASAN shadow can alias new stackframes and result in bogus KASAN warnings. A stale SCS SP is effectively a memory leak, and prevents a portion of the shadow call stack being used. Across a number of hotplug cycles the idle task's entire shadow call stack can become unusable. We previously fixed the KASAN issue in commit: e1b77c92981a5222 ("sched/kasan: remove stale KASAN poison after hotplug") ... by removing any stale KASAN stack poison immediately prior to onlining a CPU. Subsequently in commit: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") ... the refactoring left the KASAN and SCS cleanup in one-time idle thread initialization code rather than something invoked prior to each CPU being onlined, breaking both as above. We fixed SCS (but not KASAN) in commit: 63acd42c0d4942f7 ("sched/scs: Reset the shadow stack when idle_task_exit") ... but as this runs in the context of the idle task being offlined it's potentially fragile. To fix these consistently and more robustly, reset the SCS SP and KASAN shadow of a CPU's idle task immediately before we online that CPU in bringup_cpu(). This ensures the idle task always has a consistent state when it is running, and removes the need to so so when exiting an idle task. Whenever any thread is created, dup_task_struct() will give the task a stack which is free of KASAN shadow, and initialize the task's SCS SP, so there's no need to specially initialize either for idle thread within init_idle(), as this was only necessary to handle hotplug cycles. I've tested this on arm64 with: * gcc 11.1.0, defconfig +KASAN_INLINE, KASAN_STACK * clang 12.0.0, defconfig +KASAN_INLINE, KASAN_STACK, SHADOW_CALL_STACK ... offlining and onlining CPUS with: | while true; do | for C in /sys/devices/system/cpu/cpu*/online; do | echo 0 > $C; | echo 1 > $C; | done | done Fixes: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") Reported-by: Qian Cai Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Qian Cai Link: https://lore.kernel.org/lkml/20211115113310.35693-1-mark.rutland@arm.com/ --- kernel/cpu.c | 7 +++++++ kernel/sched/core.c | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 192e43a87407..407a2568f35e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -587,6 +588,12 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); + /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..76f9deeaa942 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8619,9 +8619,6 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); - #ifdef CONFIG_SMP /* * It's possible that init_idle() gets called multiple times on a task, @@ -8777,7 +8774,6 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } - scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } From 6318cb887548c70778d10c0fcb7134b4454ab8a6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:02 -0500 Subject: [PATCH 06/21] Revert "virtio-scsi: don't let virtio core to validate used buffer length" This reverts commit c57911ebfbfe745cb95da2bcf547c5bae000590f. Attempts to validate length in the core did not work out. We'll drop them for now, so revert the dependent changes in drivers. Signed-off-by: Michael S. Tsirkin --- drivers/scsi/virtio_scsi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 19f7d7b90625..28e1d98ae102 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -977,7 +977,6 @@ static unsigned int features[] = { static struct virtio_driver virtio_scsi_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), - .suppress_used_validation = true, .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, From 2b17d9f84884a37f1324be5526c6600e97a47fbe Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:09 -0500 Subject: [PATCH 07/21] Revert "virtio-blk: don't let virtio core to validate used length" This reverts commit a40392edf1b2c7822bc0ce68413106661a9d4232. Attempts to validate length in the core did not work out. We'll drop them, so revert the dependent changes in drivers. Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 97bf051a50ce..1a1b1189225d 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1049,7 +1049,6 @@ static struct virtio_driver virtio_blk = { .feature_table_size = ARRAY_SIZE(features), .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), - .suppress_used_validation = true, .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, From fcfb65f8a922c7dd25a2e9913601dae979ce6560 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:13 -0500 Subject: [PATCH 08/21] Revert "virtio-net: don't let virtio core to validate used length" This reverts commit 816625c13652cef5b2c49082d652875da6f2ad7a. Attempts to validate length in the core did not work out. We'll drop them, so revert the dependent changes in drivers. Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1771d6e5224f..55db6a336f7e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3423,7 +3423,6 @@ static struct virtio_driver virtio_net_driver = { .feature_table_size = ARRAY_SIZE(features), .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), - .suppress_used_validation = true, .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, From f124034faa911ed534bf8c4881ad98dbbde2a966 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:17 -0500 Subject: [PATCH 09/21] Revert "virtio_ring: validate used buffer length" This reverts commit 939779f5152d161b34f612af29e7dc1ac4472fcf. Attempts to validate length in the core did not work out: there turn out to exist multiple broken devices, and in particular legacy devices are known to be broken in this respect. We have ideas for handling this better in the next version but for now let's revert to a known good state to make sure drivers work for people. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 60 ------------------------------------ include/linux/virtio.h | 2 -- 2 files changed, 62 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 00f64f2f8b72..6d2614e34470 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -14,9 +14,6 @@ #include #include -static bool force_used_validation = false; -module_param(force_used_validation, bool, 0444); - #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ @@ -185,9 +182,6 @@ struct vring_virtqueue { } packed; }; - /* Per-descriptor in buffer length */ - u32 *buflen; - /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); @@ -496,7 +490,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; - u32 buflen = 0; START_USE(vq); @@ -578,7 +571,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE, indirect); - buflen += sg->length; } } /* Last one doesn't continue. */ @@ -618,10 +610,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, else vq->split.desc_state[head].indir_desc = ctx; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[head] = buflen; - /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); @@ -796,11 +784,6 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, BAD_RING(vq, "id %u is not a head!\n", i); return NULL; } - if (vq->buflen && unlikely(*len > vq->buflen[i])) { - BAD_RING(vq, "used len %d is larger than in buflen %u\n", - *len, vq->buflen[i]); - return NULL; - } /* detach_buf_split clears data, so grab it now. */ ret = vq->split.desc_state[i].data; @@ -1079,7 +1062,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, unsigned int i, n, err_idx; u16 head, id; dma_addr_t addr; - u32 buflen = 0; head = vq->packed.next_avail_idx; desc = alloc_indirect_packed(total_sg, gfp); @@ -1109,8 +1091,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(sg->length); i++; - if (n >= out_sgs) - buflen += sg->length; } } @@ -1164,10 +1144,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, vq->packed.desc_state[id].indir_desc = desc; vq->packed.desc_state[id].last = id; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[id] = buflen; - vq->num_added += 1; pr_debug("Added buffer head %i to %p\n", head, vq); @@ -1203,7 +1179,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, __le16 head_flags, flags; u16 head, id, prev, curr, avail_used_flags; int err; - u32 buflen = 0; START_USE(vq); @@ -1283,8 +1258,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, 1 << VRING_PACKED_DESC_F_AVAIL | 1 << VRING_PACKED_DESC_F_USED; } - if (n >= out_sgs) - buflen += sg->length; } } @@ -1304,10 +1277,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, vq->packed.desc_state[id].indir_desc = ctx; vq->packed.desc_state[id].last = prev; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[id] = buflen; - /* * A driver MUST NOT make the first descriptor in the list * available before all subsequent descriptors comprising @@ -1494,11 +1463,6 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, BAD_RING(vq, "id %u is not a head!\n", id); return NULL; } - if (vq->buflen && unlikely(*len > vq->buflen[id])) { - BAD_RING(vq, "used len %d is larger than in buflen %u\n", - *len, vq->buflen[id]); - return NULL; - } /* detach_buf_packed clears data, so grab it now. */ ret = vq->packed.desc_state[id].data; @@ -1704,7 +1668,6 @@ static struct virtqueue *vring_create_virtqueue_packed( struct vring_virtqueue *vq; struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; - struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; size_t ring_size_in_bytes, event_size_in_bytes; @@ -1794,15 +1757,6 @@ static struct virtqueue *vring_create_virtqueue_packed( if (!vq->packed.desc_extra) goto err_desc_extra; - if (!drv->suppress_used_validation || force_used_validation) { - vq->buflen = kmalloc_array(num, sizeof(*vq->buflen), - GFP_KERNEL); - if (!vq->buflen) - goto err_buflen; - } else { - vq->buflen = NULL; - } - /* No callback? Tell other side not to bother us. */ if (!callback) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; @@ -1815,8 +1769,6 @@ static struct virtqueue *vring_create_virtqueue_packed( spin_unlock(&vdev->vqs_list_lock); return &vq->vq; -err_buflen: - kfree(vq->packed.desc_extra); err_desc_extra: kfree(vq->packed.desc_state); err_desc_state: @@ -2224,7 +2176,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, void (*callback)(struct virtqueue *), const char *name) { - struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); struct vring_virtqueue *vq; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) @@ -2284,15 +2235,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, if (!vq->split.desc_extra) goto err_extra; - if (!drv->suppress_used_validation || force_used_validation) { - vq->buflen = kmalloc_array(vring.num, sizeof(*vq->buflen), - GFP_KERNEL); - if (!vq->buflen) - goto err_buflen; - } else { - vq->buflen = NULL; - } - /* Put everything in free lists. */ vq->free_head = 0; memset(vq->split.desc_state, 0, vring.num * @@ -2303,8 +2245,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, spin_unlock(&vdev->vqs_list_lock); return &vq->vq; -err_buflen: - kfree(vq->split.desc_extra); err_extra: kfree(vq->split.desc_state); err_state: diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 44d0e09da2d9..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -152,7 +152,6 @@ size_t virtio_max_dma_size(struct virtio_device *vdev); * @feature_table_size: number of entries in the feature table array. * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. - * @suppress_used_validation: set to not have core validate used length * @probe: the function to call when a device is found. Returns 0 or -errno. * @scan: optional function to call after successful probe; intended * for virtio-scsi to invoke a scan. @@ -169,7 +168,6 @@ struct virtio_driver { unsigned int feature_table_size; const unsigned int *feature_table_legacy; unsigned int feature_table_size_legacy; - bool suppress_used_validation; int (*validate)(struct virtio_device *dev); int (*probe)(struct virtio_device *dev); void (*scan)(struct virtio_device *dev); From 49d8c5ffad07ca014cfae72a1b9b8c52b6ad9cb8 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 22 Nov 2021 17:35:24 +0100 Subject: [PATCH 10/21] vhost/vsock: fix incorrect used length reported to the guest The "used length" reported by calling vhost_add_used() must be the number of bytes written by the device (using "in" buffers). In vhost_vsock_handle_tx_kick() the device only reads the guest buffers (they are all "out" buffers), without writing anything, so we must pass 0 as "used length" to comply virtio spec. Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") Cc: stable@vger.kernel.org Reported-by: Halil Pasic Suggested-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211122163525.294024-2-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Halil Pasic --- drivers/vhost/vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 938aefbc75ec..4e3b95af7ee4 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -554,7 +554,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) virtio_transport_free_pkt(pkt); len += sizeof(pkt->hdr); - vhost_add_used(vq, head, len); + vhost_add_used(vq, head, 0); total_len += len; added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); From 11708ff92c1dba9aaa59168c46c5317677595942 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 22 Nov 2021 17:35:25 +0100 Subject: [PATCH 11/21] vhost/vsock: cleanup removing `len` variable We can increment `total_len` directly and remove `len` since it is no longer used for vhost_add_used(). Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211122163525.294024-3-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- drivers/vhost/vsock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 4e3b95af7ee4..d6ca1c7ad513 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -511,8 +511,6 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) vhost_disable_notify(&vsock->dev, vq); do { - u32 len; - if (!vhost_vsock_more_replies(vsock)) { /* Stop tx until the device processes already * pending replies. Leave tx virtqueue @@ -540,7 +538,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) continue; } - len = pkt->len; + total_len += sizeof(pkt->hdr) + pkt->len; /* Deliver to monitoring devices all received packets */ virtio_transport_deliver_tap_pkt(pkt); @@ -553,9 +551,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) else virtio_transport_free_pkt(pkt); - len += sizeof(pkt->hdr); vhost_add_used(vq, head, 0); - total_len += len; added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); From 0466a39bd0b6c462338f10d18076703d14a552de Mon Sep 17 00:00:00 2001 From: Ye Guojin Date: Wed, 17 Nov 2021 06:39:55 +0000 Subject: [PATCH 12/21] virtio-blk: modify the value type of num in virtio_queue_rq() This was found by coccicheck: ./drivers/block/virtio_blk.c, 334, 14-17, WARNING Unsigned expression compared with zero num < 0 Reported-by: Zeal Robot Signed-off-by: Ye Guojin Link: https://lore.kernel.org/r/20211117063955.160777-1-ye.guojin@zte.com.cn Signed-off-by: Michael S. Tsirkin Fixes: 02746e26c39e ("virtio-blk: avoid preallocating big SGL for data") Reviewed-by: Stefano Garzarella Reviewed-by: Max Gurtovoy Reviewed-by: Stefan Hajnoczi --- drivers/block/virtio_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1a1b1189225d..6ae38776e30e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -316,7 +316,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; - unsigned int num; + int num; int qid = hctx->queue_num; bool notify = false; blk_status_t status; From ea8f17e44fa7d54fae287ccbe30ce269afb5ee42 Mon Sep 17 00:00:00 2001 From: Wu Zongyong Date: Mon, 15 Nov 2021 11:16:42 +0800 Subject: [PATCH 13/21] vhost-vdpa: clean irqs before reseting vdpa device Vdpa devices should be reset after unseting irqs of virtqueues, or we will get errors when killing qemu process: >> pi_update_irte: failed to update PI IRTE >> irq bypass consumer (token 0000000065102a43) unregistration fails: -22 Signed-off-by: Wu Zongyong Link: https://lore.kernel.org/r/a2cb60cf73be9da5c4e6399242117d8818f975ae.1636946171.git.wuzongyong@linux.alibaba.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 01c59ce7e250..29cced1cd277 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1014,12 +1014,12 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) mutex_lock(&d->mutex); filep->private_data = NULL; + vhost_vdpa_clean_irq(v); vhost_vdpa_reset(v); vhost_dev_stop(&v->vdev); vhost_vdpa_iotlb_free(v); vhost_vdpa_free_domain(v); vhost_vdpa_config_put(v); - vhost_vdpa_clean_irq(v); vhost_dev_cleanup(&v->vdev); kfree(v->vdev.vqs); mutex_unlock(&d->mutex); From bb93ce4b150dde79f58e34103cbd1fe829796649 Mon Sep 17 00:00:00 2001 From: Longpeng Date: Wed, 24 Nov 2021 09:52:15 +0800 Subject: [PATCH 14/21] vdpa_sim: avoid putting an uninitialized iova_domain The system will crash if we put an uninitialized iova_domain, this could happen when an error occurs before initializing the iova_domain in vdpasim_create(). BUG: kernel NULL pointer dereference, address: 0000000000000000 ... RIP: 0010:__cpuhp_state_remove_instance+0x96/0x1c0 ... Call Trace: put_iova_domain+0x29/0x220 vdpasim_free+0xd1/0x120 [vdpa_sim] vdpa_release_dev+0x21/0x40 [vdpa] device_release+0x33/0x90 kobject_release+0x63/0x160 vdpasim_create+0x127/0x2a0 [vdpa_sim] vdpasim_net_dev_add+0x7d/0xfe [vdpa_sim_net] vdpa_nl_cmd_dev_add_set_doit+0xe1/0x1a0 [vdpa] genl_family_rcv_msg_doit+0x112/0x140 genl_rcv_msg+0xdf/0x1d0 ... So we must make sure the iova_domain is already initialized before put it. In addition, we may get the following warning in this case: WARNING: ... drivers/iommu/iova.c:344 iova_cache_put+0x58/0x70 So we must make sure the iova_cache_put() is invoked only if the iova_cache_get() is already invoked. Let's fix it together. Cc: stable@vger.kernel.org Fixes: 4080fc106750 ("vdpa_sim: use iova module to allocate IOVA addresses") Signed-off-by: Longpeng Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211124015215.119-1-longpeng2@huawei.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 5f484fff8dbe..41b0cd17fcba 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -591,8 +591,11 @@ static void vdpasim_free(struct vdpa_device *vdpa) vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov); } - put_iova_domain(&vdpasim->iova); - iova_cache_put(); + if (vdpa_get_dma_dev(vdpa)) { + put_iova_domain(&vdpasim->iova); + iova_cache_put(); + } + kvfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); From 21e96a2035db43fc72f7023c4577a63ca606de86 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 23 Nov 2021 11:55:06 +0100 Subject: [PATCH 15/21] iommu/vt-d: Remove unused PASID_DISABLED The macro is unused after commit 00ecd5401349a so it can be removed. Reported-by: Linus Torvalds Fixes: 00ecd5401349a ("iommu/vt-d: Clean up unused PASID updating functions") Signed-off-by: Joerg Roedel Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20211123105507.7654-2-joro@8bytes.org --- arch/x86/include/asm/fpu/api.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 6053674f9132..c2767a6a387e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -102,12 +102,6 @@ extern void switch_fpu_return(void); */ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); -/* - * Tasks that are not using SVA have mm->pasid set to zero to note that they - * will not have the valid bit set in MSR_IA32_PASID while they are running. - */ -#define PASID_DISABLED 0 - /* Trap handling */ extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu_sync_fpstate(struct fpu *fpu); From 717e88aad37befedfd531378b632e794e24e9afb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 23 Nov 2021 11:55:07 +0100 Subject: [PATCH 16/21] iommu/amd: Clarify AMD IOMMUv2 initialization messages The messages printed on the initialization of the AMD IOMMUv2 driver have caused some confusion in the past. Clarify the messages to lower the confusion in the future. Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20211123105507.7654-3-joro@8bytes.org --- drivers/iommu/amd/iommu_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 13cbeb997cc1..58da08cc3d01 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -929,10 +929,8 @@ static int __init amd_iommu_v2_init(void) { int ret; - pr_info("AMD IOMMUv2 driver by Joerg Roedel \n"); - if (!amd_iommu_v2_supported()) { - pr_info("AMD IOMMUv2 functionality not available on this system\n"); + pr_info("AMD IOMMUv2 functionality not available on this system - This is not a bug.\n"); /* * Load anyway to provide the symbols to other modules * which may use AMD IOMMUv2 optionally. @@ -947,6 +945,8 @@ static int __init amd_iommu_v2_init(void) amd_iommu_register_ppr_notifier(&ppr_nb); + pr_info("AMD IOMMUv2 loaded and initialized\n"); + return 0; out: From f7ff3cff3527ff1e70cad8d2fe7c0c7b6f83120a Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Wed, 24 Nov 2021 03:13:25 +0100 Subject: [PATCH 17/21] iommu/rockchip: Fix PAGE_DESC_HI_MASKs for RK3568 With the submission of iommu driver for RK3568 a subtle bug was introduced: PAGE_DESC_HI_MASK1 and PAGE_DESC_HI_MASK2 have to be the other way arround - that leads to random errors, especially when addresses beyond 32 bit are used. Fix it. Fixes: c55356c534aa ("iommu: rockchip: Add support for iommu v2") Signed-off-by: Alex Bee Tested-by: Peter Geis Reviewed-by: Heiko Stuebner Tested-by: Dan Johansen Reviewed-by: Benjamin Gaignard Link: https://lore.kernel.org/r/20211124021325.858139-1-knaerzche@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 5cb260820eda..7f23ad61c094 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -200,8 +200,8 @@ static inline phys_addr_t rk_dte_pt_address(u32 dte) #define DTE_HI_MASK2 GENMASK(7, 4) #define DTE_HI_SHIFT1 24 /* shift bit 8 to bit 32 */ #define DTE_HI_SHIFT2 32 /* shift bit 4 to bit 36 */ -#define PAGE_DESC_HI_MASK1 GENMASK_ULL(39, 36) -#define PAGE_DESC_HI_MASK2 GENMASK_ULL(35, 32) +#define PAGE_DESC_HI_MASK1 GENMASK_ULL(35, 32) +#define PAGE_DESC_HI_MASK2 GENMASK_ULL(39, 36) static inline phys_addr_t rk_dte_pt_address_v2(u32 dte) { From 4e5973dd2725bb30c3db622f7d73f7a5864ce718 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 26 Nov 2021 21:55:55 +0800 Subject: [PATCH 18/21] iommu/vt-d: Fix an unbalanced rcu_read_lock/rcu_read_unlock() If we return -EOPNOTSUPP, the rcu lock remains lock. This is spurious. Go through the end of the function instead. This way, the missing 'rcu_read_unlock()' is called. Fixes: 7afd7f6aa21a ("iommu/vt-d: Check FL and SL capability sanity in scalable mode") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/40cc077ca5f543614eab2a10e84d29dd190273f6.1636217517.git.christophe.jaillet@wanadoo.fr Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20211126135556.397932-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/cap_audit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c index b39d223926a4..71596fc62822 100644 --- a/drivers/iommu/intel/cap_audit.c +++ b/drivers/iommu/intel/cap_audit.c @@ -144,6 +144,7 @@ static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type) { struct dmar_drhd_unit *d; struct intel_iommu *i; + int rc = 0; rcu_read_lock(); if (list_empty(&dmar_drhd_units)) @@ -169,11 +170,11 @@ static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type) */ if (intel_cap_smts_sanity() && !intel_cap_flts_sanity() && !intel_cap_slts_sanity()) - return -EOPNOTSUPP; + rc = -EOPNOTSUPP; out: rcu_read_unlock(); - return 0; + return rc; } int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu) From 86dc40c7ea9c22f64571e0e45f695de73a0e2644 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 26 Nov 2021 21:55:56 +0800 Subject: [PATCH 19/21] iommu/vt-d: Fix unmap_pages support When supporting only the .map and .unmap callbacks of iommu_ops, the IOMMU driver can make assumptions about the size and alignment used for mappings based on the driver provided pgsize_bitmap. VT-d previously used essentially PAGE_MASK for this bitmap as any power of two mapping was acceptably filled by native page sizes. However, with the .map_pages and .unmap_pages interface we're now getting page-size and count arguments. If we simply combine these as (page-size * count) and make use of the previous map/unmap functions internally, any size and alignment assumptions are very different. As an example, a given vfio device assignment VM will often create a 4MB mapping at IOVA pfn [0x3fe00 - 0x401ff]. On a system that does not support IOMMU super pages, the unmap_pages interface will ask to unmap 1024 4KB pages at the base IOVA. dma_pte_clear_level() will recurse down to level 2 of the page table where the first half of the pfn range exactly matches the entire pte level. We clear the pte, increment the pfn by the level size, but (oops) the next pte is on a new page, so we exit the loop an pop back up a level. When we then update the pfn based on that higher level, we seem to assume that the previous pfn value was at the start of the level. In this case the level size is 256K pfns, which we add to the base pfn and get a results of 0x7fe00, which is clearly greater than 0x401ff, so we're done. Meanwhile we never cleared the ptes for the remainder of the range. When the VM remaps this range, we're overwriting valid ptes and the VT-d driver complains loudly, as reported by the user report linked below. The fix for this seems relatively simple, if each iteration of the loop in dma_pte_clear_level() is assumed to clear to the end of the level pte page, then our next pfn should be calculated from level_pfn rather than our working pfn. Fixes: 3f34f1259776 ("iommu/vt-d: Implement map/unmap_pages() iommu_ops callback") Reported-by: Ajay Garg Signed-off-by: Alex Williamson Tested-by: Giovanni Cabiddu Link: https://lore.kernel.org/all/20211002124012.18186-1-ajaygargnsit@gmail.com/ Link: https://lore.kernel.org/r/163659074748.1617923.12716161410774184024.stgit@omen Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20211126135556.397932-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0bde0c8b4126..b6a8f3282411 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1339,13 +1339,11 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, pte = &pte[pfn_level_offset(pfn, level)]; do { - unsigned long level_pfn; + unsigned long level_pfn = pfn & level_mask(level); if (!dma_pte_present(pte)) goto next; - level_pfn = pfn & level_mask(level); - /* If range covers entire pagetable, free it */ if (start_pfn <= level_pfn && last_pfn >= level_pfn + level_size(level) - 1) { @@ -1366,7 +1364,7 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, freelist); } next: - pfn += level_size(level); + pfn = level_pfn + level_size(level); } while (!first_pte_in_page(++pte) && pfn <= last_pfn); if (first_pte) From 27ff768fa21ca3286fcc87c3f38ac67d1a2cbe2d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 27 Nov 2021 16:45:26 -0500 Subject: [PATCH 20/21] tracing: Test the 'Do not trace this pid' case in create event When creating a new event (via a module, kprobe, eprobe, etc), the descriptors that are created must add flags for pid filtering if an instance has pid filtering enabled, as the flags are used at the time the event is executed to know if pid filtering should be done or not. The "Only trace this pid" case was added, but a cut and paste error made that case checked twice, instead of checking the "Trace all but this pid" case. Link: https://lore.kernel.org/all/202111280401.qC0z99JB-lkp@intel.com/ Fixes: 6cb206508b62 ("tracing: Check pid filtering when creating events") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f8965fd50d3b..92be9cb1d7d4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2693,7 +2693,7 @@ trace_create_new_event(struct trace_event_call *call, lockdep_is_held(&event_mutex)); if (!trace_pid_list_first(pid_list, &first) || - !trace_pid_list_first(pid_list, &first)) + !trace_pid_list_first(no_pid_list, &first)) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; From d58071a8a76d779eedab38033ae4c821c30295a5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Nov 2021 14:09:19 -0800 Subject: [PATCH 21/21] Linux 5.16-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 72b0c3d5cbad..0a6ecc8bb2d2 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Gobble Gobble # *DOCUMENTATION*