From d171011e6adad135eaced630dce26cac9a174037 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Mon, 25 Jul 2022 09:59:03 +0800 Subject: [PATCH 001/145] selftests: futex: Fix 'the the' typo in comment Replace 'the the' with 'the' in the comment. Signed-off-by: Slark Xiao Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220725015903.5449-1-slark_xiao@163.com --- .../futex/functional/futex_requeue_pi_signal_restart.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index f8c43ce8fe66..c6b8f32990c8 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -184,7 +184,7 @@ int main(int argc, char *argv[]) /* * If res is non-zero, we either requeued the waiter or hit an * error, break out and handle it. If it is zero, then the - * signal may have hit before the the waiter was blocked on f1. + * signal may have hit before the waiter was blocked on f1. * Try again. */ if (res > 0) { From 9aeaf5bc4e30ec968ae660b865ed491a28daf500 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 19 Jul 2022 13:05:48 +0200 Subject: [PATCH 002/145] locking/spinlocks: Mark spinlocks noinline when inline spinlocks are disabled Otherwise LTO will inline them anyways and cause a large kernel text increase. Since the explicit intention here is to not inline them marking them noinline is good documentation even for the non-LTO case. Signed-off-by: Andi Kleen Signed-off-by: Martin Liska Signed-off-by: Jiri Slaby Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20220719110548.1544-1-jslaby@suse.cz --- kernel/locking/spinlock.c | 56 +++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 7f49baaa4979..8475a0794f8c 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -133,7 +133,7 @@ BUILD_LOCK_OPS(write, rwlock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { return __raw_spin_trylock(lock); } @@ -141,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { return __raw_spin_trylock_bh(lock); } @@ -149,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { __raw_spin_lock(lock); } @@ -157,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { return __raw_spin_lock_irqsave(lock); } @@ -165,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { __raw_spin_lock_irq(lock); } @@ -173,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { __raw_spin_lock_bh(lock); } @@ -181,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh); #endif #ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); } @@ -189,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { __raw_spin_unlock_irqrestore(lock, flags); } @@ -197,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { __raw_spin_unlock_irq(lock); } @@ -205,7 +205,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { __raw_spin_unlock_bh(lock); } @@ -215,7 +215,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_bh); #ifndef CONFIG_PREEMPT_RT #ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_read_trylock(rwlock_t *lock) { return __raw_read_trylock(lock); } @@ -223,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock); #endif #ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock(rwlock_t *lock) { __raw_read_lock(lock); } @@ -231,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { return __raw_read_lock_irqsave(lock); } @@ -239,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { __raw_read_lock_irq(lock); } @@ -247,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq); #endif #ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { __raw_read_lock_bh(lock); } @@ -255,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock(rwlock_t *lock) { __raw_read_unlock(lock); } @@ -263,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_read_unlock_irqrestore(lock, flags); } @@ -271,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { __raw_read_unlock_irq(lock); } @@ -279,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { __raw_read_unlock_bh(lock); } @@ -287,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh); #endif #ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_write_trylock(rwlock_t *lock) { return __raw_write_trylock(lock); } @@ -295,7 +295,7 @@ EXPORT_SYMBOL(_raw_write_trylock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock(rwlock_t *lock) { __raw_write_lock(lock); } @@ -313,7 +313,7 @@ EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) { return __raw_write_lock_irqsave(lock); } @@ -321,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock) { __raw_write_lock_irq(lock); } @@ -329,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock) { __raw_write_lock_bh(lock); } @@ -337,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock(rwlock_t *lock) { __raw_write_unlock(lock); } @@ -345,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_write_unlock_irqrestore(lock, flags); } @@ -353,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) { __raw_write_unlock_irq(lock); } @@ -361,7 +361,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irq); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) { __raw_write_unlock_bh(lock); } From b830774c7e4168997ba42deb186f72e6a845742b Mon Sep 17 00:00:00 2001 From: Johnson Wang Date: Thu, 18 Aug 2022 20:46:02 +0800 Subject: [PATCH 003/145] dt-bindings: timer: Add compatible for MediaTek MT8188 Add dt-binding documentation of timer for MediaTek MT8188 SoC platform. Signed-off-by: Johnson Wang Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220818124602.20394-1-johnson.wang@mediatek.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt index f1c848af91d3..8bbb6e94508b 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt +++ b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt @@ -25,6 +25,7 @@ Required properties: For those SoCs that use SYST * "mediatek,mt8183-timer" for MT8183 compatible timers (SYST) * "mediatek,mt8186-timer" for MT8186 compatible timers (SYST) + * "mediatek,mt8188-timer" for MT8188 compatible timers (SYST) * "mediatek,mt8192-timer" for MT8192 compatible timers (SYST) * "mediatek,mt8195-timer" for MT8195 compatible timers (SYST) * "mediatek,mt7629-timer" for MT7629 compatible timers (SYST) From fa7fc5243f9e7d64ea7e73c247218f22499c3479 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 26 Jul 2022 22:58:57 +0200 Subject: [PATCH 004/145] dt-bindings: timer: renesas,tmu: Add r8a779f0 support Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Acked-by: Rob Herring Link: https://lore.kernel.org/r/20220726205858.1199-1-wsa+renesas@sang-engineering.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/timer/renesas,tmu.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml index c57169118b68..60f4c059bcff 100644 --- a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml @@ -37,6 +37,7 @@ properties: - renesas,tmu-r8a77990 # R-Car E3 - renesas,tmu-r8a77995 # R-Car D3 - renesas,tmu-r8a779a0 # R-Car V3U + - renesas,tmu-r8a779f0 # R-Car S4-8 - const: renesas,tmu reg: From 501f7f69bca195da266de83eb2c26c30813fba97 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 10 Aug 2022 15:03:46 -0700 Subject: [PATCH 005/145] locking: Add __lockfunc to slow path functions So that we can skip the functions in the perf lock contention and other places like /proc/PID/wchan. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20220810220346.1919485-1-namhyung@kernel.org --- arch/x86/include/asm/qspinlock_paravirt.h | 13 +++++++------ kernel/locking/qrwlock.c | 4 ++-- kernel/locking/qspinlock.c | 2 +- kernel/locking/qspinlock_paravirt.h | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 892fd8c3a6f7..60ece592b220 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_64BIT -PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define __pv_queued_spin_unlock __pv_queued_spin_unlock #define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" #define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" @@ -20,9 +20,10 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); /* * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock * which combines the registers saving trunk and the body of the following - * C code: + * C code. Note that it puts the code in the .spinlock.text section which + * is equivalent to adding __lockfunc in the C code: * - * void __pv_queued_spin_unlock(struct qspinlock *lock) + * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock) * { * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); * @@ -36,7 +37,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .text;" +asm (".pushsection .spinlock.text;" ".globl " PV_UNLOCK ";" ".type " PV_UNLOCK ", @function;" ".align 4,0x90;" @@ -65,8 +66,8 @@ asm (".pushsection .text;" #else /* CONFIG_64BIT */ -extern void __pv_queued_spin_unlock(struct qspinlock *lock); -PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); +extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock); +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text"); #endif /* CONFIG_64BIT */ #endif diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2e1600906c9f..d2ef312a8611 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -18,7 +18,7 @@ * queued_read_lock_slowpath - acquire read lock of a queued rwlock * @lock: Pointer to queued rwlock structure */ -void queued_read_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -63,7 +63,7 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); * queued_write_lock_slowpath - acquire write lock of a queued rwlock * @lock : Pointer to queued rwlock structure */ -void queued_write_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock) { int cnts; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 65a9a10caa6f..2b23378775fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -313,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ -void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; u32 old, tail; diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e84d21aa0722..6afc249ce697 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -489,7 +489,7 @@ gotlock: * PV versions of the unlock fastpath and slowpath functions to be used * instead of queued_spin_unlock(). */ -__visible void +__visible __lockfunc void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct pv_node *node; @@ -544,7 +544,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #include #ifndef __pv_queued_spin_unlock -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { u8 locked; From 7b3e31869081771c63c3d006347ad06738f843b5 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Thu, 18 Aug 2022 09:45:53 +0800 Subject: [PATCH 006/145] objtool: Use arch_jump_destination() in read_intra_function_calls() Use arch_jump_destiation() instead of the open-coded 'offset + len + immediate' that is x86 specific. Avoids future trouble with other architectures. Signed-off-by: Chen Zhongjin Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220818014553.220261-1-chenzhongjin@huawei.com --- tools/objtool/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 0cec74da7ffe..b012d987a658 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2233,7 +2233,7 @@ static int read_intra_function_calls(struct objtool_file *file) */ insn->type = INSN_JUMP_UNCONDITIONAL; - dest_off = insn->offset + insn->len + insn->immediate; + dest_off = arch_jump_destination(insn); insn->jump_dest = find_insn(file, insn->sec, dest_off); if (!insn->jump_dest) { WARN_FUNC("can't find call dest at %s+0x%lx", From 3c6f3900808c483b0bbb2c351f995c7b880dae14 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2022 09:26:57 -0700 Subject: [PATCH 007/145] objtool: Remove "ANNOTATE_NOENDBR on ENDBR" warning This warning isn't very useful: why would you put ANNOTATE_NOENDBR on ENDBR, and if you did, what's the harm? And thus far it's only found one non-bug, where the '__end_entry_SYSENTER_compat' label happens to land on the ENDBR from entry_SYSCALL_compat: vmlinux.o: warning: objtool: entry_SYSCALL_compat+0x0: ANNOTATE_NOENDBR on ENDBR .. which is fine. Just remove the warning. Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/142341a5dafdfc788e4c95b9e226a6eefc9b626e.1660839773.git.jpoimboe@kernel.org --- tools/objtool/check.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index b012d987a658..8b8c8f74a775 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2102,9 +2102,6 @@ static int read_noendbr_hints(struct objtool_file *file) return -1; } - if (insn->type == INSN_ENDBR) - WARN_FUNC("ANNOTATE_NOENDBR on ENDBR", insn->sec, insn->offset); - insn->noendbr = 1; } From 610f9c00ce6ed894caf9bb4feb3c026339150233 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Tue, 23 Aug 2022 07:52:41 +0000 Subject: [PATCH 008/145] mm/slab_common: Remove the unneeded result variable Return the value from __kmem_cache_shrink() directly instead of storing it in another redundant variable. Reported-by: Zeal Robot Signed-off-by: ye xingchen Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 17996649cfe3..0dfa3cfb6be5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -495,13 +495,9 @@ EXPORT_SYMBOL(kmem_cache_destroy); */ int kmem_cache_shrink(struct kmem_cache *cachep) { - int ret; - - kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep); - return ret; + return __kmem_cache_shrink(cachep); } EXPORT_SYMBOL(kmem_cache_shrink); From 2bfbb0271a5b48c01c711d1509a422a7244c9eb8 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Mon, 22 Aug 2022 01:38:30 +0000 Subject: [PATCH 009/145] mm/slub: Remove the unneeded result variable Return the value from attribute->store(s, buf, len) and attribute->show(s, buf) directly instead of storing it in another redundant variable. Reported-by: Zeal Robot Acked-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: ye xingchen Signed-off-by: Vlastimil Babka --- mm/slub.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 862dbd9af4f5..dc59b9e8c66f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5826,7 +5826,6 @@ static ssize_t slab_attr_show(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5834,9 +5833,7 @@ static ssize_t slab_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - err = attribute->show(s, buf); - - return err; + return attribute->show(s, buf); } static ssize_t slab_attr_store(struct kobject *kobj, @@ -5845,7 +5842,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5853,8 +5849,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - err = attribute->store(s, buf, len); - return err; + return attribute->store(s, buf, len); } static void kmem_cache_release(struct kobject *k) From c31a910c74ed558461dc7eecf6168ccf805775ec Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:10 +0900 Subject: [PATCH 010/145] mm/slab: move NUMA-related code to __do_cache_alloc() To implement slab_alloc_node() independent of NUMA configuration, move NUMA fallback/alternate allocation code into __do_cache_alloc(). One functional change here is not to check availability of node when allocating from local node. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/slab.c | 68 +++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 10e96137b44f..1656393f55cb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3180,13 +3180,14 @@ must_grow: return obj ? obj : fallback_alloc(cachep, flags); } +static void *__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid); + static __always_inline void * slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *ptr; - int slab_node = numa_mem_id(); struct obj_cgroup *objcg = NULL; bool init = false; @@ -3200,30 +3201,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ goto out_hooks; local_irq_save(save_flags); - - if (nodeid == NUMA_NO_NODE) - nodeid = slab_node; - - if (unlikely(!get_node(cachep, nodeid))) { - /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); - goto out; - } - - if (nodeid == slab_node) { - /* - * Use the locally cached objects if possible. - * However ____cache_alloc does not allow fallback - * to other nodes. It may fail while we still have - * objects on other nodes available. - */ - ptr = ____cache_alloc(cachep, flags); - if (ptr) - goto out; - } - /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); -out: + ptr = __do_cache_alloc(cachep, flags, nodeid); local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); init = slab_want_init_on_alloc(flags, cachep); @@ -3234,31 +3212,46 @@ out_hooks: } static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *objp; + void *objp = NULL; + int slab_node = numa_mem_id(); - if (current->mempolicy || cpuset_do_slab_mem_spread()) { - objp = alternate_node_alloc(cache, flags); - if (objp) - goto out; + if (nodeid == NUMA_NO_NODE) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { + objp = alternate_node_alloc(cachep, flags); + if (objp) + goto out; + } + /* + * Use the locally cached objects if possible. + * However ____cache_alloc does not allow fallback + * to other nodes. It may fail while we still have + * objects on other nodes available. + */ + objp = ____cache_alloc(cachep, flags); + nodeid = slab_node; + } else if (nodeid == slab_node) { + objp = ____cache_alloc(cachep, flags); + } else if (!get_node(cachep, nodeid)) { + /* Node not bootstrapped yet */ + objp = fallback_alloc(cachep, flags); + goto out; } - objp = ____cache_alloc(cache, flags); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_mem_id()); - + objp = ____cache_alloc_node(cachep, flags, nodeid); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) { return ____cache_alloc(cachep, flags); } @@ -3284,7 +3277,7 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, goto out; local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); + objp = __do_cache_alloc(cachep, flags, NUMA_NO_NODE); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3521,7 +3514,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: + __do_cache_alloc(s, flags, NUMA_NO_NODE); if (unlikely(!objp)) goto error; From 07588d726f8d320215dcf6c79a28fe6b1bab6255 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:11 +0900 Subject: [PATCH 011/145] mm/slab: cleanup slab_alloc() and slab_alloc_node() Make slab_alloc_node() available even when CONFIG_NUMA=n and make slab_alloc() wrapper of slab_alloc_node(). This is necessary for further cleanup. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/slab.c | 49 +++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 36 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 1656393f55cb..748dd085f38e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3180,37 +3180,6 @@ must_grow: return obj ? obj : fallback_alloc(cachep, flags); } -static void *__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid); - -static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, - unsigned long caller) -{ - unsigned long save_flags; - void *ptr; - struct obj_cgroup *objcg = NULL; - bool init = false; - - flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags); - if (unlikely(!cachep)) - return NULL; - - ptr = kfence_alloc(cachep, orig_size, flags); - if (unlikely(ptr)) - goto out_hooks; - - local_irq_save(save_flags); - ptr = __do_cache_alloc(cachep, flags, nodeid); - local_irq_restore(save_flags); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - init = slab_want_init_on_alloc(flags, cachep); - -out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); - return ptr; -} - static __always_inline void * __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) { @@ -3259,8 +3228,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unus #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - size_t orig_size, unsigned long caller) +slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3277,7 +3246,7 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, goto out; local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags, NUMA_NO_NODE); + objp = __do_cache_alloc(cachep, flags, nodeid); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3288,6 +3257,14 @@ out: return objp; } +static __always_inline void * +slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + size_t orig_size, unsigned long caller) +{ + return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, + caller); +} + /* * Caller needs to acquire correct kmem_cache_node's list_lock * @list: List of detached free slabs should be freed by caller @@ -3574,7 +3551,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); + void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep, cachep->object_size, cachep->size, @@ -3592,7 +3569,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, { void *ret; - ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); + ret = slab_alloc_node(cachep, NULL, flags, nodeid, size, _RET_IP_); ret = kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, cachep, From f78a03f6e28be0283f73d3c18b54837b638a8ccf Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:12 +0900 Subject: [PATCH 012/145] mm/slab_common: remove CONFIG_NUMA ifdefs for common kmalloc functions Now that slab_alloc_node() is available for SLAB when CONFIG_NUMA=n, remove CONFIG_NUMA ifdefs for common kmalloc functions. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 28 ---------------------------- mm/slab.c | 2 -- mm/slob.c | 5 +---- mm/slub.c | 6 ------ 4 files changed, 1 insertion(+), 40 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..4754c834b0e3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -456,38 +456,18 @@ static __always_inline void kfree_bulk(size_t size, void **p) kmem_cache_free_bulk(NULL, size, p); } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __alloc_size(1); void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; -#else -static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __kmalloc(size, flags); -} - -static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) -{ - return kmem_cache_alloc(s, flags); -} -#endif #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) __assume_slab_alignment __alloc_size(3); -#ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) __assume_slab_alignment __alloc_size(4); -#else -static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, int node, size_t size) -{ - return kmem_cache_alloc_trace(s, gfpflags, size); -} -#endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, @@ -701,20 +681,12 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t } -#ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, unsigned long caller) __alloc_size(1); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) -#else /* CONFIG_NUMA */ - -#define kmalloc_node_track_caller(size, flags, node) \ - kmalloc_track_caller(size, flags) - -#endif /* CONFIG_NUMA */ - /* * Shortcuts */ diff --git a/mm/slab.c b/mm/slab.c index 748dd085f38e..0acd65358c83 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3535,7 +3535,6 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) EXPORT_SYMBOL(kmem_cache_alloc_trace); #endif -#ifdef CONFIG_NUMA /** * kmem_cache_alloc_node - Allocate an object on the specified node * @cachep: The cache to allocate from. @@ -3609,7 +3608,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, return __do_kmalloc_node(size, flags, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif /* CONFIG_NUMA */ #ifdef CONFIG_PRINTK void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) diff --git a/mm/slob.c b/mm/slob.c index 2bd4f476c340..74d850967213 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -536,14 +536,12 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) } EXPORT_SYMBOL(__kmalloc_track_caller); -#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, int node, unsigned long caller) { return __do_kmalloc_node(size, gfp, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif void kfree(const void *block) { @@ -647,7 +645,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_ return slob_alloc_node(cachep, flags, NUMA_NO_NODE); } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_NUMA + void *__kmalloc_node(size_t size, gfp_t gfp, int node) { return __do_kmalloc_node(size, gfp, node, _RET_IP_); @@ -659,7 +657,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) return slob_alloc_node(cachep, gfp, node); } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif static void __kmem_cache_free(void *b, int size) { diff --git a/mm/slub.c b/mm/slub.c index 862dbd9af4f5..b29b3c9d3175 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3287,7 +3287,6 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) EXPORT_SYMBOL(kmem_cache_alloc_trace); #endif -#ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3314,7 +3313,6 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif -#endif /* CONFIG_NUMA */ /* * Slow path handling. This may still be called frequently since objects @@ -4427,7 +4425,6 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); -#ifdef CONFIG_NUMA static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; @@ -4474,7 +4471,6 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return ret; } EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_NUMA */ #ifdef CONFIG_HARDENED_USERCOPY /* @@ -4930,7 +4926,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) } EXPORT_SYMBOL(__kmalloc_track_caller); -#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { @@ -4960,7 +4955,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif #ifdef CONFIG_SYSFS static int count_inuse(struct slab *slab) From c45248db04f8e3aca4798d67a394fb9cc2168118 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:13 +0900 Subject: [PATCH 013/145] mm/slab_common: cleanup kmalloc_track_caller() Make kmalloc_track_caller() wrapper of kmalloc_node_track_caller(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 17 ++++++++--------- mm/slab.c | 6 ------ mm/slob.c | 6 ------ mm/slub.c | 22 ---------------------- 4 files changed, 8 insertions(+), 43 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 4754c834b0e3..a0e57df3d5a4 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -651,6 +651,12 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag return kmalloc_array(n, size, flags | __GFP_ZERO); } +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + unsigned long caller) __alloc_size(1); +#define kmalloc_node_track_caller(size, flags, node) \ + __kmalloc_node_track_caller(size, flags, node, \ + _RET_IP_) + /* * kmalloc_track_caller is a special version of kmalloc that records the * calling function of the routine calling it for slab leak tracking instead @@ -659,9 +665,9 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, _RET_IP_) + __kmalloc_node_track_caller(size, flags, \ + NUMA_NO_NODE, _RET_IP_) static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, int node) @@ -680,13 +686,6 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } - -extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - unsigned long caller) __alloc_size(1); -#define kmalloc_node_track_caller(size, flags, node) \ - __kmalloc_node_track_caller(size, flags, node, \ - _RET_IP_) - /* * Shortcuts */ diff --git a/mm/slab.c b/mm/slab.c index 0acd65358c83..611e630ff860 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3665,12 +3665,6 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) -{ - return __do_kmalloc(size, flags, caller); -} -EXPORT_SYMBOL(__kmalloc_track_caller); - /** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. diff --git a/mm/slob.c b/mm/slob.c index 74d850967213..96b08acd72ce 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -530,12 +530,6 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) -{ - return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); -} -EXPORT_SYMBOL(__kmalloc_track_caller); - void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, int node, unsigned long caller) { diff --git a/mm/slub.c b/mm/slub.c index b29b3c9d3175..c82a4062f730 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4904,28 +4904,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, gfpflags); - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, gfpflags, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, s, size, s->size, gfpflags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_track_caller); - void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { From 0f853b2e6dd9580103484a098e9c973a67d127ac Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:14 +0900 Subject: [PATCH 014/145] mm/sl[au]b: factor out __do_kmalloc_node() __kmalloc(), __kmalloc_node(), __kmalloc_node_track_caller() mostly do same job. Factor out common code into __do_kmalloc_node(). Note that this patch also fixes missing kasan_kmalloc() in SLUB's __kmalloc_node_track_caller(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/slab.c | 30 +---------------------- mm/slub.c | 71 +++++++++++++++---------------------------------------- 2 files changed, 20 insertions(+), 81 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 611e630ff860..8c08d7f3dead 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3631,37 +3631,9 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) } #endif -/** - * __do_kmalloc - allocate memory - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate (see kmalloc). - * @caller: function caller for debug tracking of the caller - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - unsigned long caller) -{ - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = slab_alloc(cachep, NULL, flags, size, caller); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(caller, ret, cachep, - size, cachep->size, flags); - - return ret; -} - void *__kmalloc(size_t size, gfp_t flags) { - return __do_kmalloc(size, flags, _RET_IP_); + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); } EXPORT_SYMBOL(__kmalloc); diff --git a/mm/slub.c b/mm/slub.c index c82a4062f730..f9929ba858ec 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4402,29 +4402,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -void *__kmalloc(size_t size, gfp_t flags) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, flags); - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, flags, _RET_IP_, size); - - trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc); - static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; @@ -4442,7 +4419,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) return kmalloc_large_node_hook(ptr, size, flags); } -void *__kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *s; void *ret; @@ -4450,7 +4428,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = kmalloc_large_node(size, flags, node); - trace_kmalloc_node(_RET_IP_, ret, NULL, + trace_kmalloc_node(caller, ret, NULL, size, PAGE_SIZE << get_order(size), flags, node); @@ -4462,16 +4440,28 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size); + ret = slab_alloc_node(s, NULL, flags, node, caller, size); - trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node); + trace_kmalloc_node(caller, ret, s, size, s->size, flags, node); ret = kasan_kmalloc(s, ret, size, flags); return ret; } + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} EXPORT_SYMBOL(__kmalloc_node); +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + + #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -4905,32 +4895,9 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) } void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) + int node, unsigned long caller) { - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, gfpflags, node); - - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << get_order(size), - gfpflags, node); - - return ret; - } - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node); - - return ret; + return __do_kmalloc_node(size, gfpflags, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); From e4c98d68959e51646c379e157bad36ef0d7bf467 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:15 +0900 Subject: [PATCH 015/145] mm/slab_common: fold kmalloc_order_trace() into kmalloc_large() There is no caller of kmalloc_order_trace() except kmalloc_large(). Fold it into kmalloc_large() and remove kmalloc_order{,_trace}(). Also add tracepoint in kmalloc_large() that was previously in kmalloc_order_trace(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 22 ++-------------------- mm/slab_common.c | 17 ++++------------- 2 files changed, 6 insertions(+), 33 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index a0e57df3d5a4..15a4c59da59e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -489,26 +489,8 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment - __alloc_size(1); - -#ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) - __assume_page_alignment __alloc_size(1); -#else -static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, - unsigned int order) -{ - return kmalloc_order(size, flags, order); -} -#endif - -static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) -{ - unsigned int order = get_order(size); - return kmalloc_order_trace(size, flags, order); -} - +void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment + __alloc_size(1); /** * kmalloc - allocate memory * @size: how many bytes of memory are required. diff --git a/mm/slab_common.c b/mm/slab_common.c index 17996649cfe3..8b1988544b89 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -905,16 +905,16 @@ gfp_t kmalloc_fix_flags(gfp_t flags) * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +void *kmalloc_large(size_t size, gfp_t flags) { void *ret = NULL; struct page *page; + unsigned int order = get_order(size); if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); - flags |= __GFP_COMP; - page = alloc_pages(flags, order); + page = alloc_pages(flags | __GFP_COMP, order); if (likely(page)) { ret = page_address(page); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, @@ -923,19 +923,10 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) ret = kasan_kmalloc_large(ret, size, flags); /* As ret might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ret, size, 1, flags); - return ret; -} -EXPORT_SYMBOL(kmalloc_order); - -#ifdef CONFIG_TRACING -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) -{ - void *ret = kmalloc_order(size, flags, order); trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags); return ret; } -EXPORT_SYMBOL(kmalloc_order_trace); -#endif +EXPORT_SYMBOL(kmalloc_large); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ From a0c3b940023eef3fa005b2bc37d9312712331dcb Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:16 +0900 Subject: [PATCH 016/145] mm/slub: move kmalloc_large_node() to slab_common.c In later patch SLAB will also pass requests larger than order-1 page to page allocator. Move kmalloc_large_node() to slab_common.c. Fold kmalloc_large_node_hook() into kmalloc_large_node() as there is no other caller. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 4 ++++ mm/slab_common.c | 22 ++++++++++++++++++++++ mm/slub.c | 25 ------------------------- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 15a4c59da59e..082499306098 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -491,6 +491,10 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment __alloc_size(1); + +void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment + __alloc_size(1); + /** * kmalloc - allocate memory * @size: how many bytes of memory are required. diff --git a/mm/slab_common.c b/mm/slab_common.c index 8b1988544b89..1b9101f9cb21 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -928,6 +928,28 @@ void *kmalloc_large(size_t size, gfp_t flags) } EXPORT_SYMBOL(kmalloc_large); +void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); + + flags |= __GFP_COMP; + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + } + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + + return ptr; +} +EXPORT_SYMBOL(kmalloc_large_node); + #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ static void freelist_randomize(struct rnd_state *state, unsigned int *list, diff --git a/mm/slub.c b/mm/slub.c index f9929ba858ec..5e7819ade2c4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1704,14 +1704,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - return ptr; -} - static __always_inline void kfree_hook(void *x) { kmemleak_free(x); @@ -4402,23 +4394,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - - return kmalloc_large_node_hook(ptr, size, flags); -} - static __always_inline void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { From bf37d791022ecfb1279ac88c5448a53f1ae40a59 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:17 +0900 Subject: [PATCH 017/145] mm/slab_common: kmalloc_node: pass large requests to page allocator Now that kmalloc_large_node() is in common code, pass large requests to page allocator in kmalloc_node() using kmalloc_large_node(). One problem is that currently there is no tracepoint in kmalloc_large_node(). Instead of simply putting tracepoint in it, use kmalloc_large_node{,_notrace} depending on its caller to show useful address for both inlined kmalloc_node() and __kmalloc_node_track_caller() when large objects are allocated. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 26 +++++++++++++++++++------- mm/slab.h | 2 ++ mm/slab_common.c | 11 ++++++++++- mm/slub.c | 2 +- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 082499306098..fd2e129fc813 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -571,23 +571,35 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } +#ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { -#ifndef CONFIG_SLOB - if (__builtin_constant_p(size) && - size <= KMALLOC_MAX_CACHE_SIZE) { - unsigned int i = kmalloc_index(size); + if (__builtin_constant_p(size)) { + unsigned int index; - if (!i) + if (size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + index = kmalloc_index(size); + + if (!index) return ZERO_SIZE_PTR; return kmem_cache_alloc_node_trace( - kmalloc_caches[kmalloc_type(flags)][i], + kmalloc_caches[kmalloc_type(flags)][index], flags, node, size); } -#endif return __kmalloc_node(size, flags, node); } +#else +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) +{ + if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + return __kmalloc_node(size, flags, node); +} +#endif /** * kmalloc_array - allocate memory for an array. diff --git a/mm/slab.h b/mm/slab.h index 4ec82bec15ec..801a207a5cd7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -275,6 +275,8 @@ void create_kmalloc_caches(slab_flags_t); struct kmem_cache *kmalloc_slab(size_t, gfp_t); #endif +void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node); + gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 1b9101f9cb21..7a0942d54424 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -928,7 +928,7 @@ void *kmalloc_large(size_t size, gfp_t flags) } EXPORT_SYMBOL(kmalloc_large); -void *kmalloc_large_node(size_t size, gfp_t flags, int node) +void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node) { struct page *page; void *ptr = NULL; @@ -948,6 +948,15 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } + +void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + void *ret = kmalloc_large_node_notrace(size, flags, node); + + trace_kmalloc_node(_RET_IP_, ret, NULL, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; +} EXPORT_SYMBOL(kmalloc_large_node); #ifdef CONFIG_SLAB_FREELIST_RANDOM diff --git a/mm/slub.c b/mm/slub.c index 5e7819ade2c4..165fe87af204 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4401,7 +4401,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller void *ret; if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, flags, node); + ret = kmalloc_large_node_notrace(size, flags, node); trace_kmalloc_node(caller, ret, NULL, size, PAGE_SIZE << get_order(size), From c4cab557521a73bd803e5c6f613b4e00bd3c4662 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:18 +0900 Subject: [PATCH 018/145] mm/slab_common: cleanup kmalloc_large() Now that kmalloc_large() and kmalloc_large_node() do mostly same job, make kmalloc_large() wrapper of kmalloc_large_node_notrace(). In the meantime, add missing flag fix code in kmalloc_large_node_notrace(). Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 7a0942d54424..51ccd0545816 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -905,28 +905,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags) * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -void *kmalloc_large(size_t size, gfp_t flags) -{ - void *ret = NULL; - struct page *page; - unsigned int order = get_order(size); - - if (unlikely(flags & GFP_SLAB_BUG_MASK)) - flags = kmalloc_fix_flags(flags); - - page = alloc_pages(flags | __GFP_COMP, order); - if (likely(page)) { - ret = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - ret = kasan_kmalloc_large(ret, size, flags); - /* As ret might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ret, size, 1, flags); - trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags); - return ret; -} -EXPORT_SYMBOL(kmalloc_large); void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node) { @@ -934,6 +912,9 @@ void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node) void *ptr = NULL; unsigned int order = get_order(size); + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + flags |= __GFP_COMP; page = alloc_pages_node(node, flags, order); if (page) { @@ -949,6 +930,16 @@ void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node) return ptr; } +void *kmalloc_large(size_t size, gfp_t flags) +{ + void *ret = kmalloc_large_node_notrace(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, NULL, size, + PAGE_SIZE << get_order(size), flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_large); + void *kmalloc_large_node(size_t size, gfp_t flags, int node) { void *ret = kmalloc_large_node_notrace(size, flags, node); From d6a71648dbc0ca5520cba16a8fdce8d37ae74218 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:19 +0900 Subject: [PATCH 019/145] mm/slab: kmalloc: pass requests larger than order-1 page to page allocator There is not much benefit for serving large objects in kmalloc(). Let's pass large requests to page allocator like SLUB for better maintenance of common code. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 23 ++++------------- mm/slab.c | 60 +++++++++++++++++++++++++++++++------------- mm/slab.h | 3 +++ mm/slab_common.c | 25 ++++++++++++------ mm/slub.c | 19 -------------- 5 files changed, 68 insertions(+), 62 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index fd2e129fc813..4ee5b2fed164 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -243,27 +243,17 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLAB /* - * The largest kmalloc size supported by the SLAB allocators is - * 32 megabyte (2^25) or the maximum allocatable page order if that is - * less than 32 MB. - * - * WARNING: Its not easy to increase this value since the allocators have - * to do various tricks to work around compiler limitations in order to - * ensure proper constant folding. + * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ - (MAX_ORDER + PAGE_SHIFT - 1) : 25) -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif #endif #ifdef CONFIG_SLUB -/* - * SLUB directly allocates requests fitting in to an order-1 page - * (PAGE_SIZE*2). Larger requests are passed to the page allocator. - */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW @@ -415,10 +405,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size, if (size <= 512 * 1024) return 19; if (size <= 1024 * 1024) return 20; if (size <= 2 * 1024 * 1024) return 21; - if (size <= 4 * 1024 * 1024) return 22; - if (size <= 8 * 1024 * 1024) return 23; - if (size <= 16 * 1024 * 1024) return 24; - if (size <= 32 * 1024 * 1024) return 25; if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); @@ -428,6 +414,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, /* Will never be reached. Needed because the compiler may complain */ return -1; } +static_assert(PAGE_SHIFT <= 20); #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ diff --git a/mm/slab.c b/mm/slab.c index 8c08d7f3dead..10c9af904410 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3585,11 +3585,19 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) struct kmem_cache *cachep; void *ret; - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = kmalloc_large_node_notrace(size, flags, node); + + trace_kmalloc_node(caller, ret, NULL, size, + PAGE_SIZE << get_order(size), + flags, node); + return ret; + } + cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); ret = kasan_kmalloc(cachep, ret, size, flags); @@ -3664,17 +3672,27 @@ EXPORT_SYMBOL(kmem_cache_free); void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { - struct kmem_cache *s; - size_t i; local_irq_disable(); - for (i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { void *objp = p[i]; + struct kmem_cache *s; - if (!orig_s) /* called via kfree_bulk */ - s = virt_to_cache(objp); - else + if (!orig_s) { + struct folio *folio = virt_to_folio(objp); + + /* called via kfree_bulk */ + if (!folio_test_slab(folio)) { + local_irq_enable(); + free_large_kmalloc(folio, objp); + local_irq_disable(); + continue; + } + s = folio_slab(folio)->slab_cache; + } else { s = cache_from_obj(orig_s, objp); + } + if (!s) continue; @@ -3703,20 +3721,24 @@ void kfree(const void *objp) { struct kmem_cache *c; unsigned long flags; + struct folio *folio; trace_kfree(_RET_IP_, objp); if (unlikely(ZERO_OR_NULL_PTR(objp))) return; - local_irq_save(flags); - kfree_debugcheck(objp); - c = virt_to_cache(objp); - if (!c) { - local_irq_restore(flags); + + folio = virt_to_folio(objp); + if (!folio_test_slab(folio)) { + free_large_kmalloc(folio, (void *)objp); return; } - debug_check_no_locks_freed(objp, c->object_size); + c = folio_slab(folio)->slab_cache; + + local_irq_save(flags); + kfree_debugcheck(objp); + debug_check_no_locks_freed(objp, c->object_size); debug_check_no_obj_freed(objp, c->object_size); __cache_free(c, (void *)objp, _RET_IP_); local_irq_restore(flags); @@ -4138,15 +4160,17 @@ void __check_heap_object(const void *ptr, unsigned long n, size_t __ksize(const void *objp) { struct kmem_cache *c; - size_t size; + struct folio *folio; BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - c = virt_to_cache(objp); - size = c ? c->object_size : 0; + folio = virt_to_folio(objp); + if (!folio_test_slab(folio)) + return folio_size(folio); - return size; + c = folio_slab(folio)->slab_cache; + return c->object_size; } EXPORT_SYMBOL(__ksize); diff --git a/mm/slab.h b/mm/slab.h index 801a207a5cd7..9808d537f6ba 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -660,6 +660,9 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) print_tracking(cachep, x); return cachep; } + +void free_large_kmalloc(struct folio *folio, void *object); + #endif /* CONFIG_SLOB */ static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 51ccd0545816..5a2e81f42ee9 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -744,8 +744,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. - * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is - * kmalloc-32M. + * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is + * kmalloc-2M. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(0, 0), @@ -769,11 +769,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(262144, 256k), INIT_KMALLOC_INFO(524288, 512k), INIT_KMALLOC_INFO(1048576, 1M), - INIT_KMALLOC_INFO(2097152, 2M), - INIT_KMALLOC_INFO(4194304, 4M), - INIT_KMALLOC_INFO(8388608, 8M), - INIT_KMALLOC_INFO(16777216, 16M), - INIT_KMALLOC_INFO(33554432, 32M) + INIT_KMALLOC_INFO(2097152, 2M) }; /* @@ -886,6 +882,21 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* Kmalloc array is now usable */ slab_state = UP; } + +void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) diff --git a/mm/slub.c b/mm/slub.c index 165fe87af204..a659874c5d44 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1704,12 +1704,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static __always_inline void kfree_hook(void *x) -{ - kmemleak_free(x); - kasan_kfree_large(x); -} - static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { @@ -3550,19 +3544,6 @@ struct detached_freelist { struct kmem_cache *s; }; -static inline void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kfree_hook(object); - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same From ed4cd17eb26d7f0c6a762608a3f30870929fbcdd Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:20 +0900 Subject: [PATCH 020/145] mm/sl[au]b: introduce common alloc/free functions without tracepoint To unify kmalloc functions in later patch, introduce common alloc/free functions that does not have tracepoint. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/slab.c | 36 +++++++++++++++++++++++++++++------- mm/slab.h | 5 +++++ mm/slub.c | 13 +++++++++++++ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 10c9af904410..aa61851b0a07 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3560,6 +3560,14 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) } EXPORT_SYMBOL(kmem_cache_alloc_node); +void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, size_t orig_size, + unsigned long caller) +{ + return slab_alloc_node(cachep, NULL, flags, nodeid, + orig_size, caller); +} + #ifdef CONFIG_TRACING void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, gfp_t flags, @@ -3645,6 +3653,26 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); +static __always_inline +void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + unsigned long flags; + + local_irq_save(flags); + debug_check_no_locks_freed(objp, cachep->object_size); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, caller); + local_irq_restore(flags); +} + +void __kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + __do_kmem_cache_free(cachep, objp, caller); +} + /** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. @@ -3655,18 +3683,12 @@ EXPORT_SYMBOL(__kmalloc); */ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { - unsigned long flags; cachep = cache_from_obj(cachep, objp); if (!cachep) return; trace_kmem_cache_free(_RET_IP_, objp, cachep->name); - local_irq_save(flags); - debug_check_no_locks_freed(objp, cachep->object_size); - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, _RET_IP_); - local_irq_restore(flags); + __do_kmem_cache_free(cachep, objp, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slab.h b/mm/slab.h index 9808d537f6ba..8eefeed95407 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -273,6 +273,11 @@ void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller); +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); #endif void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node); diff --git a/mm/slub.c b/mm/slub.c index a659874c5d44..a11f78c2647c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3262,6 +3262,14 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru); +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller) +{ + return slab_alloc_node(s, NULL, gfpflags, node, + caller, orig_size); +} + #ifdef CONFIG_TRACING void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { @@ -3526,6 +3534,11 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) +{ + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); From a579b0560cd74e9edacbc5d6a021bae90159fb91 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 23 Aug 2022 19:03:56 +0200 Subject: [PATCH 021/145] mm/slub: move free_debug_processing() further In the following patch, the function free_debug_processing() will be calling add_partial(), remove_partial() and discard_slab(), se move it below their definitions to avoid forward declarations. To make review easier, separate the move from functional changes. Signed-off-by: Vlastimil Babka Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes --- mm/slub.c | 114 +++++++++++++++++++++++++++--------------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 862dbd9af4f5..87e794ab101a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1385,63 +1385,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 1; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - void *object = head; - int cnt = 0; - unsigned long flags, flags2; - int ret = 0; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, slab)) - goto out; - } - -next_object: - cnt++; - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, slab, object, addr)) - goto out; - } - - if (s->flags & SLAB_STORE_USER) - set_track_update(s, object, TRACK_FREE, addr, handle); - trace(s, slab, object, 0); - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ - init_object(s, object, SLUB_RED_INACTIVE); - - /* Reached end of constructed freelist yet? */ - if (object != tail) { - object = get_freepointer(s, object); - goto next_object; - } - ret = 1; - -out: - if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); - return ret; -} - /* * Parse a block of slub_debug options. Blocks are delimited by ';' * @@ -2788,6 +2731,63 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) { return atomic_long_read(&n->total_objects); } + +/* Supports checking bulk free of a constructed freelist */ +static noinline int free_debug_processing( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + void *object = head; + int cnt = 0; + unsigned long flags, flags2; + int ret = 0; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + slab_lock(slab, &flags2); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + +next_object: + cnt++; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + ret = 1; + +out: + if (cnt != bulk_cnt) + slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", + bulk_cnt, cnt); + + slab_unlock(slab, &flags2); + spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; +} #endif /* CONFIG_SLUB_DEBUG */ #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) From b40d0156f560932d14e3957579b6508f8d065260 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:49 +0530 Subject: [PATCH 022/145] perf/x86/amd/brs: Move feature-specific functions Move some of the Branch Sampling (BRS) specific functions out of the Core events sources and into the BRS sources in preparation for adding other mechanisms to record branches. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b60283b57179475d18ee242d117c335c16733693.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/brs.c | 69 ++++++++++++++++++++++++++++++++++- arch/x86/events/amd/core.c | 70 ++---------------------------------- arch/x86/events/perf_event.h | 7 ++-- 3 files changed, 76 insertions(+), 70 deletions(-) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index bee8765a1e9b..f1bff153d945 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -81,7 +81,7 @@ static bool __init amd_brs_detect(void) * a br_sel_map. Software filtering is not supported because it would not correlate well * with a sampling period. */ -int amd_brs_setup_filter(struct perf_event *event) +static int amd_brs_setup_filter(struct perf_event *event) { u64 type = event->attr.branch_sample_type; @@ -96,6 +96,73 @@ int amd_brs_setup_filter(struct perf_event *event) return 0; } +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + +int amd_brs_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + + return ret; +} + /* tos = top of stack, i.e., last valid entry written */ static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) { diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9ac3718410ce..e32a27899e11 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -330,16 +330,8 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } -#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ -static inline int amd_is_brs_event(struct perf_event *e) -{ - return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; -} - static int amd_core_hw_config(struct perf_event *event) { - int ret = 0; - if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -356,66 +348,10 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - /* - * if branch stack is requested - */ - if (has_branch_stack(event)) { - /* - * Due to interrupt holding, BRS is not recommended in - * counting mode. - */ - if (!is_sampling_event(event)) - return -EINVAL; + if (has_branch_stack(event)) + return amd_brs_hw_config(event); - /* - * Due to the way BRS operates by holding the interrupt until - * lbr_nr entries have been captured, it does not make sense - * to allow sampling on BRS with an event that does not match - * what BRS is capturing, i.e., retired taken branches. - * Otherwise the correlation with the event's period is even - * more loose: - * - * With retired taken branch: - * Effective P = P + 16 + X - * With any other event: - * Effective P = P + Y + X - * - * Where X is the number of taken branches due to interrupt - * skid. Skid is large. - * - * Where Y is the occurences of the event while BRS is - * capturing the lbr_nr entries. - * - * By using retired taken branches, we limit the impact on the - * Y variable. We know it cannot be more than the depth of - * BRS. - */ - if (!amd_is_brs_event(event)) - return -EINVAL; - - /* - * BRS implementation does not work with frequency mode - * reprogramming of the period. - */ - if (event->attr.freq) - return -EINVAL; - /* - * The kernel subtracts BRS depth from period, so it must - * be big enough. - */ - if (event->attr.sample_period <= x86_pmu.lbr_nr) - return -EINVAL; - - /* - * Check if we can allow PERF_SAMPLE_BRANCH_STACK - */ - ret = amd_brs_setup_filter(event); - - /* only set in case of success */ - if (!ret) - event->hw.flags |= PERF_X86_EVENT_AMD_BRS; - } - return ret; + return 0; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ba3d24a6a4ec..5deb34e42bbd 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1233,6 +1233,9 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); #ifdef CONFIG_PERF_EVENTS_AMD_BRS + +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ + int amd_brs_init(void); void amd_brs_disable(void); void amd_brs_enable(void); @@ -1241,7 +1244,7 @@ void amd_brs_disable_all(void); void amd_brs_drain(void); void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); -int amd_brs_setup_filter(struct perf_event *event); +int amd_brs_hw_config(struct perf_event *event); void amd_brs_reset(void); static inline void amd_pmu_brs_add(struct perf_event *event) @@ -1277,7 +1280,7 @@ static inline void amd_brs_enable(void) {} static inline void amd_brs_drain(void) {} static inline void amd_brs_lopwr_init(void) {} static inline void amd_brs_disable_all(void) {} -static inline int amd_brs_setup_filter(struct perf_event *event) +static inline int amd_brs_hw_config(struct perf_event *event) { return 0; } From 9603aa79e851c652f6da873205c92213af36b24f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:50 +0530 Subject: [PATCH 023/145] perf/x86/amd/core: Refactor branch attributes AMD processors that are capable of recording branches support either Branch Sampling (BRS) or Last Branch Record (LBR). In preparation for adding Last Branch Record Extension Version 2 (LbrExtV2) support, reuse the "branches" capability to advertise information about both BRS and LBR but make the "branch-brs" event exclusive to Family 19h processors that support BRS. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/ba4a4cde6db79b1c65c49834027bbdb8a915546b.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e32a27899e11..2f524cf84528 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1247,23 +1247,25 @@ static ssize_t branches_show(struct device *cdev, static DEVICE_ATTR_RO(branches); -static struct attribute *amd_pmu_brs_attrs[] = { +static struct attribute *amd_pmu_branches_attrs[] = { &dev_attr_branches.attr, NULL, }; static umode_t -amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.lbr_nr ? attr->mode : 0; } -static struct attribute_group group_caps_amd_brs = { +static struct attribute_group group_caps_amd_branches = { .name = "caps", - .attrs = amd_pmu_brs_attrs, - .is_visible = amd_brs_is_visible, + .attrs = amd_pmu_branches_attrs, + .is_visible = amd_branches_is_visible, }; +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + EVENT_ATTR_STR(branch-brs, amd_branch_brs, "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); @@ -1272,15 +1274,26 @@ static struct attribute *amd_brs_events_attrs[] = { NULL, }; +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ? + attr->mode : 0; +} + static struct attribute_group group_events_amd_brs = { .name = "events", .attrs = amd_brs_events_attrs, .is_visible = amd_brs_is_visible, }; +#endif /* CONFIG_PERF_EVENTS_AMD_BRS */ + static const struct attribute_group *amd_attr_update[] = { - &group_caps_amd_brs, + &group_caps_amd_branches, +#ifdef CONFIG_PERF_EVENTS_AMD_BRS &group_events_amd_brs, +#endif NULL, }; From 706460a96fc654e80b6bed1f562b00d2ce9f2f4d Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:51 +0530 Subject: [PATCH 024/145] perf/x86/amd/core: Add generic branch record interfaces AMD processors that are capable of recording branches support either Branch Sampling (BRS) or Last Branch Record (LBR). In preparation for adding Last Branch Record Extension Version 2 (LbrExtV2) support, introduce new static calls which act as gateways to call into the feature-dependent functions based on what is available on the processor. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b75dbc32663cb395f0d701167e952c6a6b0445a3.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2f524cf84528..ef3520731a20 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -330,6 +330,8 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config); + static int amd_core_hw_config(struct perf_event *event) { if (event->attr.exclude_host && event->attr.exclude_guest) @@ -349,7 +351,7 @@ static int amd_core_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PAIR; if (has_branch_stack(event)) - return amd_brs_hw_config(event); + return static_call(amd_pmu_branch_hw_config)(event); return 0; } @@ -518,8 +520,14 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +typedef void (amd_pmu_branch_reset_t)(void); +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t); + static void amd_pmu_cpu_reset(int cpu) { + if (x86_pmu.lbr_nr) + static_call(amd_pmu_branch_reset)(); + if (x86_pmu.version < 2) return; @@ -576,7 +584,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - amd_brs_reset(); amd_pmu_cpu_reset(cpu); } @@ -771,16 +778,20 @@ static void amd_pmu_v2_disable_all(void) amd_pmu_check_overflow(); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add); + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_add(event); + static_call(amd_pmu_branch_add)(event); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del); + static void amd_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_del(event); + static_call(amd_pmu_branch_del)(event); } /* @@ -1184,13 +1195,6 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static void amd_pmu_sched_task(struct perf_event_context *ctx, - bool sched_in) -{ - if (sched_in && x86_pmu.lbr_nr) - amd_pmu_brs_sched_task(ctx, sched_in); -} - static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) { /* @@ -1375,8 +1379,14 @@ static int __init amd_core_pmu_init(void) */ if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; - x86_pmu.sched_task = amd_pmu_sched_task; + x86_pmu.sched_task = amd_pmu_brs_sched_task; x86_pmu.limit_period = amd_pmu_limit_period; + + static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config); + static_call_update(amd_pmu_branch_reset, amd_brs_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_brs_add); + static_call_update(amd_pmu_branch_del, amd_pmu_brs_del); + /* * put_event_constraints callback same as Fam17h, set above */ From 257449c6a50298bd21dcabd644f66a0296b78532 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:52 +0530 Subject: [PATCH 025/145] x86/cpufeatures: Add LbrExtV2 feature bit CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 1 of EAX indicates support for Last Branch Record Extension Version 2 (LbrExtV2) features. If found to be set during PMU initialization, the EBX bits of the same leaf can be used to determine the number of available LBR entries. For better utilization of feature words, LbrExtV2 is added as a scattered feature bit. [peterz: Rename to AMD_LBR_V2] Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Acked-by: Borislav Petkov Link: https://lore.kernel.org/r/172d2b0df39306ed77221c45ee1aa62e8ae0548d.1660211399.git.sandipan.das@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 235dc85c91c3..52bdd9465f19 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -/* FREE! ( 3*32+17) */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fd44b54c90d5..fc01f81f6e2a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; From 703fb765f48897214e3eb110f35dddec80682f60 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:53 +0530 Subject: [PATCH 026/145] perf/x86/amd/lbr: Detect LbrExtV2 support AMD Last Branch Record Extension Version 2 (LbrExtV2) is driven by Core PMC overflows. It records recently taken branches up to the moment when the PMC overflow occurs. Detect the feature during PMU initialization and set the branch stack depth using CPUID leaf 0x80000022 EBX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/fc6e45378ada258f1bab79b0de6e05c393a8f1dd.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/core.c | 9 +++++---- arch/x86/events/amd/lbr.c | 21 +++++++++++++++++++++ arch/x86/events/perf_event.h | 2 ++ arch/x86/include/asm/perf_event.h | 3 ++- 5 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 arch/x86/events/amd/lbr.c diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index b9f5d4610256..527d947eb76b 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index ef3520731a20..a3aa67b4fd50 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1374,10 +1374,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } - /* - * BRS requires special event constraints and flushing on ctxsw. - */ - if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + /* LBR and BRS are mutually exclusive features */ + if (amd_pmu_lbr_init() && !amd_brs_init()) { + /* + * BRS requires special event constraints and flushing on ctxsw. + */ x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; x86_pmu.sched_task = amd_pmu_brs_sched_task; x86_pmu.limit_period = amd_pmu_limit_period; diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c new file mode 100644 index 000000000000..4e5b5d35f35a --- /dev/null +++ b/arch/x86/events/amd/lbr.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include "../perf_event.h" + +__init int amd_pmu_lbr_init(void) +{ + union cpuid_0x80000022_ebx ebx; + + if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2)) + return -EOPNOTSUPP; + + /* Set number of entries */ + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz; + + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + + return 0; +} diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 5deb34e42bbd..82e8a6d87ade 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1232,6 +1232,8 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); +int amd_pmu_lbr_init(void); + #ifdef CONFIG_PERF_EVENTS_AMD_BRS #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f6fc8dd51ef4..9ac46dbe57d4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -207,7 +207,8 @@ union cpuid_0x80000022_ebx { struct { /* Number of Core Performance Counters */ unsigned int num_core_pmc:4; - unsigned int reserved:6; + /* Number of available LBR Stack Entries */ + unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; } split; From ca5b7c0d9621702e107c83216316a6d722878b64 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:54 +0530 Subject: [PATCH 027/145] perf/x86/amd/lbr: Add LbrExtV2 branch record support If AMD Last Branch Record Extension Version 2 (LbrExtV2) is detected, enable it alongside LBR Freeze on PMI when an event requests branch stack i.e. PERF_SAMPLE_BRANCH_STACK. Each branch record is represented by a pair of registers, LBR From and LBR To. The freeze feature prevents any updates to these registers once a PMC overflows. The contents remain unchanged until the freeze bit is cleared by the PMI handler. The branch records are read and copied to sample data before unfreezing. However, only valid entries are copied. There is no additional register to denote which of the register pairs represent the top of the stack (TOS) since internal register renaming always ensures that the first pair (i.e. index 0) is the one representing the most recent branch and so on. The LBR registers are per-thread resources and are cleared explicitly whenever a new task is scheduled in. There are no special implications on the contents of these registers when transitioning to deep C-states. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/d3b8500a3627a0d4d0259b005891ee248f248d91.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 47 +++++-- arch/x86/events/amd/lbr.c | 203 +++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 8 ++ arch/x86/include/asm/msr-index.h | 5 + 4 files changed, 252 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index a3aa67b4fd50..d799628016c8 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -620,7 +620,7 @@ static inline u64 amd_pmu_get_global_status(void) /* PerfCntrGlobalStatus is read-only */ rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); - return status & amd_pmu_global_cntr_mask; + return status; } static inline void amd_pmu_ack_global_status(u64 status) @@ -631,8 +631,6 @@ static inline void amd_pmu_ack_global_status(u64 status) * clears the same bit in PerfCntrGlobalStatus */ - /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ - status &= amd_pmu_global_cntr_mask; wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); } @@ -742,11 +740,17 @@ static void amd_pmu_v2_enable_event(struct perf_event *event) __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } -static void amd_pmu_v2_enable_all(int added) +static __always_inline void amd_pmu_core_enable_all(void) { amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); } +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_lbr_enable_all(); + amd_pmu_core_enable_all(); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -771,10 +775,15 @@ static void amd_pmu_disable_all(void) amd_pmu_check_overflow(); } +static __always_inline void amd_pmu_core_disable_all(void) +{ + amd_pmu_set_global_ctl(0); +} + static void amd_pmu_v2_disable_all(void) { - /* Disable all PMCs */ - amd_pmu_set_global_ctl(0); + amd_pmu_core_disable_all(); + amd_pmu_lbr_disable_all(); amd_pmu_check_overflow(); } @@ -877,8 +886,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) pmu_enabled = cpuc->enabled; cpuc->enabled = 0; - /* Stop counting */ - amd_pmu_v2_disable_all(); + /* Stop counting but do not disable LBR */ + amd_pmu_core_disable_all(); status = amd_pmu_get_global_status(); @@ -886,6 +895,12 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!status) goto done; + /* Read branch records before unfreezing */ + if (status & GLOBAL_STATUS_LBRS_FROZEN) { + amd_pmu_lbr_read(); + status &= ~GLOBAL_STATUS_LBRS_FROZEN; + } + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -905,6 +920,9 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -918,7 +936,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) */ WARN_ON(status > 0); - /* Clear overflow bits */ + /* Clear overflow and freeze bits */ amd_pmu_ack_global_status(~status); /* @@ -932,7 +950,7 @@ done: /* Resume counting only if PMU is active */ if (pmu_enabled) - amd_pmu_v2_enable_all(0); + amd_pmu_core_enable_all(); return amd_pmu_adjust_nmi_window(handled); } @@ -1375,7 +1393,14 @@ static int __init amd_core_pmu_init(void) } /* LBR and BRS are mutually exclusive features */ - if (amd_pmu_lbr_init() && !amd_brs_init()) { + if (!amd_pmu_lbr_init()) { + /* LBR requires flushing on context switch */ + x86_pmu.sched_task = amd_pmu_lbr_sched_task; + static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config); + static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); + static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + } else if (!amd_brs_init()) { /* * BRS requires special event constraints and flushing on ctxsw. */ diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 4e5b5d35f35a..1dea66f332ae 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -4,6 +4,209 @@ #include "../perf_event.h" +struct branch_entry { + union { + struct { + u64 ip:58; + u64 ip_sign_ext:5; + u64 mispredict:1; + } split; + u64 full; + } from; + + union { + struct { + u64 ip:58; + u64 ip_sign_ext:3; + u64 reserved:1; + u64 spec:1; + u64 valid:1; + } split; + u64 full; + } to; +}; + +static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); +} + +static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); +} + +static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + + return val; +} + +static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + + return val; +} + +static __always_inline u64 sign_ext_branch_ip(u64 ip) +{ + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + return (u64)(((s64)ip << shift) >> shift); +} + +void amd_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_branch_entry *br = cpuc->lbr_entries; + struct branch_entry entry; + int out = 0, i; + + if (!cpuc->lbr_users) + return; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + entry.from.full = amd_pmu_lbr_get_from(i); + entry.to.full = amd_pmu_lbr_get_to(i); + + /* Check if a branch has been logged */ + if (!entry.to.split.valid) + continue; + + perf_clear_branch_entry_bitfields(br + out); + + br[out].from = sign_ext_branch_ip(entry.from.split.ip); + br[out].to = sign_ext_branch_ip(entry.to.split.ip); + br[out].mispred = entry.from.split.mispredict; + br[out].predicted = !br[out].mispred; + out++; + } + + cpuc->lbr_stack.nr = out; + + /* + * Internal register renaming always ensures that LBR From[0] and + * LBR To[0] always represent the TOS + */ + cpuc->lbr_stack.hw_idx = 0; +} + +static int amd_pmu_lbr_setup_filter(struct perf_event *event) +{ + /* No LBR support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + return 0; +} + +int amd_pmu_lbr_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* LBR is not recommended in counting mode */ + if (!is_sampling_event(event)) + return -EINVAL; + + ret = amd_pmu_lbr_setup_filter(event); + if (!ret) + event->attach_state |= PERF_ATTACH_SCHED_CB; + + return ret; +} + +void amd_pmu_lbr_reset(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + if (!x86_pmu.lbr_nr) + return; + + /* Reset all branch records individually */ + for (i = 0; i < x86_pmu.lbr_nr; i++) { + amd_pmu_lbr_set_from(i, 0); + amd_pmu_lbr_set_to(i, 0); + } + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; +} + +void amd_pmu_lbr_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + perf_sched_cb_inc(event->ctx->pmu); + + if (!cpuc->lbr_users++ && !event->total_time_running) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * A context switch can flip the address space and LBR entries are + * not tagged with an identifier. Hence, branches cannot be resolved + * from the old address space and the LBR records should be wiped. + */ + if (cpuc->lbr_users && sched_in) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); +} + +void amd_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); +} + __init int amd_pmu_lbr_init(void) { union cpuid_0x80000022_ebx ebx; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 82e8a6d87ade..e8930414cba5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1233,6 +1233,14 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); int amd_pmu_lbr_init(void); +void amd_pmu_lbr_reset(void); +void amd_pmu_lbr_read(void); +void amd_pmu_lbr_add(struct perf_event *event); +void amd_pmu_lbr_del(struct perf_event *event); +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_enable_all(void); +void amd_pmu_lbr_disable_all(void); +int amd_pmu_lbr_hw_config(struct perf_event *event); #ifdef CONFIG_PERF_EVENTS_AMD_BRS diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6674bdb096f3..109c404e6137 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -585,6 +585,9 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Last Branch Record MSRs */ +#define MSR_AMD64_LBR_SELECT 0xc000010e + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -756,6 +759,8 @@ #define MSR_AMD_DBG_EXTN_CFG 0xc000010f #define MSR_AMD_SAMP_BR_FROM 0xc0010300 +#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6) + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 From f4f925dae7419fc7a10af539c073871927ce3a24 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:55 +0530 Subject: [PATCH 028/145] perf/x86/amd/lbr: Add LbrExtV2 hardware branch filter support If AMD Last Branch Record Extension Version 2 (LbrExtV2) is detected, convert the requested branch filter (PERF_SAMPLE_BRANCH_* flags) to the corresponding hardware filter value and stash it in the event data when a branch stack is requested. The hardware filter value is also saved in per-CPU areas for use during event scheduling. Hardware filtering is provided by the LBR Branch Select register. It has bits which when set, suppress recording of the following types of branches: * CPL = 0 (Kernel only) * CPL > 0 (Userspace only) * Conditional Branches * Near Relative Calls * Near Indirect Calls * Near Returns * Near Indirect Jumps (excluding Near Indirect Calls and Near Returns) * Near Relative Jumps (excluding Near Relative Calls) * Far Branches Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/9336af5c9785b8e14c62220fc0e6cfb10ab97de3.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 21 ++++++--- arch/x86/events/amd/lbr.c | 94 +++++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d799628016c8..36bede1d7b1e 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -542,16 +542,24 @@ static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL, + cpu_to_node(cpu)); + if (!cpuc->lbr_sel) + return -ENOMEM; + WARN_ON_ONCE(cpuc->amd_nb); if (!x86_pmu.amd_nb_constraints) return 0; cpuc->amd_nb = amd_alloc_nb(cpu); - if (!cpuc->amd_nb) - return -ENOMEM; + if (cpuc->amd_nb) + return 0; - return 0; + kfree(cpuc->lbr_sel); + cpuc->lbr_sel = NULL; + + return -ENOMEM; } static void amd_pmu_cpu_starting(int cpu) @@ -589,13 +597,14 @@ static void amd_pmu_cpu_starting(int cpu) static void amd_pmu_cpu_dead(int cpu) { - struct cpu_hw_events *cpuhw; + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + kfree(cpuhw->lbr_sel); + cpuhw->lbr_sel = NULL; if (!x86_pmu.amd_nb_constraints) return; - cpuhw = &per_cpu(cpu_hw_events, cpu); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 1dea66f332ae..bb79b43b7cd8 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -4,6 +4,39 @@ #include "../perf_event.h" +/* LBR Branch Select valid bits */ +#define LBR_SELECT_MASK 0x1ff + +/* + * LBR Branch Select filter bits which when set, ensures that the + * corresponding type of branches are not recorded + */ +#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */ +#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */ +#define LBR_SELECT_JCC 2 /* Conditional branches */ +#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */ +#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */ +#define LBR_SELECT_RET_NEAR 5 /* Near returns */ +#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */ +#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */ +#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */ + +#define LBR_KERNEL BIT(LBR_SELECT_KERNEL) +#define LBR_USER BIT(LBR_SELECT_USER) +#define LBR_JCC BIT(LBR_SELECT_JCC) +#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL) +#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND) +#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR) +#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL) +#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND) +#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH) +#define LBR_NOT_SUPP -1 /* unsupported filter */ +#define LBR_IGNORE 0 + +#define LBR_ANY \ + (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \ + LBR_REL_JMP | LBR_IND_JMP | LBR_FAR) + struct branch_entry { union { struct { @@ -97,12 +130,56 @@ void amd_pmu_lbr_read(void) cpuc->lbr_stack.hw_idx = 0; } +static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, + + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, + + [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, + + [PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT] = LBR_NOT_SUPP, +}; + static int amd_pmu_lbr_setup_filter(struct perf_event *event) { + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, v; + int i; + /* No LBR support */ if (!x86_pmu.lbr_nr) return -EOPNOTSUPP; + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & BIT_ULL(i))) + continue; + + v = lbr_select_map[i]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGNORE) + mask |= v; + } + + /* Filter bits operate in suppress mode */ + reg->config = mask ^ LBR_SELECT_MASK; + return 0; } @@ -137,6 +214,7 @@ void amd_pmu_lbr_reset(void) cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; + wrmsrl(MSR_AMD64_LBR_SELECT, 0); } void amd_pmu_lbr_add(struct perf_event *event) @@ -146,6 +224,11 @@ void amd_pmu_lbr_add(struct perf_event *event) if (!x86_pmu.lbr_nr) return; + if (has_branch_stack(event)) { + cpuc->lbr_select = 1; + cpuc->lbr_sel->config = event->hw.branch_reg.config; + } + perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) @@ -159,6 +242,9 @@ void amd_pmu_lbr_del(struct perf_event *event) if (!x86_pmu.lbr_nr) return; + if (has_branch_stack(event)) + cpuc->lbr_select = 0; + cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); perf_sched_cb_dec(event->ctx->pmu); @@ -180,11 +266,17 @@ void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) void amd_pmu_lbr_enable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 dbg_ctl, dbg_extn_cfg; + u64 lbr_select, dbg_ctl, dbg_extn_cfg; if (!cpuc->lbr_users || !x86_pmu.lbr_nr) return; + /* Set hardware branch filter */ + if (cpuc->lbr_select) { + lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; + wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); + } + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); From 4462fbfe6ec1bfe2196b977010f6ce7b43a32f2c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:56 +0530 Subject: [PATCH 029/145] perf/x86: Move branch classifier Commit 3e702ff6d1ea ("perf/x86: Add LBR software filter support for Intel CPUs") introduces a software branch filter which complements the hardware branch filter and adds an x86 branch classifier. Move the branch classifier to arch/x86/events/ so that it can be utilized by other vendors for branch record filtering. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/bae5b95470d6bd49f40954bd379f414f5afcb965.1660211399.git.sandipan.das@amd.com --- arch/x86/events/Makefile | 2 +- arch/x86/events/intel/lbr.c | 273 ----------------------------------- arch/x86/events/perf_event.h | 62 ++++++++ arch/x86/events/utils.c | 216 +++++++++++++++++++++++++++ 4 files changed, 279 insertions(+), 274 deletions(-) create mode 100644 arch/x86/events/utils.c diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9933c0e8e97a..86a76efa8bb6 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += core.o probe.o +obj-y += core.o probe.o utils.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4f70fb6c2c1e..7dffc0c731a6 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -4,7 +4,6 @@ #include #include -#include #include "../perf_event.h" @@ -65,65 +64,6 @@ #define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59)) -/* - * x86control flow change classification - * x86control flow changes include branches, interrupts, traps, faults - */ -enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ - X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ - X86_BR_CALL_STACK = 1 << 16,/* call stack */ - X86_BR_IND_JMP = 1 << 17,/* indirect jump */ - - X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ - -}; - -#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) -#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) - -#define X86_BR_ANY \ - (X86_BR_CALL |\ - X86_BR_RET |\ - X86_BR_SYSCALL |\ - X86_BR_SYSRET |\ - X86_BR_INT |\ - X86_BR_IRET |\ - X86_BR_JCC |\ - X86_BR_JMP |\ - X86_BR_IRQ |\ - X86_BR_ABORT |\ - X86_BR_IND_CALL |\ - X86_BR_IND_JMP |\ - X86_BR_ZERO_CALL) - -#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) - -#define X86_BR_ANY_CALL \ - (X86_BR_CALL |\ - X86_BR_IND_CALL |\ - X86_BR_ZERO_CALL |\ - X86_BR_SYSCALL |\ - X86_BR_IRQ |\ - X86_BR_INT) - /* * Intel LBR_CTL bits * @@ -1143,219 +1083,6 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) return ret; } -/* - * return the type of control flow change at address "from" - * instruction is not necessarily a branch (in case of interrupt). - * - * The branch type returned also includes the priv level of the - * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). - * - * If a branch type is unknown OR the instruction cannot be - * decoded (e.g., text page not present), then X86_BR_NONE is - * returned. - */ -static int branch_type(unsigned long from, unsigned long to, int abort) -{ - struct insn insn; - void *addr; - int bytes_read, bytes_left; - int ret = X86_BR_NONE; - int ext, to_plm, from_plm; - u8 buf[MAX_INSN_SIZE]; - int is64 = 0; - - to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; - from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; - - /* - * maybe zero if lbr did not fill up after a reset by the time - * we get a PMU interrupt - */ - if (from == 0 || to == 0) - return X86_BR_NONE; - - if (abort) - return X86_BR_ABORT | to_plm; - - if (from_plm == X86_BR_USER) { - /* - * can happen if measuring at the user level only - * and we interrupt in a kernel thread, e.g., idle. - */ - if (!current->mm) - return X86_BR_NONE; - - /* may fail if text not present */ - bytes_left = copy_from_user_nmi(buf, (void __user *)from, - MAX_INSN_SIZE); - bytes_read = MAX_INSN_SIZE - bytes_left; - if (!bytes_read) - return X86_BR_NONE; - - addr = buf; - } else { - /* - * The LBR logs any address in the IP, even if the IP just - * faulted. This means userspace can control the from address. - * Ensure we don't blindly read any address by validating it is - * a known text address. - */ - if (kernel_text_address(from)) { - addr = (void *)from; - /* - * Assume we can get the maximum possible size - * when grabbing kernel data. This is not - * _strictly_ true since we could possibly be - * executing up next to a memory hole, but - * it is very unlikely to be a problem. - */ - bytes_read = MAX_INSN_SIZE; - } else { - return X86_BR_NONE; - } - } - - /* - * decoder needs to know the ABI especially - * on 64-bit systems running 32-bit apps - */ -#ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); -#endif - insn_init(&insn, addr, bytes_read, is64); - if (insn_get_opcode(&insn)) - return X86_BR_ABORT; - - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; - break; - } - fallthrough; - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - if (insn_get_modrm(&insn)) - return X86_BR_ABORT; - - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; - break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; - } - /* - * interrupts, traps, faults (and thus ring transition) may - * occur on any instructions. Thus, to classify them correctly, - * we need to first look at the from and to priv levels. If they - * are different and to is in the kernel, then it indicates - * a ring transition. If the from instruction is not a ring - * transition instr (syscall, systenter, int), then it means - * it was a irq, trap or fault. - * - * we have no way of detecting kernel to kernel faults. - */ - if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL - && ret != X86_BR_SYSCALL && ret != X86_BR_INT) - ret = X86_BR_IRQ; - - /* - * branch priv level determined by target as - * is done by HW when LBR_SELECT is implemented - */ - if (ret != X86_BR_NONE) - ret |= to_plm; - - return ret; -} - -#define X86_BR_TYPE_MAP_MAX 16 - -static int branch_map[X86_BR_TYPE_MAP_MAX] = { - PERF_BR_CALL, /* X86_BR_CALL */ - PERF_BR_RET, /* X86_BR_RET */ - PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ - PERF_BR_SYSRET, /* X86_BR_SYSRET */ - PERF_BR_UNKNOWN, /* X86_BR_INT */ - PERF_BR_ERET, /* X86_BR_IRET */ - PERF_BR_COND, /* X86_BR_JCC */ - PERF_BR_UNCOND, /* X86_BR_JMP */ - PERF_BR_IRQ, /* X86_BR_IRQ */ - PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_ABORT */ - PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ - PERF_BR_CALL, /* X86_BR_ZERO_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ - PERF_BR_IND, /* X86_BR_IND_JMP */ -}; - -static int -common_branch_type(int type) -{ - int i; - - type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ - - if (type) { - i = __ffs(type); - if (i < X86_BR_TYPE_MAP_MAX) - return branch_map[i]; - } - - return PERF_BR_UNKNOWN; -} - enum { ARCH_LBR_BR_TYPE_JCC = 0, ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e8930414cba5..2de9cd66347e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1210,6 +1210,68 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL |\ + X86_BR_IND_JMP |\ + X86_BR_ZERO_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +int common_branch_type(int type); +int branch_type(unsigned long from, unsigned long to, int abort); + ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c new file mode 100644 index 000000000000..a32368945462 --- /dev/null +++ b/arch/x86/events/utils.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "perf_event.h" + +/* + * return the type of control flow change at address "from" + * instruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +int branch_type(unsigned long from, unsigned long to, int abort) +{ + struct insn insn; + void *addr; + int bytes_read, bytes_left; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindly read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) { + addr = (void *)from; + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { + return X86_BR_NONE; + } + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); +#endif + insn_init(&insn, addr, bytes_read, is64); + if (insn_get_opcode(&insn)) + return X86_BR_ABORT; + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { + /* zero length call */ + ret = X86_BR_ZERO_CALL; + break; + } + fallthrough; + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(&insn)) + return X86_BR_ABORT; + + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_IND_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; + } + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_ERET, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_IRQ, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +int common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} From f9c732249b110fae9ebf4ce33db4cb3a12c6eae3 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:57 +0530 Subject: [PATCH 030/145] perf/x86/amd/lbr: Add LbrExtV2 software branch filter support With AMD Last Branch Record Extension Version 2 (LbrExtV2), it is necessary to process the branch records further as hardware filtering is not granular enough for identifying certain types of branches. E.g. to record system calls, one should record far branches. The filter captures both far calls and far returns but the irrelevant records are filtered out based on the branch type as seen by the branch classifier. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/e51de057517f77788abd393c832e8dea616d489c.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 92 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index bb79b43b7cd8..1a8d27e0c145 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -94,6 +94,50 @@ static __always_inline u64 sign_ext_branch_ip(u64 ip) return (u64)(((s64)ip << shift) >> shift); } +static void amd_pmu_lbr_filter(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int br_sel = cpuc->br_sel, type, i, j; + bool compress = false; + u64 from, to; + + /* If sampling all branches, there is nothing to filter */ + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + type = branch_type(from, to, 0); + + /* If type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; /* mark invalid */ + compress = true; + } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); + } + + if (!compress) + return; + + /* Remove all invalid entries */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + void amd_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -128,6 +172,9 @@ void amd_pmu_lbr_read(void) * LBR To[0] always represent the TOS */ cpuc->lbr_stack.hw_idx = 0; + + /* Perform further software filtering */ + amd_pmu_lbr_filter(); } static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -136,8 +183,8 @@ static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, - [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, @@ -150,8 +197,6 @@ static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, - - [PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT] = LBR_NOT_SUPP, }; static int amd_pmu_lbr_setup_filter(struct perf_event *event) @@ -165,6 +210,41 @@ static int amd_pmu_lbr_setup_filter(struct perf_event *event) if (!x86_pmu.lbr_nr) return -EOPNOTSUPP; + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* Ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + + reg->reg = mask; + mask = 0; + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { if (!(br_type & BIT_ULL(i))) continue; @@ -220,13 +300,15 @@ void amd_pmu_lbr_reset(void) void amd_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event_extra *reg = &event->hw.branch_reg; if (!x86_pmu.lbr_nr) return; if (has_branch_stack(event)) { cpuc->lbr_select = 1; - cpuc->lbr_sel->config = event->hw.branch_reg.config; + cpuc->lbr_sel->config = reg->config; + cpuc->br_sel = reg->reg; } perf_sched_cb_inc(event->ctx->pmu); From df3e9612f758fb5f9c251cbe262e3c68ffe67b2c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:58 +0530 Subject: [PATCH 031/145] perf/x86: Make branch classifier fusion-aware With branch fusion and other optimizations, branch sampling hardware in some processors can report a branch from address that points to an instruction preceding the actual branch by several bytes. In such cases, the classifier cannot determine the branch type which leads to failures such as with the recently added test from commit b55878c90ab9 ("perf test: Add test for branch stack sampling"). Branch information is also easier to consume and annotate if branch from addresses always point to branch instructions. Add a new variant of the branch classifier that can account for instruction fusion. If fusion is expected and the current branch from address does not point to a branch instruction, it attempts to find the first branch within the next (MAX_INSN_SIZE - 1) bytes and if found, additionally provides the offset between the reported branch from address and the address of the expected branch instruction. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b6bb0abaa8a54c0b6d716344700ee11a1793d709.1660211399.git.sandipan.das@amd.com --- arch/x86/events/perf_event.h | 2 + arch/x86/events/utils.c | 169 +++++++++++++++++++++-------------- 2 files changed, 102 insertions(+), 69 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2de9cd66347e..93263b98cd6e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1271,6 +1271,8 @@ enum { int common_branch_type(int type); int branch_type(unsigned long from, unsigned long to, int abort); +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset); ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index a32368945462..e013243f360c 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -3,6 +3,68 @@ #include "perf_event.h" +static int decode_branch_type(struct insn *insn) +{ + int ext; + + if (insn_get_opcode(insn)) + return X86_BR_ABORT; + + switch (insn->opcode.bytes[0]) { + case 0xf: + switch (insn->opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + return X86_BR_SYSCALL; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + return X86_BR_SYSRET; + case 0x80 ... 0x8f: /* conditional */ + return X86_BR_JCC; + } + return X86_BR_NONE; + case 0x70 ... 0x7f: /* conditional */ + return X86_BR_JCC; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + return X86_BR_RET; + case 0xcf: /* iret */ + return X86_BR_IRET; + case 0xcc ... 0xce: /* int */ + return X86_BR_INT; + case 0xe8: /* call near rel */ + if (insn_get_immediate(insn) || insn->immediate1.value == 0) { + /* zero length call */ + return X86_BR_ZERO_CALL; + } + fallthrough; + case 0x9a: /* call far absolute */ + return X86_BR_CALL; + case 0xe0 ... 0xe3: /* loop jmp */ + return X86_BR_JCC; + case 0xe9 ... 0xeb: /* jmp */ + return X86_BR_JMP; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(insn)) + return X86_BR_ABORT; + + ext = (insn->modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + return X86_BR_IND_CALL; + case 4: + case 5: + return X86_BR_IND_JMP; + } + return X86_BR_NONE; + } + + return X86_BR_NONE; +} + /* * return the type of control flow change at address "from" * instruction is not necessarily a branch (in case of interrupt). @@ -13,14 +75,22 @@ * If a branch type is unknown OR the instruction cannot be * decoded (e.g., text page not present), then X86_BR_NONE is * returned. + * + * While recording branches, some processors can report the "from" + * address to be that of an instruction preceding the actual branch + * when instruction fusion occurs. If fusion is expected, attempt to + * find the type of the first branch instruction within the next + * MAX_INSN_SIZE bytes and if found, provide the offset between the + * reported "from" address and the actual branch instruction address. */ -int branch_type(unsigned long from, unsigned long to, int abort) +static int get_branch_type(unsigned long from, unsigned long to, int abort, + bool fused, int *offset) { struct insn insn; void *addr; - int bytes_read, bytes_left; + int bytes_read, bytes_left, insn_offset; int ret = X86_BR_NONE; - int ext, to_plm, from_plm; + int to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; int is64 = 0; @@ -83,77 +153,27 @@ int branch_type(unsigned long from, unsigned long to, int abort) is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); #endif insn_init(&insn, addr, bytes_read, is64); - if (insn_get_opcode(&insn)) - return X86_BR_ABORT; + ret = decode_branch_type(&insn); + insn_offset = 0; - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; + /* Check for the possibility of branch fusion */ + while (fused && ret == X86_BR_NONE) { + /* Check for decoding errors */ + if (insn_get_length(&insn) || !insn.length) break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; - break; - } - fallthrough; - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - if (insn_get_modrm(&insn)) - return X86_BR_ABORT; - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; + insn_offset += insn.length; + bytes_read -= insn.length; + if (bytes_read < 0) break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; + + insn_init(&insn, addr + insn_offset, bytes_read, is64); + ret = decode_branch_type(&insn); } + + if (offset) + *offset = insn_offset; + /* * interrupts, traps, faults (and thus ring transition) may * occur on any instructions. Thus, to classify them correctly, @@ -179,6 +199,17 @@ int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +int branch_type(unsigned long from, unsigned long to, int abort) +{ + return get_branch_type(from, to, abort, false, NULL); +} + +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset) +{ + return get_branch_type(from, to, abort, true, offset); +} + #define X86_BR_TYPE_MAP_MAX 16 static int branch_map[X86_BR_TYPE_MAP_MAX] = { From 245268c19f701c7222dedcb6a383bc73d63925d4 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:59 +0530 Subject: [PATCH 032/145] perf/x86/amd/lbr: Use fusion-aware branch classifier AMD Last Branch Record Extension Version 2 (LbrExtV2) can report a branch from address that points to an instruction preceding the actual branch by several bytes due to branch fusion and further optimizations in Zen4 processors. In such cases, software should move forward sequentially in the instruction stream from the reported address and the address of the first branch encountered should be used instead. Hence, use the fusion-aware branch classifier to determine the correct branch type and get the offset for adjusting the branch from address. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/c324d2d0a9c3976da30b9563d09e50bfee0f264d.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 1a8d27e0c145..eb84f196b2ca 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -97,7 +97,7 @@ static __always_inline u64 sign_ext_branch_ip(u64 ip) static void amd_pmu_lbr_filter(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int br_sel = cpuc->br_sel, type, i, j; + int br_sel = cpuc->br_sel, offset, type, i, j; bool compress = false; u64 from, to; @@ -109,7 +109,15 @@ static void amd_pmu_lbr_filter(void) for (i = 0; i < cpuc->lbr_stack.nr; i++) { from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; - type = branch_type(from, to, 0); + type = branch_type_fused(from, to, 0, &offset); + + /* + * Adjust the branch from address in case of instruction + * fusion where it points to an instruction preceding the + * actual branch + */ + if (offset) + cpuc->lbr_entries[i].from += offset; /* If type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { From 93315e46b000fc80fff5d53c3f444417fb3df6de Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 18:00:00 +0530 Subject: [PATCH 033/145] perf/core: Add speculation info to branch entries Add a new "spec" bitfield to branch entries for providing speculation information. This will be populated using hints provided by branch sampling features on supported hardware. The following cases are covered: * No branch speculation information is available * Branch is speculative but taken on the wrong path * Branch is non-speculative but taken on the correct path * Branch is speculative and taken on the correct path Suggested-by: Stephane Eranian Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/834088c302faf21c7b665031dd111f424e509a64.1660211399.git.sandipan.das@amd.com --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ee8b9ecdc03b..ae30c61957d2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1078,6 +1078,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b br->abort = 0; br->cycles = 0; br->type = 0; + br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 03b370062741..30a4723aefd4 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -256,6 +256,17 @@ enum { PERF_BR_MAX, }; +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1363,6 +1374,7 @@ union perf_mem_data_src { * abort: aborting a hardware transaction * cycles: cycles from last branch (or 0 if not supported) * type: branch type + * spec: branch speculation info (or 0 if not supported) */ struct perf_branch_entry { __u64 from; @@ -1373,7 +1385,8 @@ struct perf_branch_entry { abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ type:4, /* branch type */ - reserved:40; + spec:2, /* branch speculation info */ + reserved:38; }; union perf_sample_weight { From 0bc3be5b4bfd5b75086c26d63584a6f7aaea87d5 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 18:00:01 +0530 Subject: [PATCH 034/145] perf/x86/amd/lbr: Add LbrExtV2 branch speculation info support Provide branch speculation information captured via AMD Last Branch Record Extension Version 2 (LbrExtV2) by setting the speculation info in branch records. The info is based on the "valid" and "spec" bits in the Branch To registers. Suggested-by: Stephane Eranian Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/ddc02f6320464cad0e3ff5bdb2314531568a91bc.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index eb84f196b2ca..2e1c1573efe7 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -146,12 +146,19 @@ static void amd_pmu_lbr_filter(void) } } +static const int lbr_spec_map[PERF_BR_SPEC_MAX] = { + PERF_BR_SPEC_NA, + PERF_BR_SPEC_WRONG_PATH, + PERF_BR_NON_SPEC_CORRECT_PATH, + PERF_BR_SPEC_CORRECT_PATH, +}; + void amd_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_branch_entry *br = cpuc->lbr_entries; struct branch_entry entry; - int out = 0, i; + int out = 0, idx, i; if (!cpuc->lbr_users) return; @@ -160,8 +167,11 @@ void amd_pmu_lbr_read(void) entry.from.full = amd_pmu_lbr_get_from(i); entry.to.full = amd_pmu_lbr_get_to(i); - /* Check if a branch has been logged */ - if (!entry.to.split.valid) + /* + * Check if a branch has been logged; if valid = 0, spec = 0 + * then no branch was recorded + */ + if (!entry.to.split.valid && !entry.to.split.spec) continue; perf_clear_branch_entry_bitfields(br + out); @@ -170,6 +180,25 @@ void amd_pmu_lbr_read(void) br[out].to = sign_ext_branch_ip(entry.to.split.ip); br[out].mispred = entry.from.split.mispredict; br[out].predicted = !br[out].mispred; + + /* + * Set branch speculation information using the status of + * the valid and spec bits. + * + * When valid = 0, spec = 0, no branch was recorded and the + * entry is discarded as seen above. + * + * When valid = 0, spec = 1, the recorded branch was + * speculative but took the wrong path. + * + * When valid = 1, spec = 0, the recorded branch was + * non-speculative but took the correct path. + * + * When valid = 1, spec = 1, the recorded branch was + * speculative and took the correct path + */ + idx = (entry.to.split.valid << 1) | entry.to.split.spec; + br[out].spec = lbr_spec_map[idx]; out++; } From a724ec82966d57e4b5d36341d3e3dc1a3c011564 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:15 +0530 Subject: [PATCH 035/145] perf: Add system error and not in transaction branch types This expands generic branch type classification by adding two more entries there in i.e system error and not in transaction. This also updates the x86 implementation to process X86_BR_NO_TX records as appropriate. This changes branch types reported to user space on x86 platform but it should not be a problem. The possible scenarios and impacts are enumerated here. -------------------------------------------------------------------------- | kernel | perf tool | Impact | -------------------------------------------------------------------------- | old | old | Works as before | -------------------------------------------------------------------------- | old | new | PERF_BR_UNKNOWN is processed | -------------------------------------------------------------------------- | new | old | PERF_BR_NO_TX is blocked via old PERF_BR_MAX | -------------------------------------------------------------------------- | new | new | PERF_BR_NO_TX is recognized | -------------------------------------------------------------------------- When PERF_BR_NO_TX is blocked via old PERF_BR_MAX (new kernel with old perf tool) the user space might throw up an warning complaining about an unrecognized branch types being reported, but it's expected. PERF_BR_SERROR & PERF_BR_NO_TX branch types will be used for BRBE implementation on arm64 platform. PERF_BR_NO_TX complements 'abort' and 'in_tx' elements in perf_branch_entry which represent other transaction states for a given branch record. Because this completes the transaction state classification. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-2-anshuman.khandual@arm.com --- arch/x86/events/utils.c | 2 +- include/uapi/linux/perf_event.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index e013243f360c..5f5617afde79 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -225,7 +225,7 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = { PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ PERF_BR_UNKNOWN, /* X86_BR_ABORT */ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_NO_TX, /* X86_BR_NO_TX */ PERF_BR_CALL, /* X86_BR_ZERO_CALL */ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ PERF_BR_IND, /* X86_BR_IND_JMP */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 30a4723aefd4..a79cc0eb4de7 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -253,6 +253,8 @@ enum { PERF_BR_COND_RET = 10, /* conditional function return */ PERF_BR_ERET = 11, /* exception return */ PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ PERF_BR_MAX, }; From b190bc4ac9e6d9763b61654c5a0c085ff77d7a09 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:16 +0530 Subject: [PATCH 036/145] perf: Extend branch type classification branch_entry.type now has ran out of space to accommodate more branch types classification. This will prevent perf branch stack implementation on arm64 (via BRBE) to capture all available branch types. Extending this bit field i.e branch_entry.type [4 bits] is not an option as it will break user space ABI both for little and big endian perf tools. Extend branch classification with a new field branch_entry.new_type via a new branch type PERF_BR_EXTEND_ABI in branch_entry.type. Perf tools which could decode PERF_BR_EXTEND_ABI, will then parse branch_entry.new_type as well. branch_entry.new_type is a 4 bit field which can hold upto 16 branch types. The first three branch types will hold various generic page faults followed by five architecture specific branch types, which can be overridden by the platform for specific use cases. These architecture specific branch types gets overridden on arm64 platform for BRBE implementation. New generic branch types - PERF_BR_NEW_FAULT_ALGN - PERF_BR_NEW_FAULT_DATA - PERF_BR_NEW_FAULT_INST New arch specific branch types - PERF_BR_NEW_ARCH_1 - PERF_BR_NEW_ARCH_2 - PERF_BR_NEW_ARCH_3 - PERF_BR_NEW_ARCH_4 - PERF_BR_NEW_ARCH_5 Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-3-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index a79cc0eb4de7..fed60e6b10e5 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -255,6 +255,7 @@ enum { PERF_BR_IRQ = 12, /* irq */ PERF_BR_SERROR = 13, /* system error */ PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ PERF_BR_MAX, }; @@ -269,6 +270,18 @@ enum { PERF_BR_SPEC_MAX, }; +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1388,7 +1401,8 @@ struct perf_branch_entry { cycles:16, /* cycle count to last branch */ type:4, /* branch type */ spec:2, /* branch speculation info */ - reserved:38; + new_type:4, /* additional branch type */ + reserved:34; }; union perf_sample_weight { From 5402d25aa5710d240040f73fb13d7d5c303ef071 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:17 +0530 Subject: [PATCH 037/145] perf: Capture branch privilege information Platforms like arm64 could capture privilege level information for all the branch records. Hence this adds a new element in the struct branch_entry to record the privilege level information, which could be requested through a new event.attr.branch_sample_type based flag PERF_SAMPLE_BRANCH_PRIV_SAVE. This flag helps user choose whether privilege information is captured. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-4-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index fed60e6b10e5..1a258d45a3fa 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -233,6 +235,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -282,6 +286,13 @@ enum { PERF_BR_NEW_MAX, }; +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1402,7 +1413,8 @@ struct perf_branch_entry { type:4, /* branch type */ spec:2, /* branch speculation info */ new_type:4, /* additional branch type */ - reserved:34; + priv:3, /* privilege level */ + reserved:31; }; union perf_sample_weight { From f4054e522531038354bea5c924f286fdd8ae77b5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:18 +0530 Subject: [PATCH 038/145] perf: Add PERF_BR_NEW_ARCH_[N] map for BRBE on arm64 platform BRBE captured branch types will overflow perf_branch_entry.type and generic branch types in perf_branch_entry.new_type. So override each available arch specific branch type in the following manner to comprehensively process all reported branch types in BRBE. PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-5-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1a258d45a3fa..dca16582885f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -293,6 +293,12 @@ enum { PERF_BR_PRIV_HV = 3, }; +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ From 724c299c6a0e412b5679d7ebb9b3f4e00bd2aa78 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:06 +0200 Subject: [PATCH 039/145] perf/hw_breakpoint: Add KUnit test for constraints accounting Add KUnit test for hw_breakpoint constraints accounting, with various interesting mixes of breakpoint targets (some care was taken to catch interesting corner cases via bug-injection). The test cannot be built as a module because it requires access to hw_breakpoint_slots(), which is not inlinable or exported on all architectures. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-2-elver@google.com --- kernel/events/Makefile | 1 + kernel/events/hw_breakpoint_test.c | 323 +++++++++++++++++++++++++++++ lib/Kconfig.debug | 10 + 3 files changed, 334 insertions(+) create mode 100644 kernel/events/hw_breakpoint_test.c diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 8591c180b52b..91a62f566743 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c new file mode 100644 index 000000000000..433c5c45e2a5 --- /dev/null +++ b/kernel/events/hw_breakpoint_test.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for hw_breakpoint constraints accounting logic. + * + * Copyright (C) 2022, Google LLC. + */ + +#include +#include +#include +#include +#include +#include + +#define TEST_REQUIRES_BP_SLOTS(test, slots) \ + do { \ + if ((slots) > get_test_bp_slots()) { \ + kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \ + get_test_bp_slots()); \ + } \ + } while (0) + +#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr)) + +#define MAX_TEST_BREAKPOINTS 512 + +static char break_vars[MAX_TEST_BREAKPOINTS]; +static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS]; +static struct task_struct *__other_task; + +static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx) +{ + struct perf_event_attr attr = {}; + + if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS)) + return NULL; + + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)&break_vars[idx]; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_RW; + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); +} + +static void unregister_test_bp(struct perf_event **bp) +{ + if (WARN_ON(IS_ERR(*bp))) + return; + if (WARN_ON(!*bp)) + return; + unregister_hw_breakpoint(*bp); + *bp = NULL; +} + +static int get_test_bp_slots(void) +{ + static int slots; + + if (!slots) + slots = hw_breakpoint_slots(TYPE_DATA); + + return slots; +} + +static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk) +{ + struct perf_event *bp = register_test_bp(cpu, tsk, *id); + + KUNIT_ASSERT_NOT_NULL(test, bp); + KUNIT_ASSERT_FALSE(test, IS_ERR(bp)); + KUNIT_ASSERT_NULL(test, test_bps[*id]); + test_bps[(*id)++] = bp; +} + +/* + * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free. + * + * Returns true if this can be called again, continuing at @id. + */ +static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip) +{ + for (int i = 0; i < get_test_bp_slots() - skip; ++i) + fill_one_bp_slot(test, id, cpu, tsk); + + return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS; +} + +static int dummy_kthread(void *arg) +{ + return 0; +} + +static struct task_struct *get_other_task(struct kunit *test) +{ + struct task_struct *tsk; + + if (__other_task) + return __other_task; + + tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task"); + KUNIT_ASSERT_FALSE(test, IS_ERR(tsk)); + __other_task = tsk; + return __other_task; +} + +static int get_test_cpu(int num) +{ + int cpu; + + WARN_ON(num < 0); + + for_each_online_cpu(cpu) { + if (num-- <= 0) + break; + } + + return cpu; +} + +/* ===== Test cases ===== */ + +static void test_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_many_cpus(struct kunit *test) +{ + int idx = 0; + int cpu; + + /* Test that CPUs are independent. */ + for_each_online_cpu(cpu) { + bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0); + + TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx)); + if (!do_continue) + break; + } +} + +static void test_one_task_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, -1, current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one and adding back CPU-target should work. */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_two_tasks_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + /* Test that tasks are independent. */ + fill_bp_slots(test, &idx, -1, current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one from first task and adding back CPU-target should not work. */ + unregister_test_bp(&test_bps[0]); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_one_task_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* + * Remove one and adding back CPU-target should work; this case is + * special vs. above because the task's constraints are CPU-dependent. + */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_one_task_mixed(struct kunit *test) +{ + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_bp_slots(test, &idx, -1, current, 1); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* Transition from CPU-dependent pinned count to CPU-independent. */ + unregister_test_bp(&test_bps[0]); + unregister_test_bp(&test_bps[1]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_two_tasks_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Can still create breakpoints on some other CPU. */ + fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0); +} + +static void test_two_tasks_on_one_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Cannot create breakpoints on some other CPU either. */ + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static void test_task_on_all_and_one_cpu(struct kunit *test) +{ + int tsk_on_cpu_idx, cpu_idx; + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_bp_slots(test, &idx, -1, current, 2); + /* Transitioning from only all CPU breakpoints to mixed. */ + tsk_on_cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* We should still be able to use up another CPU's slots. */ + cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); + + /* Transitioning back to task target on all CPUs. */ + unregister_test_bp(&test_bps[tsk_on_cpu_idx]); + /* Still have a CPU target breakpoint in get_test_cpu(1). */ + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + /* Remove it and try again. */ + unregister_test_bp(&test_bps[cpu_idx]); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static struct kunit_case hw_breakpoint_test_cases[] = { + KUNIT_CASE(test_one_cpu), + KUNIT_CASE(test_many_cpus), + KUNIT_CASE(test_one_task_on_all_cpus), + KUNIT_CASE(test_two_tasks_on_all_cpus), + KUNIT_CASE(test_one_task_on_one_cpu), + KUNIT_CASE(test_one_task_mixed), + KUNIT_CASE(test_two_tasks_on_one_cpu), + KUNIT_CASE(test_two_tasks_on_one_all_cpus), + KUNIT_CASE(test_task_on_all_and_one_cpu), + {}, +}; + +static int test_init(struct kunit *test) +{ + /* Most test cases want 2 distinct CPUs. */ + return num_online_cpus() < 2 ? -EINVAL : 0; +} + +static void test_exit(struct kunit *test) +{ + for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) { + if (test_bps[i]) + unregister_test_bp(&test_bps[i]); + } + + if (__other_task) { + kthread_stop(__other_task); + __other_task = NULL; + } +} + +static struct kunit_suite hw_breakpoint_test_suite = { + .name = "hw_breakpoint", + .test_cases = hw_breakpoint_test_cases, + .init = test_init, + .exit = test_exit, +}; + +kunit_test_suites(&hw_breakpoint_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Elver "); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 072e4b289c13..863b51751566 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2530,6 +2530,16 @@ config STACKINIT_KUNIT_TEST CONFIG_GCC_PLUGIN_STRUCTLEAK, CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF, or CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL. +config HW_BREAKPOINT_KUNIT_TEST + bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS + depends on HAVE_HW_BREAKPOINT + depends on KUNIT=y + default KUNIT_ALL_TESTS + help + Tests for hw_breakpoint constraints accounting. + + If unsure, say N. + config TEST_UDELAY tristate "udelay test driver" help From c5b81449f915a28bb9c7725e53aebab3ba39b4a2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:07 +0200 Subject: [PATCH 040/145] perf/hw_breakpoint: Provide hw_breakpoint_is_used() and use in test Provide hw_breakpoint_is_used() to check if breakpoints are in use on the system. Use it in the KUnit test to verify the global state before and after a test case. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-3-elver@google.com --- include/linux/hw_breakpoint.h | 3 +++ kernel/events/hw_breakpoint.c | 29 +++++++++++++++++++++++++++++ kernel/events/hw_breakpoint_test.c | 12 +++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 78dd7035d1e5..a3fb846705eb 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -74,6 +74,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, extern int register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); +extern bool hw_breakpoint_is_used(void); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); @@ -121,6 +122,8 @@ register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } +static inline bool hw_breakpoint_is_used(void) { return false; } + static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index f32320ac02fd..fd5cd1f9e7fc 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -604,6 +604,35 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); +/** + * hw_breakpoint_is_used - check if breakpoints are currently used + * + * Returns: true if breakpoints are used, false otherwise. + */ +bool hw_breakpoint_is_used(void) +{ + int cpu; + + if (!constraints_initialized) + return false; + + for_each_possible_cpu(cpu) { + for (int type = 0; type < TYPE_MAX; ++type) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + + if (info->cpu_pinned) + return true; + + for (int slot = 0; slot < nr_slots[type]; ++slot) { + if (info->tsk_pinned[slot]) + return true; + } + } + } + + return false; +} + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c index 433c5c45e2a5..5ced822df788 100644 --- a/kernel/events/hw_breakpoint_test.c +++ b/kernel/events/hw_breakpoint_test.c @@ -294,7 +294,14 @@ static struct kunit_case hw_breakpoint_test_cases[] = { static int test_init(struct kunit *test) { /* Most test cases want 2 distinct CPUs. */ - return num_online_cpus() < 2 ? -EINVAL : 0; + if (num_online_cpus() < 2) + return -EINVAL; + + /* Want the system to not use breakpoints elsewhere. */ + if (hw_breakpoint_is_used()) + return -EBUSY; + + return 0; } static void test_exit(struct kunit *test) @@ -308,6 +315,9 @@ static void test_exit(struct kunit *test) kthread_stop(__other_task); __other_task = NULL; } + + /* Verify that internal state agrees that no breakpoints are in use. */ + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); } static struct kunit_suite hw_breakpoint_test_suite = { From 089cdcb0cd1c25343fa56d3eabbe878df31a7c0e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:08 +0200 Subject: [PATCH 041/145] perf/hw_breakpoint: Clean up headers Clean up headers: - Remove unused - Remove unused - Remove unused - Remove unused - Add for EXPORT_SYMBOL_GPL(). - Add for mutex. - Sort alphabetically. - Move to top to test it compiles on its own. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-4-elver@google.com --- kernel/events/hw_breakpoint.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fd5cd1f9e7fc..6076c6346291 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -17,23 +17,22 @@ * This file contains the arch-independent routines. */ +#include + +#include +#include +#include +#include #include -#include -#include -#include #include #include -#include +#include +#include +#include #include #include -#include #include -#include -#include -#include -#include -#include /* * Constraints data */ From 0370dc314df35579b751d1b77c9169f071444962 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:09 +0200 Subject: [PATCH 042/145] perf/hw_breakpoint: Optimize list of per-task breakpoints On a machine with 256 CPUs, running the recently added perf breakpoint benchmark results in: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 236.418 [sec] | | 123134.794271 usecs/op | 7880626.833333 usecs/op/cpu The benchmark tests inherited breakpoint perf events across many threads. Looking at a perf profile, we can see that the majority of the time is spent in various hw_breakpoint.c functions, which execute within the 'nr_bp_mutex' critical sections which then results in contention on that mutex as well: 37.27% [kernel] [k] osq_lock 34.92% [kernel] [k] mutex_spin_on_owner 12.15% [kernel] [k] toggle_bp_slot 11.90% [kernel] [k] __reserve_bp_slot The culprit here is task_bp_pinned(), which has a runtime complexity of O(#tasks) due to storing all task breakpoints in the same list and iterating through that list looking for a matching task. Clearly, this does not scale to thousands of tasks. Instead, make use of the "rhashtable" variant "rhltable" which stores multiple items with the same key in a list. This results in average runtime complexity of O(1) for task_bp_pinned(). With the optimization, the benchmark shows: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.208 [sec] | | 108.422396 usecs/op | 6939.033333 usecs/op/cpu On this particular setup that's a speedup of ~1135x. While one option would be to make task_struct a breakpoint list node, this would only further bloat task_struct for infrequently used data. Furthermore, after all optimizations in this series, there's no evidence it would result in better performance: later optimizations make the time spent looking up entries in the hash table negligible (we'll reach the theoretical ideal performance i.e. no constraints). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-5-elver@google.com --- include/linux/perf_event.h | 3 +- kernel/events/hw_breakpoint.c | 56 ++++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ae30c61957d2..1999408a9cbb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { }; #ifdef CONFIG_HAVE_HW_BREAKPOINT +#include #include #endif @@ -178,7 +179,7 @@ struct hw_perf_event { * creation and event initalization. */ struct arch_hw_breakpoint info; - struct list_head bp_list; + struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6076c6346291..6d09edc80d19 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -26,10 +26,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) } /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, +}; static int constraints_initialized; @@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, /* * Add/remove the given breakpoint in our constraint table */ -static void +static int toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { @@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + return 0; } /* Pinned counter task profiling */ @@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, toggle_bp_task_slot(bp, cpu, type, weight); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); else - list_del(&bp->hw.bp_list); + return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) @@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) @@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { int cpu, err_cpu; - int i; + int i, ret; for (i = 0; i < TYPE_MAX; i++) nr_slots[i] = hw_breakpoint_slots(i); @@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void) info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; + if (!info->tsk_pinned) { + ret = -ENOMEM; + goto err; + } } } + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + goto err; + constraints_initialized = 1; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - err_alloc: +err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) kfree(get_bp_info(err_cpu, i)->tsk_pinned); @@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void) break; } - return -ENOMEM; + return ret; } - - From db5f6f853194c5e02d8551425b5e86b7e0b81806 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:10 +0200 Subject: [PATCH 043/145] perf/hw_breakpoint: Mark data __ro_after_init Mark read-only data after initialization as __ro_after_init. While we are here, turn 'constraints_initialized' into a bool. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-6-elver@google.com --- kernel/events/hw_breakpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6d09edc80d19..7df46b276452 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -46,7 +46,7 @@ struct bp_cpuinfo { }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX]; +static int nr_slots[TYPE_MAX] __ro_after_init; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { @@ -62,7 +62,7 @@ static const struct rhashtable_params task_bps_ht_params = { .automatic_shrinking = true, }; -static int constraints_initialized; +static bool constraints_initialized __ro_after_init; /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { @@ -739,7 +739,7 @@ int __init init_hw_breakpoint(void) if (ret) goto err; - constraints_initialized = 1; + constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); From be3f152568cc7f5f573d21d5f86a2c4f3cc047ab Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:11 +0200 Subject: [PATCH 044/145] perf/hw_breakpoint: Optimize constant number of breakpoint slots Optimize internal hw_breakpoint state if the architecture's number of breakpoint slots is constant. This avoids several kmalloc() calls and potentially unnecessary failures if the allocations fail, as well as subtly improves code generation and cache locality. The protocol is that if an architecture defines hw_breakpoint_slots via the preprocessor, it must be constant and the same for all types. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-7-elver@google.com --- arch/sh/include/asm/hw_breakpoint.h | 5 +- arch/x86/include/asm/hw_breakpoint.h | 5 +- kernel/events/hw_breakpoint.c | 94 ++++++++++++++++++---------- 3 files changed, 63 insertions(+), 41 deletions(-) diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h index 199d17b765f2..361a0f57bdeb 100644 --- a/arch/sh/include/asm/hw_breakpoint.h +++ b/arch/sh/include/asm/hw_breakpoint.h @@ -48,10 +48,7 @@ struct pmu; /* Maximum number of UBC channels */ #define HBP_NUM 2 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) /* arch/sh/kernel/hw_breakpoint.c */ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index a1f0e90d0818..0bc931cd0698 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -44,10 +44,7 @@ struct arch_hw_breakpoint { /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) struct perf_event_attr; struct perf_event; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 7df46b276452..9fb66d358d81 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -40,13 +40,16 @@ struct bp_cpuinfo { /* Number of pinned cpu breakpoints in a cpu */ unsigned int cpu_pinned; /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ +#ifdef hw_breakpoint_slots + unsigned int tsk_pinned[hw_breakpoint_slots(0)]; +#else unsigned int *tsk_pinned; +#endif /* Number of non-pinned cpu/task breakpoints in a cpu */ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX] __ro_after_init; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { @@ -73,6 +76,54 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL); + if (!info->tsk_pinned) + goto err; + } + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + kfree(get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + + return -ENOMEM; +} +#endif + __weak int hw_breakpoint_weight(struct perf_event *bp) { return 1; @@ -95,7 +146,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; - for (i = nr_slots[type] - 1; i >= 0; i--) { + for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -312,7 +363,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + if (slots.pinned + (!!slots.flexible) > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); @@ -632,7 +683,7 @@ bool hw_breakpoint_is_used(void) if (info->cpu_pinned) return true; - for (int slot = 0; slot < nr_slots[type]; ++slot) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { if (info->tsk_pinned[slot]) return true; } @@ -716,42 +767,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i, ret; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); - - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); - - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) { - ret = -ENOMEM; - goto err; - } - } - } + int ret; ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); if (ret) - goto err; + return ret; + + ret = init_breakpoint_slots(); + if (ret) + return ret; constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - -err: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return ret; } From 9caf87be118f4639537404eeb67dd444a3716e9a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:12 +0200 Subject: [PATCH 045/145] perf/hw_breakpoint: Make hw_breakpoint_weight() inlinable Due to being a __weak function, hw_breakpoint_weight() will cause the compiler to always emit a call to it. This generates unnecessarily bad code (register spills etc.) for no good reason; in fact it appears in profiles of `perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512`: ... 0.70% [kernel] [k] hw_breakpoint_weight ... While a small percentage, no architecture defines its own hw_breakpoint_weight() nor are there users outside hw_breakpoint.c, which makes the fact it is currently __weak a poor choice. Change hw_breakpoint_weight()'s definition to follow a similar protocol to hw_breakpoint_slots(), such that if defines hw_breakpoint_weight(), we'll use it instead. The result is that it is inlined and no longer shows up in profiles. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-8-elver@google.com --- include/linux/hw_breakpoint.h | 1 - kernel/events/hw_breakpoint.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a3fb846705eb..f319bd26b030 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -80,7 +80,6 @@ extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); -int hw_breakpoint_weight(struct perf_event *bp); int arch_reserve_bp_slot(struct perf_event *bp); void arch_release_bp_slot(struct perf_event *bp); void arch_unregister_hw_breakpoint(struct perf_event *bp); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9fb66d358d81..9c9bf17666a5 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -124,10 +124,12 @@ err: } #endif -__weak int hw_breakpoint_weight(struct perf_event *bp) +#ifndef hw_breakpoint_weight +static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } +#endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { From 24198ad373ad1e30b638aa147142dc21ab5757e7 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:13 +0200 Subject: [PATCH 046/145] perf/hw_breakpoint: Remove useless code related to flexible breakpoints Flexible breakpoints have never been implemented, with bp_cpuinfo::flexible always being 0. Unfortunately, they still occupy 4 bytes in each bp_cpuinfo and bp_busy_slots, as well as computing the max flexible count in fetch_bp_busy_slots(). This again causes suboptimal code generation, when we always know that `!!slots.flexible` will be 0. Just get rid of the flexible "placeholder" and remove all real code related to it. Make a note in the comment related to the constraints algorithm but don't remove them from the algorithm, so that if in future flexible breakpoints need supporting, it should be trivial to revive them (along with reverting this change). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-9-elver@google.com --- kernel/events/hw_breakpoint.c | 57 +++++++++++------------------------ 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9c9bf17666a5..8b40fca1a063 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -45,8 +45,6 @@ struct bp_cpuinfo { #else unsigned int *tsk_pinned; #endif - /* Number of non-pinned cpu/task breakpoints in a cpu */ - unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); @@ -67,12 +65,6 @@ static const struct rhashtable_params task_bps_ht_params = { static bool constraints_initialized __ro_after_init; -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; - /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); @@ -190,14 +182,14 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp) } /* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). + * Returns the max pinned breakpoint slots in a given + * CPU (cpu > -1) or across all of them (cpu = -1). */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) +static int +max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); + int pinned_slots = 0; int cpu; for_each_cpu(cpu, cpumask) { @@ -210,24 +202,10 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, else nr += task_bp_pinned(cpu, bp, type); - if (nr > slots->pinned) - slots->pinned = nr; - - nr = info->flexible; - if (nr > slots->flexible) - slots->flexible = nr; + pinned_slots = max(nr, pinned_slots); } -} -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; + return pinned_slots; } /* @@ -298,7 +276,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Constraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter. + * + * Note: Flexible breakpoints are currently unimplemented, but outlined in the + * below algorithm for completeness. The implementation treats flexible as + * pinned due to no guarantee that we currently always schedule flexible events + * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * @@ -340,8 +323,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { - struct bp_busy_slots slots = {0}; enum bp_type_idx type; + int max_pinned_slots; int weight; int ret; @@ -357,15 +340,9 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > hw_breakpoint_slots_cached(type)) + /* Check if this new breakpoint can be satisfied across all CPUs. */ + max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; + if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); From f95e5a3d59011eec1257d0e76de1e1f8969d426f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:14 +0200 Subject: [PATCH 047/145] powerpc/hw_breakpoint: Avoid relying on caller synchronization Internal data structures (cpu_bps, task_bps) of powerpc's hw_breakpoint implementation have relied on nr_bp_mutex serializing access to them. Before overhauling synchronization of kernel/events/hw_breakpoint.c, introduce 2 spinlocks to synchronize cpu_bps and task_bps respectively, thus avoiding reliance on callers synchronizing powerpc's hw_breakpoint. Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-10-elver@google.com --- arch/powerpc/kernel/hw_breakpoint.c | 53 ++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 2669f80b3a49..8db1a15d7acb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -129,7 +130,14 @@ struct breakpoint { bool ptrace_bp; }; +/* + * While kernel/events/hw_breakpoint.c does its own synchronization, we cannot + * rely on it safely synchronizing internals here; however, we can rely on it + * not requesting more breakpoints than available. + */ +static DEFINE_SPINLOCK(cpu_bps_lock); static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]); +static DEFINE_SPINLOCK(task_bps_lock); static LIST_HEAD(task_bps); static struct breakpoint *alloc_breakpoint(struct perf_event *bp) @@ -174,7 +182,9 @@ static int task_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&task_bps_lock); list_add(&tmp->list, &task_bps); + spin_unlock(&task_bps_lock); return 0; } @@ -182,6 +192,7 @@ static void task_bps_remove(struct perf_event *bp) { struct list_head *pos, *q; + spin_lock(&task_bps_lock); list_for_each_safe(pos, q, &task_bps) { struct breakpoint *tmp = list_entry(pos, struct breakpoint, list); @@ -191,6 +202,7 @@ static void task_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&task_bps_lock); } /* @@ -200,12 +212,17 @@ static void task_bps_remove(struct perf_event *bp) static bool all_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { - if (!can_co_exist(tmp, bp)) - return true; + if (!can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } /* @@ -215,13 +232,18 @@ static bool all_task_bps_check(struct perf_event *bp) static bool same_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { if (tmp->bp->hw.target == bp->hw.target && - !can_co_exist(tmp, bp)) - return true; + !can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } static int cpu_bps_add(struct perf_event *bp) @@ -234,6 +256,7 @@ static int cpu_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) { @@ -241,6 +264,7 @@ static int cpu_bps_add(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); return 0; } @@ -249,6 +273,7 @@ static void cpu_bps_remove(struct perf_event *bp) struct breakpoint **cpu_bp; int i = 0; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) @@ -260,19 +285,25 @@ static void cpu_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); } static bool cpu_bps_check(int cpu, struct perf_event *bp) { struct breakpoint **cpu_bp; + bool ret = false; int i; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, cpu); for (i = 0; i < nr_wp_slots(); i++) { - if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) - return true; + if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&cpu_bps_lock); + return ret; } static bool all_cpu_bps_check(struct perf_event *bp) @@ -286,10 +317,6 @@ static bool all_cpu_bps_check(struct perf_event *bp) return false; } -/* - * We don't use any locks to serialize accesses to cpu_bps or task_bps - * because are already inside nr_bp_mutex. - */ int arch_reserve_bp_slot(struct perf_event *bp) { int ret; From 01fe8a3f818e1074a9a95d624be4549ee7ea2b2b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:15 +0200 Subject: [PATCH 048/145] locking/percpu-rwsem: Add percpu_is_write_locked() and percpu_is_read_locked() Implement simple accessors to probe percpu-rwsem's locked state: percpu_is_write_locked(), percpu_is_read_locked(). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-11-elver@google.com --- include/linux/percpu-rwsem.h | 6 ++++++ kernel/locking/percpu-rwsem.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5fda40f97fe9..36b942b67b7d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -121,9 +121,15 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) +{ + return atomic_read(&sem->block); +} + extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 5fe4c5495ba3..185bd1c906b0 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any From 0912037fec1136d4e4796a3481f4a4ee09a2c325 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:16 +0200 Subject: [PATCH 049/145] perf/hw_breakpoint: Reduce contention with large number of tasks While optimizing task_bp_pinned()'s runtime complexity to O(1) on average helps reduce time spent in the critical section, we still suffer due to serializing everything via 'nr_bp_mutex'. Indeed, a profile shows that now contention is the biggest issue: 95.93% [kernel] [k] osq_lock 0.70% [kernel] [k] mutex_spin_on_owner 0.22% [kernel] [k] smp_cfm_core_cond 0.18% [kernel] [k] task_bp_pinned 0.18% [kernel] [k] rhashtable_jhash2 0.15% [kernel] [k] queued_spin_lock_slowpath when running the breakpoint benchmark with (system with 256 CPUs): | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.207 [sec] | | 108.267188 usecs/op | 6929.100000 usecs/op/cpu The main concern for synchronizing the breakpoint constraints data is that a consistent snapshot of the per-CPU and per-task data is observed. The access pattern is as follows: 1. If the target is a task: the task's pinned breakpoints are counted, checked for space, and then appended to; only bp_cpuinfo::cpu_pinned is used to check for conflicts with CPU-only breakpoints; bp_cpuinfo::tsk_pinned are incremented/decremented, but otherwise unused. 2. If the target is a CPU: bp_cpuinfo::cpu_pinned are counted, along with bp_cpuinfo::tsk_pinned; after a successful check, cpu_pinned is incremented. No per-task breakpoints are checked. Since rhltable safely synchronizes insertions/deletions, we can allow concurrency as follows: 1. If the target is a task: independent tasks may update and check the constraints concurrently, but same-task target calls need to be serialized; since bp_cpuinfo::tsk_pinned is only updated, but not checked, these modifications can happen concurrently by switching tsk_pinned to atomic_t. 2. If the target is a CPU: access to the per-CPU constraints needs to be serialized with other CPU-target and task-target callers (to stabilize the bp_cpuinfo::tsk_pinned snapshot). We can allow the above concurrency by introducing a per-CPU constraints data reader-writer lock (bp_cpuinfo_sem), and per-task mutexes (reuses task_struct::perf_event_mutex): 1. If the target is a task: acquires perf_event_mutex, and acquires bp_cpuinfo_sem as a reader. The choice of percpu-rwsem minimizes contention in the presence of many read-lock but few write-lock acquisitions: we assume many orders of magnitude more task target breakpoints creations/destructions than CPU target breakpoints. 2. If the target is a CPU: acquires bp_cpuinfo_sem as a writer. With these changes, contention with thousands of tasks is reduced to the point where waiting on locking no longer dominates the profile: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.077 [sec] | | 40.201563 usecs/op | 2572.900000 usecs/op/cpu 21.54% [kernel] [k] task_bp_pinned 20.18% [kernel] [k] rhashtable_jhash2 6.81% [kernel] [k] toggle_bp_slot 5.47% [kernel] [k] queued_spin_lock_slowpath 3.75% [kernel] [k] smp_cfm_core_cond 3.48% [kernel] [k] bcmp On this particular setup that's a speedup of 2.7x. We're also getting closer to the theoretical ideal performance through optimizations in hw_breakpoint.c -- constraints accounting disabled: | perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.067 [sec] | | 35.286458 usecs/op | 2258.333333 usecs/op/cpu Which means the current implementation is ~12% slower than the theoretical ideal. For reference, performance without any breakpoints: | $> bench -r 30 breakpoint thread -b 0 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 0 breakpoints and 64 parallelism | Total time: 0.060 [sec] | | 31.365625 usecs/op | 2007.400000 usecs/op/cpu On a system with 256 CPUs, the theoretical ideal is only ~12% slower than no breakpoints at all; the current implementation is ~28% slower. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-12-elver@google.com --- kernel/events/hw_breakpoint.c | 161 ++++++++++++++++++++++++++++------ 1 file changed, 133 insertions(+), 28 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 8b40fca1a063..229c6f4fae75 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -41,9 +43,9 @@ struct bp_cpuinfo { unsigned int cpu_pinned; /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ #ifdef hw_breakpoint_slots - unsigned int tsk_pinned[hw_breakpoint_slots(0)]; + atomic_t tsk_pinned[hw_breakpoint_slots(0)]; #else - unsigned int *tsk_pinned; + atomic_t *tsk_pinned; #endif }; @@ -65,8 +67,79 @@ static const struct rhashtable_params task_bps_ht_params = { static bool constraints_initialized __ro_after_init; -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); +/* + * Synchronizes accesses to the per-CPU constraints; the locking rules are: + * + * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock + * (due to bp_slots_histogram::count being atomic, no update are lost). + * + * 2. Holding a write-lock is required for computations that require a + * stable snapshot of all bp_cpuinfo::tsk_pinned. + * + * 3. In all other cases, non-atomic accesses require the appropriately held + * lock (read-lock for read-only accesses; write-lock for reads/writes). + */ +DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); + +/* + * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since + * rhltable synchronizes concurrent insertions/deletions, independent tasks may + * insert/delete concurrently; therefore, a mutex per task is sufficient. + * + * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a + * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is + * that hw_breakpoint may contend with per-task perf event list management. The + * assumption is that perf usecases involving hw_breakpoints are very unlikely + * to result in unnecessary contention. + */ +static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) +{ + struct task_struct *tsk = bp->hw.target; + + return tsk ? &tsk->perf_event_mutex : NULL; +} + +static struct mutex *bp_constraints_lock(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) { + mutex_lock(tsk_mtx); + percpu_down_read(&bp_cpuinfo_sem); + } else { + percpu_down_write(&bp_cpuinfo_sem); + } + + return tsk_mtx; +} + +static void bp_constraints_unlock(struct mutex *tsk_mtx) +{ + if (tsk_mtx) { + percpu_up_read(&bp_cpuinfo_sem); + mutex_unlock(tsk_mtx); + } else { + percpu_up_write(&bp_cpuinfo_sem); + } +} + +static bool bp_constraints_is_locked(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + return percpu_is_write_locked(&bp_cpuinfo_sem) || + (tsk_mtx ? mutex_is_locked(tsk_mtx) : + percpu_is_read_locked(&bp_cpuinfo_sem)); +} + +static inline void assert_bp_constraints_lock_held(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) + lockdep_assert_held(tsk_mtx); + lockdep_assert_held(&bp_cpuinfo_sem); +} #ifdef hw_breakpoint_slots /* @@ -97,7 +170,7 @@ static __init int init_breakpoint_slots(void) for (i = 0; i < TYPE_MAX; i++) { struct bp_cpuinfo *info = get_bp_info(cpu, i); - info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL); + info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(atomic_t), GFP_KERNEL); if (!info->tsk_pinned) goto err; } @@ -137,11 +210,19 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; + atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; + /* + * At this point we want to have acquired the bp_cpuinfo_sem as a + * writer to ensure that there are no concurrent writers in + * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. + */ + lockdep_assert_held_write(&bp_cpuinfo_sem); + for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) + ASSERT_EXCLUSIVE_WRITER(tsk_pinned[i]); /* Catch unexpected writers. */ + if (atomic_read(&tsk_pinned[i]) > 0) return i + 1; } @@ -158,6 +239,11 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) struct perf_event *iter; int count = 0; + /* + * We need a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + rcu_read_lock(); head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); if (!head) @@ -214,16 +300,25 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; + atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int old_idx, new_idx; + /* + * If bp->hw.target, tsk_pinned is only modified, but not used + * otherwise. We can permit concurrent updates as long as there are no + * other uses: having acquired bp_cpuinfo_sem as a reader allows + * concurrent updates here. Uses of tsk_pinned will require acquiring + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. + */ + lockdep_assert_held_read(&bp_cpuinfo_sem); + old_idx = task_bp_pinned(cpu, bp, type) - 1; new_idx = old_idx + weight; if (old_idx >= 0) - tsk_pinned[old_idx]--; + atomic_dec(&tsk_pinned[old_idx]); if (new_idx >= 0) - tsk_pinned[new_idx]++; + atomic_inc(&tsk_pinned[new_idx]); } /* @@ -241,6 +336,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { + lockdep_assert_held_write(&bp_cpuinfo_sem); get_bp_info(bp->cpu, type)->cpu_pinned += weight; return 0; } @@ -249,6 +345,11 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, for_each_cpu(cpu, cpumask) toggle_bp_task_slot(bp, cpu, type, weight); + /* + * Readers want a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + if (enable) return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); else @@ -354,14 +455,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) int reserve_bp_slot(struct perf_event *bp) { - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __reserve_bp_slot(bp, bp->attr.bp_type); + bp_constraints_unlock(mtx); return ret; } @@ -379,12 +476,11 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) void release_bp_slot(struct perf_event *bp) { - mutex_lock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) @@ -411,11 +507,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { - int ret; + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __modify_bp_slot(bp, old_type, new_type); - mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type, new_type); - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); return ret; } @@ -426,18 +521,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) */ int dbg_reserve_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + int ret; + + if (bp_constraints_is_locked(bp)) return -1; - return __reserve_bp_slot(bp, bp->attr.bp_type); + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); + + return ret; } int dbg_release_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + if (bp_constraints_is_locked(bp)) return -1; + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); return 0; } @@ -663,7 +768,7 @@ bool hw_breakpoint_is_used(void) return true; for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { - if (info->tsk_pinned[slot]) + if (atomic_read(&info->tsk_pinned[slot])) return true; } } From 16db2839a5a59c242df77308cf57342ce0c3768e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:17 +0200 Subject: [PATCH 050/145] perf/hw_breakpoint: Introduce bp_slots_histogram Factor out the existing `atomic_t count[N]` into its own struct called 'bp_slots_histogram', to generalize and make its intent clearer in preparation of reusing elsewhere. The basic idea of bucketing "total uses of N slots" resembles a histogram, so calling it such seems most intuitive. No functional change. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-13-elver@google.com --- kernel/events/hw_breakpoint.c | 100 ++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 229c6f4fae75..03ebecf048c0 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -36,17 +36,25 @@ #include /* - * Constraints data + * Datastructure to track the total uses of N slots across tasks or CPUs; + * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. + */ +struct bp_slots_histogram { +#ifdef hw_breakpoint_slots + atomic_t count[hw_breakpoint_slots(0)]; +#else + atomic_t *count; +#endif +}; + +/* + * Per-CPU constraints data. */ struct bp_cpuinfo { - /* Number of pinned cpu breakpoints in a cpu */ - unsigned int cpu_pinned; - /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ -#ifdef hw_breakpoint_slots - atomic_t tsk_pinned[hw_breakpoint_slots(0)]; -#else - atomic_t *tsk_pinned; -#endif + /* Number of pinned CPU breakpoints in a CPU. */ + unsigned int cpu_pinned; + /* Histogram of pinned task breakpoints in a CPU. */ + struct bp_slots_histogram tsk_pinned; }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); @@ -159,6 +167,18 @@ static inline int hw_breakpoint_slots_cached(int type) return __nr_bp_slots[type]; } +static __init bool +bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); + return hist->count; +} + +static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) +{ + kfree(hist->count); +} + static __init int init_breakpoint_slots(void) { int i, cpu, err_cpu; @@ -170,8 +190,7 @@ static __init int init_breakpoint_slots(void) for (i = 0; i < TYPE_MAX; i++) { struct bp_cpuinfo *info = get_bp_info(cpu, i); - info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(atomic_t), GFP_KERNEL); - if (!info->tsk_pinned) + if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) goto err; } } @@ -180,7 +199,7 @@ static __init int init_breakpoint_slots(void) err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); + bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } @@ -189,6 +208,34 @@ err: } #endif +static inline void +bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) +{ + const int old_idx = old - 1; + const int new_idx = old_idx + val; + + if (old_idx >= 0) + WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); + if (new_idx >= 0) + WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); +} + +static int +bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count = atomic_read(&hist->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist->count[i]); + if (count > 0) + return i + 1; + WARN(count < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + #ifndef hw_breakpoint_weight static inline int hw_breakpoint_weight(struct perf_event *bp) { @@ -205,13 +252,11 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) } /* - * Report the maximum number of pinned breakpoints a task - * have in this cpu + * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int i; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; /* * At this point we want to have acquired the bp_cpuinfo_sem as a @@ -219,14 +264,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. */ lockdep_assert_held_write(&bp_cpuinfo_sem); - - for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { - ASSERT_EXCLUSIVE_WRITER(tsk_pinned[i]); /* Catch unexpected writers. */ - if (atomic_read(&tsk_pinned[i]) > 0) - return i + 1; - } - - return 0; + return bp_slots_histogram_max(tsk_pinned, type); } /* @@ -300,8 +338,7 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { - atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int old_idx, new_idx; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; /* * If bp->hw.target, tsk_pinned is only modified, but not used @@ -311,14 +348,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. */ lockdep_assert_held_read(&bp_cpuinfo_sem); - - old_idx = task_bp_pinned(cpu, bp, type) - 1; - new_idx = old_idx + weight; - - if (old_idx >= 0) - atomic_dec(&tsk_pinned[old_idx]); - if (new_idx >= 0) - atomic_inc(&tsk_pinned[new_idx]); + bp_slots_histogram_add(tsk_pinned, task_bp_pinned(cpu, bp, type), weight); } /* @@ -768,7 +798,7 @@ bool hw_breakpoint_is_used(void) return true; for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { - if (atomic_read(&info->tsk_pinned[slot])) + if (atomic_read(&info->tsk_pinned.count[slot])) return true; } } From 9b1933b864a10e7f66b06d10c39217142baed28b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:18 +0200 Subject: [PATCH 051/145] perf/hw_breakpoint: Optimize max_bp_pinned_slots() for CPU-independent task targets Running the perf benchmark with (note: more aggressive parameters vs. preceding changes, but same 256 CPUs host): | $> perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512 | # Running 'breakpoint/thread' benchmark: | # Created/joined 100 threads with 4 breakpoints and 128 parallelism | Total time: 1.989 [sec] | | 38.854160 usecs/op | 4973.332500 usecs/op/cpu 20.43% [kernel] [k] queued_spin_lock_slowpath 18.75% [kernel] [k] osq_lock 16.98% [kernel] [k] rhashtable_jhash2 8.34% [kernel] [k] task_bp_pinned 4.23% [kernel] [k] smp_cfm_core_cond 3.65% [kernel] [k] bcmp 2.83% [kernel] [k] toggle_bp_slot 1.87% [kernel] [k] find_next_bit 1.49% [kernel] [k] __reserve_bp_slot We can see that a majority of the time is now spent hashing task pointers to index into task_bps_ht in task_bp_pinned(). Obtaining the max_bp_pinned_slots() for CPU-independent task targets currently is O(#cpus), and calls task_bp_pinned() for each CPU, even if the result of task_bp_pinned() is CPU-independent. The loop in max_bp_pinned_slots() wants to compute the maximum slots across all CPUs. If task_bp_pinned() is CPU-independent, we can do so by obtaining the max slots across all CPUs and adding task_bp_pinned(). To do so in O(1), use a bp_slots_histogram for CPU-pinned slots. After this optimization: | $> perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512 | # Running 'breakpoint/thread' benchmark: | # Created/joined 100 threads with 4 breakpoints and 128 parallelism | Total time: 1.930 [sec] | | 37.697832 usecs/op | 4825.322500 usecs/op/cpu 19.13% [kernel] [k] queued_spin_lock_slowpath 18.21% [kernel] [k] rhashtable_jhash2 15.46% [kernel] [k] osq_lock 6.27% [kernel] [k] toggle_bp_slot 5.91% [kernel] [k] task_bp_pinned 5.05% [kernel] [k] smp_cfm_core_cond 1.78% [kernel] [k] update_sg_lb_stats 1.36% [kernel] [k] llist_reverse_order 1.34% [kernel] [k] find_next_bit 1.19% [kernel] [k] bcmp Suggesting that time spent in task_bp_pinned() has been reduced. However, we're still hashing too much, which will be addressed in the subsequent change. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-14-elver@google.com --- kernel/events/hw_breakpoint.c | 57 ++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 03ebecf048c0..a489f31fe147 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -64,6 +64,9 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) return per_cpu_ptr(bp_cpuinfo + type, cpu); } +/* Number of pinned CPU breakpoints globally. */ +static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; + /* Keep track of the breakpoints attached to tasks */ static struct rhltable task_bps_ht; static const struct rhashtable_params task_bps_ht_params = { @@ -194,6 +197,10 @@ static __init int init_breakpoint_slots(void) goto err; } } + for (i = 0; i < TYPE_MAX; i++) { + if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) + goto err; + } return 0; err: @@ -203,6 +210,8 @@ err: if (err_cpu == cpu) break; } + for (i = 0; i < TYPE_MAX; i++) + bp_slots_histogram_free(&cpu_pinned[i]); return -ENOMEM; } @@ -270,6 +279,9 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. + * + * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, + * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { @@ -288,9 +300,18 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) goto out; rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { - if (find_slot_idx(iter->attr.bp_type) == type && - (iter->cpu < 0 || cpu == iter->cpu)) - count += hw_breakpoint_weight(iter); + if (find_slot_idx(iter->attr.bp_type) != type) + continue; + + if (iter->cpu >= 0) { + if (cpu == -1) { + count = -1; + goto out; + } else if (cpu != iter->cpu) + continue; + } + + count += hw_breakpoint_weight(iter); } out: @@ -316,6 +337,19 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) int pinned_slots = 0; int cpu; + if (bp->hw.target && bp->cpu < 0) { + int max_pinned = task_bp_pinned(-1, bp, type); + + if (max_pinned >= 0) { + /* + * Fast path: task_bp_pinned() is CPU-independent and + * returns the same value for any CPU. + */ + max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); + return max_pinned; + } + } + for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; @@ -366,8 +400,11 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); + lockdep_assert_held_write(&bp_cpuinfo_sem); - get_bp_info(bp->cpu, type)->cpu_pinned += weight; + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); + info->cpu_pinned += weight; return 0; } @@ -804,6 +841,18 @@ bool hw_breakpoint_is_used(void) } } + for (int type = 0; type < TYPE_MAX; ++type) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + /* + * Warn, because if there are CPU pinned counters, + * should never get here; bp_cpuinfo::cpu_pinned should + * be consistent with the global cpu_pinned histogram. + */ + if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) + return true; + } + } + return false; } From ecdfb8896f2ad733097e6309d64f94db4cd1020c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:19 +0200 Subject: [PATCH 052/145] perf/hw_breakpoint: Optimize toggle_bp_slot() for CPU-independent task targets We can still see that a majority of the time is spent hashing task pointers: ... 16.98% [kernel] [k] rhashtable_jhash2 ... Doing the bookkeeping in toggle_bp_slots() is currently O(#cpus), calling task_bp_pinned() for each CPU, even if task_bp_pinned() is CPU-independent. The reason for this is to update the per-CPU 'tsk_pinned' histogram. To optimize the CPU-independent case to O(1), keep a separate CPU-independent 'tsk_pinned_all' histogram. The major source of complexity are transitions between "all CPU-independent task breakpoints" and "mixed CPU-independent and CPU-dependent task breakpoints". The code comments list all cases that require handling. After this optimization: | $> perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512 | # Running 'breakpoint/thread' benchmark: | # Created/joined 100 threads with 4 breakpoints and 128 parallelism | Total time: 1.758 [sec] | | 34.336621 usecs/op | 4395.087500 usecs/op/cpu 38.08% [kernel] [k] queued_spin_lock_slowpath 10.81% [kernel] [k] smp_cfm_core_cond 3.01% [kernel] [k] update_sg_lb_stats 2.58% [kernel] [k] osq_lock 2.57% [kernel] [k] llist_reverse_order 1.45% [kernel] [k] find_next_bit 1.21% [kernel] [k] flush_tlb_func_common 1.01% [kernel] [k] arch_install_hw_breakpoint Showing that the time spent hashing keys has become insignificant. With the given benchmark parameters, that's an improvement of 12% compared with the old O(#cpus) version. And finally, using the less aggressive parameters from the preceding changes, we now observe: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.067 [sec] | | 35.292187 usecs/op | 2258.700000 usecs/op/cpu Which is an improvement of 12% compared to without the histogram optimizations (baseline is 40 usecs/op). This is now on par with the theoretical ideal (constraints disabled), and only 12% slower than no breakpoints at all. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-15-elver@google.com --- kernel/events/hw_breakpoint.c | 157 +++++++++++++++++++++++++++------- 1 file changed, 125 insertions(+), 32 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a489f31fe147..7ef0e98d31e2 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -66,6 +66,8 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) /* Number of pinned CPU breakpoints globally. */ static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; +/* Number of pinned CPU-independent task breakpoints. */ +static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; /* Keep track of the breakpoints attached to tasks */ static struct rhltable task_bps_ht; @@ -200,6 +202,8 @@ static __init int init_breakpoint_slots(void) for (i = 0; i < TYPE_MAX; i++) { if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) goto err; + if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) + goto err; } return 0; @@ -210,8 +214,10 @@ err: if (err_cpu == cpu) break; } - for (i = 0; i < TYPE_MAX; i++) + for (i = 0; i < TYPE_MAX; i++) { bp_slots_histogram_free(&cpu_pinned[i]); + bp_slots_histogram_free(&tsk_pinned_all[i]); + } return -ENOMEM; } @@ -245,6 +251,26 @@ bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) return 0; } +static int +bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, + enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count1 = atomic_read(&hist1->count[i]); + const int count2 = atomic_read(&hist2->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); + ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); + if (count1 + count2 > 0) + return i + 1; + WARN(count1 < 0, "inconsistent breakpoint slots histogram"); + WARN(count2 < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + #ifndef hw_breakpoint_weight static inline int hw_breakpoint_weight(struct perf_event *bp) { @@ -273,7 +299,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. */ lockdep_assert_held_write(&bp_cpuinfo_sem); - return bp_slots_histogram_max(tsk_pinned, type); + return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* @@ -367,12 +393,28 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) } /* - * Add a pinned breakpoint for the given task in our constraint table + * Add/remove the given breakpoint in our constraint table */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, - enum bp_type_idx type, int weight) +static int +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; + int cpu, next_tsk_pinned; + + if (!enable) + weight = -weight; + + if (!bp->hw.target) { + /* + * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the + * global histogram. + */ + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); + + lockdep_assert_held_write(&bp_cpuinfo_sem); + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); + info->cpu_pinned += weight; + return 0; + } /* * If bp->hw.target, tsk_pinned is only modified, but not used @@ -382,35 +424,83 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. */ lockdep_assert_held_read(&bp_cpuinfo_sem); - bp_slots_histogram_add(tsk_pinned, task_bp_pinned(cpu, bp, type), weight); -} -/* - * Add/remove the given breakpoint in our constraint table - */ -static int -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) -{ - const struct cpumask *cpumask = cpumask_of_bp(bp); - int cpu; + /* + * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global + * histogram. We need to take care of 4 cases: + * + * 1. This breakpoint targets all CPUs (cpu < 0), and there may only + * exist other task breakpoints targeting all CPUs. In this case we + * can simply update the global slots histogram. + * + * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may + * only exist other task breakpoints targeting all CPUs. + * + * a. On enable: remove the existing breakpoints from the global + * slots histogram and use the per-CPU histogram. + * + * b. On disable: re-insert the existing breakpoints into the global + * slots histogram and remove from per-CPU histogram. + * + * 3. Some other existing task breakpoints target specific CPUs. Only + * update the per-CPU slots histogram. + */ - if (!enable) - weight = -weight; + if (!enable) { + /* + * Remove before updating histograms so we can determine if this + * was the last task breakpoint for a specific CPU. + */ + int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); - /* Pinned counter cpu profiling */ - if (!bp->hw.target) { - struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); - - lockdep_assert_held_write(&bp_cpuinfo_sem); - bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); - info->cpu_pinned += weight; - return 0; + if (ret) + return ret; } + /* + * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. + */ + next_tsk_pinned = task_bp_pinned(-1, bp, type); - /* Pinned counter task profiling */ - for_each_cpu(cpu, cpumask) - toggle_bp_task_slot(bp, cpu, type, weight); + if (next_tsk_pinned >= 0) { + if (bp->cpu < 0) { /* Case 1: fast path */ + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); + } else if (enable) { /* Case 2.a: slow path */ + /* Add existing to per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + 0, next_tsk_pinned); + } + /* Add this first CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, + -next_tsk_pinned); + } else { /* Case 2.b: slow path */ + /* Remove this last CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned + hw_breakpoint_weight(bp), weight); + /* Remove all from per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, -next_tsk_pinned); + } + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); + } + } else { /* Case 3: slow path */ + const struct cpumask *cpumask = cpumask_of_bp(bp); + + for_each_cpu(cpu, cpumask) { + next_tsk_pinned = task_bp_pinned(cpu, bp, type); + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + } + } /* * Readers want a stable snapshot of the per-task breakpoint list. @@ -419,8 +509,8 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, if (enable) return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); - else - return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + return 0; } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -850,6 +940,9 @@ bool hw_breakpoint_is_used(void) */ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) return true; + + if (atomic_read(&tsk_pinned_all[type].count[slot])) + return true; } } From b14051352465a24b3c9ceaccac4e39b3521bb370 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:21 +0900 Subject: [PATCH 053/145] mm/sl[au]b: generalize kmalloc subsystem Now everything in kmalloc subsystem can be generalized. Let's do it! Generalize __do_kmalloc_node(), __kmalloc_node_track_caller(), kfree(), __ksize(), __kmalloc(), __kmalloc_node() and move them to slab_common.c. In the meantime, rename kmalloc_large_node_notrace() to __kmalloc_large_node() and make it static as it's now only called in slab_common.c. [ feng.tang@intel.com: adjust kfence skip list to include __kmem_cache_free so that kfence kunit tests do not fail ] Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/kfence/report.c | 1 + mm/slab.c | 108 -------------------------------------------- mm/slab.h | 2 - mm/slab_common.c | 109 +++++++++++++++++++++++++++++++++++++++++++-- mm/slub.c | 87 ------------------------------------ 5 files changed, 107 insertions(+), 200 deletions(-) diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f5a6d8ba3e21..7e496856c2eb 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -86,6 +86,7 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries /* Also the *_bulk() variants by only checking prefixes. */ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; diff --git a/mm/slab.c b/mm/slab.c index aa61851b0a07..5b234e3ab165 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3587,44 +3587,6 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif -static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node_notrace(size, flags, node); - - trace_kmalloc_node(caller, ret, NULL, size, - PAGE_SIZE << get_order(size), - flags, node); - return ret; - } - - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - - ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - ret = kasan_kmalloc(cachep, ret, size, flags); - - return ret; -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, flags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); - #ifdef CONFIG_PRINTK void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { @@ -3647,12 +3609,6 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) } #endif -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc); - static __always_inline void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) @@ -3730,43 +3686,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/** - * kfree - free previously allocated memory - * @objp: pointer returned by kmalloc. - * - * If @objp is NULL, no operation is performed. - * - * Don't free memory not originally allocated by kmalloc() - * or you will run into trouble. - */ -void kfree(const void *objp) -{ - struct kmem_cache *c; - unsigned long flags; - struct folio *folio; - - trace_kfree(_RET_IP_, objp); - - if (unlikely(ZERO_OR_NULL_PTR(objp))) - return; - - folio = virt_to_folio(objp); - if (!folio_test_slab(folio)) { - free_large_kmalloc(folio, (void *)objp); - return; - } - - c = folio_slab(folio)->slab_cache; - - local_irq_save(flags); - kfree_debugcheck(objp); - debug_check_no_locks_freed(objp, c->object_size); - debug_check_no_obj_freed(objp, c->object_size); - __cache_free(c, (void *)objp, _RET_IP_); - local_irq_restore(flags); -} -EXPORT_SYMBOL(kfree); - /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ @@ -4169,30 +4088,3 @@ void __check_heap_object(const void *ptr, unsigned long n, usercopy_abort("SLAB object", cachep->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ - -/** - * __ksize -- Uninstrumented ksize. - * @objp: pointer to the object - * - * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same - * safety checks as ksize() with KASAN instrumentation enabled. - * - * Return: size of the actual memory used by @objp in bytes - */ -size_t __ksize(const void *objp) -{ - struct kmem_cache *c; - struct folio *folio; - - BUG_ON(!objp); - if (unlikely(objp == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(objp); - if (!folio_test_slab(folio)) - return folio_size(folio); - - c = folio_slab(folio)->slab_cache; - return c->object_size; -} -EXPORT_SYMBOL(__ksize); diff --git a/mm/slab.h b/mm/slab.h index 8eefeed95407..4d8330d57573 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -280,8 +280,6 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); #endif -void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node); - gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 5a2e81f42ee9..6a744fc3d7f2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -897,6 +897,109 @@ void free_large_kmalloc(struct folio *folio, void *object) -(PAGE_SIZE << order)); __free_pages(folio_page(folio, 0), order); } + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc_node(caller, ret, NULL, + size, PAGE_SIZE << get_order(size), + flags, node); + return ret; + } + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = __kmem_cache_alloc_node(s, flags, node, size, caller); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc_node(caller, ret, s, size, + s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc. + * + * If @object is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + __kmem_cache_free(s, (void *)object, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/** + * __ksize -- Uninstrumented ksize. + * @object: pointer to the object + * + * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same + * safety checks as ksize() with KASAN instrumentation enabled. + * + * Return: size of the actual memory used by @object in bytes + */ +size_t __ksize(const void *object) +{ + struct folio *folio; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + folio = virt_to_folio(object); + + if (unlikely(!folio_test_slab(folio))) + return folio_size(folio); + + return slab_ksize(folio_slab(folio)->slab_cache); +} +EXPORT_SYMBOL(__ksize); #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) @@ -917,7 +1020,7 @@ gfp_t kmalloc_fix_flags(gfp_t flags) * know the allocation order to free the pages properly in kfree. */ -void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node) +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; void *ptr = NULL; @@ -943,7 +1046,7 @@ void *kmalloc_large_node_notrace(size_t size, gfp_t flags, int node) void *kmalloc_large(size_t size, gfp_t flags) { - void *ret = kmalloc_large_node_notrace(size, flags, NUMA_NO_NODE); + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << get_order(size), flags); @@ -953,7 +1056,7 @@ EXPORT_SYMBOL(kmalloc_large); void *kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = kmalloc_large_node_notrace(size, flags, node); + void *ret = __kmalloc_large_node(size, flags, node); trace_kmalloc_node(_RET_IP_, ret, NULL, size, PAGE_SIZE << get_order(size), flags, node); diff --git a/mm/slub.c b/mm/slub.c index a11f78c2647c..cd49785d59e1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4388,49 +4388,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -static __always_inline -void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node_notrace(size, flags, node); - - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << get_order(size), - flags, node); - - return ret; - } - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, flags, node, caller, size); - - trace_kmalloc_node(caller, ret, s, size, s->size, flags, node); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc(size_t size, gfp_t flags) -{ - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc); - - #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -4481,43 +4438,6 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif /* CONFIG_HARDENED_USERCOPY */ -size_t __ksize(const void *object) -{ - struct folio *folio; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(object); - - if (unlikely(!folio_test_slab(folio))) - return folio_size(folio); - - return slab_ksize(folio_slab(folio)->slab_cache); -} -EXPORT_SYMBOL(__ksize); - -void kfree(const void *x) -{ - struct folio *folio; - struct slab *slab; - void *object = (void *)x; - - trace_kfree(_RET_IP_, x); - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return; - - folio = virt_to_folio(x); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); - return; - } - slab = folio_slab(folio); - slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - #define SHRINK_PROMOTE_MAX 32 /* @@ -4863,13 +4783,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, gfpflags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); - #ifdef CONFIG_SYSFS static int count_inuse(struct slab *slab) { From 26a40990ba052e6f553256f9d0f112452b992a38 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:22 +0900 Subject: [PATCH 054/145] mm/sl[au]b: cleanup kmem_cache_alloc[_node]_trace() Despite its name, kmem_cache_alloc[_node]_trace() is hook for inlined kmalloc. So rename it to kmalloc[_node]_trace(). Move its implementation to slab_common.c by using __kmem_cache_alloc_node(), but keep CONFIG_TRACING=n varients to save a function call when CONFIG_TRACING=n. Use __assume_kmalloc_alignment for kmalloc[_node]_trace instead of __assume_slab_alignement. Generally kmalloc has larger alignment requirements. Suggested-by: Vlastimil Babka Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 27 ++++++++++++++------------- mm/slab.c | 35 ----------------------------------- mm/slab_common.c | 27 +++++++++++++++++++++++++++ mm/slub.c | 27 --------------------------- 4 files changed, 41 insertions(+), 75 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 4ee5b2fed164..c8e485ce8815 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -449,16 +449,16 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assum __malloc; #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_slab_alignment __alloc_size(3); - -extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment - __alloc_size(4); +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_kmalloc_alignment __alloc_size(3); +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_kmalloc_alignment + __alloc_size(4); #else /* CONFIG_TRACING */ -static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, - gfp_t flags, size_t size) +/* Save a function call when CONFIG_TRACING=n */ +static __always_inline __alloc_size(3) +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -466,8 +466,9 @@ static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_ return ret; } -static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) +static __always_inline __alloc_size(4) +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); @@ -550,7 +551,7 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_trace( + return kmalloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); #endif @@ -572,9 +573,9 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node_trace( + return kmalloc_node_trace( kmalloc_caches[kmalloc_type(flags)][index], - flags, node, size); + flags, node, size); } return __kmalloc_node(size, flags, node); } diff --git a/mm/slab.c b/mm/slab.c index 5b234e3ab165..8d9d0fbf9792 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3519,22 +3519,6 @@ error: } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -#ifdef CONFIG_TRACING -void * -kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) -{ - void *ret; - - ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, cachep, - size, cachep->size, flags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif - /** * kmem_cache_alloc_node - Allocate an object on the specified node * @cachep: The cache to allocate from. @@ -3568,25 +3552,6 @@ void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, orig_size, caller); } -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid, - size_t size) -{ - void *ret; - - ret = slab_alloc_node(cachep, NULL, flags, nodeid, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc_node(_RET_IP_, ret, cachep, - size, cachep->size, - flags, nodeid); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif - #ifdef CONFIG_PRINTK void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 6a744fc3d7f2..2eab19043940 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1000,6 +1000,33 @@ size_t __ksize(const void *object) return slab_ksize(folio_slab(folio)->slab_cache); } EXPORT_SYMBOL(__ksize); + +#ifdef CONFIG_TRACING +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, + size, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, + gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); +#endif /* !CONFIG_TRACING */ #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) diff --git a/mm/slub.c b/mm/slub.c index cd49785d59e1..7d7fd9d4e8fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3270,17 +3270,6 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, caller, orig_size); } -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) -{ - void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags); - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif - void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3292,22 +3281,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) -{ - void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, - size, s->size, gfpflags, node); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif - /* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. From 11e9734bcb6a7361943f993eba4e97f5812120d8 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:23 +0900 Subject: [PATCH 055/145] mm/slab_common: unify NUMA and UMA version of tracepoints Drop kmem_alloc event class, rename kmem_alloc_node to kmem_alloc, and remove _node postfix for NUMA version of tracepoints. This will break some tools that depend on {kmem_cache_alloc,kmalloc}_node, but at this point maintaining both kmem_alloc and kmem_alloc_node event classes does not makes sense at all. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/trace/events/kmem.h | 60 ++----------------------------------- mm/slab.c | 9 +++--- mm/slab_common.c | 21 +++++-------- mm/slob.c | 20 ++++++------- mm/slub.c | 6 ++-- 5 files changed, 27 insertions(+), 89 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 4cb51ace600d..e078ebcdc4b1 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -11,62 +11,6 @@ DECLARE_EVENT_CLASS(kmem_alloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - struct kmem_cache *s, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( unsigned long, gfp_flags ) - __field( bool, accounted ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = (__force unsigned long)gfp_flags; - __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? - ((gfp_flags & __GFP_ACCOUNT) || - (s && s->flags & SLAB_ACCOUNT)) : false; - ), - - TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s accounted=%s", - (void *)__entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - show_gfp_flags(__entry->gfp_flags), - __entry->accounted ? "true" : "false") -); - -DEFINE_EVENT(kmem_alloc, kmalloc, - - TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags) -); - -DEFINE_EVENT(kmem_alloc, kmem_cache_alloc, - - TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags) -); - -DECLARE_EVENT_CLASS(kmem_alloc_node, - TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, @@ -109,7 +53,7 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, __entry->accounted ? "true" : "false") ); -DEFINE_EVENT(kmem_alloc_node, kmalloc_node, +DEFINE_EVENT(kmem_alloc, kmalloc, TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, @@ -118,7 +62,7 @@ DEFINE_EVENT(kmem_alloc_node, kmalloc_node, TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node) ); -DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node, +DEFINE_EVENT(kmem_alloc, kmem_cache_alloc, TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, diff --git a/mm/slab.c b/mm/slab.c index 8d9d0fbf9792..2fd400203ac2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3440,8 +3440,8 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, flags); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, cachep->object_size, + cachep->size, flags, NUMA_NO_NODE); return ret; } @@ -3536,9 +3536,8 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, - flags, nodeid); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, cachep->object_size, + cachep->size, flags, nodeid); return ret; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 2eab19043940..3d7ad992ece1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -907,9 +907,8 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << get_order(size), - flags, node); + trace_kmalloc(_RET_IP_, ret, NULL, size, + PAGE_SIZE << get_order(size), flags, node); return ret; } @@ -920,8 +919,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller ret = __kmem_cache_alloc_node(s, flags, node, size, caller); ret = kasan_kmalloc(s, ret, size, flags); - trace_kmalloc_node(caller, ret, s, size, - s->size, flags, node); + trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags, node); return ret; } @@ -1007,8 +1005,7 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, size, _RET_IP_); - trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, - gfpflags, NUMA_NO_NODE); + trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags, NUMA_NO_NODE); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -1020,7 +1017,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, { void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); - trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, gfpflags, node); + trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags, node); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -1076,7 +1073,7 @@ void *kmalloc_large(size_t size, gfp_t flags) void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); trace_kmalloc(_RET_IP_, ret, NULL, size, - PAGE_SIZE << get_order(size), flags); + PAGE_SIZE << get_order(size), flags, NUMA_NO_NODE); return ret; } EXPORT_SYMBOL(kmalloc_large); @@ -1085,8 +1082,8 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) { void *ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc_node(_RET_IP_, ret, NULL, size, - PAGE_SIZE << get_order(size), flags, node); + trace_kmalloc(_RET_IP_, ret, NULL, size, + PAGE_SIZE << get_order(size), flags, node); return ret; } EXPORT_SYMBOL(kmalloc_large_node); @@ -1421,8 +1418,6 @@ EXPORT_SYMBOL(ksize); /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index 96b08acd72ce..3208c56d8f82 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -507,8 +507,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) *m = size; ret = (void *)m + minalign; - trace_kmalloc_node(caller, ret, NULL, - size, size + minalign, gfp, node); + trace_kmalloc(caller, ret, NULL, size, + size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -516,8 +516,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << order, gfp, node); + trace_kmalloc(caller, ret, NULL, size, + PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); @@ -608,14 +608,14 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - SLOB_UNITS(c->size) * SLOB_UNIT, - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, NULL, c->object_size, + SLOB_UNITS(c->size) * SLOB_UNIT, + flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - PAGE_SIZE << get_order(c->size), - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, NULL, c->object_size, + PAGE_SIZE << get_order(c->size), + flags, node); } if (b && c->ctor) { diff --git a/mm/slub.c b/mm/slub.c index 7d7fd9d4e8fa..22e4ccf06638 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3244,7 +3244,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, - s->size, gfpflags); + s->size, gfpflags, NUMA_NO_NODE); return ret; } @@ -3274,8 +3274,8 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - trace_kmem_cache_alloc_node(_RET_IP_, ret, s, - s->object_size, s->size, gfpflags, node); + trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, + s->size, gfpflags, node); return ret; } From 2c1d697fb8ba6d2d44f914d4268ae1ccdf025f1b Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:24 +0900 Subject: [PATCH 056/145] mm/slab_common: drop kmem_alloc & avoid dereferencing fields when not using Drop kmem_alloc event class, and define kmalloc and kmem_cache_alloc using TRACE_EVENT() macro. And then this patch does: - Do not pass pointer to struct kmem_cache to trace_kmalloc. gfp flag is enough to know if it's accounted or not. - Avoid dereferencing s->object_size and s->size when not using kmem_cache_alloc event. - Avoid dereferencing s->name in when not using kmem_cache_free event. - Adjust s->size to SLOB_UNITS(s->size) * SLOB_UNIT in SLOB Cc: Vasily Averin Suggested-by: Vlastimil Babka Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/trace/events/kmem.h | 64 ++++++++++++++++++++++++------------- mm/slab.c | 8 ++--- mm/slab_common.c | 16 +++++----- mm/slob.c | 19 +++++------ mm/slub.c | 8 ++--- 5 files changed, 64 insertions(+), 51 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index e078ebcdc4b1..243073cfc29d 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -9,17 +9,15 @@ #include #include -DECLARE_EVENT_CLASS(kmem_alloc, +TRACE_EVENT(kmem_cache_alloc, TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, - size_t bytes_alloc, gfp_t gfp_flags, int node), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node), + TP_ARGS(call_site, ptr, s, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) @@ -34,13 +32,13 @@ DECLARE_EVENT_CLASS(kmem_alloc, TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; + __entry->bytes_req = s->object_size; + __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? ((gfp_flags & __GFP_ACCOUNT) || - (s && s->flags & SLAB_ACCOUNT)) : false; + (s->flags & SLAB_ACCOUNT)) : false; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", @@ -53,22 +51,44 @@ DECLARE_EVENT_CLASS(kmem_alloc, __entry->accounted ? "true" : "false") ); -DEFINE_EVENT(kmem_alloc, kmalloc, +TRACE_EVENT(kmalloc, - TP_PROTO(unsigned long call_site, const void *ptr, - struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, - gfp_t gfp_flags, int node), + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node) -); + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), -DEFINE_EVENT(kmem_alloc, kmem_cache_alloc, + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( unsigned long, gfp_flags ) + __field( int, node ) + ), - TP_PROTO(unsigned long call_site, const void *ptr, - struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, - gfp_t gfp_flags, int node), + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = (__force unsigned long)gfp_flags; + __entry->node = node; + ), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node) + TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", + (void *)__entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + show_gfp_flags(__entry->gfp_flags), + __entry->node, + (IS_ENABLED(CONFIG_MEMCG_KMEM) && + (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); TRACE_EVENT(kfree, @@ -93,20 +113,20 @@ TRACE_EVENT(kfree, TRACE_EVENT(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr, const char *name), + TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s), - TP_ARGS(call_site, ptr, name), + TP_ARGS(call_site, ptr, s), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) - __string( name, name ) + __string( name, s->name ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __assign_str(name, name); + __assign_str(name, s->name); ), TP_printk("call_site=%pS ptr=%p name=%s", diff --git a/mm/slab.c b/mm/slab.c index 2fd400203ac2..a5486ff8362a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3440,8 +3440,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, cachep->object_size, - cachep->size, flags, NUMA_NO_NODE); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); return ret; } @@ -3536,8 +3535,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, cachep->object_size, - cachep->size, flags, nodeid); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); return ret; } @@ -3607,7 +3605,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) if (!cachep) return; - trace_kmem_cache_free(_RET_IP_, objp, cachep->name); + trace_kmem_cache_free(_RET_IP_, objp, cachep); __do_kmem_cache_free(cachep, objp, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slab_common.c b/mm/slab_common.c index 3d7ad992ece1..ad4c36fb697c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -907,7 +907,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc(_RET_IP_, ret, NULL, size, + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), flags, node); return ret; } @@ -919,7 +919,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller ret = __kmem_cache_alloc_node(s, flags, node, size, caller); ret = kasan_kmalloc(s, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags, node); + trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node); return ret; } @@ -1005,7 +1005,7 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, size, _RET_IP_); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags, NUMA_NO_NODE); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -1017,7 +1017,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, { void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags, node); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; @@ -1072,8 +1072,8 @@ void *kmalloc_large(size_t size, gfp_t flags) { void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); - trace_kmalloc(_RET_IP_, ret, NULL, size, - PAGE_SIZE << get_order(size), flags, NUMA_NO_NODE); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); return ret; } EXPORT_SYMBOL(kmalloc_large); @@ -1082,8 +1082,8 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) { void *ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc(_RET_IP_, ret, NULL, size, - PAGE_SIZE << get_order(size), flags, node); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); return ret; } EXPORT_SYMBOL(kmalloc_large_node); diff --git a/mm/slob.c b/mm/slob.c index 3208c56d8f82..771af84576bf 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) *m = size; ret = (void *)m + minalign; - trace_kmalloc(caller, ret, NULL, size, - size + minalign, gfp, node); + trace_kmalloc(caller, ret, size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - trace_kmalloc(caller, ret, NULL, size, - PAGE_SIZE << order, gfp, node); + trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); @@ -594,6 +592,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } + + /* Actual size allocated */ + c->size = SLOB_UNITS(c->size) * SLOB_UNIT; c->flags = flags; return 0; } @@ -608,14 +609,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc(_RET_IP_, b, NULL, c->object_size, - SLOB_UNITS(c->size) * SLOB_UNIT, - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc(_RET_IP_, b, NULL, c->object_size, - PAGE_SIZE << get_order(c->size), - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } if (b && c->ctor) { @@ -671,7 +668,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - trace_kmem_cache_free(_RET_IP_, b, c->name); + trace_kmem_cache_free(_RET_IP_, b, c); if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); diff --git a/mm/slub.c b/mm/slub.c index 22e4ccf06638..8083a6ee5f15 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3243,8 +3243,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, { void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, - s->size, gfpflags, NUMA_NO_NODE); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } @@ -3274,8 +3273,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, - s->size, gfpflags, node); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); return ret; } @@ -3517,7 +3515,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) s = cache_from_obj(s, x); if (!s) return; - trace_kmem_cache_free(_RET_IP_, x, s->name); + trace_kmem_cache_free(_RET_IP_, x, s); slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); From 8dfa9d554061873f96335730fb1d403698b2b1b4 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:25 +0900 Subject: [PATCH 057/145] mm/slab_common: move declaration of __ksize() to mm/slab.h __ksize() is only called by KASAN. Remove export symbol and move declaration to mm/slab.h as we don't want to grow its callers. Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 1 - mm/slab.h | 2 ++ mm/slab_common.c | 11 +---------- mm/slob.c | 1 - 4 files changed, 3 insertions(+), 12 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index c8e485ce8815..9b592e611cb1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -187,7 +187,6 @@ int kmem_cache_shrink(struct kmem_cache *s); void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); -size_t __ksize(const void *objp); size_t ksize(const void *objp); #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); diff --git a/mm/slab.h b/mm/slab.h index 4d8330d57573..65023f000d42 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -668,6 +668,8 @@ void free_large_kmalloc(struct folio *folio, void *object); #endif /* CONFIG_SLOB */ +size_t __ksize(const void *objp); + static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB diff --git a/mm/slab_common.c b/mm/slab_common.c index ad4c36fb697c..500eb777faca 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -974,15 +974,7 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); -/** - * __ksize -- Uninstrumented ksize. - * @object: pointer to the object - * - * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same - * safety checks as ksize() with KASAN instrumentation enabled. - * - * Return: size of the actual memory used by @object in bytes - */ +/* Uninstrumented ksize. Only called by KASAN. */ size_t __ksize(const void *object) { struct folio *folio; @@ -997,7 +989,6 @@ size_t __ksize(const void *object) return slab_ksize(folio_slab(folio)->slab_cache); } -EXPORT_SYMBOL(__ksize); #ifdef CONFIG_TRACING void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) diff --git a/mm/slob.c b/mm/slob.c index 771af84576bf..45a061b8ba38 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -584,7 +584,6 @@ size_t __ksize(const void *block) m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(__ksize); int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { From d5eff736902d5565a24f1b571b5987b3e5ee9a5b Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Wed, 17 Aug 2022 19:18:26 +0900 Subject: [PATCH 058/145] mm/sl[au]b: check if large object is valid in __ksize() If address of large object is not beginning of folio or size of the folio is too small, it must be invalid. WARN() and return 0 in such cases. Cc: Marco Elver Suggested-by: Vlastimil Babka Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 500eb777faca..7972ec4b9ca4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -984,8 +984,13 @@ size_t __ksize(const void *object) folio = virt_to_folio(object); - if (unlikely(!folio_test_slab(folio))) + if (unlikely(!folio_test_slab(folio))) { + if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) + return 0; + if (WARN_ON(object != folio_address(folio))) + return 0; return folio_size(folio); + } return slab_ksize(folio_slab(folio)->slab_cache); } From 3aac580d5cc3001ca1627725b3b61edb529f341d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:54 -0700 Subject: [PATCH 059/145] perf: Add sample_flags to indicate the PMU-filled sample data On some platforms, some data e.g., timestamps, can be retrieved from the PMU driver. Usually, the data from the PMU driver is more accurate. The current perf kernel should output the PMU-filled sample data if it's available. To check the availability of the PMU-filled sample data, the current perf kernel initializes the related fields in the perf_sample_data_init(). When outputting a sample, the perf checks whether the field is updated by the PMU driver. If yes, the updated value will be output. If not, the perf uses an SW way to calculate the value or just outputs the initialized value if an SW way is unavailable either. With more and more data being provided by the PMU driver, more fields has to be initialized in the perf_sample_data_init(). That will increase the number of cache lines touched in perf_sample_data_init() and be harmful to the performance. Add new "sample_flags" to indicate the PMU-filled sample data. The PMU driver should set the corresponding PERF_SAMPLE_ flag when the field is updated. The initialization of the corresponding field is not required anymore. The following patches will make use of it and remove the corresponding fields from the perf_sample_data_init(), which will further minimize the number of cache lines touched. Only clear the sample flags that have already been done by the PMU driver in the perf_prepare_sample() for the PERF_RECORD_SAMPLE. For the other PERF_RECORD_ event type, the sample data is not available. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 17 +++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1999408a9cbb..0978165a2d87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1008,6 +1008,7 @@ struct perf_sample_data { * Fields set by perf_sample_data_init(), group so as to * minimize the cachelines touched. */ + u64 sample_flags; u64 addr; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; @@ -1057,6 +1058,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ + data->sample_flags = 0; data->addr = addr; data->raw = NULL; data->br_stack = NULL; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..c9b9cb79231a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6794,11 +6794,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6827,7 +6826,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -7303,6 +7302,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7310,7 +7310,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); From 47a3aeb39e8dc099ae431cd8b46bdf218f5511b2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:55 -0700 Subject: [PATCH 060/145] perf/x86/intel/pebs: Fix PEBS timestamps overwritten The PEBS TSC-based timestamps do not appear correctly in the final perf.data output file from perf record. The data->time field setup by PEBS in the setup_pebs_fixed_sample_data() is later overwritten by perf_events generic code in perf_prepare_sample(). There is an ordering problem. Set the sample flags when the data->time is updated by PEBS. The data->time field will not be overwritten anymore. Reported-by: Andreas Kogler Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-3-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba60427caa6d..cdd857bd2dd6 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1635,8 +1635,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * We can only do this for the default trace clock. */ if (x86_pmu.intel_cap.pebs_format >= 3 && - event->attr.use_clockid == 0) + event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(pebs->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } if (has_branch_stack(event)) data->br_stack = &cpuc->lbr_stack; @@ -1697,8 +1699,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; - if (event->attr.use_clockid == 0) + if (event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(basic->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } /* * We must however always use iregs for the unwinder to stay sane; the From a9a931e2666878343782c82d7d55cc173ddeb3e9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:56 -0700 Subject: [PATCH 061/145] perf: Use sample_flags for branch stack Use the new sample_flags to indicate whether the branch stack is filled by the PMU driver. Remove the br_stack from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-4-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 1 + arch/x86/events/amd/core.c | 4 +++- arch/x86/events/core.c | 4 +++- arch/x86/events/intel/core.c | 4 +++- arch/x86/events/intel/ds.c | 5 ++++- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 4 ++-- 7 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 13919eb96931..1ad1efdb33f9 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2297,6 +2297,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, cpuhw = this_cpu_ptr(&cpu_hw_events); power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 36bede1d7b1e..bd99d2ae14c3 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -929,8 +929,10 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f969410d0c90..bb34a28fa71b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1714,8 +1714,10 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..ba101c28dcc9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2995,8 +2995,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cdd857bd2dd6..0489f750baa0 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1640,8 +1640,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_TIME; } - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1791,6 +1793,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0978165a2d87..1e12e79454e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1011,7 +1011,6 @@ struct perf_sample_data { u64 sample_flags; u64 addr; struct perf_raw_record *raw; - struct perf_branch_stack *br_stack; u64 period; union perf_sample_weight weight; u64 txn; @@ -1021,6 +1020,8 @@ struct perf_sample_data { * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ + struct perf_branch_stack *br_stack; + u64 type; u64 ip; struct { @@ -1061,7 +1062,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->sample_flags = 0; data->addr = addr; data->raw = NULL; - data->br_stack = NULL; data->period = period; data->weight.full = 0; data->data_src.val = PERF_MEM_NA; diff --git a/kernel/events/core.c b/kernel/events/core.c index c9b9cb79231a..104c0c9f4e6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7052,7 +7052,7 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr @@ -7358,7 +7358,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { if (perf_sample_save_hw_index(event)) size += sizeof(u64); From 2abe681da0a192ab850a5271d838a7817b469fca Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:57 -0700 Subject: [PATCH 062/145] perf: Use sample_flags for weight Use the new sample_flags to indicate whether the weight field is filled by the PMU driver. Remove the weight field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-5-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 5 +++-- arch/x86/events/intel/ds.c | 10 +++++++--- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 1ad1efdb33f9..a5c95a2006ea 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2305,9 +2305,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && - ppmu->get_mem_weight) + ppmu->get_mem_weight) { ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); - + data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } else if (period) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 0489f750baa0..4c51118e4add 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1527,8 +1527,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { data->weight.full = pebs->lat; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } /* * data.data_src encodes the data source @@ -1620,9 +1622,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); - + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); @@ -1764,6 +1767,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e12e79454e0..06a587b5faa9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1021,6 +1020,7 @@ struct perf_sample_data { * perf_{prepare,output}_sample(). */ struct perf_branch_stack *br_stack; + union perf_sample_weight weight; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 104c0c9f4e6f..f0af45db02b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7408,6 +7408,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); From e16fd7f2cb1a65555cfe76f983eaefb1eab7471f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:58 -0700 Subject: [PATCH 063/145] perf: Use sample_flags for data_src Use the new sample_flags to indicate whether the data_src field is filled by the PMU driver. Remove the data_src field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-6-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a5c95a2006ea..6ec7069e6482 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2301,8 +2301,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && - ppmu->get_mem_data_src) + ppmu->get_mem_data_src) { ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); + data.sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && ppmu->get_mem_weight) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4c51118e4add..bde73d492889 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1535,8 +1535,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, pebs->dse); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1770,8 +1772,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, meminfo->aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 06a587b5faa9..6849f10dfc7e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1013,7 +1013,6 @@ struct perf_sample_data { struct perf_raw_record *raw; u64 period; u64 txn; - union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { */ struct perf_branch_stack *br_stack; union perf_sample_weight weight; + union perf_mem_data_src data_src; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index f0af45db02b3..163e2f478e61 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7411,6 +7411,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) data->weight.full = 0; + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); From ee9db0e14b0575aa827579dc2471a29ec5fc6877 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:59 -0700 Subject: [PATCH 064/145] perf: Use sample_flags for txn Use the new sample_flags to indicate whether the txn field is filled by the PMU driver. Remove the txn field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-7-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bde73d492889..a5275c235c2a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1628,9 +1628,11 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } /* @@ -1780,9 +1782,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, gprs ? gprs->ax : 0); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } if (format_size & PEBS_DATACFG_XMMS) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6849f10dfc7e..581880ddb9ef 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - u64 txn; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; union perf_sample_weight weight; union perf_mem_data_src data_src; + u64 txn; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->txn = 0; } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 163e2f478e61..15d27b14c827 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = PERF_MEM_NA; + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); From eaa1a955094b20ca4b1c5064200d140b27f8d96f Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 7 Sep 2022 09:00:56 +0100 Subject: [PATCH 065/145] clocksource/drivers/renesas-ostm: Add support for RZ/V2L SoC The OSTM block is identical on Renesas RZ/G2L and RZ/V2L SoC's, so instead of adding dependency for each SoC's add dependency on ARCH_RZG2L. The ARCH_RZG2L config option is already selected by ARCH_R9A07G044 and ARCH_R9A07G054. With the above change OSTM will be enabled on RZ/V2L SoC. Signed-off-by: Lad Prabhakar Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220907080056.3460-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/renesas-ostm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/renesas-ostm.c b/drivers/clocksource/renesas-ostm.c index 21d1392637b8..8da972dc1713 100644 --- a/drivers/clocksource/renesas-ostm.c +++ b/drivers/clocksource/renesas-ostm.c @@ -224,7 +224,7 @@ err_free: TIMER_OF_DECLARE(ostm, "renesas,ostm", ostm_init); -#ifdef CONFIG_ARCH_R9A07G044 +#ifdef CONFIG_ARCH_RZG2L static int __init ostm_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; From 03b02db93be407103c385814033633364674a6f6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 6 Sep 2022 14:14:14 +0530 Subject: [PATCH 066/145] perf: Consolidate branch sample filter helpers Besides the branch type filtering requests, 'event.attr.branch_sample_type' also contains various flags indicating which additional information should be captured, along with the base branch record. These flags help configure the underlying hardware, and capture the branch records appropriately when required e.g after PMU interrupt. But first, this moves an existing helper perf_sample_save_hw_index() into the header before adding some more helpers for other branch sample filter flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220906084414.396220-1-anshuman.khandual@arm.com --- include/linux/perf_event.h | 26 ++++++++++++++++++++++++++ kernel/events/core.c | 9 ++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 581880ddb9ef..a627528e5f3a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1685,4 +1685,30 @@ static inline void perf_lopwr_cb(bool mode) } #endif +#ifdef CONFIG_PERF_EVENTS +static inline bool branch_sample_no_flags(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; +} + +static inline bool branch_sample_no_cycles(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; +} + +static inline bool branch_sample_type(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; +} + +static inline bool branch_sample_hw_index(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + +static inline bool branch_sample_priv(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; +} +#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 15d27b14c827..00389d5f9241 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6966,11 +6966,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7059,7 +7054,7 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7359,7 +7354,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr From 7517f08b9a5eef0fa683b976c97d6178d00e6a3d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:21 +0530 Subject: [PATCH 067/145] perf/core: Expand PERF_EVENT_FLAG_ARCH Two hardware event flags on x86 platform has overshot PERF_EVENT_FLAG_ARCH (0x0000ffff). These flags are PERF_X86_EVENT_PEBS_LAT_HYBRID (0x20000) and PERF_X86_EVENT_AMD_BRS (0x10000). Lets expand PERF_EVENT_FLAG_ARCH mask to accommodate those flags, and also create room for two more in the future. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-2-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a627528e5f3a..3e3c07512b75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -138,7 +138,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 /** From f67dd218fafd9de9a13d095e775b621db76a058f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:22 +0530 Subject: [PATCH 068/145] perf/core: Assert PERF_EVENT_FLAG_ARCH does not overlap with generic flags This just ensures that PERF_EVENT_FLAG_ARCH does not overlap with generic hardware event flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-3-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3e3c07512b75..f88cb31eaf75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -141,6 +141,8 @@ struct hw_perf_event_extra { #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 +static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); + /** * struct hw_perf_event - performance event hardware details: */ From 91207f62616f9f51b52436364e6d064f002e9112 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:23 +0530 Subject: [PATCH 069/145] arm64/perf: Assert all platform event flags are within PERF_EVENT_FLAG_ARCH Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-4-anshuman.khandual@arm.com --- drivers/perf/arm_spe_pmu.c | 4 +++- include/linux/perf/arm_pmu.h | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index b65a7d9640e1..db8a0a841062 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -44,7 +44,9 @@ * This allows us to perform the check, i.e, perfmon_capable(), * in the context of the event owner, once, during the event_init(). */ -#define SPE_PMU_HW_FLAGS_CX BIT(0) +#define SPE_PMU_HW_FLAGS_CX 0x00001 + +static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX); static void set_spe_event_has_cx(struct perf_event *event) { diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0407a38b470a..0356cb6a215d 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -24,10 +24,11 @@ /* * ARM PMU hw_event flags */ -/* Event uses a 64bit counter */ -#define ARMPMU_EVT_64BIT 1 -/* Event uses a 47bit counter */ -#define ARMPMU_EVT_47BIT 2 +#define ARMPMU_EVT_64BIT 0x00001 /* Event uses a 64bit counter */ +#define ARMPMU_EVT_47BIT 0x00002 /* Event uses a 47bit counter */ + +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT); +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT); #define HW_OP_UNSUPPORTED 0xFFFF #define C(_x) PERF_COUNT_HW_CACHE_##_x From 88081cfb699ce2568e5309c145eb9f9e9497b53f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:24 +0530 Subject: [PATCH 070/145] x86/perf: Assert all platform event flags are within PERF_EVENT_FLAG_ARCH Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-5-anshuman.khandual@arm.com --- arch/x86/events/perf_event.h | 32 ++++++++++++++---------------- arch/x86/events/perf_event_flags.h | 22 ++++++++++++++++++++ 2 files changed, 37 insertions(+), 17 deletions(-) create mode 100644 arch/x86/events/perf_event_flags.h diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 93263b98cd6e..4a3dde24a1ae 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -64,27 +64,25 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) return ((ecode & c->cmask) - c->code) <= (u64)c->size; } +#define PERF_ARCH(name, val) \ + PERF_X86_EVENT_##name = val, + /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ +enum { +#include "perf_event_flags.h" +}; -#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ -#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ -#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */ +#undef PERF_ARCH + +#define PERF_ARCH(name, val) \ + static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \ + PERF_X86_EVENT_##name); + +#include "perf_event_flags.h" + +#undef PERF_ARCH static inline bool is_topdown_count(struct perf_event *event) { diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h new file mode 100644 index 000000000000..1dc19b9b4426 --- /dev/null +++ b/arch/x86/events/perf_event_flags.h @@ -0,0 +1,22 @@ + +/* + * struct hw_perf_event.flags flags + */ +PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ + /* 0x00080 */ +PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ From f3c0eba287049237b23d1300376768293eb89e69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Sep 2022 18:48:55 +0200 Subject: [PATCH 071/145] perf: Add a few assertions While auditing 6b959ba22d34 ("perf/core: Fix reentry problem in perf_output_read_group()") a few spots were found that wanted assertions. Notable for_each_sibling_event() relies on exclusion from modification. This would normally be holding either ctx->lock or ctx->mutex, however due to how things are constructed disabling IRQs is a valid and sufficient substitute for ctx->lock. Another possible site to add assertions would be the various pmu::{add,del,read,..}() methods, but that's not trivially expressable in C -- the best option is wrappers, but those are easy enough to forget. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 17 +++++++++++++++++ kernel/events/core.c | 2 ++ 2 files changed, 19 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f88cb31eaf75..368bdc4f563f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,6 +61,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -634,7 +635,23 @@ struct pmu_event_list { struct list_head list; }; +/* + * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex + * as such iteration must hold either lock. However, since ctx->lock is an IRQ + * safe lock, and is only held by the CPU doing the modification, having IRQs + * disabled is sufficient since it will hold-off the IPIs. + */ +#ifdef CONFIG_PROVE_LOCKING +#define lockdep_assert_event_ctx(event) \ + WARN_ON_ONCE(__lockdep_enabled && \ + (this_cpu_read(hardirqs_enabled) || \ + lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) +#else +#define lockdep_assert_event_ctx(event) +#endif + #define for_each_sibling_event(sibling, event) \ + lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) diff --git a/kernel/events/core.c b/kernel/events/core.c index 00389d5f9241..3e90e454b995 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1468,6 +1468,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; From 73759c346341d39dfde39701476c0376dea0a98b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:27:22 +0200 Subject: [PATCH 072/145] perf/x86: Add two more x86_pmu methods In order to clean up x86_perf_event_{set_period,update)() start by adding them as x86_pmu methods. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.440196408@infradead.org --- arch/x86/events/core.c | 22 +++++++++++++++++----- arch/x86/events/perf_event.h | 5 +++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bb34a28fa71b..bb559b79ff23 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -72,6 +72,9 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); + DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); @@ -1518,7 +1521,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); } event->hw.state = 0; @@ -1610,7 +1613,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -1700,7 +1703,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; - val = x86_perf_event_update(event); + val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; @@ -1709,7 +1712,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) */ handled++; - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); @@ -2025,6 +2028,9 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); + static_call_update(x86_pmu_set_period, x86_pmu.set_period); + static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); @@ -2044,7 +2050,7 @@ static void x86_pmu_static_call_update(void) static void _x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, @@ -2151,6 +2157,12 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; + if (!x86_pmu.set_period) + x86_pmu.set_period = x86_perf_event_set_period; + + if (!x86_pmu.update) + x86_pmu.update = x86_perf_event_update; + x86_pmu_static_call_update(); /* diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4a3dde24a1ae..7ae1a6c5368c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -743,6 +743,8 @@ struct x86_pmu { void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); + int (*set_period)(struct perf_event *event); + u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -1042,6 +1044,9 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; +DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); +DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); + static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { if (static_cpu_has(X86_FEATURE_ARCH_LBR)) From e577bb17a1eaa35b86ee873a786e603be768d668 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:06 +0200 Subject: [PATCH 073/145] perf/x86/intel: Move the topdown stuff into the intel driver Use the new x86_pmu::{set_period,update}() methods to push the topdown stuff into the Intel driver, where it belongs. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.505933457@infradead.org --- arch/x86/events/core.c | 7 ------- arch/x86/events/intel/core.c | 26 +++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bb559b79ff23..b074e71bab21 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -119,9 +119,6 @@ u64 x86_perf_event_update(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) - return x86_pmu.update_topdown_event(event); - /* * Careful: an NMI might modify the previous event value. * @@ -1373,10 +1370,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && - x86_pmu.set_topdown_event_period) - return x86_pmu.set_topdown_event_period(event); - /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ba101c28dcc9..feed732fdf57 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2302,7 +2302,7 @@ static void intel_pmu_nhm_workaround(void) for (i = 0; i < 4; i++) { event = cpuc->events[i]; if (event) - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } for (i = 0; i < 4; i++) { @@ -2317,7 +2317,7 @@ static void intel_pmu_nhm_workaround(void) event = cpuc->events[i]; if (event) { - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } else @@ -2794,7 +2794,7 @@ static void intel_pmu_add_event(struct perf_event *event) */ int intel_pmu_save_and_restart(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); /* * For a checkpointed counter always reset back to 0. This * avoids a situation where the counter overflows, aborts the @@ -2806,9 +2806,27 @@ int intel_pmu_save_and_restart(struct perf_event *event) wrmsrl(event->hw.event_base, 0); local64_set(&event->hw.prev_count, 0); } + return static_call(x86_pmu_set_period)(event); +} + +static int intel_pmu_set_period(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event)) && + x86_pmu.set_topdown_event_period) + return x86_pmu.set_topdown_event_period(event); + return x86_perf_event_set_period(event); } +static u64 intel_pmu_update(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event)) && + x86_pmu.update_topdown_event) + return x86_pmu.update_topdown_event(event); + + return x86_perf_event_update(event); +} + static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); @@ -4786,6 +4804,8 @@ static __initconst const struct x86_pmu intel_pmu = { .add = intel_pmu_add_event, .del = intel_pmu_del_event, .read = intel_pmu_read_event, + .set_period = intel_pmu_set_period, + .update = intel_pmu_update, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, From 28f0f3c44b5c35be657a4f922dcdfb48285f4373 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:25 +0200 Subject: [PATCH 074/145] perf/x86: Change x86_pmu::limit_period signature In preparation for making it a static_call, change the signature. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.573713839@infradead.org --- arch/x86/events/amd/core.c | 8 +++----- arch/x86/events/core.c | 13 ++++++++----- arch/x86/events/intel/core.c | 19 ++++++++----------- arch/x86/events/perf_event.h | 2 +- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index bd99d2ae14c3..8b70237c33f7 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1224,16 +1224,14 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) +static void amd_pmu_limit_period(struct perf_event *event, s64 *left) { /* * Decrease period by the depth of the BRS feature to get the last N * taken branches and approximate the desired period */ - if (has_branch_stack(event) && left > x86_pmu.lbr_nr) - left -= x86_pmu.lbr_nr; - - return left; + if (has_branch_stack(event) && *left > x86_pmu.lbr_nr) + *left -= x86_pmu.lbr_nr; } static __initconst const struct x86_pmu amd_pmu = { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b074e71bab21..1e90bc7ca36f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -621,8 +621,9 @@ int x86_pmu_hw_config(struct perf_event *event) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, event->attr.sample_period) > - event->attr.sample_period) + s64 left = event->attr.sample_period; + x86_pmu.limit_period(event, &left); + if (left > event->attr.sample_period) return -EINVAL; } @@ -1396,9 +1397,9 @@ int x86_perf_event_set_period(struct perf_event *event) left = x86_pmu.max_period; if (x86_pmu.limit_period) - left = x86_pmu.limit_period(event, left); + x86_pmu.limit_period(event, &left); - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, @@ -2677,7 +2678,9 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value) return -EINVAL; if (value && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, value) > value) + s64 left = value; + x86_pmu.limit_period(event, &left); + if (left > value) return -EINVAL; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index feed732fdf57..92cc390590d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4344,28 +4344,25 @@ static u8 adl_get_hybrid_cpu_type(void) * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity. */ -static u64 bdw_limit_period(struct perf_event *event, u64 left) +static void bdw_limit_period(struct perf_event *event, s64 *left) { if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xc0, .umask=0x01)) { - if (left < 128) - left = 128; - left &= ~0x3fULL; + if (*left < 128) + *left = 128; + *left &= ~0x3fULL; } - return left; } -static u64 nhm_limit_period(struct perf_event *event, u64 left) +static void nhm_limit_period(struct perf_event *event, s64 *left) { - return max(left, 32ULL); + *left = max(*left, 32LL); } -static u64 spr_limit_period(struct perf_event *event, u64 left) +static void spr_limit_period(struct perf_event *event, s64 *left) { if (event->attr.precise_ip == 3) - return max(left, 128ULL); - - return left; + *left = max(*left, 128LL); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7ae1a6c5368c..e82d2d212534 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -781,7 +781,7 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; - u64 (*limit_period)(struct perf_event *event, u64 l); + void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ unsigned int late_ack :1, From 08b3068fab207e3c7d79799d434e1d648524cac6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:18 +0200 Subject: [PATCH 075/145] perf/x86: Add a x86_pmu::limit_period static_call Avoid a branch and indirect call. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.640658334@infradead.org --- arch/x86/events/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1e90bc7ca36f..05830bb77d40 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -72,8 +72,9 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); -DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); -DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); @@ -1396,8 +1397,7 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; - if (x86_pmu.limit_period) - x86_pmu.limit_period(event, &left); + static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); @@ -2024,6 +2024,7 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); From 2368516731901c391826f7cf23516173193652fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 May 2022 16:41:25 +0200 Subject: [PATCH 076/145] perf/x86/intel: Remove x86_pmu::set_topdown_event_period Now that it is all internal to the intel driver, remove x86_pmu::set_topdown_event_period. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.706354189@infradead.org --- arch/x86/events/intel/core.c | 16 ++++++++++------ arch/x86/events/perf_event.h | 1 - 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 92cc390590d1..75400ed0eac3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2521,6 +2521,8 @@ static int adl_set_topdown_event_period(struct perf_event *event) return icl_set_topdown_event_period(event); } +DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); + static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) { u32 val; @@ -2811,9 +2813,8 @@ int intel_pmu_save_and_restart(struct perf_event *event) static int intel_pmu_set_period(struct perf_event *event) { - if (unlikely(is_topdown_count(event)) && - x86_pmu.set_topdown_event_period) - return x86_pmu.set_topdown_event_period(event); + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_set_topdown_event_period)(event); return x86_perf_event_set_period(event); } @@ -6292,7 +6293,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 4; x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Icelake events, "); name = "icelake"; break; @@ -6330,7 +6332,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 8; x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Sapphire Rapids events, "); name = "sapphire_rapids"; break; @@ -6367,7 +6370,8 @@ __init int intel_pmu_init(void) x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; x86_pmu.update_topdown_event = adl_update_topdown_event; - x86_pmu.set_topdown_event_period = adl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &adl_set_topdown_event_period); x86_pmu.filter_match = intel_pmu_filter_match; x86_pmu.get_event_constraints = adl_get_event_constraints; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e82d2d212534..be27eadc40a3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -890,7 +890,6 @@ struct x86_pmu { */ int num_topdown_events; u64 (*update_topdown_event)(struct perf_event *event); - int (*set_topdown_event_period)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) From 1acab2e01c9c5df00f2fddf3473014dea89dcb5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 May 2022 17:02:05 +0200 Subject: [PATCH 077/145] perf/x86/intel: Remove x86_pmu::update_topdown_event Now that it is all internal to the intel driver, remove x86_pmu::update_topdown_event. Assumes that is_topdown_count(event) can only be true when the hardware has topdown stuff and the function is set. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.771635301@infradead.org --- arch/x86/events/intel/core.c | 22 ++++++++++++---------- arch/x86/events/perf_event.h | 1 - 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 75400ed0eac3..1e429e86a7f4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2673,6 +2673,7 @@ static u64 adl_update_topdown_event(struct perf_event *event) return icl_update_topdown_event(event); } +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); static void intel_pmu_read_topdown_event(struct perf_event *event) { @@ -2684,7 +2685,7 @@ static void intel_pmu_read_topdown_event(struct perf_event *event) return; perf_pmu_disable(event->pmu); - x86_pmu.update_topdown_event(event); + static_call(intel_pmu_update_topdown_event)(event); perf_pmu_enable(event->pmu); } @@ -2692,7 +2693,7 @@ static void intel_pmu_read_event(struct perf_event *event) { if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_auto_reload_read(event); - else if (is_topdown_count(event) && x86_pmu.update_topdown_event) + else if (is_topdown_count(event)) intel_pmu_read_topdown_event(event); else x86_perf_event_update(event); @@ -2821,9 +2822,8 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { - if (unlikely(is_topdown_count(event)) && - x86_pmu.update_topdown_event) - return x86_pmu.update_topdown_event(event); + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_update_topdown_event)(event); return x86_perf_event_update(event); } @@ -2990,8 +2990,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - if (x86_pmu.update_topdown_event) - x86_pmu.update_topdown_event(NULL); + static_call(intel_pmu_update_topdown_event)(NULL); } /* @@ -6292,7 +6291,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 4; - x86_pmu.update_topdown_event = icl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &icl_set_topdown_event_period); pr_cont("Icelake events, "); @@ -6331,7 +6331,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = icl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &icl_set_topdown_event_period); pr_cont("Sapphire Rapids events, "); @@ -6369,7 +6370,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_adl(); x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = adl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &adl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &adl_set_topdown_event_period); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index be27eadc40a3..386ebfa416da 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -889,7 +889,6 @@ struct x86_pmu { * Intel perf metrics */ int num_topdown_events; - u64 (*update_topdown_event)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) From dbf4e792beadafc684ef455453c613ff182c7723 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 May 2022 15:38:43 +0200 Subject: [PATCH 078/145] perf/x86/p4: Remove perfctr_second_write quirk Now that we have a x86_pmu::set_period() method, use it to remove the perfctr_second_write quirk from the generic code. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.839502514@infradead.org --- arch/x86/events/core.c | 12 +----------- arch/x86/events/intel/p4.c | 37 ++++++++++++++++++++++++++---------- arch/x86/events/perf_event.h | 2 +- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 05830bb77d40..b30b8bbcd1e2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1356,7 +1356,7 @@ static void x86_pmu_enable(struct pmu *pmu) static_call(x86_pmu_enable_all)(added); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); +DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -1416,16 +1416,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); - /* - * Due to erratum on certan cpu we need - * a second write to be sure the register - * is updated properly - */ - if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base, - (u64)(-left) & x86_pmu.cntval_mask); - } - perf_event_update_userpage(event); return ret; diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 7951a5dc73b6..03bbcc2fa2ff 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -1006,6 +1006,29 @@ static void p4_pmu_enable_all(int added) } } +static int p4_pmu_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = this_cpu_read(pmc_prev_left[hwc->idx]); + int ret; + + ret = x86_perf_event_set_period(event); + + if (hwc->event_base) { + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + } + + return ret; +} + static int p4_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; @@ -1044,7 +1067,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) /* event overflow for sure */ perf_sample_data_init(&data, 0, hwc->last_period); - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; @@ -1316,6 +1339,9 @@ static __initconst const struct x86_pmu p4_pmu = { .enable_all = p4_pmu_enable_all, .enable = p4_pmu_enable_event, .disable = p4_pmu_disable_event, + + .set_period = p4_pmu_set_period, + .eventsel = MSR_P4_BPU_CCCR0, .perfctr = MSR_P4_BPU_PERFCTR0, .event_map = p4_pmu_event_map, @@ -1334,15 +1360,6 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, - /* - * This handles erratum N15 in intel doc 249199-029, - * the counter may not be updated correctly on write - * so we need a second write operation to do the trick - * (the official workaround didn't work) - * - * the former idea is taken from OProfile code - */ - .perfctr_second_write = 1, .format_attrs = intel_p4_formats_attr, }; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 386ebfa416da..20c2ee2128fe 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -780,7 +780,6 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; - int perfctr_second_write; void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ @@ -1060,6 +1059,7 @@ static inline bool x86_pmu_has_lbr_callstack(void) } DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); +DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); int x86_perf_event_set_period(struct perf_event *event); From fae9ebde9696385fa2e993e752cf68d9781f3ea0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 4 Aug 2022 07:07:29 -0700 Subject: [PATCH 079/145] perf/x86/intel: Optimize FIXED_CTR_CTRL access All the fixed counters share a fixed control register. The current perf reads and re-writes the fixed control register for each fixed counter disable/enable, which is unnecessary. When changing the fixed control register, the entire PMU must be disabled via the global control register. The changing cannot be taken effect until the entire PMU is re-enabled. Only updating the fixed control register once right before the entire PMU re-enabling is enough. The read of the fixed control register is not necessary either. The value can be cached in the per CPU cpu_hw_events. Test results: Counting all the fixed counters with the perf bench sched pipe as below on a SPR machine. $perf stat -e cycles,instructions,ref-cycles,slots --no-inherit -- taskset -c 1 perf bench sched pipe The Total elapsed time reduces from 5.36s (without the patch) to 4.99s (with the patch), which is ~6.9% improvement. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220804140729.2951259-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 22 +++++++++++++--------- arch/x86/events/perf_event.h | 4 ++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1e429e86a7f4..7f4e7e6b45f0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2190,6 +2190,12 @@ static void __intel_pmu_enable_all(int added, bool pmi) u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); intel_pmu_lbr_enable_all(pmi); + + if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) { + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); + cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val; + } + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, intel_ctrl & ~cpuc->intel_ctrl_guest_mask); @@ -2407,9 +2413,10 @@ static inline void intel_clear_masks(struct perf_event *event, int idx) static void intel_pmu_disable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask; int idx = hwc->idx; + u64 mask; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2426,9 +2433,7 @@ static void intel_pmu_disable_fixed(struct perf_event *event) intel_clear_masks(event, idx); mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; } static void intel_pmu_disable_event(struct perf_event *event) @@ -2701,8 +2706,9 @@ static void intel_pmu_read_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask, bits = 0; + u64 mask, bits = 0; int idx = hwc->idx; if (is_topdown_idx(idx)) { @@ -2746,10 +2752,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); } - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val |= bits; } static void intel_pmu_enable_event(struct perf_event *event) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 20c2ee2128fe..371967054871 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -270,6 +270,10 @@ struct cpu_hw_events { u64 active_pebs_data_cfg; int pebs_record_size; + /* Intel Fixed counter configuration */ + u64 fixed_ctrl_val; + u64 active_fixed_ctrl_val; + /* * Intel LBR bits */ From 3749d33e510c3dc695b3a5886b706310890d7ebd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:02 -0700 Subject: [PATCH 080/145] perf: Use sample_flags for callchain So that it can call perf_callchain() only if needed. Historically it used __PERF_SAMPLE_CALLCHAIN_EARLY but we can do that with sample_flags in the struct perf_sample_data. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-1-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- kernel/events/core.c | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c251bc44c088..dab094166693 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -798,8 +798,10 @@ fail: * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { data.callchain = perf_callchain(event, iregs); + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; + } throttle = perf_event_overflow(event, &data, ®s); out: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a5275c235c2a..4ba6ab6d0d92 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1546,8 +1546,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } /* * We use the interrupt regs as a base because the PEBS record does not @@ -1719,8 +1721,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } *regs = *iregs; /* The ip in basic is EventingIP */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e90e454b995..c98ecf3e09ba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7320,7 +7320,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; From 16817ad7e8b31728b44ff9f17d8d894ed8a450d0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:03 -0700 Subject: [PATCH 081/145] perf/bpf: Always use perf callchains if exist If the perf_event has PERF_SAMPLE_CALLCHAIN, BPF can use it for stack trace. The problematic cases like PEBS and IBS already handled in the PMU driver and they filled the callchain info in the sample data. For others, we can call perf_callchain() before the BPF handler. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-2-namhyung@kernel.org --- kernel/bpf/stackmap.c | 4 ++-- kernel/events/core.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1adbe67cdb95..aecea7451b61 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); @@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, int err = -EINVAL; __u64 nr_kernel; - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | diff --git a/kernel/events/core.c b/kernel/events/core.c index c98ecf3e09ba..7da5515082f9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10000,8 +10000,16 @@ static void bpf_overflow_handler(struct perf_event *event, goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); - if (prog) + if (prog) { + if (prog->call_get_stack && + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { + data->callchain = perf_callchain(event, regs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + ret = bpf_prog_run(prog, &ctx); + } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -10027,7 +10035,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, if (event->attr.precise_ip && prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* From b4e12b2d70fd9eccdb3cef8015dc1788ca38e3fd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:04 -0700 Subject: [PATCH 082/145] perf: Kill __PERF_SAMPLE_CALLCHAIN_EARLY There's no in-tree user anymore. Let's get rid of it. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-3-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 10 ---------- arch/x86/events/intel/core.c | 3 --- include/uapi/linux/perf_event.h | 2 -- 3 files changed, 15 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index dab094166693..ce5720bfb350 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -300,16 +300,6 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; - /* - * rip recorded by IbsOpRip will not be consistent with rsp and rbp - * recorded as part of interrupt regs. Thus we need to use rip from - * interrupt regs while unwinding call stack. Setting _EARLY flag - * makes sure we unwind call-stack before perf sample rip is set to - * IbsOpRip. - */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; - return 0; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7f4e7e6b45f0..b16c91ac9219 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3868,9 +3868,6 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index dca16582885f..e639c74cf5fb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -164,8 +164,6 @@ enum perf_event_sample_format { PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ - - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) From 7a7621dfa417aa3715d2a3bd1bdd6cf5018274d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 7 Sep 2022 11:01:20 +0200 Subject: [PATCH 083/145] objtool,x86: Teach decode about LOOP* instructions When 'discussing' control flow Masami mentioned the LOOP* instructions and I realized objtool doesn't decode them properly. As it turns out, these instructions are somewhat inefficient and as such unlikely to be emitted by the compiler (a few vmlinux.o checks can't find a single one) so this isn't critical, but still, best to decode them properly. Reported-by: Masami Hiramatsu (Google) Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Yxhd4EMKyoFoH9y4@hirez.programming.kicks-ass.net --- tools/objtool/arch/x86/decode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index c260006106be..1c253b4b7ce0 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -635,6 +635,12 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec *type = INSN_CONTEXT_SWITCH; break; + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + *type = INSN_JUMP_CONDITIONAL; + break; + case 0xe8: *type = INSN_CALL; /* From 2747b93ebbede2af2d7bb088b9ddae3193ceede8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Aug 2022 18:08:56 +0200 Subject: [PATCH 084/145] locking: Detect includes rwlock.h outside of spinlock.h From: Michael S. Tsirkin The check for __LINUX_SPINLOCK_H within rwlock.h (and other files) detects the direct include of the header file if it is at the very beginning of the include section. If it is listed later then chances are high that spinlock.h was already included (including rwlock.h) and the additional listing of rwlock.h will not cause any failure. On PREEMPT_RT this additional rwlock.h will lead to compile failures since it uses a different rwlock implementation. Add __LINUX_INSIDE_SPINLOCK_H to spinlock.h and check for this instead of __LINUX_SPINLOCK_H to detect wrong includes. This will help detect direct includes of rwlock.h with without PREEMPT_RT enabled. [ bigeasy: add remaining __LINUX_SPINLOCK_H user and rewrite commit description. ] Signed-off-by: Michael S. Tsirkin Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YweemHxJx7O8rjBx@linutronix.de --- include/linux/rwlock.h | 2 +- include/linux/spinlock.h | 2 ++ include/linux/spinlock_api_smp.h | 2 +- include/linux/spinlock_api_up.h | 2 +- include/linux/spinlock_rt.h | 2 +- include/linux/spinlock_up.h | 2 +- 6 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 8f416c5e929e..c0ef596f340b 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -1,7 +1,7 @@ #ifndef __LINUX_RWLOCK_H #define __LINUX_RWLOCK_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 5c0c5174155d..1341f7d62da4 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H +#define __LINUX_INSIDE_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations @@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, void free_bucket_spinlocks(spinlock_t *locks); +#undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 51fa0dab68c4..89eb6f4c659c 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_SMP_H #define __LINUX_SPINLOCK_API_SMP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index b8ba00ccccde..819aeba1c87e 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_UP_H #define __LINUX_SPINLOCK_API_UP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 835aedaf68ac..61c49b16f69a 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -2,7 +2,7 @@ #ifndef __LINUX_SPINLOCK_RT_H #define __LINUX_SPINLOCK_RT_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H #error Do not include directly. Use spinlock.h #endif diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 16521074b6f7..c87204247592 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_UP_H #define __LINUX_SPINLOCK_UP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif From 48dfb5d2560d36fb16c7d430c229d1604ea7d185 Mon Sep 17 00:00:00 2001 From: Gokul krishna Krishnakumar Date: Thu, 8 Sep 2022 23:54:27 +0530 Subject: [PATCH 085/145] locking/rwsem: Disable preemption while trying for rwsem lock Make the region inside the rwsem_write_trylock non preemptible. We observe RT task is hogging CPU when trying to acquire rwsem lock which was acquired by a kworker task but before the rwsem owner was set. Here is the scenario: 1. CFS task (affined to a particular CPU) takes rwsem lock. 2. CFS task gets preempted by a RT task before setting owner. 3. RT task (FIFO) is trying to acquire the lock, but spinning until RT throttling happens for the lock as the lock was taken by CFS task. This patch attempts to fix the above issue by disabling preemption until owner is set for the lock. While at it also fix the issues at the places where rwsem_{set,clear}_owner() are called. This also adds lockdep annotation of preemption disable in rwsem_{set,clear}_owner() on Peter Z. suggestion. Signed-off-by: Gokul krishna Krishnakumar Signed-off-by: Mukesh Ojha Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Link: https://lore.kernel.org/r/1662661467-24203-1-git-send-email-quic_mojha@quicinc.com --- kernel/locking/rwsem.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 65f0262f635e..44873594de03 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -133,14 +133,19 @@ * the owner value concurrently without lock. Read from owner, however, * may not need READ_ONCE() as long as the pointer value is only used * for comparison and isn't being dereferenced. + * + * Both rwsem_{set,clear}_owner() functions should be in the same + * preempt disable section as the atomic op that changes sem->count. */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, 0); } @@ -251,13 +256,16 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) static inline bool rwsem_write_trylock(struct rw_semaphore *sem) { long tmp = RWSEM_UNLOCKED_VALUE; + bool ret = false; + preempt_disable(); if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - return true; + ret = true; } - return false; + preempt_enable(); + return ret; } /* @@ -1352,8 +1360,10 @@ static inline void __up_write(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + preempt_enable(); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } From 0d97db026509c1a13f732b22670ab1f0ac9d8d87 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 17:08:03 -0700 Subject: [PATCH 086/145] locking: Add __sched to semaphore functions The internal functions are marked with __sched already, let's do the same for external functions too so that we can skip them in the stack trace. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220909000803.4181857-1-namhyung@kernel.org --- kernel/locking/semaphore.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index f2654d2fe43a..34bfae72f295 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -51,7 +51,7 @@ static noinline void __up(struct semaphore *sem); * Use of this function is deprecated, please use down_interruptible() or * down_killable() instead. */ -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { unsigned long flags; @@ -74,7 +74,7 @@ EXPORT_SYMBOL(down); * If the sleep is interrupted by a signal, this function will return -EINTR. * If the semaphore is successfully acquired, this function returns 0. */ -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -101,7 +101,7 @@ EXPORT_SYMBOL(down_interruptible); * -EINTR. If the semaphore is successfully acquired, this function returns * 0. */ -int down_killable(struct semaphore *sem) +int __sched down_killable(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -131,7 +131,7 @@ EXPORT_SYMBOL(down_killable); * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int __sched down_trylock(struct semaphore *sem) { unsigned long flags; int count; @@ -156,7 +156,7 @@ EXPORT_SYMBOL(down_trylock); * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long timeout) +int __sched down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; @@ -180,7 +180,7 @@ EXPORT_SYMBOL(down_timeout); * Release the semaphore. Unlike mutexes, up() may be called from any * context and even by tasks which have never called down(). */ -void up(struct semaphore *sem) +void __sched up(struct semaphore *sem) { unsigned long flags; From c7323a5ad0786371f61dca49fc26f7ab3a68e0da Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 23 Aug 2022 19:03:57 +0200 Subject: [PATCH 087/145] mm/slub: restrict sysfs validation to debug caches and make it safe Rongwei Wang reports [1] that cache validation triggered by writing to /sys/kernel/slab//validate is racy against normal cache operations (e.g. freeing) in a way that can cause false positive inconsistency reports for caches with debugging enabled. The problem is that debugging actions that mark object free or active and actual freelist operations are not atomic, and the validation can see an inconsistent state. For caches that do or don't have debugging enabled, additional races involving n->nr_slabs are possible that result in false reports of wrong slab counts. This patch attempts to solve these issues while not adding overhead to normal (especially fastpath) operations for caches that do not have debugging enabled. Such overhead would not be justified to make possible userspace-triggered validation safe. Instead, disable the validation for caches that don't have debugging enabled and make their sysfs validate handler return -EINVAL. For caches that do have debugging enabled, we can instead extend the existing approach of not using percpu freelists to force all alloc/free operations to the slow paths where debugging flags is checked and acted upon. There can adjust the debug-specific paths to increase n->list_lock coverage against concurrent validation as necessary. The processing on free in free_debug_processing() already happens under n->list_lock so we can extend it to actually do the freeing as well and thus make it atomic against concurrent validation. As observed by Hyeonggon Yoo, we do not really need to take slab_lock() anymore here because all paths we could race with are protected by n->list_lock under the new scheme, so drop its usage here. The processing on alloc in alloc_debug_processing() currently doesn't take any locks, but we have to first allocate the object from a slab on the partial list (as debugging caches have no percpu slabs) and thus take the n->list_lock anyway. Add a function alloc_single_from_partial() that grabs just the allocated object instead of the whole freelist, and does the debug processing. The n->list_lock coverage again makes it atomic against validation and it is also ultimately more efficient than the current grabbing of freelist immediately followed by slab deactivation. To prevent races on n->nr_slabs updates, make sure that for caches with debugging enabled, inc_slabs_node() or dec_slabs_node() is called under n->list_lock. When allocating a new slab for a debug cache, handle the allocation by a new function alloc_single_from_new_slab() instead of the current forced deactivation path. Neither of these changes affect the fast paths at all. The changes in slow paths are negligible for non-debug caches. [1] https://lore.kernel.org/all/20220529081535.69275-1-rongwei.wang@linux.alibaba.com/ Reported-by: Rongwei Wang Signed-off-by: Vlastimil Babka Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> --- mm/slub.c | 232 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 180 insertions(+), 52 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 87e794ab101a..a18a81a52307 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1324,17 +1324,14 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, } static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, - void *object, unsigned long addr) + struct slab *slab, void *object) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) goto bad; } - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); + /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); init_object(s, object, SLUB_RED_ACTIVE); return 1; @@ -1604,16 +1601,18 @@ static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, unsigned long addr) { return 0; } + struct slab *slab, void *object) { return 0; } -static inline int free_debug_processing( +static inline void free_debug_processing( struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, - unsigned long addr) { return 0; } + unsigned long addr) {} static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1919,11 +1918,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) - goto out; + return NULL; stat(s, ORDER_FALLBACK); } slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; account_slab(slab, oo_order(oo), s, flags); @@ -1950,15 +1951,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - slab->inuse = slab->objects; - slab->frozen = 1; - -out: - if (!slab) - return NULL; - - inc_slabs_node(s, slab_nid(slab), slab->objects); - return slab; } @@ -2045,6 +2037,75 @@ static inline void remove_partial(struct kmem_cache_node *n, n->nr_partial--; } +/* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab) +{ + void *object; + + lockdep_assert_held(&n->list_lock); + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object)) { + remove_partial(n, slab); + return NULL; + } + + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; +} + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, + struct slab *slab) +{ + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object)) + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + */ + return NULL; + + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; +} + /* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. @@ -2125,6 +2186,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!pfmemalloc_match(slab, gfpflags)) continue; + if (kmem_cache_debug(s)) { + object = alloc_single_from_partial(s, n, slab); + if (object) + break; + continue; + } + t = acquire_slab(s, n, slab, object == NULL); if (!t) break; @@ -2733,31 +2801,39 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) } /* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( +static noinline void free_debug_processing( struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, unsigned long addr) { struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; void *object = head; int cnt = 0; - unsigned long flags, flags2; - int ret = 0; + unsigned long flags; + bool checks_ok = false; depot_stack_handle_t handle = 0; if (s->flags & SLAB_STORE_USER) handle = set_track_prepare(); spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, slab)) goto out; } + if (slab->inuse < bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, bulk_cnt); + goto out; + } + next_object: - cnt++; + + if (++cnt > bulk_cnt) + goto out_cnt; if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!free_consistency_checks(s, slab, object, addr)) @@ -2775,18 +2851,57 @@ next_object: object = get_freepointer(s, object); goto next_object; } - ret = 1; + checks_ok = true; -out: +out_cnt: if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", bulk_cnt, cnt); - slab_unlock(slab, &flags2); +out: + if (checks_ok) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* Do we need to remove the slab from full or partial list? */ + if (!prior) { + remove_full(s, n, slab); + } else if (slab->inuse == 0 && + n->nr_partial >= s->min_partial) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + + /* Do we need to discard the slab or add to partial list? */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) { + slab_free = slab; + } else if (!prior) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) + + if (!checks_ok) slab_fix(s, "Object at 0x%p not freed", object); - return ret; + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } } #endif /* CONFIG_SLUB_DEBUG */ @@ -3036,36 +3151,52 @@ new_objects: return NULL; } + stat(s, ALLOC_SLAB); + + if (kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab); + + if (unlikely(!freelist)) + goto new_objects; + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + /* * No other reference to the slab yet so we can * muck around with it freely without cmpxchg */ freelist = slab->freelist; slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; - stat(s, ALLOC_SLAB); + inc_slabs_node(s, slab_nid(slab), slab->objects); check_new_slab: if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, slab, freelist, addr)) { - /* Slab failed checks. Next slab needed */ - goto new_slab; - } else { - /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() - */ - goto return_single; - } + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the tracking info + * and return the object + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + return freelist; } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. */ - goto return_single; + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } retry_load_slab: @@ -3089,11 +3220,6 @@ retry_load_slab: c->slab = slab; goto load_freelist; - -return_single: - - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; } /* @@ -3341,9 +3467,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) + if (kmem_cache_debug(s)) { + free_debug_processing(s, slab, head, tail, cnt, addr); return; + } do { if (unlikely(n)) { @@ -3936,6 +4063,7 @@ static void early_kmem_cache_node_alloc(int node) slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); + inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); @@ -3950,7 +4078,6 @@ static void early_kmem_cache_node_alloc(int node) n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - slab->frozen = 0; kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); @@ -4611,6 +4738,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); n->nr_partial--; + dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } @@ -4626,7 +4754,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) - discard_slab(s, slab); + free_slab(s, slab); if (slabs_node(s, node)) ret = 1; @@ -5601,7 +5729,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length; From 41bec7c33f37aaae6e3737615e2dfa17a30ea985 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 23 Aug 2022 19:03:58 +0200 Subject: [PATCH 088/145] mm/slub: remove slab_lock() usage for debug operations All alloc and free operations on debug caches are now serialized by n->list_lock, so we can remove slab_lock() usage in validate_slab() and list_slab_objects() as those also happen under n->list_lock. Note the usage in list_slab_objects() could happen even on non-debug caches, but only during cache shutdown time, so there should not be any parallel freeing activity anymore. Except for buggy slab users, but in that case the slab_lock() would not help against the common cmpxchg based fast paths (in non-debug caches) anyway. Also adjust documentation comments accordingly. Suggested-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: David Rientjes --- mm/slub.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a18a81a52307..37234e60591c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -50,7 +50,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches or for debugging) + * 4. slab_lock(slab) (Only on some arches) * 5. object_map_lock (Only for debugging) * * slab_mutex @@ -64,8 +64,9 @@ * The slab_lock is a wrapper around the page lock, thus it is a bit * spinlock. * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects: + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * * A. slab->freelist -> List of free objects in a slab * B. slab->inuse -> Number of objects in use * C. slab->objects -> Number of objects in slab @@ -94,6 +95,9 @@ * allocating a long series of objects that fill up slabs does not require * the list lock. * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * * cpu_slab->lock local lock * * This locks protect slowpath manipulation of all kmem_cache_cpu fields @@ -4369,7 +4373,6 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, void *p; slab_err(s, slab, text, s->name); - slab_lock(slab, &flags); map = get_map(s, slab); for_each_object(p, s, addr, slab->objects) { @@ -4380,7 +4383,6 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, } } put_map(map); - slab_unlock(slab, &flags); #endif } @@ -5108,12 +5110,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, { void *p; void *addr = slab_address(slab); - unsigned long flags; - - slab_lock(slab, &flags); if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) - goto unlock; + return; /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); @@ -5124,8 +5123,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, if (!check_object(s, slab, p, val)) break; } -unlock: - slab_unlock(slab, &flags); } static int validate_slab_node(struct kmem_cache *s, From 4ef3f5a32051def596b2d445462a1dccda7af600 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 23 Aug 2022 19:03:59 +0200 Subject: [PATCH 089/145] mm/slub: convert object_map_lock to non-raw spinlock The only remaining user of object_map_lock is list_slab_objects(). Obtaining the lock there used to happen under slab_lock() which implied disabling irqs on PREEMPT_RT, thus it's a raw_spinlock. With the slab_lock() removed, we can convert it to a normal spinlock. Also remove the get_map()/put_map() wrappers as list_slab_objects() became their only remaining user. Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Sebastian Andrzej Siewior --- mm/slub.c | 36 ++++++------------------------------ 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 37234e60591c..e0759b5fe506 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -565,7 +565,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_RAW_SPINLOCK(object_map_lock); +static DEFINE_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct slab *slab) @@ -599,30 +599,6 @@ static bool slab_add_kunit_errors(void) static inline bool slab_add_kunit_errors(void) { return false; } #endif -/* - * Determine a map of objects in use in a slab. - * - * Node listlock must be held to guarantee that the slab does - * not vanish from under us. - */ -static unsigned long *get_map(struct kmem_cache *s, struct slab *slab) - __acquires(&object_map_lock) -{ - VM_BUG_ON(!irqs_disabled()); - - raw_spin_lock(&object_map_lock); - - __fill_map(object_map, s, slab); - - return object_map; -} - -static void put_map(unsigned long *map) __releases(&object_map_lock) -{ - VM_BUG_ON(map != object_map); - raw_spin_unlock(&object_map_lock); -} - static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) @@ -4368,21 +4344,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); - unsigned long flags; - unsigned long *map; void *p; slab_err(s, slab, text, s->name); - map = get_map(s, slab); + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); + for_each_object(p, s, addr, slab->objects) { - if (!test_bit(__obj_to_index(s, addr, p), map)) { + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } - put_map(map); + spin_unlock(&object_map_lock); #endif } From 5875e59828a026e47f37c5e343f4fe8e0ba023b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 23 Aug 2022 19:04:00 +0200 Subject: [PATCH 090/145] mm/slub: simplify __cmpxchg_double_slab() and slab_[un]lock() The PREEMPT_RT specific disabling of irqs in __cmpxchg_double_slab() (through slab_[un]lock()) is unnecessary as bit_spin_lock() disables preemption and that's sufficient on PREEMPT_RT where no allocation/free operation is performed in hardirq context and so can't interrupt the current operation. That means we no longer need the slab_[un]lock() wrappers, so delete them and rename the current __slab_[un]lock() to slab_[un]lock(). Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Reviewed-by: Sebastian Andrzej Siewior --- mm/slub.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e0759b5fe506..d58f19502a92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -446,7 +446,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) /* * Per slab locking using the pagelock */ -static __always_inline void __slab_lock(struct slab *slab) +static __always_inline void slab_lock(struct slab *slab) { struct page *page = slab_page(slab); @@ -454,7 +454,7 @@ static __always_inline void __slab_lock(struct slab *slab) bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void __slab_unlock(struct slab *slab) +static __always_inline void slab_unlock(struct slab *slab) { struct page *page = slab_page(slab); @@ -462,24 +462,12 @@ static __always_inline void __slab_unlock(struct slab *slab) __bit_spin_unlock(PG_locked, &page->flags); } -static __always_inline void slab_lock(struct slab *slab, unsigned long *flags) -{ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(*flags); - __slab_lock(slab); -} - -static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags) -{ - __slab_unlock(slab); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(*flags); -} - /* * Interrupts must be disabled (for the fallback code to work right), typically - * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different - * so we disable interrupts as part of slab_[un]lock(). + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, @@ -498,18 +486,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab } else #endif { - /* init to 0 to prevent spurious warnings */ - unsigned long flags = 0; - - slab_lock(slab, &flags); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - slab_unlock(slab, &flags); + slab_unlock(slab); return true; } - slab_unlock(slab, &flags); + slab_unlock(slab); } cpu_relax(); @@ -540,16 +525,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, unsigned long flags; local_irq_save(flags); - __slab_lock(slab); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); return true; } - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); } From 1f04b07d976da0666dbc2b170634c5531974dfa1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 09:51:36 +0200 Subject: [PATCH 091/145] slub: Make PREEMPT_RT support less convoluted The slub code already has a few helpers depending on PREEMPT_RT. Add a few more and get rid of the CONFIG_PREEMPT_RT conditionals all over the place. No functional change. Signed-off-by: Thomas Gleixner Cc: Andrew Morton Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Vlastimil Babka Cc: linux-mm@kvack.org Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Vlastimil Babka --- mm/slub.c | 56 ++++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d58f19502a92..d9650f2ca776 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -104,9 +104,11 @@ * except the stat counters. This is a percpu structure manipulated only by * the local cpu, so the lock protects against being preempted or interrupted * by an irq. Fast path operations rely on lockless operations instead. - * On PREEMPT_RT, the local lock does not actually disable irqs (and thus - * prevent the lockless operations), so fastpath operations also need to take - * the lock and are no longer lockless. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. * * lockless fastpaths * @@ -167,8 +169,9 @@ * function call even on !PREEMPT_RT, use inline preempt_disable() there. */ #ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) #else #define slub_get_cpu_ptr(var) \ ({ \ @@ -180,6 +183,7 @@ do { \ (void)(var); \ migrate_enable(); \ } while (0) +#define USE_LOCKLESS_FAST_PATH() (false) #endif #ifdef CONFIG_SLUB_DEBUG @@ -474,7 +478,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab void *freelist_new, unsigned long counters_new, const char *n) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -3288,14 +3292,8 @@ redo: object = c->freelist; slab = c->slab; - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if a - * slowpath has taken the local_lock_irqsave(), it is not protected - * against a fast path operation in an irq handler. So we need to take - * the slow path which uses local_lock. It is still relatively fast if - * there is a suitable cpu freelist. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) || + + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { @@ -3555,6 +3553,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; + void **freelist; redo: /* @@ -3569,9 +3568,13 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); - if (likely(slab == c->slab)) { -#ifndef CONFIG_PREEMPT_RT - void **freelist = READ_ONCE(c->freelist); + if (unlikely(slab != c->slab)) { + __slab_free(s, slab, head, tail_obj, cnt, addr); + return; + } + + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3583,16 +3586,8 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } -#else /* CONFIG_PREEMPT_RT */ - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if - * a slowpath has taken the local_lock_irqsave(), it is not - * protected against a fast path operation in an irq handler. So - * we need to take the local_lock. We shouldn't simply defer to - * __slab_free() as that wouldn't use the cpu freelist at all. - */ - void **freelist; - + } else { + /* Update the free list under the local lock */ local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { @@ -3607,11 +3602,8 @@ redo: c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); -#endif - stat(s, FREE_FASTPATH); - } else - __slab_free(s, slab, head, tail_obj, cnt, addr); - + } + stat(s, FREE_FASTPATH); } static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, From 555bb4ccd1dd78d0263eae31629fe1fdd65c1fb5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:24 +0200 Subject: [PATCH 092/145] preempt: Provide preempt_[dis|en]able_nested() On PREEMPT_RT enabled kernels, spinlocks and rwlocks are neither disabling preemption nor interrupts. Though there are a few places which depend on the implicit preemption/interrupt disable of those locks, e.g. seqcount write sections, per CPU statistics updates etc. To avoid sprinkling CONFIG_PREEMPT_RT conditionals all over the place, add preempt_disable_nested() and preempt_enable_nested() which should be descriptive enough. Add a lockdep assertion for the !PREEMPT_RT case to catch callers which do not have preemption disabled. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220825164131.402717-2-bigeasy@linutronix.de --- include/linux/preempt.h | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b4381f255a5c..0df425bf9bd7 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -421,4 +421,46 @@ static inline void migrate_enable(void) { } #endif /* CONFIG_SMP */ +/** + * preempt_disable_nested - Disable preemption inside a normally preempt disabled section + * + * Use for code which requires preemption protection inside a critical + * section which has preemption disabled implicitly on non-PREEMPT_RT + * enabled kernels, by e.g.: + * - holding a spinlock/rwlock + * - soft interrupt context + * - regular interrupt handlers + * + * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft + * interrupt context and regular interrupt handlers are preemptible and + * only prevent migration. preempt_disable_nested() ensures that preemption + * is disabled for cases which require CPU local serialization even on + * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. + * + * The use cases are code sequences which are not serialized by a + * particular lock instance, e.g.: + * - seqcount write side critical sections where the seqcount is not + * associated to a particular lock and therefore the automatic + * protection mechanism does not work. This prevents a live lock + * against a preempting high priority reader. + * - RMW per CPU variable updates like vmstat. + */ +/* Macro to avoid header recursion hell vs. lockdep */ +#define preempt_disable_nested() \ +do { \ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ + preempt_disable(); \ + else \ + lockdep_assert_preemption_disabled(); \ +} while (0) + +/** + * preempt_enable_nested - Undo the effect of preempt_disable_nested() + */ +static __always_inline void preempt_enable_nested(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); +} + #endif /* __LINUX_PREEMPT_H */ From 93f6d4e1893657f07ba3c9e2bfa74b355a0b32f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:25 +0200 Subject: [PATCH 093/145] dentry: Use preempt_[dis|en]able_nested() Replace the open coded CONFIG_PREEMPT_RT conditional preempt_disable/enable() with the new helper. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220825164131.402717-3-bigeasy@linutronix.de --- fs/dcache.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index bb0c4d0038db..2ee8636016ee 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - /* - * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT - * kernels spin_lock() implicitly disables preemption, but not on - * PREEMPT_RT. So for RT it has to be done explicitly to protect - * the sequence count write side critical section against a reader - * or another writer preempting, which would result in a live lock. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); wake_up_all(d_wait); } From 7a025e91abd23effe869a05d037b26770ffa0309 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:26 +0200 Subject: [PATCH 094/145] mm/vmstat: Use preempt_[dis|en]able_nested() Replace the open coded CONFIG_PREEMPT_RT conditional preempt_enable/disable() pairs with the new helper functions which hide the underlying implementation details. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Link: https://lore.kernel.org/r/20220825164131.402717-4-bigeasy@linutronix.de --- mm/vmstat.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 90af9a8572f5..7a2d73f15230 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, * CPU migrations and preemption potentially corrupts a counter so * disable preemption. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) From a738e9bad6030d4fd33bfd7db3399a250b7e94d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:27 +0200 Subject: [PATCH 095/145] mm/debug: Provide VM_WARN_ON_IRQS_ENABLED() Some places in the VM code expect interrupts disabled, which is a valid expectation on non-PREEMPT_RT kernels, but does not hold on RT kernels in some places because the RT spinlock substitution does not disable interrupts. To avoid sprinkling CONFIG_PREEMPT_RT conditionals into those places, provide VM_WARN_ON_IRQS_ENABLED() which is only enabled when VM_DEBUG=y and PREEMPT_RT=n. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Link: https://lore.kernel.org/r/20220825164131.402717-5-bigeasy@linutronix.de --- include/linux/mmdebug.h | 6 ++++++ lib/Kconfig.debug | 3 +++ 2 files changed, 9 insertions(+) diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 15ae78cd2853..b8728d11c949 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif +#ifdef CONFIG_DEBUG_VM_IRQSOFF +#define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) +#else +#define VM_WARN_ON_IRQS_ENABLED() do { } while (0) +#endif + #ifdef CONFIG_DEBUG_VIRTUAL #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) #else diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index bcbe60d6c80c..cdb4b27ef1a0 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -803,6 +803,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE An architecture should select this when it can successfully build and run DEBUG_VM_PGTABLE. +config DEBUG_VM_IRQSOFF + def_bool DEBUG_VM && !PREEMPT_RT + config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL From e575d401583273a7ac5dfb27520e41c821e81816 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:28 +0200 Subject: [PATCH 096/145] mm/memcontrol: Replace the PREEMPT_RT conditionals Use VM_WARN_ON_IRQS_ENABLED() and preempt_disable/enable_nested() to replace the CONFIG_PREEMPT_RT #ifdeffery. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Link: https://lore.kernel.org/r/20220825164131.402717-6-bigeasy@linutronix.de --- mm/memcontrol.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b69979c9ced5..d35b6fa560f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -597,25 +597,18 @@ static u64 flush_next_time; */ static void memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#else - VM_BUG_ON(!irqs_disabled()); -#endif + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); } static void __memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#endif + preempt_disable_nested(); } static void memcg_stats_unlock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_enable(); -#endif + preempt_enable_nested(); } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) @@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, * interrupt context while other caller need to have disabled interrupt. */ __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (IS_ENABLED(CONFIG_DEBUG_VM)) { switch (idx) { case NR_ANON_MAPPED: case NR_FILE_MAPPED: @@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, WARN_ON_ONCE(!in_task()); break; default: - WARN_ON_ONCE(!irqs_disabled()); + VM_WARN_ON_IRQS_ENABLED(); } } From c7e0b3d088717d148707cd6fcb12f97c6fd961c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:29 +0200 Subject: [PATCH 097/145] mm/compaction: Get rid of RT ifdeffery Move the RT dependency for the initial value of sysctl_compact_unevictable_allowed into Kconfig. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220825164131.402717-7-bigeasy@linutronix.de --- mm/Kconfig | 6 ++++++ mm/compaction.c | 6 +----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 0331f1461f81..3897e924e40f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -579,6 +579,12 @@ config COMPACTION it and then we would be really interested to hear about that at linux-mm@kvack.org. +config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION + default 0 if PREEMPT_RT + default 1 + # # support for free page reporting config PAGE_REPORTING diff --git a/mm/compaction.c b/mm/compaction.c index 640fa76228dd..10561cb1aaad 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1727,11 +1727,7 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ -#ifdef CONFIG_PREEMPT_RT -int sysctl_compact_unevictable_allowed __read_mostly = 0; -#else -int sysctl_compact_unevictable_allowed __read_mostly = 1; -#endif +int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) From 9458e0a78c45bc6537ce11eb9f03489eab92f9c2 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Aug 2022 18:41:30 +0200 Subject: [PATCH 098/145] flex_proportions: Disable preemption entering the write section. The seqcount fprop_global::sequence is not associated with a lock. The write section (fprop_new_period()) is invoked from a timer and since the softirq is preemptible on PREEMPT_RT it is possible to preempt the write section which is not desited. Disable preemption around the write section on PREEMPT_RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220825164131.402717-8-bigeasy@linutronix.de --- lib/flex_proportions.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 05cccbcf1661..83332fefa6f4 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) */ if (events <= 1) return false; + preempt_disable_nested(); write_seqcount_begin(&p->sequence); if (periods < 64) events -= events >> periods; @@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) percpu_counter_add(&p->events, -events); p->period += periods; write_seqcount_end(&p->sequence); + preempt_enable_nested(); return true; } From 44b0c2957adc62b86fcd51adeaf8e993171bc319 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 25 Aug 2022 18:41:31 +0200 Subject: [PATCH 099/145] u64_stats: Streamline the implementation The u64 stats code handles 3 different cases: - 32bit UP - 32bit SMP - 64bit with an unreadable #ifdef maze, which was recently expanded with PREEMPT_RT conditionals. Reduce it to two cases (32bit and 64bit) and drop the optimization for 32bit UP as suggested by Linus. Use the new preempt_disable/enable_nested() helpers to get rid of the CONFIG_PREEMPT_RT conditionals. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220825164131.402717-9-bigeasy@linutronix.de --- include/linux/u64_stats_sync.h | 147 +++++++++++++++------------------ 1 file changed, 65 insertions(+), 82 deletions(-) diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 6ad4e9032d53..46040d66334a 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -8,7 +8,7 @@ * * Key points : * - * - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP. + * - Use a seqcount on 32-bit * - The whole thing is a no-op on 64-bit architectures. * * Usage constraints: @@ -20,7 +20,8 @@ * writer and also spin forever. * * 3) Write side must use the _irqsave() variant if other writers, or a reader, - * can be invoked from an IRQ context. + * can be invoked from an IRQ context. On 64bit systems this variant does not + * disable interrupts. * * 4) If reader fetches several counters, there is no guarantee the whole values * are consistent w.r.t. each other (remember point #2: seqcounts are not @@ -29,11 +30,6 @@ * 5) Readers are allowed to sleep or be preempted/interrupted: they perform * pure reads. * - * 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats - * might be updated from a hardirq or softirq context (remember point #1: - * seqcounts are not used for UP kernels). 32-bit UP stat readers could read - * corrupted 64-bit values otherwise. - * * Usage : * * Stats producer (writer) should use following template granted it already got @@ -66,7 +62,7 @@ #include struct u64_stats_sync { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) +#if BITS_PER_LONG == 32 seqcount_t seq; #endif }; @@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p) local64_inc(&p->v); } -#else +static inline void u64_stats_init(struct u64_stats_sync *syncp) { } +static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { } +static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { } +static inline unsigned long __u64_stats_irqsave(void) { return 0; } +static inline void __u64_stats_irqrestore(unsigned long flags) { } +static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ + return 0; +} +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ + return false; +} + +#else /* 64 bit */ typedef struct { u64 v; @@ -123,123 +134,95 @@ static inline void u64_stats_inc(u64_stats_t *p) { p->v++; } -#endif -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) -#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) -#else static inline void u64_stats_init(struct u64_stats_sync *syncp) { + seqcount_init(&syncp->seq); } -#endif -static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) +static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); write_seqcount_begin(&syncp->seq); -#endif } -static inline void u64_stats_update_end(struct u64_stats_sync *syncp) +static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); -#endif + preempt_enable_nested(); } -static inline unsigned long -u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +static inline unsigned long __u64_stats_irqsave(void) { - unsigned long flags = 0; + unsigned long flags; -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); - else - local_irq_save(flags); - write_seqcount_begin(&syncp->seq); -#endif + local_irq_save(flags); return flags; } -static inline void -u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, - unsigned long flags) +static inline void __u64_stats_irqrestore(unsigned long flags) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - write_seqcount_end(&syncp->seq); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); - else - local_irq_restore(flags); -#endif + local_irq_restore(flags); } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); -#else - return 0; -#endif +} + +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ + return read_seqcount_retry(&syncp->seq, start); +} +#endif /* !64 bit */ + +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) +{ + __u64_stats_update_begin(syncp); +} + +static inline void u64_stats_update_end(struct u64_stats_sync *syncp) +{ + __u64_stats_update_end(syncp); +} + +static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +{ + unsigned long flags = __u64_stats_irqsave(); + + __u64_stats_update_begin(syncp); + return flags; +} + +static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) +{ + __u64_stats_update_end(syncp); + __u64_stats_irqrestore(flags); } static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) - preempt_disable(); -#endif return __u64_stats_fetch_begin(syncp); } -static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, - unsigned int start) -{ -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - return read_seqcount_retry(&syncp->seq, start); -#else - return false; -#endif -} - static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) - preempt_enable(); -#endif return __u64_stats_fetch_retry(syncp, start); } -/* - * In case irq handlers can update u64 counters, readers can use following helpers - * - SMP 32bit arches use seqcount protection, irq safe. - * - UP 32bit must disable irqs. - * - 64bit have no problem atomically reading u64 values, irq safe. - */ +/* Obsolete interfaces */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_disable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_disable(); -#endif - return __u64_stats_fetch_begin(syncp); + return u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_enable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_enable(); -#endif - return __u64_stats_fetch_retry(syncp, start); + return u64_stats_fetch_retry(syncp, start); } #endif /* _LINUX_U64_STATS_SYNC_H */ From 4364044c326cbf2ad09e9fa99a9a28a342fecce6 Mon Sep 17 00:00:00 2001 From: Victor Hassan Date: Tue, 6 Sep 2022 13:20:56 +0800 Subject: [PATCH 100/145] clocksource/drivers/sun4i: Add definition of clear interrupt To prevent misunderstanding, use TIMER_IRQ_CLEAR instead of TIMER_IRQ_EN in function sun4i_timer_clear_interrupt. Signed-off-by: Victor Hassan Acked-by: Jernej Skrabec Link: https://lore.kernel.org/r/20220906052056.43404-1-victor@allwinnertech.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-sun4i.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-sun4i.c b/drivers/clocksource/timer-sun4i.c index 94dc6e42e983..e5a70aa1deb4 100644 --- a/drivers/clocksource/timer-sun4i.c +++ b/drivers/clocksource/timer-sun4i.c @@ -26,6 +26,7 @@ #define TIMER_IRQ_EN_REG 0x00 #define TIMER_IRQ_EN(val) BIT(val) #define TIMER_IRQ_ST_REG 0x04 +#define TIMER_IRQ_CLEAR(val) BIT(val) #define TIMER_CTL_REG(val) (0x10 * val + 0x10) #define TIMER_CTL_ENABLE BIT(0) #define TIMER_CTL_RELOAD BIT(1) @@ -123,7 +124,7 @@ static int sun4i_clkevt_next_event(unsigned long evt, static void sun4i_timer_clear_interrupt(void __iomem *base) { - writel(TIMER_IRQ_EN(0), base + TIMER_IRQ_ST_REG); + writel(TIMER_IRQ_CLEAR(0), base + TIMER_IRQ_ST_REG); } static irqreturn_t sun4i_timer_interrupt(int irq, void *dev_id) From 99b701fd2758d046d9e6ecdef1a3320d29b8b1d9 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 9 Jun 2022 13:27:35 +0200 Subject: [PATCH 101/145] dt-bindings: timer: exynos4210-mct: Add ARTPEC-8 MCT support The ARTPEC-8 has an MCT with 4 global and 8 local timer interrupts. The SoC has a quad-core Cortex-A53 and a single-core Cortex-A5 which share one MCT with one global and eight local timers. The Cortex-A53 and Cortex-A5 do not have cache-coherency between them, and therefore run two separate kernels. The Cortex-A53 boots first and starts the global free-running counter and also registers a clock events device using the global timer. (This global timer clock events is usually replaced by arch timer clock events for each of the cores.) When the A5 boots (via the A53), it should not use the global timer interrupts or write to the global timer registers. This is because even if there are four global comparators, the control bits for all four are in the same registers, and we would need to synchronize between the cpus. Instead, the global timer FRC (already started by the A53) should be used as the clock source, and one of the local timers which are not used by the A53 can be used for clock events on the A5. To support this hardware, add a compatible for the MCT as well as two new properties to describe the hardware-mandated sharing of the FRC and dedicating local timers to specific processors. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Vincent Whitchurch Link: https://lore.kernel.org/r/20220609112738.359385-2-vincent.whitchurch@axis.com Signed-off-by: Daniel Lezcano --- .../timer/samsung,exynos4210-mct.yaml | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml index 9c81d00b12e0..829bd2227f7c 100644 --- a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml +++ b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml @@ -25,6 +25,7 @@ properties: - samsung,exynos4412-mct - items: - enum: + - axis,artpec8-mct - samsung,exynos3250-mct - samsung,exynos5250-mct - samsung,exynos5260-mct @@ -45,6 +46,19 @@ properties: reg: maxItems: 1 + samsung,frc-shared: + type: boolean + description: | + Indicates that the hardware requires that this processor share the + free-running counter with a different (main) processor. + + samsung,local-timers: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 16 + description: | + List of indices of local timers usable from this processor. + interrupts: description: | Interrupts should be put in specific order. This is, the local timer @@ -74,6 +88,17 @@ required: - reg allOf: + - if: + not: + properties: + compatible: + contains: + enum: + - axis,artpec8-mct + then: + properties: + samsung,local-timers: false + samsung,frc-shared: false - if: properties: compatible: @@ -101,6 +126,7 @@ allOf: compatible: contains: enum: + - axis,artpec8-mct - samsung,exynos5260-mct - samsung,exynos5420-mct - samsung,exynos5433-mct From e8550f0e7bde9bd31697e3c534d386f7f3b5787b Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 9 Jun 2022 13:27:36 +0200 Subject: [PATCH 102/145] clocksource/drivers/exynos_mct: Support frc-shared property When the FRC is shared with another main processor, the other processor is assumed to have started it and this processor should not write to the global registers. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Vincent Whitchurch Link: https://lore.kernel.org/r/20220609112738.359385-3-vincent.whitchurch@axis.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/exynos_mct.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index f29c812b70c9..12023831dedf 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -233,9 +233,16 @@ static cycles_t exynos4_read_current_timer(void) } #endif -static int __init exynos4_clocksource_init(void) +static int __init exynos4_clocksource_init(bool frc_shared) { - exynos4_mct_frc_start(); + /* + * When the frc is shared, the main processer should have already + * turned it on and we shouldn't be writing to TCON. + */ + if (frc_shared) + mct_frc.resume = NULL; + else + exynos4_mct_frc_start(); #if defined(CONFIG_ARM) exynos4_delay_timer.read_current_timer = &exynos4_read_current_timer; @@ -605,6 +612,7 @@ out_irq: static int __init mct_init_dt(struct device_node *np, unsigned int int_type) { + bool frc_shared = of_property_read_bool(np, "samsung,frc-shared"); int ret; ret = exynos4_timer_resources(np); @@ -615,10 +623,17 @@ static int __init mct_init_dt(struct device_node *np, unsigned int int_type) if (ret) return ret; - ret = exynos4_clocksource_init(); + ret = exynos4_clocksource_init(frc_shared); if (ret) return ret; + /* + * When the FRC is shared with a main processor, this secondary + * processor cannot use the global comparator. + */ + if (frc_shared) + return ret; + return exynos4_clockevent_init(); } From 47dbe4eb9822208be2b7ec901c7e0c15536f9c92 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 9 Jun 2022 13:27:37 +0200 Subject: [PATCH 103/145] clocksource/drivers/exynos_mct: Support local-timers property If the device tree indicates that the hardware requires that the processor only use certain local timers, respect that. Signed-off-by: Vincent Whitchurch Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220609112738.359385-4-vincent.whitchurch@axis.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/exynos_mct.c | 62 ++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 12023831dedf..bfd60093ee1c 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -33,7 +33,7 @@ #define EXYNOS4_MCT_G_INT_ENB EXYNOS4_MCTREG(0x248) #define EXYNOS4_MCT_G_WSTAT EXYNOS4_MCTREG(0x24C) #define _EXYNOS4_MCT_L_BASE EXYNOS4_MCTREG(0x300) -#define EXYNOS4_MCT_L_BASE(x) (_EXYNOS4_MCT_L_BASE + (0x100 * x)) +#define EXYNOS4_MCT_L_BASE(x) (_EXYNOS4_MCT_L_BASE + (0x100 * (x))) #define EXYNOS4_MCT_L_MASK (0xffffff00) #define MCT_L_TCNTB_OFFSET (0x00) @@ -66,6 +66,8 @@ #define MCT_L0_IRQ 4 /* Max number of IRQ as per DT binding document */ #define MCT_NR_IRQS 20 +/* Max number of local timers */ +#define MCT_NR_LOCAL (MCT_NR_IRQS - MCT_L0_IRQ) enum { MCT_INT_SPI, @@ -456,7 +458,6 @@ static int exynos4_mct_starting_cpu(unsigned int cpu) per_cpu_ptr(&percpu_mct_tick, cpu); struct clock_event_device *evt = &mevt->evt; - mevt->base = EXYNOS4_MCT_L_BASE(cpu); snprintf(mevt->name, sizeof(mevt->name), "mct_tick%d", cpu); evt->name = mevt->name; @@ -527,8 +528,17 @@ static int __init exynos4_timer_resources(struct device_node *np) return 0; } +/** + * exynos4_timer_interrupts - initialize MCT interrupts + * @np: device node for MCT + * @int_type: interrupt type, MCT_INT_PPI or MCT_INT_SPI + * @local_idx: array mapping CPU numbers to local timer indices + * @nr_local: size of @local_idx array + */ static int __init exynos4_timer_interrupts(struct device_node *np, - unsigned int int_type) + unsigned int int_type, + const u32 *local_idx, + size_t nr_local) { int nr_irqs, i, err, cpu; @@ -561,13 +571,21 @@ static int __init exynos4_timer_interrupts(struct device_node *np, } else { for_each_possible_cpu(cpu) { int mct_irq; + unsigned int irq_idx; struct mct_clock_event_device *pcpu_mevt = per_cpu_ptr(&percpu_mct_tick, cpu); + if (cpu >= nr_local) { + err = -EINVAL; + goto out_irq; + } + + irq_idx = MCT_L0_IRQ + local_idx[cpu]; + pcpu_mevt->evt.irq = -1; - if (MCT_L0_IRQ + cpu >= ARRAY_SIZE(mct_irqs)) + if (irq_idx >= ARRAY_SIZE(mct_irqs)) break; - mct_irq = mct_irqs[MCT_L0_IRQ + cpu]; + mct_irq = mct_irqs[irq_idx]; irq_set_status_flags(mct_irq, IRQ_NOAUTOEN); if (request_irq(mct_irq, @@ -583,6 +601,17 @@ static int __init exynos4_timer_interrupts(struct device_node *np, } } + for_each_possible_cpu(cpu) { + struct mct_clock_event_device *mevt = per_cpu_ptr(&percpu_mct_tick, cpu); + + if (cpu >= nr_local) { + err = -EINVAL; + goto out_irq; + } + + mevt->base = EXYNOS4_MCT_L_BASE(local_idx[cpu]); + } + /* Install hotplug callbacks which configure the timer on this CPU */ err = cpuhp_setup_state(CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING, "clockevents/exynos4/mct_timer:starting", @@ -613,13 +642,34 @@ out_irq: static int __init mct_init_dt(struct device_node *np, unsigned int int_type) { bool frc_shared = of_property_read_bool(np, "samsung,frc-shared"); + u32 local_idx[MCT_NR_LOCAL] = {0}; + int nr_local; int ret; + nr_local = of_property_count_u32_elems(np, "samsung,local-timers"); + if (nr_local == 0) + return -EINVAL; + if (nr_local > 0) { + if (nr_local > ARRAY_SIZE(local_idx)) + return -EINVAL; + + ret = of_property_read_u32_array(np, "samsung,local-timers", + local_idx, nr_local); + if (ret) + return ret; + } else { + int i; + + nr_local = ARRAY_SIZE(local_idx); + for (i = 0; i < nr_local; i++) + local_idx[i] = i; + } + ret = exynos4_timer_resources(np); if (ret) return ret; - ret = exynos4_timer_interrupts(np, int_type); + ret = exynos4_timer_interrupts(np, int_type, local_idx, nr_local); if (ret) return ret; From 55ccdab79524ce9a46965af6e1908b90fa92b303 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 9 Jun 2022 13:27:38 +0200 Subject: [PATCH 104/145] clocksource/drivers/exynos_mct: Enable building on ARTPEC This timer block is used on ARTPEC-8. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Vincent Whitchurch Link: https://lore.kernel.org/r/20220609112738.359385-5-vincent.whitchurch@axis.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 4f2bb7315b67..4469e7f555e9 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -434,7 +434,7 @@ config ATMEL_TCB_CLKSRC config CLKSRC_EXYNOS_MCT bool "Exynos multi core timer driver" if COMPILE_TEST depends on ARM || ARM64 - depends on ARCH_EXYNOS || COMPILE_TEST + depends on ARCH_ARTPEC || ARCH_EXYNOS || COMPILE_TEST help Support for Multi Core Timer controller on Exynos SoCs. From 6c3b62d93e195f78c1437c8fa7581e9b2f00886e Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Wed, 14 Sep 2022 14:14:24 +0800 Subject: [PATCH 105/145] clocksource/drivers/arm_arch_timer: Fix handling of ARM erratum 858921 The commit a38b71b0833e ("clocksource/drivers/arm_arch_timer: Move system register timer programming over to CVAL") moves the programming of the timers from the countdown timer (TVAL) over to the comparator (CVAL). This makes it necessary to read the counter when programming next event. However, the workaround of Cortex-A73 erratum 858921 does not set the corresponding set_next_event_phys and set_next_event_virt. Add the appropriate hooks to apply the erratum mitigation when programming the next timer event. Fixes: a38b71b0833e ("clocksource/drivers/arm_arch_timer: Move system register timer programming over to CVAL") Signed-off-by: Kunkun Jiang Acked-by: Marc Zyngier Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20220914061424.1260-1-jiangkunkun@huawei.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 9ab8221ee3c6..ff935efb6a88 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -473,6 +473,8 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = { .desc = "ARM erratum 858921", .read_cntpct_el0 = arm64_858921_read_cntpct_el0, .read_cntvct_el0 = arm64_858921_read_cntvct_el0, + .set_next_event_phys = erratum_set_next_event_phys, + .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1 From 0e2c8e6d769bcdc4f6634a02c545356282275e68 Mon Sep 17 00:00:00 2001 From: Lin Yujun Date: Wed, 14 Sep 2022 11:30:18 +0800 Subject: [PATCH 106/145] clocksource/drivers/timer-gxp: Add missing error handling in gxp_timer_probe Add platform_device_put() to make sure to free the platform device in the event platform_device_add() fails. Fixes: 5184f4bf151b ("clocksource/drivers/timer-gxp: Add HPE GXP Timer") Signed-off-by: Lin Yujun Link: https://lore.kernel.org/r/20220914033018.97484-1-linyujun809@huawei.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-gxp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-gxp.c b/drivers/clocksource/timer-gxp.c index 8b38b3212388..fe4fa8d7b3f1 100644 --- a/drivers/clocksource/timer-gxp.c +++ b/drivers/clocksource/timer-gxp.c @@ -171,6 +171,7 @@ static int gxp_timer_probe(struct platform_device *pdev) { struct platform_device *gxp_watchdog_device; struct device *dev = &pdev->dev; + int ret; if (!gxp_timer) { pr_err("Gxp Timer not initialized, cannot create watchdog"); @@ -187,7 +188,11 @@ static int gxp_timer_probe(struct platform_device *pdev) gxp_watchdog_device->dev.platform_data = gxp_timer->counter; gxp_watchdog_device->dev.parent = dev; - return platform_device_add(gxp_watchdog_device); + ret = platform_device_add(gxp_watchdog_device); + if (ret) + platform_device_put(gxp_watchdog_device); + + return ret; } static const struct of_device_id gxp_timer_of_match[] = { From caa590067efd659d8811ad8904489536912ebc53 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:42 +0300 Subject: [PATCH 107/145] clocksource/drivers/timer-ti-dm: Drop unused functions We still have some unused functions left, let's drop them. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-2-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 51 ------------------------------- include/clocksource/timer-ti-dm.h | 7 ----- 2 files changed, 58 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 469f7c91564b..b8f8c1138e31 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -388,16 +388,6 @@ static inline u32 omap_dm_timer_reserved_systimer(int id) return (omap_reserved_systimers & (1 << (id - 1))) ? 1 : 0; } -int omap_dm_timer_reserve_systimer(int id) -{ - if (omap_dm_timer_reserved_systimer(id)) - return -ENODEV; - - omap_reserved_systimers |= (1 << (id - 1)); - - return 0; -} - static struct omap_dm_timer *_omap_dm_timer_request(int req_type, void *data) { struct omap_dm_timer *timer = NULL, *t; @@ -499,20 +489,6 @@ static struct omap_dm_timer *omap_dm_timer_request_specific(int id) return _omap_dm_timer_request(REQUEST_BY_ID, &id); } -/** - * omap_dm_timer_request_by_cap - Request a timer by capability - * @cap: Bit mask of capabilities to match - * - * Find a timer based upon capabilities bit mask. Callers of this function - * should use the definitions found in the plat/dmtimer.h file under the - * comment "timer capabilities used in hwmod database". Returns pointer to - * timer handle on success and a NULL pointer on failure. - */ -struct omap_dm_timer *omap_dm_timer_request_by_cap(u32 cap) -{ - return _omap_dm_timer_request(REQUEST_BY_CAP, &cap); -} - /** * omap_dm_timer_request_by_node - Request a timer by device-tree node * @np: Pointer to device-tree timer node @@ -606,17 +582,6 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) #endif -int omap_dm_timer_trigger(struct omap_dm_timer *timer) -{ - if (unlikely(!timer || !atomic_read(&timer->enabled))) { - pr_err("%s: timer not available or enabled.\n", __func__); - return -EINVAL; - } - - omap_dm_timer_write_reg(timer, OMAP_TIMER_TRIGGER_REG, 0); - return 0; -} - static int omap_dm_timer_start(struct omap_dm_timer *timer) { u32 l; @@ -833,22 +798,6 @@ static int omap_dm_timer_write_counter(struct omap_dm_timer *timer, unsigned int return 0; } -int omap_dm_timers_active(void) -{ - struct omap_dm_timer *timer; - - list_for_each_entry(timer, &omap_timer_list, node) { - if (!timer->reserved) - continue; - - if (omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG) & - OMAP_TIMER_CTRL_ST) { - return 1; - } - } - return 0; -} - static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { struct omap_dm_timer *timer = dev_get_drvdata(dev); diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index b0f80cfd2a26..4142bf77be1a 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -119,17 +119,10 @@ struct omap_dm_timer { struct notifier_block nb; }; -int omap_dm_timer_reserve_systimer(int id); -struct omap_dm_timer *omap_dm_timer_request_by_cap(u32 cap); - int omap_dm_timer_get_irq(struct omap_dm_timer *timer); u32 omap_dm_timer_modify_idlect_mask(u32 inputmask); -int omap_dm_timer_trigger(struct omap_dm_timer *timer); - -int omap_dm_timers_active(void); - /* * Do not use the defines below, they are not needed. They should be only * used by dmtimer.c and sys_timer related code. From 90c9aada19606ae78259cef78a46646a97ec8f67 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:43 +0300 Subject: [PATCH 108/145] clocksource/drivers/timer-ti-dm: Simplify register reads with dmtimer_read() We can simplify register read access by checking for the register write posted mode in the read function. This way we can combine the functions for __omap_dm_timer_read() and omap_dm_timer_read_reg() into a single function dmtimer_read(). We update the shared register access first, the timer revision specific register access will be updated in a later patch. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-3-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 86 ++++++++++++++----------------- 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index b8f8c1138e31..070d47d6a88c 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -44,14 +44,28 @@ enum { REQUEST_BY_NODE, }; -static inline u32 __omap_dm_timer_read(struct omap_dm_timer *timer, u32 reg, - int posted) +/** + * dmtimer_read - read timer registers in posted and non-posted mode + * @timer: timer pointer over which read operation to perform + * @reg: lowest byte holds the register offset + * + * The posted mode bit is encoded in reg. Note that in posted mode, write + * pending bit must be checked. Otherwise a read of a non completed write + * will produce an error. + */ +static inline u32 dmtimer_read(struct omap_dm_timer *timer, u32 reg) { - if (posted) - while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) + u16 wp, offset; + + wp = reg >> WPSHIFT; + offset = reg & 0xff; + + /* Wait for a possible write pending bit in posted mode */ + if (wp && timer->posted) + while (readl_relaxed(timer->pend) & wp) cpu_relax(); - return readl_relaxed(timer->func_base + (reg & 0xff)); + return readl_relaxed(timer->func_base + offset); } static inline void __omap_dm_timer_write(struct omap_dm_timer *timer, @@ -121,13 +135,13 @@ static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, { u32 l; - l = __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (l & OMAP_TIMER_CTRL_ST) { l &= ~0x1; __omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, l, posted); #ifdef CONFIG_ARCH_OMAP2PLUS /* Readback to make sure write has completed */ - __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); + dmtimer_read(timer, OMAP_TIMER_CTRL_REG); /* * Wait for functional clock period x 3.5 to make sure that * timer is stopped @@ -148,9 +162,9 @@ static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer, } static inline unsigned int -__omap_dm_timer_read_counter(struct omap_dm_timer *timer, int posted) +__omap_dm_timer_read_counter(struct omap_dm_timer *timer) { - return __omap_dm_timer_read(timer, OMAP_TIMER_COUNTER_REG, posted); + return dmtimer_read(timer, OMAP_TIMER_COUNTER_REG); } static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, @@ -159,21 +173,6 @@ static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, writel_relaxed(value, timer->irq_stat); } -/** - * omap_dm_timer_read_reg - read timer registers in posted and non-posted mode - * @timer: timer pointer over which read operation to perform - * @reg: lowest byte holds the register offset - * - * The posted mode bit is encoded in reg. Note that in posted mode write - * pending bit must be checked. Otherwise a read of a non completed write - * will produce an error. - */ -static inline u32 omap_dm_timer_read_reg(struct omap_dm_timer *timer, u32 reg) -{ - WARN_ON((reg & 0xff) < _OMAP_TIMER_WAKEUP_EN_OFFSET); - return __omap_dm_timer_read(timer, reg, timer->posted); -} - /** * omap_dm_timer_write_reg - write timer registers in posted and non-posted mode * @timer: timer pointer over which write operation is to perform @@ -213,20 +212,14 @@ static void omap_timer_restore_context(struct omap_dm_timer *timer) static void omap_timer_save_context(struct omap_dm_timer *timer) { - timer->context.ocp_cfg = - __omap_dm_timer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET, 0); + timer->context.ocp_cfg = dmtimer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET); - timer->context.tclr = - omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); - timer->context.twer = - omap_dm_timer_read_reg(timer, OMAP_TIMER_WAKEUP_EN_REG); - timer->context.tldr = - omap_dm_timer_read_reg(timer, OMAP_TIMER_LOAD_REG); - timer->context.tmar = - omap_dm_timer_read_reg(timer, OMAP_TIMER_MATCH_REG); + timer->context.tclr = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + timer->context.twer = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG); + timer->context.tldr = dmtimer_read(timer, OMAP_TIMER_LOAD_REG); + timer->context.tmar = dmtimer_read(timer, OMAP_TIMER_MATCH_REG); timer->context.tier = readl_relaxed(timer->irq_ena); - timer->context.tsicr = - omap_dm_timer_read_reg(timer, OMAP_TIMER_IF_CTRL_REG); + timer->context.tsicr = dmtimer_read(timer, OMAP_TIMER_IF_CTRL_REG); } static int omap_timer_context_notifier(struct notifier_block *nb, @@ -266,8 +259,7 @@ static int omap_dm_timer_reset(struct omap_dm_timer *timer) omap_dm_timer_write_reg(timer, OMAP_TIMER_IF_CTRL_REG, 0x06); do { - l = __omap_dm_timer_read(timer, - OMAP_TIMER_V1_SYS_STAT_OFFSET, 0); + l = dmtimer_read(timer, OMAP_TIMER_V1_SYS_STAT_OFFSET); } while (!l && timeout--); if (!timeout) { @@ -276,7 +268,7 @@ static int omap_dm_timer_reset(struct omap_dm_timer *timer) } /* Configure timer for smart-idle mode */ - l = __omap_dm_timer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET, 0); + l = dmtimer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET); l |= 0x2 << 0x3; __omap_dm_timer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, l, 0); @@ -550,7 +542,7 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) list_for_each_entry(timer, &omap_timer_list, node) { u32 l; - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (l & OMAP_TIMER_CTRL_ST) { if (((omap_readl(MOD_CONF_CTRL_1) >> (i * 2)) & 0x03) == 0) inputmask &= ~(1 << 1); @@ -591,7 +583,7 @@ static int omap_dm_timer_start(struct omap_dm_timer *timer) omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (!(l & OMAP_TIMER_CTRL_ST)) { l |= OMAP_TIMER_CTRL_ST; omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); @@ -638,7 +630,7 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *timer, int enable, return -EINVAL; omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (enable) l |= OMAP_TIMER_CTRL_CE; else @@ -659,7 +651,7 @@ static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, return -EINVAL; omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); l &= ~(OMAP_TIMER_CTRL_GPOCFG | OMAP_TIMER_CTRL_SCPWM | OMAP_TIMER_CTRL_PT | (0x03 << 10) | OMAP_TIMER_CTRL_AR); if (def_on) @@ -683,7 +675,7 @@ static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *timer) return -EINVAL; omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); omap_dm_timer_disable(timer); return l; @@ -698,7 +690,7 @@ static int omap_dm_timer_set_prescaler(struct omap_dm_timer *timer, return -EINVAL; omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); l &= ~(OMAP_TIMER_CTRL_PRE | (0x07 << 2)); if (prescaler >= 0) { l |= OMAP_TIMER_CTRL_PRE; @@ -743,7 +735,7 @@ static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) l = readl_relaxed(timer->irq_ena) & ~mask; writel_relaxed(l, timer->irq_dis); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_WAKEUP_EN_REG) & ~mask; + l = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG) & ~mask; omap_dm_timer_write_reg(timer, OMAP_TIMER_WAKEUP_EN_REG, l); omap_dm_timer_disable(timer); @@ -781,7 +773,7 @@ static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *timer) return 0; } - return __omap_dm_timer_read_counter(timer, timer->posted); + return __omap_dm_timer_read_counter(timer); } static int omap_dm_timer_write_counter(struct omap_dm_timer *timer, unsigned int value) From 49cd16bb573e43dc2ee64d734b9b545475dbb35f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:44 +0300 Subject: [PATCH 109/145] clocksource/drivers/timer-ti-dm: Simplify register writes with dmtimer_write() We can simplify register write access by checking for the register write posted mode in the write function. This way we can combine the functions for __omap_dm_timer_write() and omap_dm_timer_write_reg() into a single function dmtimer_write(). We update the shared register access first, the timer revision specific register access will be updated in a later patch. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-4-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 96 ++++++++++++++----------------- 1 file changed, 43 insertions(+), 53 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 070d47d6a88c..61c3c7684314 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -68,14 +68,29 @@ static inline u32 dmtimer_read(struct omap_dm_timer *timer, u32 reg) return readl_relaxed(timer->func_base + offset); } -static inline void __omap_dm_timer_write(struct omap_dm_timer *timer, - u32 reg, u32 val, int posted) +/** + * dmtimer_write - write timer registers in posted and non-posted mode + * @timer: timer pointer over which write operation is to perform + * @reg: lowest byte holds the register offset + * @value: data to write into the register + * + * The posted mode bit is encoded in reg. Note that in posted mode, the write + * pending bit must be checked. Otherwise a write on a register which has a + * pending write will be lost. + */ +static inline void dmtimer_write(struct omap_dm_timer *timer, u32 reg, u32 val) { - if (posted) - while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) + u16 wp, offset; + + wp = reg >> WPSHIFT; + offset = reg & 0xff; + + /* Wait for a possible write pending bit in posted mode */ + if (wp && timer->posted) + while (readl_relaxed(timer->pend) & wp) cpu_relax(); - writel_relaxed(val, timer->func_base + (reg & 0xff)); + writel_relaxed(val, timer->func_base + offset); } static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) @@ -120,25 +135,24 @@ static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer) if (timer->errata & OMAP_TIMER_ERRATA_I103_I767) { timer->posted = OMAP_TIMER_NONPOSTED; - __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0, 0); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0); return; } - __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, - OMAP_TIMER_CTRL_POSTED, 0); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, OMAP_TIMER_CTRL_POSTED); timer->context.tsicr = OMAP_TIMER_CTRL_POSTED; timer->posted = OMAP_TIMER_POSTED; } static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, - int posted, unsigned long rate) + unsigned long rate) { u32 l; l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (l & OMAP_TIMER_CTRL_ST) { l &= ~0x1; - __omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, l, posted); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); #ifdef CONFIG_ARCH_OMAP2PLUS /* Readback to make sure write has completed */ dmtimer_read(timer, OMAP_TIMER_CTRL_REG); @@ -158,7 +172,7 @@ static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer, unsigned int value) { writel_relaxed(value, timer->irq_ena); - __omap_dm_timer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value, 0); + dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value); } static inline unsigned int @@ -173,41 +187,17 @@ static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, writel_relaxed(value, timer->irq_stat); } -/** - * omap_dm_timer_write_reg - write timer registers in posted and non-posted mode - * @timer: timer pointer over which write operation is to perform - * @reg: lowest byte holds the register offset - * @value: data to write into the register - * - * The posted mode bit is encoded in reg. Note that in posted mode the write - * pending bit must be checked. Otherwise a write on a register which has a - * pending write will be lost. - */ -static void omap_dm_timer_write_reg(struct omap_dm_timer *timer, u32 reg, - u32 value) -{ - WARN_ON((reg & 0xff) < _OMAP_TIMER_WAKEUP_EN_OFFSET); - __omap_dm_timer_write(timer, reg, value, timer->posted); -} - static void omap_timer_restore_context(struct omap_dm_timer *timer) { - __omap_dm_timer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, - timer->context.ocp_cfg, 0); + dmtimer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, timer->context.ocp_cfg); - omap_dm_timer_write_reg(timer, OMAP_TIMER_WAKEUP_EN_REG, - timer->context.twer); - omap_dm_timer_write_reg(timer, OMAP_TIMER_COUNTER_REG, - timer->context.tcrr); - omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG, - timer->context.tldr); - omap_dm_timer_write_reg(timer, OMAP_TIMER_MATCH_REG, - timer->context.tmar); - omap_dm_timer_write_reg(timer, OMAP_TIMER_IF_CTRL_REG, - timer->context.tsicr); + dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, timer->context.twer); + dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, timer->context.tcrr); + dmtimer_write(timer, OMAP_TIMER_LOAD_REG, timer->context.tldr); + dmtimer_write(timer, OMAP_TIMER_MATCH_REG, timer->context.tmar); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, timer->context.tsicr); writel_relaxed(timer->context.tier, timer->irq_ena); - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, - timer->context.tclr); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, timer->context.tclr); } static void omap_timer_save_context(struct omap_dm_timer *timer) @@ -256,7 +246,7 @@ static int omap_dm_timer_reset(struct omap_dm_timer *timer) if (timer->revision != 1) return -EINVAL; - omap_dm_timer_write_reg(timer, OMAP_TIMER_IF_CTRL_REG, 0x06); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0x06); do { l = dmtimer_read(timer, OMAP_TIMER_V1_SYS_STAT_OFFSET); @@ -270,7 +260,7 @@ static int omap_dm_timer_reset(struct omap_dm_timer *timer) /* Configure timer for smart-idle mode */ l = dmtimer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET); l |= 0x2 << 0x3; - __omap_dm_timer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, l, 0); + dmtimer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, l); timer->posted = 0; @@ -586,7 +576,7 @@ static int omap_dm_timer_start(struct omap_dm_timer *timer) l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (!(l & OMAP_TIMER_CTRL_ST)) { l |= OMAP_TIMER_CTRL_ST; - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); } return 0; @@ -602,7 +592,7 @@ static int omap_dm_timer_stop(struct omap_dm_timer *timer) if (!(timer->capability & OMAP_TIMER_NEEDS_RESET)) rate = clk_get_rate(timer->fclk); - __omap_dm_timer_stop(timer, timer->posted, rate); + __omap_dm_timer_stop(timer, rate); omap_dm_timer_disable(timer); return 0; @@ -615,7 +605,7 @@ static int omap_dm_timer_set_load(struct omap_dm_timer *timer, return -EINVAL; omap_dm_timer_enable(timer); - omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG, load); + dmtimer_write(timer, OMAP_TIMER_LOAD_REG, load); omap_dm_timer_disable(timer); return 0; @@ -635,8 +625,8 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *timer, int enable, l |= OMAP_TIMER_CTRL_CE; else l &= ~OMAP_TIMER_CTRL_CE; - omap_dm_timer_write_reg(timer, OMAP_TIMER_MATCH_REG, match); - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_MATCH_REG, match); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); omap_dm_timer_disable(timer); return 0; @@ -661,7 +651,7 @@ static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, l |= trigger << 10; if (autoreload) l |= OMAP_TIMER_CTRL_AR; - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); omap_dm_timer_disable(timer); return 0; @@ -696,7 +686,7 @@ static int omap_dm_timer_set_prescaler(struct omap_dm_timer *timer, l |= OMAP_TIMER_CTRL_PRE; l |= prescaler << 2; } - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); omap_dm_timer_disable(timer); return 0; @@ -736,7 +726,7 @@ static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) writel_relaxed(l, timer->irq_dis); l = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG) & ~mask; - omap_dm_timer_write_reg(timer, OMAP_TIMER_WAKEUP_EN_REG, l); + dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, l); omap_dm_timer_disable(timer); return 0; @@ -783,7 +773,7 @@ static int omap_dm_timer_write_counter(struct omap_dm_timer *timer, unsigned int return -EINVAL; } - omap_dm_timer_write_reg(timer, OMAP_TIMER_COUNTER_REG, value); + dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, value); /* Save the context */ timer->context.tcrr = value; From f32bdac10cb5f461f8c71c3b9703617c42322d2f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:45 +0300 Subject: [PATCH 110/145] clocksource/drivers/timer-ti-dm: Simplify register access further Let's unify register access and use dmtimer_read() and dmtimer_write() also for the timer revision specific registers like we now do for the shread registers. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-5-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 28 ++++++++++++++-------------- include/clocksource/timer-ti-dm.h | 6 +++--- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 61c3c7684314..fe8ba0fad869 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -101,16 +101,16 @@ static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) tidr = readl_relaxed(timer->io_base); if (!(tidr >> 16)) { timer->revision = 1; - timer->irq_stat = timer->io_base + OMAP_TIMER_V1_STAT_OFFSET; - timer->irq_ena = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; - timer->irq_dis = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; + timer->irq_stat = OMAP_TIMER_V1_STAT_OFFSET; + timer->irq_ena = OMAP_TIMER_V1_INT_EN_OFFSET; + timer->irq_dis = OMAP_TIMER_V1_INT_EN_OFFSET; timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET; timer->func_base = timer->io_base; } else { timer->revision = 2; - timer->irq_stat = timer->io_base + OMAP_TIMER_V2_IRQSTATUS; - timer->irq_ena = timer->io_base + OMAP_TIMER_V2_IRQENABLE_SET; - timer->irq_dis = timer->io_base + OMAP_TIMER_V2_IRQENABLE_CLR; + timer->irq_stat = OMAP_TIMER_V2_IRQSTATUS - OMAP_TIMER_V2_FUNC_OFFSET; + timer->irq_ena = OMAP_TIMER_V2_IRQENABLE_SET - OMAP_TIMER_V2_FUNC_OFFSET; + timer->irq_dis = OMAP_TIMER_V2_IRQENABLE_CLR - OMAP_TIMER_V2_FUNC_OFFSET; timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET + OMAP_TIMER_V2_FUNC_OFFSET; @@ -165,13 +165,13 @@ static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, } /* Ack possibly pending interrupt */ - writel_relaxed(OMAP_TIMER_INT_OVERFLOW, timer->irq_stat); + dmtimer_write(timer, timer->irq_stat, OMAP_TIMER_INT_OVERFLOW); } static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer, unsigned int value) { - writel_relaxed(value, timer->irq_ena); + dmtimer_write(timer, timer->irq_ena, value); dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value); } @@ -184,7 +184,7 @@ __omap_dm_timer_read_counter(struct omap_dm_timer *timer) static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, unsigned int value) { - writel_relaxed(value, timer->irq_stat); + dmtimer_write(timer, timer->irq_stat, value); } static void omap_timer_restore_context(struct omap_dm_timer *timer) @@ -196,7 +196,7 @@ static void omap_timer_restore_context(struct omap_dm_timer *timer) dmtimer_write(timer, OMAP_TIMER_LOAD_REG, timer->context.tldr); dmtimer_write(timer, OMAP_TIMER_MATCH_REG, timer->context.tmar); dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, timer->context.tsicr); - writel_relaxed(timer->context.tier, timer->irq_ena); + dmtimer_write(timer, timer->irq_ena, timer->context.tier); dmtimer_write(timer, OMAP_TIMER_CTRL_REG, timer->context.tclr); } @@ -208,7 +208,7 @@ static void omap_timer_save_context(struct omap_dm_timer *timer) timer->context.twer = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG); timer->context.tldr = dmtimer_read(timer, OMAP_TIMER_LOAD_REG); timer->context.tmar = dmtimer_read(timer, OMAP_TIMER_MATCH_REG); - timer->context.tier = readl_relaxed(timer->irq_ena); + timer->context.tier = dmtimer_read(timer, timer->irq_ena); timer->context.tsicr = dmtimer_read(timer, OMAP_TIMER_IF_CTRL_REG); } @@ -722,9 +722,9 @@ static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) omap_dm_timer_enable(timer); if (timer->revision == 1) - l = readl_relaxed(timer->irq_ena) & ~mask; + l = dmtimer_read(timer, timer->irq_ena) & ~mask; - writel_relaxed(l, timer->irq_dis); + dmtimer_write(timer, timer->irq_dis, l); l = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG) & ~mask; dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, l); @@ -741,7 +741,7 @@ static unsigned int omap_dm_timer_read_status(struct omap_dm_timer *timer) return 0; } - l = readl_relaxed(timer->irq_stat); + l = dmtimer_read(timer, timer->irq_stat); return l; } diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index 4142bf77be1a..e874eed2fa46 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -100,9 +100,9 @@ struct omap_dm_timer { struct clk *fclk; void __iomem *io_base; - void __iomem *irq_stat; /* TISR/IRQSTATUS interrupt status */ - void __iomem *irq_ena; /* irq enable */ - void __iomem *irq_dis; /* irq disable, only on v2 ip */ + int irq_stat; /* TISR/IRQSTATUS interrupt status */ + int irq_ena; /* irq enable */ + int irq_dis; /* irq disable, only on v2 ip */ void __iomem *pend; /* write pending */ void __iomem *func_base; /* function register base */ From 1d513f439d7930363adce4588030af7c8fa71cc9 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:46 +0300 Subject: [PATCH 111/145] clocksource/drivers/timer-ti-dm: Move private defines to the driver These defines are only used by timer-ti-dm driver. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-6-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 62 +++++++++++++++++++++++++++++++ include/clocksource/timer-ti-dm.h | 62 ------------------------------- 2 files changed, 62 insertions(+), 62 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index fe8ba0fad869..e82ec5626849 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -33,6 +33,68 @@ #include +/* + * timer errata flags + * + * Errata i103/i767 impacts all OMAP3/4/5 devices including AM33xx. This + * errata prevents us from using posted mode on these devices, unless the + * timer counter register is never read. For more details please refer to + * the OMAP3/4/5 errata documents. + */ +#define OMAP_TIMER_ERRATA_I103_I767 0x80000000 + +/* posted mode types */ +#define OMAP_TIMER_NONPOSTED 0x00 +#define OMAP_TIMER_POSTED 0x01 + +/* register offsets with the write pending bit encoded */ +#define WPSHIFT 16 + +#define OMAP_TIMER_WAKEUP_EN_REG (_OMAP_TIMER_WAKEUP_EN_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_CTRL_REG (_OMAP_TIMER_CTRL_OFFSET \ + | (WP_TCLR << WPSHIFT)) + +#define OMAP_TIMER_COUNTER_REG (_OMAP_TIMER_COUNTER_OFFSET \ + | (WP_TCRR << WPSHIFT)) + +#define OMAP_TIMER_LOAD_REG (_OMAP_TIMER_LOAD_OFFSET \ + | (WP_TLDR << WPSHIFT)) + +#define OMAP_TIMER_TRIGGER_REG (_OMAP_TIMER_TRIGGER_OFFSET \ + | (WP_TTGR << WPSHIFT)) + +#define OMAP_TIMER_WRITE_PEND_REG (_OMAP_TIMER_WRITE_PEND_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_MATCH_REG (_OMAP_TIMER_MATCH_OFFSET \ + | (WP_TMAR << WPSHIFT)) + +#define OMAP_TIMER_CAPTURE_REG (_OMAP_TIMER_CAPTURE_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_IF_CTRL_REG (_OMAP_TIMER_IF_CTRL_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_CAPTURE2_REG (_OMAP_TIMER_CAPTURE2_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_TICK_POS_REG (_OMAP_TIMER_TICK_POS_OFFSET \ + | (WP_TPIR << WPSHIFT)) + +#define OMAP_TIMER_TICK_NEG_REG (_OMAP_TIMER_TICK_NEG_OFFSET \ + | (WP_TNIR << WPSHIFT)) + +#define OMAP_TIMER_TICK_COUNT_REG (_OMAP_TIMER_TICK_COUNT_OFFSET \ + | (WP_TCVR << WPSHIFT)) + +#define OMAP_TIMER_TICK_INT_MASK_SET_REG \ + (_OMAP_TIMER_TICK_INT_MASK_SET_OFFSET | (WP_TOCR << WPSHIFT)) + +#define OMAP_TIMER_TICK_INT_MASK_COUNT_REG \ + (_OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET | (WP_TOWR << WPSHIFT)) + static u32 omap_reserved_systimers; static LIST_HEAD(omap_timer_list); static DEFINE_SPINLOCK(dm_timer_lock); diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index e874eed2fa46..df3c6c88264f 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -52,10 +52,6 @@ #define OMAP_TIMER_TRIGGER_OVERFLOW 0x01 #define OMAP_TIMER_TRIGGER_OVERFLOW_AND_COMPARE 0x02 -/* posted mode types */ -#define OMAP_TIMER_NONPOSTED 0x00 -#define OMAP_TIMER_POSTED 0x01 - /* timer capabilities used in hwmod database */ #define OMAP_TIMER_SECURE 0x80000000 #define OMAP_TIMER_ALWON 0x40000000 @@ -63,16 +59,6 @@ #define OMAP_TIMER_NEEDS_RESET 0x10000000 #define OMAP_TIMER_HAS_DSP_IRQ 0x08000000 -/* - * timer errata flags - * - * Errata i103/i767 impacts all OMAP3/4/5 devices including AM33xx. This - * errata prevents us from using posted mode on these devices, unless the - * timer counter register is never read. For more details please refer to - * the OMAP3/4/5 errata documents. - */ -#define OMAP_TIMER_ERRATA_I103_I767 0x80000000 - struct timer_regs { u32 ocp_cfg; u32 tidr; @@ -192,52 +178,4 @@ u32 omap_dm_timer_modify_idlect_mask(u32 inputmask); #define _OMAP_TIMER_TICK_INT_MASK_SET_OFFSET 0x54 /* TOCR, 34xx only */ #define _OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET 0x58 /* TOWR, 34xx only */ -/* register offsets with the write pending bit encoded */ -#define WPSHIFT 16 - -#define OMAP_TIMER_WAKEUP_EN_REG (_OMAP_TIMER_WAKEUP_EN_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_CTRL_REG (_OMAP_TIMER_CTRL_OFFSET \ - | (WP_TCLR << WPSHIFT)) - -#define OMAP_TIMER_COUNTER_REG (_OMAP_TIMER_COUNTER_OFFSET \ - | (WP_TCRR << WPSHIFT)) - -#define OMAP_TIMER_LOAD_REG (_OMAP_TIMER_LOAD_OFFSET \ - | (WP_TLDR << WPSHIFT)) - -#define OMAP_TIMER_TRIGGER_REG (_OMAP_TIMER_TRIGGER_OFFSET \ - | (WP_TTGR << WPSHIFT)) - -#define OMAP_TIMER_WRITE_PEND_REG (_OMAP_TIMER_WRITE_PEND_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_MATCH_REG (_OMAP_TIMER_MATCH_OFFSET \ - | (WP_TMAR << WPSHIFT)) - -#define OMAP_TIMER_CAPTURE_REG (_OMAP_TIMER_CAPTURE_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_IF_CTRL_REG (_OMAP_TIMER_IF_CTRL_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_CAPTURE2_REG (_OMAP_TIMER_CAPTURE2_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_TICK_POS_REG (_OMAP_TIMER_TICK_POS_OFFSET \ - | (WP_TPIR << WPSHIFT)) - -#define OMAP_TIMER_TICK_NEG_REG (_OMAP_TIMER_TICK_NEG_OFFSET \ - | (WP_TNIR << WPSHIFT)) - -#define OMAP_TIMER_TICK_COUNT_REG (_OMAP_TIMER_TICK_COUNT_OFFSET \ - | (WP_TCVR << WPSHIFT)) - -#define OMAP_TIMER_TICK_INT_MASK_SET_REG \ - (_OMAP_TIMER_TICK_INT_MASK_SET_OFFSET | (WP_TOCR << WPSHIFT)) - -#define OMAP_TIMER_TICK_INT_MASK_COUNT_REG \ - (_OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET | (WP_TOWR << WPSHIFT)) - #endif /* __CLOCKSOURCE_DMTIMER_H */ From bd351f1aee21ca667b39658550b5f3c61e8bb77f Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:47 +0300 Subject: [PATCH 112/145] clocksource/drivers/timer-ti-dm: Use runtime PM directly and check errors Use pm_runtime_resume_and_get() and check for a possible error returned. We want to do this as omap_dm_timer_enable() and omap_dm_timer_disable() are exposed to the pwm and remoteproc drivers, and in the following patch we turn struct omap_dm_timer into a cookie used by the exposed functions only. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-7-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 117 +++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 26 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index e82ec5626849..4967a91a114a 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -387,16 +387,24 @@ static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source) static void omap_dm_timer_enable(struct omap_dm_timer *timer) { - pm_runtime_get_sync(&timer->pdev->dev); + struct device *dev = &timer->pdev->dev; + int rc; + + rc = pm_runtime_resume_and_get(dev); + if (rc) + dev_err(dev, "could not enable timer\n"); } static void omap_dm_timer_disable(struct omap_dm_timer *timer) { - pm_runtime_put_sync(&timer->pdev->dev); + struct device *dev = &timer->pdev->dev; + + pm_runtime_put_sync(dev); } static int omap_dm_timer_prepare(struct omap_dm_timer *timer) { + struct device *dev = &timer->pdev->dev; int rc; /* @@ -411,18 +419,20 @@ static int omap_dm_timer_prepare(struct omap_dm_timer *timer) } } - omap_dm_timer_enable(timer); + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; if (timer->capability & OMAP_TIMER_NEEDS_RESET) { rc = omap_dm_timer_reset(timer); if (rc) { - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); return rc; } } __omap_dm_timer_enable_posted(timer); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); return 0; } @@ -628,12 +638,16 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) static int omap_dm_timer_start(struct omap_dm_timer *timer) { + struct device *dev = &timer->pdev->dev; + int rc; u32 l; if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (!(l & OMAP_TIMER_CTRL_ST)) { @@ -646,6 +660,7 @@ static int omap_dm_timer_start(struct omap_dm_timer *timer) static int omap_dm_timer_stop(struct omap_dm_timer *timer) { + struct device *dev = &timer->pdev->dev; unsigned long rate = 0; if (unlikely(!timer)) @@ -656,32 +671,47 @@ static int omap_dm_timer_stop(struct omap_dm_timer *timer) __omap_dm_timer_stop(timer, rate); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } static int omap_dm_timer_set_load(struct omap_dm_timer *timer, unsigned int load) { + struct device *dev; + int rc; + if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + dmtimer_write(timer, OMAP_TIMER_LOAD_REG, load); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } static int omap_dm_timer_set_match(struct omap_dm_timer *timer, int enable, unsigned int match) { + struct device *dev; + int rc; u32 l; if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (enable) l |= OMAP_TIMER_CTRL_CE; @@ -690,19 +720,26 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *timer, int enable, dmtimer_write(timer, OMAP_TIMER_MATCH_REG, match); dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, int toggle, int trigger, int autoreload) { + struct device *dev; + int rc; u32 l; if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); l &= ~(OMAP_TIMER_CTRL_GPOCFG | OMAP_TIMER_CTRL_SCPWM | OMAP_TIMER_CTRL_PT | (0x03 << 10) | OMAP_TIMER_CTRL_AR); @@ -715,20 +752,28 @@ static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, l |= OMAP_TIMER_CTRL_AR; dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *timer) { + struct device *dev; + int rc; u32 l; if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); - omap_dm_timer_disable(timer); + + pm_runtime_put_sync(dev); return l; } @@ -736,12 +781,18 @@ static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *timer) static int omap_dm_timer_set_prescaler(struct omap_dm_timer *timer, int prescaler) { + struct device *dev; + int rc; u32 l; if (unlikely(!timer) || prescaler < -1 || prescaler > 7) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); l &= ~(OMAP_TIMER_CTRL_PRE | (0x07 << 2)); if (prescaler >= 0) { @@ -750,20 +801,29 @@ static int omap_dm_timer_set_prescaler(struct omap_dm_timer *timer, } dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } static int omap_dm_timer_set_int_enable(struct omap_dm_timer *timer, unsigned int value) { + struct device *dev; + int rc; + if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + __omap_dm_timer_int_enable(timer, value); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } @@ -776,12 +836,17 @@ static int omap_dm_timer_set_int_enable(struct omap_dm_timer *timer, */ static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) { + struct device *dev; u32 l = mask; + int rc; if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; if (timer->revision == 1) l = dmtimer_read(timer, timer->irq_ena) & ~mask; @@ -790,7 +855,8 @@ static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) l = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG) & ~mask; dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, l); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } @@ -943,11 +1009,11 @@ static int omap_dm_timer_probe(struct platform_device *pdev) pm_runtime_enable(dev); if (!timer->reserved) { - ret = pm_runtime_get_sync(dev); - if (ret < 0) { + ret = pm_runtime_resume_and_get(dev); + if (ret) { dev_err(dev, "%s: pm_runtime_get_sync failed!\n", __func__); - goto err_get_sync; + goto err_disable; } __omap_dm_timer_init_regs(timer); pm_runtime_put(dev); @@ -962,8 +1028,7 @@ static int omap_dm_timer_probe(struct platform_device *pdev) return 0; -err_get_sync: - pm_runtime_put_noidle(dev); +err_disable: pm_runtime_disable(dev); return ret; } From a6e543f61531b63bfc8d43053c6ec6f65117f627 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:48 +0300 Subject: [PATCH 113/145] clocksource/drivers/timer-ti-dm: Move struct omap_dm_timer fields to driver There is no longer any need to expose the elements of struct omap_dm_timer outside the driver. The pwm and remoteproc drivers just use struct omap_dm_timer as a cookie. Let's move the elements of struct omap_dm_timer into struct dmtimer that is private to the driver. To do this, we mostly rename omap_dm_timer to dmtimer in the driver. We keep omap_dm_timer only for the exposed functions in the platform_data for the pwm and remoteproc drivers. Let's also add a note about not using the exposed functions internally as those will get deprecated eventually in favor of Linux generic frameworks. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-8-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 218 +++++++++++++++++++++++------- include/clocksource/timer-ti-dm.h | 43 ------ 2 files changed, 170 insertions(+), 91 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 4967a91a114a..0d0130ec4d1c 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -95,6 +95,53 @@ #define OMAP_TIMER_TICK_INT_MASK_COUNT_REG \ (_OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET | (WP_TOWR << WPSHIFT)) +struct timer_regs { + u32 ocp_cfg; + u32 tidr; + u32 tier; + u32 twer; + u32 tclr; + u32 tcrr; + u32 tldr; + u32 ttrg; + u32 twps; + u32 tmar; + u32 tcar1; + u32 tsicr; + u32 tcar2; + u32 tpir; + u32 tnir; + u32 tcvr; + u32 tocr; + u32 towr; +}; + +struct dmtimer { + struct omap_dm_timer cookie; + int id; + int irq; + struct clk *fclk; + + void __iomem *io_base; + int irq_stat; /* TISR/IRQSTATUS interrupt status */ + int irq_ena; /* irq enable */ + int irq_dis; /* irq disable, only on v2 ip */ + void __iomem *pend; /* write pending */ + void __iomem *func_base; /* function register base */ + + atomic_t enabled; + unsigned long rate; + unsigned reserved:1; + unsigned posted:1; + struct timer_regs context; + int revision; + u32 capability; + u32 errata; + struct platform_device *pdev; + struct list_head node; + struct notifier_block nb; +}; + static u32 omap_reserved_systimers; static LIST_HEAD(omap_timer_list); static DEFINE_SPINLOCK(dm_timer_lock); @@ -115,7 +162,7 @@ enum { * pending bit must be checked. Otherwise a read of a non completed write * will produce an error. */ -static inline u32 dmtimer_read(struct omap_dm_timer *timer, u32 reg) +static inline u32 dmtimer_read(struct dmtimer *timer, u32 reg) { u16 wp, offset; @@ -140,7 +187,7 @@ static inline u32 dmtimer_read(struct omap_dm_timer *timer, u32 reg) * pending bit must be checked. Otherwise a write on a register which has a * pending write will be lost. */ -static inline void dmtimer_write(struct omap_dm_timer *timer, u32 reg, u32 val) +static inline void dmtimer_write(struct dmtimer *timer, u32 reg, u32 val) { u16 wp, offset; @@ -155,7 +202,7 @@ static inline void dmtimer_write(struct omap_dm_timer *timer, u32 reg, u32 val) writel_relaxed(val, timer->func_base + offset); } -static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) +static inline void __omap_dm_timer_init_regs(struct dmtimer *timer) { u32 tidr; @@ -190,7 +237,7 @@ static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) * complete. Enabling this feature can improve performance for writing to the * timer registers. */ -static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer) +static inline void __omap_dm_timer_enable_posted(struct dmtimer *timer) { if (timer->posted) return; @@ -206,7 +253,7 @@ static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer) timer->posted = OMAP_TIMER_POSTED; } -static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, +static inline void __omap_dm_timer_stop(struct dmtimer *timer, unsigned long rate) { u32 l; @@ -230,26 +277,26 @@ static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, dmtimer_write(timer, timer->irq_stat, OMAP_TIMER_INT_OVERFLOW); } -static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer, - unsigned int value) +static inline void __omap_dm_timer_int_enable(struct dmtimer *timer, + unsigned int value) { dmtimer_write(timer, timer->irq_ena, value); dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value); } static inline unsigned int -__omap_dm_timer_read_counter(struct omap_dm_timer *timer) +__omap_dm_timer_read_counter(struct dmtimer *timer) { return dmtimer_read(timer, OMAP_TIMER_COUNTER_REG); } -static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, +static inline void __omap_dm_timer_write_status(struct dmtimer *timer, unsigned int value) { dmtimer_write(timer, timer->irq_stat, value); } -static void omap_timer_restore_context(struct omap_dm_timer *timer) +static void omap_timer_restore_context(struct dmtimer *timer) { dmtimer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, timer->context.ocp_cfg); @@ -262,7 +309,7 @@ static void omap_timer_restore_context(struct omap_dm_timer *timer) dmtimer_write(timer, OMAP_TIMER_CTRL_REG, timer->context.tclr); } -static void omap_timer_save_context(struct omap_dm_timer *timer) +static void omap_timer_save_context(struct dmtimer *timer) { timer->context.ocp_cfg = dmtimer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET); @@ -277,9 +324,9 @@ static void omap_timer_save_context(struct omap_dm_timer *timer) static int omap_timer_context_notifier(struct notifier_block *nb, unsigned long cmd, void *v) { - struct omap_dm_timer *timer; + struct dmtimer *timer; - timer = container_of(nb, struct omap_dm_timer, nb); + timer = container_of(nb, struct dmtimer, nb); switch (cmd) { case CPU_CLUSTER_PM_ENTER: @@ -301,7 +348,7 @@ static int omap_timer_context_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static int omap_dm_timer_reset(struct omap_dm_timer *timer) +static int omap_dm_timer_reset(struct dmtimer *timer) { u32 l, timeout = 100000; @@ -329,13 +376,29 @@ static int omap_dm_timer_reset(struct omap_dm_timer *timer) return 0; } -static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source) +/* + * Functions exposed to PWM and remoteproc drivers via platform_data. + * Do not use these in the driver, these will get deprecated and will + * will be replaced by Linux generic framework functions such as + * chained interrupts and clock framework. + */ +static struct dmtimer *to_dmtimer(struct omap_dm_timer *cookie) +{ + if (!cookie) + return NULL; + + return container_of(cookie, struct dmtimer, cookie); +} + +static int omap_dm_timer_set_source(struct omap_dm_timer *cookie, int source) { int ret; const char *parent_name; struct clk *parent; struct dmtimer_platform_data *pdata; + struct dmtimer *timer; + timer = to_dmtimer(cookie); if (unlikely(!timer) || IS_ERR(timer->fclk)) return -EINVAL; @@ -385,8 +448,9 @@ static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source) return ret; } -static void omap_dm_timer_enable(struct omap_dm_timer *timer) +static void omap_dm_timer_enable(struct omap_dm_timer *cookie) { + struct dmtimer *timer = to_dmtimer(cookie); struct device *dev = &timer->pdev->dev; int rc; @@ -395,14 +459,15 @@ static void omap_dm_timer_enable(struct omap_dm_timer *timer) dev_err(dev, "could not enable timer\n"); } -static void omap_dm_timer_disable(struct omap_dm_timer *timer) +static void omap_dm_timer_disable(struct omap_dm_timer *cookie) { + struct dmtimer *timer = to_dmtimer(cookie); struct device *dev = &timer->pdev->dev; pm_runtime_put_sync(dev); } -static int omap_dm_timer_prepare(struct omap_dm_timer *timer) +static int omap_dm_timer_prepare(struct dmtimer *timer) { struct device *dev = &timer->pdev->dev; int rc; @@ -442,9 +507,9 @@ static inline u32 omap_dm_timer_reserved_systimer(int id) return (omap_reserved_systimers & (1 << (id - 1))) ? 1 : 0; } -static struct omap_dm_timer *_omap_dm_timer_request(int req_type, void *data) +static struct dmtimer *_omap_dm_timer_request(int req_type, void *data) { - struct omap_dm_timer *timer = NULL, *t; + struct dmtimer *timer = NULL, *t; struct device_node *np = NULL; unsigned long flags; u32 cap = 0; @@ -528,11 +593,19 @@ found: static struct omap_dm_timer *omap_dm_timer_request(void) { - return _omap_dm_timer_request(REQUEST_ANY, NULL); + struct dmtimer *timer; + + timer = _omap_dm_timer_request(REQUEST_ANY, NULL); + if (!timer) + return NULL; + + return &timer->cookie; } static struct omap_dm_timer *omap_dm_timer_request_specific(int id) { + struct dmtimer *timer; + /* Requesting timer by ID is not supported when device tree is used */ if (of_have_populated_dt()) { pr_warn("%s: Please use omap_dm_timer_request_by_node()\n", @@ -540,7 +613,11 @@ static struct omap_dm_timer *omap_dm_timer_request_specific(int id) return NULL; } - return _omap_dm_timer_request(REQUEST_BY_ID, &id); + timer = _omap_dm_timer_request(REQUEST_BY_ID, &id); + if (!timer) + return NULL; + + return &timer->cookie; } /** @@ -552,14 +629,23 @@ static struct omap_dm_timer *omap_dm_timer_request_specific(int id) */ static struct omap_dm_timer *omap_dm_timer_request_by_node(struct device_node *np) { + struct dmtimer *timer; + if (!np) return NULL; - return _omap_dm_timer_request(REQUEST_BY_NODE, np); + timer = _omap_dm_timer_request(REQUEST_BY_NODE, np); + if (!timer) + return NULL; + + return &timer->cookie; } -static int omap_dm_timer_free(struct omap_dm_timer *timer) +static int omap_dm_timer_free(struct omap_dm_timer *cookie) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; @@ -570,8 +656,9 @@ static int omap_dm_timer_free(struct omap_dm_timer *timer) return 0; } -int omap_dm_timer_get_irq(struct omap_dm_timer *timer) +int omap_dm_timer_get_irq(struct omap_dm_timer *cookie) { + struct dmtimer *timer = to_dmtimer(cookie); if (timer) return timer->irq; return -EINVAL; @@ -580,7 +667,7 @@ int omap_dm_timer_get_irq(struct omap_dm_timer *timer) #if defined(CONFIG_ARCH_OMAP1) #include -static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *timer) +static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *cookie) { return NULL; } @@ -592,7 +679,7 @@ static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *timer) __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) { int i = 0; - struct omap_dm_timer *timer = NULL; + struct dmtimer *timer = NULL; unsigned long flags; /* If ARMXOR cannot be idled this function call is unnecessary */ @@ -620,8 +707,10 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) #else -static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *timer) +static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *cookie) { + struct dmtimer *timer = to_dmtimer(cookie); + if (timer && !IS_ERR(timer->fclk)) return timer->fclk; return NULL; @@ -636,15 +725,19 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) #endif -static int omap_dm_timer_start(struct omap_dm_timer *timer) +static int omap_dm_timer_start(struct omap_dm_timer *cookie) { - struct device *dev = &timer->pdev->dev; + struct dmtimer *timer; + struct device *dev; int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); if (rc) return rc; @@ -658,14 +751,18 @@ static int omap_dm_timer_start(struct omap_dm_timer *timer) return 0; } -static int omap_dm_timer_stop(struct omap_dm_timer *timer) +static int omap_dm_timer_stop(struct omap_dm_timer *cookie) { - struct device *dev = &timer->pdev->dev; + struct dmtimer *timer; + struct device *dev; unsigned long rate = 0; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; + dev = &timer->pdev->dev; + if (!(timer->capability & OMAP_TIMER_NEEDS_RESET)) rate = clk_get_rate(timer->fclk); @@ -676,12 +773,14 @@ static int omap_dm_timer_stop(struct omap_dm_timer *timer) return 0; } -static int omap_dm_timer_set_load(struct omap_dm_timer *timer, +static int omap_dm_timer_set_load(struct omap_dm_timer *cookie, unsigned int load) { + struct dmtimer *timer; struct device *dev; int rc; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; @@ -697,13 +796,15 @@ static int omap_dm_timer_set_load(struct omap_dm_timer *timer, return 0; } -static int omap_dm_timer_set_match(struct omap_dm_timer *timer, int enable, +static int omap_dm_timer_set_match(struct omap_dm_timer *cookie, int enable, unsigned int match) { + struct dmtimer *timer; struct device *dev; int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; @@ -725,13 +826,15 @@ static int omap_dm_timer_set_match(struct omap_dm_timer *timer, int enable, return 0; } -static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, +static int omap_dm_timer_set_pwm(struct omap_dm_timer *cookie, int def_on, int toggle, int trigger, int autoreload) { + struct dmtimer *timer; struct device *dev; int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; @@ -757,12 +860,14 @@ static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, return 0; } -static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *timer) +static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *cookie) { + struct dmtimer *timer; struct device *dev; int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; @@ -778,13 +883,15 @@ static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *timer) return l; } -static int omap_dm_timer_set_prescaler(struct omap_dm_timer *timer, - int prescaler) +static int omap_dm_timer_set_prescaler(struct omap_dm_timer *cookie, + int prescaler) { + struct dmtimer *timer; struct device *dev; int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer) || prescaler < -1 || prescaler > 7) return -EINVAL; @@ -806,12 +913,14 @@ static int omap_dm_timer_set_prescaler(struct omap_dm_timer *timer, return 0; } -static int omap_dm_timer_set_int_enable(struct omap_dm_timer *timer, +static int omap_dm_timer_set_int_enable(struct omap_dm_timer *cookie, unsigned int value) { + struct dmtimer *timer; struct device *dev; int rc; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; @@ -834,12 +943,14 @@ static int omap_dm_timer_set_int_enable(struct omap_dm_timer *timer, * * Disables the specified timer interrupts for a timer. */ -static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) +static int omap_dm_timer_set_int_disable(struct omap_dm_timer *cookie, u32 mask) { + struct dmtimer *timer; struct device *dev; u32 l = mask; int rc; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; @@ -860,10 +971,12 @@ static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) return 0; } -static unsigned int omap_dm_timer_read_status(struct omap_dm_timer *timer) +static unsigned int omap_dm_timer_read_status(struct omap_dm_timer *cookie) { + struct dmtimer *timer; unsigned int l; + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) { pr_err("%s: timer not available or enabled.\n", __func__); return 0; @@ -874,8 +987,11 @@ static unsigned int omap_dm_timer_read_status(struct omap_dm_timer *timer) return l; } -static int omap_dm_timer_write_status(struct omap_dm_timer *timer, unsigned int value) +static int omap_dm_timer_write_status(struct omap_dm_timer *cookie, unsigned int value) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) return -EINVAL; @@ -884,8 +1000,11 @@ static int omap_dm_timer_write_status(struct omap_dm_timer *timer, unsigned int return 0; } -static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *timer) +static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *cookie) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) { pr_err("%s: timer not iavailable or enabled.\n", __func__); return 0; @@ -894,8 +1013,11 @@ static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *timer) return __omap_dm_timer_read_counter(timer); } -static int omap_dm_timer_write_counter(struct omap_dm_timer *timer, unsigned int value) +static int omap_dm_timer_write_counter(struct omap_dm_timer *cookie, unsigned int value) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) { pr_err("%s: timer not available or enabled.\n", __func__); return -EINVAL; @@ -910,7 +1032,7 @@ static int omap_dm_timer_write_counter(struct omap_dm_timer *timer, unsigned int static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { - struct omap_dm_timer *timer = dev_get_drvdata(dev); + struct dmtimer *timer = dev_get_drvdata(dev); atomic_set(&timer->enabled, 0); @@ -924,7 +1046,7 @@ static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) static int __maybe_unused omap_dm_timer_runtime_resume(struct device *dev) { - struct omap_dm_timer *timer = dev_get_drvdata(dev); + struct dmtimer *timer = dev_get_drvdata(dev); if (!(timer->capability & OMAP_TIMER_ALWON) && timer->func_base) omap_timer_restore_context(timer); @@ -951,7 +1073,7 @@ static const struct of_device_id omap_timer_match[]; static int omap_dm_timer_probe(struct platform_device *pdev) { unsigned long flags; - struct omap_dm_timer *timer; + struct dmtimer *timer; struct device *dev = &pdev->dev; const struct dmtimer_platform_data *pdata; int ret; @@ -1043,7 +1165,7 @@ err_disable: */ static int omap_dm_timer_remove(struct platform_device *pdev) { - struct omap_dm_timer *timer; + struct dmtimer *timer; unsigned long flags; int ret = -EINVAL; diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index df3c6c88264f..77eceeae708c 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -59,50 +59,7 @@ #define OMAP_TIMER_NEEDS_RESET 0x10000000 #define OMAP_TIMER_HAS_DSP_IRQ 0x08000000 -struct timer_regs { - u32 ocp_cfg; - u32 tidr; - u32 tier; - u32 twer; - u32 tclr; - u32 tcrr; - u32 tldr; - u32 ttrg; - u32 twps; - u32 tmar; - u32 tcar1; - u32 tsicr; - u32 tcar2; - u32 tpir; - u32 tnir; - u32 tcvr; - u32 tocr; - u32 towr; -}; - struct omap_dm_timer { - int id; - int irq; - struct clk *fclk; - - void __iomem *io_base; - int irq_stat; /* TISR/IRQSTATUS interrupt status */ - int irq_ena; /* irq enable */ - int irq_dis; /* irq disable, only on v2 ip */ - void __iomem *pend; /* write pending */ - void __iomem *func_base; /* function register base */ - - atomic_t enabled; - unsigned long rate; - unsigned reserved:1; - unsigned posted:1; - struct timer_regs context; - int revision; - u32 capability; - u32 errata; - struct platform_device *pdev; - struct list_head node; - struct notifier_block nb; }; int omap_dm_timer_get_irq(struct omap_dm_timer *timer); From 664ad59da11687ef9b518fc2519af6a71a1db9f1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:49 +0300 Subject: [PATCH 114/145] clocksource/drivers/timer-ti-dm: Add flag to detect omap1 Let's make it clear that some features need to be tested currently on omap1. Only omap1 still uses platform_data. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-9-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 0d0130ec4d1c..2310f973fec6 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -133,6 +133,7 @@ struct dmtimer { unsigned long rate; unsigned reserved:1; unsigned posted:1; + unsigned omap1:1; struct timer_regs context; int revision; u32 capability; @@ -423,7 +424,7 @@ static int omap_dm_timer_set_source(struct omap_dm_timer *cookie, int source) * use the clock framework to set the parent clock. To be removed * once OMAP1 migrated to using clock framework for dmtimers */ - if (pdata && pdata->set_timer_src) + if (timer->omap1 && pdata && pdata->set_timer_src) return pdata->set_timer_src(timer->pdev, source); #if defined(CONFIG_COMMON_CLK) @@ -476,7 +477,7 @@ static int omap_dm_timer_prepare(struct dmtimer *timer) * FIXME: OMAP1 devices do not use the clock framework for dmtimers so * do not call clk_get() for these devices. */ - if (!(timer->capability & OMAP_TIMER_NEEDS_RESET)) { + if (!timer->omap1) { timer->fclk = clk_get(&timer->pdev->dev, "fck"); if (WARN_ON_ONCE(IS_ERR(timer->fclk))) { dev_err(&timer->pdev->dev, ": No fclk handle.\n"); @@ -763,7 +764,7 @@ static int omap_dm_timer_stop(struct omap_dm_timer *cookie) dev = &timer->pdev->dev; - if (!(timer->capability & OMAP_TIMER_NEEDS_RESET)) + if (!timer->omap1) rate = clk_get_rate(timer->fclk); __omap_dm_timer_stop(timer, rate); @@ -1119,6 +1120,8 @@ static int omap_dm_timer_probe(struct platform_device *pdev) timer->reserved = omap_dm_timer_reserved_systimer(timer->id); } + timer->omap1 = timer->capability & OMAP_TIMER_NEEDS_RESET; + if (!(timer->capability & OMAP_TIMER_ALWON)) { timer->nb.notifier_call = omap_timer_context_notifier; cpu_pm_register_notifier(&timer->nb); From 789d4b1070261fa98b06384d2067f23c080dc9f1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 15 Aug 2022 16:12:50 +0300 Subject: [PATCH 115/145] clocksource/drivers/timer-ti-dm: Get clock in probe with devm_clk_get() We can simplify the code a bit by getting the clock in probe, and using devm_clk_get(). This will also make further changes easier as the clock is available in probe instead of prepare. Signed-off-by: Tony Lindgren Reviewed-by: Janusz Krzysztofik Link: https://lore.kernel.org/r/20220815131250.34603-10-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 2310f973fec6..cad29ded3a48 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -473,18 +473,6 @@ static int omap_dm_timer_prepare(struct dmtimer *timer) struct device *dev = &timer->pdev->dev; int rc; - /* - * FIXME: OMAP1 devices do not use the clock framework for dmtimers so - * do not call clk_get() for these devices. - */ - if (!timer->omap1) { - timer->fclk = clk_get(&timer->pdev->dev, "fck"); - if (WARN_ON_ONCE(IS_ERR(timer->fclk))) { - dev_err(&timer->pdev->dev, ": No fclk handle.\n"); - return -EINVAL; - } - } - rc = pm_runtime_resume_and_get(dev); if (rc) return rc; @@ -650,8 +638,6 @@ static int omap_dm_timer_free(struct omap_dm_timer *cookie) if (unlikely(!timer)) return -EINVAL; - clk_put(timer->fclk); - WARN_ON(!timer->reserved); timer->reserved = 0; return 0; @@ -1098,7 +1084,6 @@ static int omap_dm_timer_probe(struct platform_device *pdev) if (timer->irq < 0) return timer->irq; - timer->fclk = ERR_PTR(-ENODEV); timer->io_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(timer->io_base)) return PTR_ERR(timer->io_base); @@ -1122,6 +1107,15 @@ static int omap_dm_timer_probe(struct platform_device *pdev) timer->omap1 = timer->capability & OMAP_TIMER_NEEDS_RESET; + /* OMAP1 devices do not yet use the clock framework for dmtimers */ + if (!timer->omap1) { + timer->fclk = devm_clk_get(dev, "fck"); + if (IS_ERR(timer->fclk)) + return PTR_ERR(timer->fclk); + } else { + timer->fclk = ERR_PTR(-ENODEV); + } + if (!(timer->capability & OMAP_TIMER_ALWON)) { timer->nb.notifier_call = omap_timer_context_notifier; cpu_pm_register_notifier(&timer->nb); From 061f42748d0639647636b7a48d71951e8d92720a Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 2 Sep 2022 19:12:06 +0800 Subject: [PATCH 116/145] dt-bindings: timer: nxp,sysctr-timer: add nxp,no-divider property The base clock input to system counter is internally divided by 3 in previous design, but there is change that no divider now. So add a property to indicate that. Signed-off-by: Peng Fan Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220902111207.2902493-2-peng.fan@oss.nxp.com Signed-off-by: Daniel Lezcano --- Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml index 830211c55b4a..2b9653dafab8 100644 --- a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml +++ b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml @@ -32,6 +32,10 @@ properties: clock-names: const: per + nxp,no-divider: + description: if present, means there is no internal base clk divider. + type: boolean + required: - compatible - reg From 27b30995b75d1e79360c164ba179bca86ab76ba6 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Fri, 2 Sep 2022 19:12:07 +0800 Subject: [PATCH 117/145] clocksource/drivers/imx-sysctr: handle nxp,no-divider property The previous hardware design embedds a internal divider for base clock. New design not has that divider, so check the nxp,no-divider property, if true, directly use base clock input, otherwise divide by 3 as before. Signed-off-by: Peng Fan Link: https://lore.kernel.org/r/20220902111207.2902493-3-peng.fan@oss.nxp.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-imx-sysctr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/timer-imx-sysctr.c b/drivers/clocksource/timer-imx-sysctr.c index 523e37662a6e..5a7a951c4efc 100644 --- a/drivers/clocksource/timer-imx-sysctr.c +++ b/drivers/clocksource/timer-imx-sysctr.c @@ -134,8 +134,10 @@ static int __init sysctr_timer_init(struct device_node *np) if (ret) return ret; - /* system counter clock is divided by 3 internally */ - to_sysctr.of_clk.rate /= SYS_CTR_CLK_DIV; + if (!of_property_read_bool(np, "nxp,no-divider")) { + /* system counter clock is divided by 3 internally */ + to_sysctr.of_clk.rate /= SYS_CTR_CLK_DIV; + } sys_ctr_base = timer_of_base(&to_sysctr); cmpcr = readl(sys_ctr_base + CMPCR); From 76e64c73db9542ff4bae8a60f4f32e38f3799b95 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 19 Sep 2022 09:52:13 +0900 Subject: [PATCH 118/145] locking/lockdep: Print more debug information - report name and key when look_up_lock_class() got confused Printing this information will be helpful: ------------[ cut here ]------------ Looking for class "l2tp_sock" with key l2tp_socket_class, but found a different class "slock-AF_INET6" with the same key WARNING: CPU: 1 PID: 14195 at kernel/locking/lockdep.c:940 look_up_lock_class+0xcc/0x140 Modules linked in: CPU: 1 PID: 14195 Comm: a.out Not tainted 6.0.0-rc6-dirty #863 Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 RIP: 0010:look_up_lock_class+0xcc/0x140 Signed-off-by: Tetsuo Handa Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/bd99391e-f787-efe9-5ec6-3c6dc4c587b0@I-love.SAKURA.ne.jp --- kernel/locking/lockdep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 64a13eb56078..e3375bc40dad 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -934,8 +934,10 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name && - lock->key != &__lockdep_no_validate__); + WARN_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__, + "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n", + lock->name, lock->key, class->name); return class; } } From dca6344d7a77dd0501a73745f4a9fb1ee2bc9d7c Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sun, 18 Sep 2022 00:41:08 +0100 Subject: [PATCH 119/145] perf/core: Convert snprintf() to scnprintf() Coccinelle reports a warning: WARNING: use scnprintf or sprintf This LWN article explains the rationale for this change: https: //lwn.net/Articles/69419/ Ie. snprintf() returns what *would* be the resulting length, while scnprintf() returns the actual length. Adding to that, there has also been some slow migration from snprintf to scnprintf, here's the shift in usage in the past 3.5 years, in all fs/ files: v5.0 v6.0-rc6 -------------------------------------- snprintf() uses: 63 213 scnprintf() uses: 374 186 No intended change in behavior. [ mingo: Improved the changelog & reviewed the usage sites. ] Signed-off-by: Jules Irenge Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 7da5515082f9..c07e9a3ea94c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10952,7 +10952,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -10963,7 +10963,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -10974,7 +10974,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); From 6edf2576a6cc46460c164831517a36064eb8109c Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 13 Sep 2022 14:54:20 +0800 Subject: [PATCH 120/145] mm/slub: enable debugging memory wasting of kmalloc kmalloc's API family is critical for mm, with one nature that it will round up the request size to a fixed one (mostly power of 2). Say when user requests memory for '2^n + 1' bytes, actually 2^(n+1) bytes could be allocated, so in worst case, there is around 50% memory space waste. The wastage is not a big issue for requests that get allocated/freed quickly, but may cause problems with objects that have longer life time. We've met a kernel boot OOM panic (v5.10), and from the dumped slab info: [ 26.062145] kmalloc-2k 814056KB 814056KB From debug we found there are huge number of 'struct iova_magazine', whose size is 1032 bytes (1024 + 8), so each allocation will waste 1016 bytes. Though the issue was solved by giving the right (bigger) size of RAM, it is still nice to optimize the size (either use a kmalloc friendly size or create a dedicated slab for it). And from lkml archive, there was another crash kernel OOM case [1] back in 2019, which seems to be related with the similar slab waste situation, as the log is similar: [ 4.332648] iommu: Adding device 0000:20:02.0 to group 16 [ 4.338946] swapper/0 invoked oom-killer: gfp_mask=0x6040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null), order=0, oom_score_adj=0 ... [ 4.857565] kmalloc-2048 59164KB 59164KB The crash kernel only has 256M memory, and 59M is pretty big here. (Note: the related code has been changed and optimised in recent kernel [2], these logs are just picked to demo the problem, also a patch changing its size to 1024 bytes has been merged) So add an way to track each kmalloc's memory waste info, and leverage the existing SLUB debug framework (specifically SLUB_STORE_USER) to show its call stack of original allocation, so that user can evaluate the waste situation, identify some hot spots and optimize accordingly, for a better utilization of memory. The waste info is integrated into existing interface: '/sys/kernel/debug/slab/kmalloc-xx/alloc_traces', one example of 'kmalloc-4k' after boot is: 126 ixgbe_alloc_q_vector+0xbe/0x830 [ixgbe] waste=233856/1856 age=280763/281414/282065 pid=1330 cpus=32 nodes=1 __kmem_cache_alloc_node+0x11f/0x4e0 __kmalloc_node+0x4e/0x140 ixgbe_alloc_q_vector+0xbe/0x830 [ixgbe] ixgbe_init_interrupt_scheme+0x2ae/0xc90 [ixgbe] ixgbe_probe+0x165f/0x1d20 [ixgbe] local_pci_probe+0x78/0xc0 work_for_cpu_fn+0x26/0x40 ... which means in 'kmalloc-4k' slab, there are 126 requests of 2240 bytes which got a 4KB space (wasting 1856 bytes each and 233856 bytes in total), from ixgbe_alloc_q_vector(). And when system starts some real workload like multiple docker instances, there could are more severe waste. [1]. https://lkml.org/lkml/2019/8/12/266 [2]. https://lore.kernel.org/lkml/2920df89-9975-5785-f79b-257d3052dfaf@huawei.com/ [Thanks Hyeonggon for pointing out several bugs about sorting/format] [Thanks Vlastimil for suggesting way to reduce memory usage of orig_size and keep it only for kmalloc objects] Signed-off-by: Feng Tang Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Robin Murphy Cc: John Garry Cc: Kefeng Wang Signed-off-by: Vlastimil Babka --- Documentation/mm/slub.rst | 33 +++++--- include/linux/slab.h | 2 + mm/slab_common.c | 3 +- mm/slub.c | 154 +++++++++++++++++++++++++++++--------- 4 files changed, 142 insertions(+), 50 deletions(-) diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst index 43063ade737a..4e1578186b4f 100644 --- a/Documentation/mm/slub.rst +++ b/Documentation/mm/slub.rst @@ -400,21 +400,30 @@ information: allocated objects. The output is sorted by frequency of each trace. Information in the output: - Number of objects, allocating function, minimal/average/maximal jiffies since alloc, - pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. + Number of objects, allocating function, possible memory wastage of + kmalloc objects(total/per-object), minimal/average/maximal jiffies + since alloc, pid range of the allocating processes, cpu mask of + allocating cpus, numa node mask of origins of memory, and stack trace. Example::: - 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: - __slab_alloc+0x6d/0x90 - kmem_cache_alloc_trace+0x2eb/0x300 - populate_error_injection_list+0x97/0x110 - init_error_injection+0x1b/0x71 - do_one_initcall+0x5f/0x2d0 - kernel_init_freeable+0x26f/0x2d7 - kernel_init+0xe/0x118 - ret_from_fork+0x22/0x30 - + 338 pci_alloc_dev+0x2c/0xa0 waste=521872/1544 age=290837/291891/293509 pid=1 cpus=106 nodes=0-1 + __kmem_cache_alloc_node+0x11f/0x4e0 + kmalloc_trace+0x26/0xa0 + pci_alloc_dev+0x2c/0xa0 + pci_scan_single_device+0xd2/0x150 + pci_scan_slot+0xf7/0x2d0 + pci_scan_child_bus_extend+0x4e/0x360 + acpi_pci_root_create+0x32e/0x3b0 + pci_acpi_scan_root+0x2b9/0x2d0 + acpi_pci_root_add.cold.11+0x110/0xb0a + acpi_bus_attach+0x262/0x3f0 + device_for_each_child+0xb7/0x110 + acpi_dev_for_each_child+0x77/0xa0 + acpi_bus_attach+0x108/0x3f0 + device_for_each_child+0xb7/0x110 + acpi_dev_for_each_child+0x77/0xa0 + acpi_bus_attach+0x108/0x3f0 2. free_traces:: diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..a713b0e5bbcd 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -29,6 +29,8 @@ #define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ #define SLAB_POISON ((slab_flags_t __force)0x00000800U) +/* Indicate a kmalloc slab */ +#define SLAB_KMALLOC ((slab_flags_t __force)0x00001000U) /* Align objs on cache lines */ #define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) /* Use GFP_DMA memory */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 17996649cfe3..a2c8b937b14e 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -649,7 +649,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, if (!s) panic("Out of memory when creating slab %s\n", name); - create_boot_cache(s, name, size, flags, useroffset, usersize); + create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, + usersize); kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; diff --git a/mm/slub.c b/mm/slub.c index d9650f2ca776..a8a3e7d6d6aa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -194,11 +194,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +/* Structure holding parameters for get_partial() call chain */ +struct partial_context { + struct slab **slab; + gfp_t flags; + unsigned int orig_size; +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } +static inline bool slub_debug_orig_size(struct kmem_cache *s) +{ + return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && + (s->flags & SLAB_KMALLOC)); +} + void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -785,6 +798,39 @@ static void print_slab_info(const struct slab *slab) folio_flags(folio, 0)); } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + static void slab_bug(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@ -844,6 +890,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); + if (slub_debug_orig_size(s)) + off += sizeof(unsigned int); + off += kasan_metadata_size(s); if (off != size_from_object(s)) @@ -977,7 +1026,8 @@ skip_bug_print: * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at minimum + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * D. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -995,10 +1045,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ - if (s->flags & SLAB_STORE_USER) + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); + if (s->flags & SLAB_KMALLOC) + off += sizeof(unsigned int); + } + off += kasan_metadata_size(s); if (size_from_object(s) == off) @@ -1293,7 +1347,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, } static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object) + struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) @@ -1302,6 +1356,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); + set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); return 1; @@ -1570,7 +1625,7 @@ static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object) { return 0; } + struct slab *slab, void *object, int orig_size) { return 0; } static inline void free_debug_processing( struct kmem_cache *s, struct slab *slab, @@ -2013,7 +2068,7 @@ static inline void remove_partial(struct kmem_cache_node *n, * it to full list if it was the last free object. */ static void *alloc_single_from_partial(struct kmem_cache *s, - struct kmem_cache_node *n, struct slab *slab) + struct kmem_cache_node *n, struct slab *slab, int orig_size) { void *object; @@ -2023,7 +2078,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s, slab->freelist = get_freepointer(s, object); slab->inuse++; - if (!alloc_debug_processing(s, slab, object)) { + if (!alloc_debug_processing(s, slab, object, orig_size)) { remove_partial(n, slab); return NULL; } @@ -2042,7 +2097,7 @@ static void *alloc_single_from_partial(struct kmem_cache *s, * and put the slab to the partial (or full) list. */ static void *alloc_single_from_new_slab(struct kmem_cache *s, - struct slab *slab) + struct slab *slab, int orig_size) { int nid = slab_nid(slab); struct kmem_cache_node *n = get_node(s, nid); @@ -2054,7 +2109,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, slab->freelist = get_freepointer(s, object); slab->inuse = 1; - if (!alloc_debug_processing(s, slab, object)) + if (!alloc_debug_processing(s, slab, object, orig_size)) /* * It's not really expected that this would fail on a * freshly allocated slab, but a concurrent memory @@ -2132,7 +2187,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct slab **ret_slab, gfp_t gfpflags) + struct partial_context *pc) { struct slab *slab, *slab2; void *object = NULL; @@ -2152,11 +2207,12 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(slab, gfpflags)) + if (!pfmemalloc_match(slab, pc->flags)) continue; if (kmem_cache_debug(s)) { - object = alloc_single_from_partial(s, n, slab); + object = alloc_single_from_partial(s, n, slab, + pc->orig_size); if (object) break; continue; @@ -2167,7 +2223,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, break; if (!object) { - *ret_slab = slab; + *pc->slab = slab; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { @@ -2191,14 +2247,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct slab **ret_slab) +static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type highest_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(pc->flags); void *object; unsigned int cpuset_mems_cookie; @@ -2226,15 +2281,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(mempolicy_slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed(zone, flags) && + if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, ret_slab, flags); + object = get_partial_node(s, n, pc); if (object) { /* * Don't check read_mems_allowed_retry() @@ -2255,8 +2310,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct slab **ret_slab) +static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) { void *object; int searchnode = node; @@ -2264,11 +2318,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags); + object = get_partial_node(s, get_node(s, searchnode), pc); if (object || node != NUMA_NO_NODE) return object; - return get_any_partial(s, flags, ret_slab); + return get_any_partial(s, pc); } #ifdef CONFIG_PREEMPTION @@ -2989,11 +3043,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *freelist; struct slab *slab; unsigned long flags; + struct partial_context pc; stat(s, ALLOC_SLOWPATH); @@ -3107,7 +3162,10 @@ new_slab: new_objects: - freelist = get_partial(s, gfpflags, node, &slab); + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + freelist = get_partial(s, node, &pc); if (freelist) goto check_new_slab; @@ -3123,7 +3181,7 @@ new_objects: stat(s, ALLOC_SLAB); if (kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab); + freelist = alloc_single_from_new_slab(s, slab, orig_size); if (unlikely(!freelist)) goto new_objects; @@ -3155,6 +3213,7 @@ check_new_slab: */ if (s->flags & SLAB_STORE_USER) set_track(s, freelist, TRACK_ALLOC, addr); + return freelist; } @@ -3197,7 +3256,7 @@ retry_load_slab: * pointer. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *p; @@ -3210,7 +3269,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = slub_get_cpu_ptr(s->cpu_slab); #endif - p = ___slab_alloc(s, gfpflags, node, addr, c); + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -3295,7 +3354,7 @@ redo: if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); } else { void *next_object = get_freepointer_safe(s, object); @@ -3793,7 +3852,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c); + _RET_IP_, c, s->object_size); if (unlikely(!p[i])) goto error; @@ -4196,12 +4255,17 @@ static int calculate_sizes(struct kmem_cache *s) } #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) { /* * Need to store information about allocs and frees after * the object. */ size += 2 * sizeof(struct track); + + /* Save the original kmalloc request size */ + if (flags & SLAB_KMALLOC) + size += sizeof(unsigned int); + } #endif kasan_cache_create(s, &size, &s->flags); @@ -5146,6 +5210,7 @@ struct location { depot_stack_handle_t handle; unsigned long count; unsigned long addr; + unsigned long waste; long long sum_time; long min_time; long max_time; @@ -5192,13 +5257,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) } static int add_location(struct loc_track *t, struct kmem_cache *s, - const struct track *track) + const struct track *track, + unsigned int orig_size) { long start, end, pos; struct location *l; - unsigned long caddr, chandle; + unsigned long caddr, chandle, cwaste; unsigned long age = jiffies - track->when; depot_stack_handle_t handle = 0; + unsigned int waste = s->object_size - orig_size; #ifdef CONFIG_STACKDEPOT handle = READ_ONCE(track->handle); @@ -5216,11 +5283,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (pos == end) break; - caddr = t->loc[pos].addr; - chandle = t->loc[pos].handle; - if ((track->addr == caddr) && (handle == chandle)) { + l = &t->loc[pos]; + caddr = l->addr; + chandle = l->handle; + cwaste = l->waste; + if ((track->addr == caddr) && (handle == chandle) && + (waste == cwaste)) { - l = &t->loc[pos]; l->count++; if (track->when) { l->sum_time += age; @@ -5245,6 +5314,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, end = pos; else if (track->addr == caddr && handle < chandle) end = pos; + else if (track->addr == caddr && handle == chandle && + waste < cwaste) + end = pos; else start = pos; } @@ -5268,6 +5340,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->min_pid = track->pid; l->max_pid = track->pid; l->handle = handle; + l->waste = waste; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -5280,13 +5353,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, unsigned long *obj_map) { void *addr = slab_address(slab); + bool is_alloc = (alloc == TRACK_ALLOC); void *p; __fill_map(obj_map, s, slab); for_each_object(p, s, addr, slab->objects) if (!test_bit(__obj_to_index(s, addr, p), obj_map)) - add_location(t, s, get_track(s, p, alloc)); + add_location(t, s, get_track(s, p, alloc), + is_alloc ? get_orig_size(s, p) : + s->object_size); } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ @@ -6156,6 +6232,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) else seq_puts(seq, ""); + if (l->waste) + seq_printf(seq, " waste=%lu/%lu", + l->count * l->waste, l->waste); + if (l->sum_time != l->min_time) { seq_printf(seq, " age=%ld/%llu/%ld", l->min_time, div_u64(l->sum_time, l->count), From d65360f224069a6de56eb18e0425973914a10fe8 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 26 Sep 2022 22:20:41 +0800 Subject: [PATCH 121/145] mm/slub: clean up create_unique_id() As Christophe JAILLET suggested [1] In create_unique_id(), "looks that ID_STR_LENGTH could even be reduced to 32 or 16. The 2nd BUG_ON at the end of the function could certainly be just removed as well or remplaced by a: if (p > name + ID_STR_LENGTH - 1) { kfree(name); return -E; } " According to above suggestion, let's do below cleanups: 1. reduce ID_STR_LENGTH to 32, as the buffer size should be enough; 2. use WARN_ON instead of BUG_ON() and return error if check condition is true; 3. use snprintf instead of sprintf to avoid overflow. [1] https://lore.kernel.org/linux-mm/2025305d-16db-abdf-6cd3-1fb93371c2b4@wanadoo.fr/ Suggested-by: Christophe JAILLET Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Chao Yu Signed-off-by: Vlastimil Babka --- mm/slub.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index dc59b9e8c66f..8f80d9bc507f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5874,7 +5874,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) return slab_kset; } -#define ID_STR_LENGTH 64 +#define ID_STR_LENGTH 32 /* Create a unique string id for a slab cache: * @@ -5907,9 +5907,12 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07u", s->size); + p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); - BUG_ON(p > name + ID_STR_LENGTH - 1); + if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { + kfree(name); + return ERR_PTR(-EINVAL); + } return name; } From af246cc6d0ed11318223606128bb0b09866c4c08 Mon Sep 17 00:00:00 2001 From: Yang Guo Date: Tue, 27 Sep 2022 11:32:21 +0800 Subject: [PATCH 122/145] clocksource/drivers/arm_arch_timer: Fix CNTPCT_LO and CNTVCT_LO value CNTPCT_LO and CNTVCT_LO are defined by mistake in commit '8b82c4f883a7', so fix them according to the Arm ARM DDI 0487I.a, Table I2-4 "CNTBaseN memory map" as follows: Offset Register Type Description 0x000 CNTPCT[31:0] RO Physical Count register. 0x004 CNTPCT[63:32] RO 0x008 CNTVCT[31:0] RO Virtual Count register. 0x00C CNTVCT[63:32] RO Fixes: 8b82c4f883a7 ("clocksource/drivers/arm_arch_timer: Move MMIO timer programming over to CVAL") Cc: stable@vger.kernel.org Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: Marc Zyngier Cc: Mark Rutland Acked-by: Marc Zyngier Signed-off-by: Yang Guo Signed-off-by: Shaokun Zhang Link: https://lore.kernel.org/r/20220927033221.49589-1-zhangshaokun@hisilicon.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/arm_arch_timer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index ff935efb6a88..a7ff77550e17 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -44,8 +44,8 @@ #define CNTACR_RWVT BIT(4) #define CNTACR_RWPT BIT(5) -#define CNTVCT_LO 0x00 -#define CNTPCT_LO 0x08 +#define CNTPCT_LO 0x00 +#define CNTVCT_LO 0x08 #define CNTFRQ 0x10 #define CNTP_CVAL_LO 0x20 #define CNTP_CTL 0x2c From 7b084630153152239d84990ac4540c2dd360186f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:31 -0700 Subject: [PATCH 123/145] perf: Use sample_flags for addr Use the new sample_flags to indicate whether the addr field is filled by the PMU driver. As most PMU drivers pass 0, it can set the flag only if it has a non-zero value. And use 0 in perf_sample_output() if it's not filled already. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-1-namhyung@kernel.org --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 8 ++++++-- kernel/events/core.c | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4ba6ab6d0d92..d2e9ff16f6ed 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1621,8 +1621,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) { data->addr = pebs->dla; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ @@ -1783,8 +1785,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_DATA_SRC; } - if (sample_type & PERF_SAMPLE_ADDR_TYPE) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { data->addr = meminfo->address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 368bdc4f563f..f4a13579b0e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - u64 addr; struct perf_raw_record *raw; u64 period; @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; + u64 addr; u64 type; u64 ip; @@ -1079,9 +1079,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = 0; - data->addr = addr; data->raw = NULL; data->period = period; + + if (addr) { + data->addr = addr; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index c07e9a3ea94c..a91f74db9fe9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,11 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) data->txn = 0; + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); From 838d9bb62d132ec3baf1b5aba2e95ef9a7a9a3cd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:32 -0700 Subject: [PATCH 124/145] perf: Use sample_flags for raw_data Use the new sample_flags to indicate whether the raw data field is filled by the PMU driver. Although it could check with the NULL, follow the same rule with other fields. Remove the raw field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-2-namhyung@kernel.org --- arch/s390/kernel/perf_cpum_cf.c | 1 + arch/s390/kernel/perf_pai_crypto.c | 1 + arch/x86/events/amd/ibs.c | 1 + include/linux/perf_event.h | 5 ++--- kernel/events/core.c | 3 ++- 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f7dd3c849e68..f043a7ff220b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -664,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event, raw.frag.data = cpuhw->stop; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index b38b4ae01589..6826e2a69a21 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -366,6 +366,7 @@ static int paicrypt_push_sample(void) raw.frag.data = cpump->save; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ce5720bfb350..c29a006954c7 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -781,6 +781,7 @@ fail: }, }; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f4a13579b0e8..e9b151cde491 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - struct perf_raw_record *raw; u64 period; /* @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_mem_data_src data_src; u64 txn; u64 addr; + struct perf_raw_record *raw; u64 type; u64 ip; @@ -1078,8 +1078,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ - data->sample_flags = 0; - data->raw = NULL; + data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; if (addr) { diff --git a/kernel/events/core.c b/kernel/events/core.c index a91f74db9fe9..04e19a857d4b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7332,7 +7332,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7348,6 +7348,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; From 4674ffe2fcad45a9b164401cc0794115702326cf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 27 Sep 2022 19:20:25 +0200 Subject: [PATCH 125/145] perf, hw_breakpoint: Fix use-after-free if perf_event_open() fails Local testing revealed that we can trigger a use-after-free during rhashtable lookup as follows: | BUG: KASAN: use-after-free in memcmp lib/string.c:757 | Read of size 8 at addr ffff888107544dc0 by task perf-rhltable-n/1293 | | CPU: 0 PID: 1293 Comm: perf-rhltable-n Not tainted 6.0.0-rc3-00014-g85260862789c #46 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014 | Call Trace: | | memcmp lib/string.c:757 | rhashtable_compare include/linux/rhashtable.h:577 [inline] | __rhashtable_lookup include/linux/rhashtable.h:602 [inline] | rhltable_lookup include/linux/rhashtable.h:688 [inline] | task_bp_pinned kernel/events/hw_breakpoint.c:324 | toggle_bp_slot kernel/events/hw_breakpoint.c:462 | __release_bp_slot kernel/events/hw_breakpoint.c:631 [inline] | release_bp_slot kernel/events/hw_breakpoint.c:639 | register_perf_hw_breakpoint kernel/events/hw_breakpoint.c:742 | hw_breakpoint_event_init kernel/events/hw_breakpoint.c:976 | perf_try_init_event kernel/events/core.c:11261 | perf_init_event kernel/events/core.c:11325 [inline] | perf_event_alloc kernel/events/core.c:11619 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | | Allocated by task 1292: | perf_event_alloc kernel/events/core.c:11505 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | Freed by task 1292: | perf_event_alloc kernel/events/core.c:11716 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | The buggy address belongs to the object at ffff888107544c00 | which belongs to the cache perf_event of size 1352 | The buggy address is located 448 bytes inside of | 1352-byte region [ffff888107544c00, ffff888107545148) This happens because the first perf_event_open() managed to reserve a HW breakpoint slot, however, later fails for other reasons and returns. The second perf_event_open() runs concurrently, and during rhltable_lookup() looks up an entry which is being freed: since rhltable_lookup() may run concurrently (under the RCU read lock) with rhltable_remove(), we may end up with a stale entry, for which memory may also have already been freed when being accessed. To fix, only free the failed perf_event after an RCU grace period. This allows subsystems that store references to an event to always access it concurrently under the RCU read lock, even if initialization will fail. Given failure is unlikely and a slow-path, turning the immediate free into a call_rcu()-wrapped free does not affect performance elsewhere. Fixes: 0370dc314df3 ("perf/hw_breakpoint: Optimize list of per-task breakpoints") Reported-by: syzkaller Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220927172025.1636995-1-elver@google.com --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 04e19a857d4b..e1ffdb861b53 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11734,11 +11734,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kmem_cache_free(perf_event_cache, event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } From cce6a2d7e0e494c453ad73e1e78bd50684f20cca Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 27 Sep 2022 22:32:59 +0200 Subject: [PATCH 126/145] bpf: Check flags for branch stack in bpf_read_branch_records helper Recent commit [1] changed branch stack data indication from br_stack pointer to sample_flags in perf_sample_data struct. We need to check sample_flags for PERF_SAMPLE_BRANCH_STACK bit for valid branch stack data. [1] a9a931e26668 ("perf: Use sample_flags for branch stack") Fixes: a9a931e26668 ("perf: Use sample_flags for branch stack") Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20220927203259.590950-1-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68e5cdd24cef..1fcd1234607e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1507,6 +1507,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) + return -ENOENT; + if (unlikely(!br_stack)) return -ENOENT; From 9ed9cac1850a2a55674b4a17100c50b46f645921 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Sep 2022 13:28:07 -0700 Subject: [PATCH 127/145] slab: Remove __malloc attribute from realloc functions The __malloc attribute should not be applied to "realloc" functions, as the returned pointer may alias the storage of the prior pointer. Instead of splitting __malloc from __alloc_size, which would be a huge amount of churn, just create __realloc_size for the few cases where it is needed. Thanks to Geert Uytterhoeven for reporting build failures with gcc-8 in earlier version which tried to remove the #ifdef. While the "alloc_size" attribute is available on all GCC versions, I forgot that it gets disabled explicitly by the kernel in GCC < 9.1 due to misbehaviors. Add a note to the compiler_attributes.h entry for it. Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Vlastimil Babka Cc: Roman Gushchin Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Marco Elver Cc: linux-mm@kvack.org Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/compiler_attributes.h | 3 ++- include/linux/compiler_types.h | 8 +++++--- include/linux/slab.h | 12 ++++++------ mm/slab_common.c | 4 ++-- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 445e80517cab..96a4ed11b4be 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -35,7 +35,8 @@ /* * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally - * available and includes other attributes. + * available and includes other attributes. For GCC < 9.1, __alloc_size__ gets undefined + * in compiler-gcc.h, due to misbehaviors. * * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a..0717534f8364 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -271,14 +271,16 @@ struct ftrace_likely_data { /* * Any place that could be marked with the "alloc_size" attribute is also - * a place to be marked with the "malloc" attribute. Do this as part of the - * __alloc_size macro to avoid redundant attributes and to avoid missing a - * __malloc marking. + * a place to be marked with the "malloc" attribute, except those that may + * be performing a _reallocation_, as that may alias the existing pointer. + * For these, use __realloc_size(). */ #ifdef __alloc_size__ # define __alloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) __malloc +# define __realloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) #else # define __alloc_size(x, ...) __malloc +# define __realloc_size(x, ...) #endif #ifndef asm_volatile_goto diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..41bd036e7551 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -184,7 +184,7 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -647,10 +647,10 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_ * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p, - size_t new_n, - size_t new_size, - gfp_t flags) +static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; @@ -774,7 +774,7 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla } extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __alloc_size(3); + __realloc_size(3); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); diff --git a/mm/slab_common.c b/mm/slab_common.c index ccc02573588f..023dc0451d16 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1150,8 +1150,8 @@ module_init(slab_proc_init); #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; size_t ks; From 05a940656e1eb2026d9ee31019d5b47e9545124d Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 23 Sep 2022 13:28:08 -0700 Subject: [PATCH 128/145] slab: Introduce kmalloc_size_roundup() In the effort to help the compiler reason about buffer sizes, the __alloc_size attribute was added to allocators. This improves the scope of the compiler's ability to apply CONFIG_UBSAN_BOUNDS and (in the near future) CONFIG_FORTIFY_SOURCE. For most allocations, this works well, as the vast majority of callers are not expecting to use more memory than what they asked for. There is, however, one common exception to this: anticipatory resizing of kmalloc allocations. These cases all use ksize() to determine the actual bucket size of a given allocation (e.g. 128 when 126 was asked for). This comes in two styles in the kernel: 1) An allocation has been determined to be too small, and needs to be resized. Instead of the caller choosing its own next best size, it wants to minimize the number of calls to krealloc(), so it just uses ksize() plus some additional bytes, forcing the realloc into the next bucket size, from which it can learn how large it is now. For example: data = krealloc(data, ksize(data) + 1, gfp); data_len = ksize(data); 2) The minimum size of an allocation is calculated, but since it may grow in the future, just use all the space available in the chosen bucket immediately, to avoid needing to reallocate later. A good example of this is skbuff's allocators: data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); ... /* kmalloc(size) might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ osize = ksize(data); size = SKB_WITH_OVERHEAD(osize); In both cases, the "how much was actually allocated?" question is answered _after_ the allocation, where the compiler hinting is not in an easy place to make the association any more. This mismatch between the compiler's view of the buffer length and the code's intention about how much it is going to actually use has already caused problems[1]. It is possible to fix this by reordering the use of the "actual size" information. We can serve the needs of users of ksize() and still have accurate buffer length hinting for the compiler by doing the bucket size calculation _before_ the allocation. Code can instead ask "how large an allocation would I get for a given size?". Introduce kmalloc_size_roundup(), to serve this function so we can start replacing the "anticipatory resizing" uses of ksize(). [1] https://github.com/ClangBuiltLinux/linux/issues/1599 https://github.com/KSPP/linux/issues/183 [ vbabka@suse.cz: add SLOB version ] Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: linux-mm@kvack.org Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 31 +++++++++++++++++++++++++++++++ mm/slab.c | 9 ++++++--- mm/slab_common.c | 20 ++++++++++++++++++++ mm/slob.c | 14 ++++++++++++++ 4 files changed, 71 insertions(+), 3 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 41bd036e7551..727640173568 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -188,7 +188,21 @@ void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __r void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); + +/** + * ksize - Report actual allocation size of associated object + * + * @objp: Pointer returned from a prior kmalloc()-family allocation. + * + * This should not be used for writing beyond the originally requested + * allocation size. Either use krealloc() or round up the allocation size + * with kmalloc_size_roundup() prior to allocation. If this is used to + * access beyond the originally requested allocation size, UBSAN_BOUNDS + * and/or FORTIFY_SOURCE may trip, since they only know about the + * originally allocated size via the __alloc_size attribute. + */ size_t ksize(const void *objp); + #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); @@ -779,6 +793,23 @@ extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); + +/** + * kmalloc_size_roundup - Report allocation bucket size for the given size + * + * @size: Number of bytes to round up from. + * + * This returns the number of bytes that would be available in a kmalloc() + * allocation of @size bytes. For example, a 126 byte request would be + * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly + * for the general-purpose kmalloc()-based allocations, and is not for the + * pre-sized kmem_cache_alloc()-based allocations.) + * + * Use this to kmalloc() the full bucket size ahead of time instead of using + * ksize() to query the size after an allocation. + */ +size_t kmalloc_size_roundup(size_t size); + void __init kmem_cache_init_late(void); #if defined(CONFIG_SMP) && defined(CONFIG_SLAB) diff --git a/mm/slab.c b/mm/slab.c index 10e96137b44f..2da862bf6226 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4192,11 +4192,14 @@ void __check_heap_object(const void *ptr, unsigned long n, #endif /* CONFIG_HARDENED_USERCOPY */ /** - * __ksize -- Uninstrumented ksize. + * __ksize -- Report full size of underlying allocation * @objp: pointer to the object * - * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same - * safety checks as ksize() with KASAN instrumentation enabled. + * This should only be used internally to query the true size of allocations. + * It is not meant to be a way to discover the usable size of an allocation + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, + * and/or FORTIFY_SOURCE. * * Return: size of the actual memory used by @objp in bytes */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 023dc0451d16..78c0dcb0221b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -737,6 +737,26 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) return kmalloc_caches[kmalloc_type(flags)][index]; } +size_t kmalloc_size_roundup(size_t size) +{ + struct kmem_cache *c; + + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + /* Above the smaller buckets, size is a multiple of page size. */ + if (size > KMALLOC_MAX_CACHE_SIZE) + return PAGE_SIZE << get_order(size); + + /* The flags don't matter since size_index is common to all. */ + c = kmalloc_slab(size, GFP_KERNEL); + return c ? c->object_size : 0; +} +EXPORT_SYMBOL(kmalloc_size_roundup); + #ifdef CONFIG_ZONE_DMA #define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, #else diff --git a/mm/slob.c b/mm/slob.c index 2bd4f476c340..5dbdf6ad8bcc 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -574,6 +574,20 @@ void kfree(const void *block) } EXPORT_SYMBOL(kfree); +size_t kmalloc_size_roundup(size_t size) +{ + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + + return ALIGN(size, ARCH_KMALLOC_MINALIGN); +} + +EXPORT_SYMBOL(kmalloc_size_roundup); + /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t __ksize(const void *block) { From 50b0c97bf00e4815aee09cace28b940ebb060e69 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:28 -0700 Subject: [PATCH 129/145] perf/x86: Add new Raptor Lake S support From PMU's perspective, the new Raptor Lake S is the same as the other of hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index b2d8def4d03c..3939debb27e7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6344,6 +6344,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: /* * Alder Lake has 2 types of CPU, core and atom. * From 193c888b7ffe4da97346950c0e98dd77cc629f24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:29 -0700 Subject: [PATCH 130/145] perf/x86/msr: Add new Raptor Lake S support The same as the other hybrid {ALDER,RAPTOP}LAKE, the new Raptor Lake S also support PPERF and SMI_COUNT MSRs. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-2-kan.liang@linux.intel.com --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index ac542f98c070..ecced3a52668 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -106,6 +106,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; From d12940d2ead51c6978e7d38b2abf12b833270b2a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:30 -0700 Subject: [PATCH 131/145] perf/x86/cstate: Add new Raptor Lake S support From the perspective of Intel cstate residency counters, the new Raptor Lake S is the same as the other hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-3-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 8ec23f47fee9..a2834bc93149 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -685,6 +685,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); From e04a1607c9c3c0e2dc48715aeb570f2581f514bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:31 -0700 Subject: [PATCH 132/145] perf/x86/uncore: Add new Raptor Lake S support From the perspective of the uncore PMU, the new Raptor Lake S is the same as the other hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-4-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index db6c31bca809..6f1ccc57a692 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1831,6 +1831,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, From ee3e88dfec23153d0675b5d00522297b9adf657c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:51 +0530 Subject: [PATCH 133/145] perf/mem: Introduce PERF_MEM_LVLNUM_{EXTN_MEM|IO} PERF_MEM_LVLNUM_EXTN_MEM which can be used to indicate accesses to extension memory like CXL etc. PERF_MEM_LVL_IO can be used for IO accesses but it can not distinguish between local and remote IO. Introduce new field PERF_MEM_LVLNUM_IO which can be clubbed with PERF_MEM_REMOTE_REMOTE to indicate Remote IO accesses. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-2-ravi.bangoria@amd.com --- include/uapi/linux/perf_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e639c74cf5fb..4ae3c249f675 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1336,7 +1336,9 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0xa available */ +/* 5-0x8 available */ +#define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ From 610c238041fbc682936d34132362a54a802600fe Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:52 +0530 Subject: [PATCH 134/145] perf/x86/amd: Add IBS OP_DATA2 DataSrc bit definitions IBS_OP_DATA2 DataSrc provides detail about location of the data being accessed from by load ops. Define macros for legacy and extended DataSrc values. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-3-ravi.bangoria@amd.com --- arch/x86/include/asm/amd-ibs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index f3eb098d63d4..cb2a5e113daa 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -6,6 +6,22 @@ #include +/* IBS_OP_DATA2 DataSrc */ +#define IBS_DATA_SRC_LOC_CACHE 2 +#define IBS_DATA_SRC_DRAM 3 +#define IBS_DATA_SRC_REM_CACHE 4 +#define IBS_DATA_SRC_IO 7 + +/* IBS_OP_DATA2 DataSrc Extension */ +#define IBS_DATA_SRC_EXT_LOC_CACHE 1 +#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2 +#define IBS_DATA_SRC_EXT_DRAM 3 +#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5 +#define IBS_DATA_SRC_EXT_PMEM 6 +#define IBS_DATA_SRC_EXT_IO 7 +#define IBS_DATA_SRC_EXT_EXT_MEM 8 +#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12 + /* * IBS Hardware MSRs */ From 7c10dd0a88b1cc6ae4637fffb494c5e080027eb6 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:53 +0530 Subject: [PATCH 135/145] perf/x86/amd: Support PERF_SAMPLE_DATA_SRC struct perf_mem_data_src is used to pass arch specific memory access details into generic form. These details gets consumed by tools like perf mem and c2c. IBS tagged load/store sample provides most of the information needed for these tools. Add a logic to convert IBS specific raw data into perf_mem_data_src. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-4-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 318 +++++++++++++++++++++++++++++++++++++- 1 file changed, 312 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c29a006954c7..e20caa5cf02f 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -678,6 +678,312 @@ static struct perf_ibs perf_ibs_op = { .get_count = get_ibs_op_count, }; +static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_op = PERF_MEM_OP_NA; + + if (op_data3->ld_op) + data_src->mem_op = PERF_MEM_OP_LOAD; + else if (op_data3->st_op) + data_src->mem_op = PERF_MEM_OP_STORE; +} + +/* + * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has + * more fine granular DataSrc encodings. Others have coarse. + */ +static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) +{ + if (ibs_caps & IBS_CAPS_ZEN4) + return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; + + return op_data2->data_src_lo; +} + +static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src = perf_ibs_data_src(op_data2); + + data_src->mem_lvl = 0; + + /* + * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached + * memory accesses. So, check DcUcMemAcc bit early. + */ + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; + return; + } + + /* L1 Hit */ + if (op_data3->dc_miss == 0) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + return; + } + + /* L2 Hit */ + if (op_data3->l2_miss == 0) { + /* Erratum #1293 */ + if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* + * OP_DATA2 is valid only for load ops. Skip all checks which + * uses OP_DATA2[DataSrc]. + */ + if (data_src->mem_op != PERF_MEM_OP_LOAD) + goto check_mab; + + /* L3 Hit */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | + PERF_MEM_LVL_HIT; + return; + } + } + + /* A peer cache in a near CCX */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + return; + } + + /* A peer cache in a far CCX */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* DRAM */ + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { + if (op_data2->rmt_node == 0) + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + else + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; + return; + } + + /* PMEM */ + if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* Extension Memory */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* IO */ + if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_IO; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + +check_mab: + /* + * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding + * DC misses. However, such data may come from any level in mem + * hierarchy. IBS provides detail about both MAB as well as actual + * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set + * MAB only when IBS fails to provide DataSrc. + */ + if (op_data3->dc_miss_no_mab_alloc) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + return; + } + + data_src->mem_lvl = PERF_MEM_LVL_NA; +} + +static bool perf_ibs_cache_hit_st_valid(void) +{ + /* 0: Uninitialized, 1: Valid, -1: Invalid */ + static int cache_hit_st_valid; + + if (unlikely(!cache_hit_st_valid)) { + if (boot_cpu_data.x86 == 0x19 && + (boot_cpu_data.x86_model <= 0xF || + (boot_cpu_data.x86_model >= 0x20 && + boot_cpu_data.x86_model <= 0x5F))) { + cache_hit_st_valid = -1; + } else { + cache_hit_st_valid = 1; + } + } + + return cache_hit_st_valid == 1; +} + +static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src; + + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + + if (!perf_ibs_cache_hit_st_valid() || + data_src->mem_op != PERF_MEM_OP_LOAD || + data_src->mem_lvl & PERF_MEM_LVL_L1 || + data_src->mem_lvl & PERF_MEM_LVL_L2 || + op_data2->cache_hit_st) + return; + + ibs_data_src = perf_ibs_data_src(op_data2); + + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } +} + +static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_dtlb = PERF_MEM_TLB_NA; + + if (!op_data3->dc_lin_addr_valid) + return; + + if (!op_data3->dc_l1tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; + return; + } + + if (!op_data3->dc_l2tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; + return; + } + + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; +} + +static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_lock = PERF_MEM_LOCK_NA; + + if (op_data3->dc_locked_op) + data_src->mem_lock = PERF_MEM_LOCK_LOCKED; +} + +#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) + +static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, + struct perf_sample_data *data, + union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3) +{ + perf_ibs_get_mem_lvl(op_data2, op_data3, data); + perf_ibs_get_mem_snoop(op_data2, data); + perf_ibs_get_tlb_lvl(op_data3, data); + perf_ibs_get_mem_lock(op_data3, data); +} + +static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, + union ibs_op_data3 *op_data3) +{ + __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; + + /* Erratum #1293 */ + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && + (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + /* + * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. + * DataSrc=0 is 'No valid status' and RmtNode is invalid when + * DataSrc=0. + */ + val = 0; + } + return val; +} + +static void perf_ibs_parse_ld_st_data(__u64 sample_type, + struct perf_ibs_data *ibs_data, + struct perf_sample_data *data) +{ + union ibs_op_data3 op_data3; + union ibs_op_data2 op_data2; + + data->data_src.val = PERF_MEM_NA; + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + + perf_ibs_get_mem_op(&op_data3, data); + if (data->data_src.mem_op != PERF_MEM_OP_LOAD && + data->data_src.mem_op != PERF_MEM_OP_STORE) + return; + + op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, + int check_rip) +{ + if (sample_type & PERF_SAMPLE_RAW || + (perf_ibs == &perf_ibs_op && + sample_type & PERF_SAMPLE_DATA_SRC)) + return perf_ibs->offset_max; + else if (check_rip) + return 3; + return 1; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -725,12 +1031,9 @@ fail: size = 1; offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - if (event->attr.sample_type & PERF_SAMPLE_RAW) - offset_max = perf_ibs->offset_max; - else if (check_rip) - offset_max = 3; - else - offset_max = 1; + + offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + do { rdmsrl(msr + offset, *buf++); size++; @@ -784,6 +1087,9 @@ fail: data.sample_flags |= PERF_SAMPLE_RAW; } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from From 6b2ae4952ef8ac23b467bc10776404092b581143 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:54 +0530 Subject: [PATCH 136/145] perf/x86/amd: Support PERF_SAMPLE_{WEIGHT|WEIGHT_STRUCT} IbsDcMissLat indicates the number of clock cycles from when a miss is detected in the data cache to when the data was delivered to the core. Similarly, IbsTagToRetCtr provides number of cycles from when the op was tagged to when the op was retired. Consider these fields for sample->weight. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-5-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e20caa5cf02f..d883694e0fd4 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -955,6 +955,7 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, { union ibs_op_data3 op_data3; union ibs_op_data2 op_data2; + union ibs_op_data op_data; data->data_src.val = PERF_MEM_NA; op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; @@ -970,6 +971,19 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); data->sample_flags |= PERF_SAMPLE_DATA_SRC; } + + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && + data->data_src.mem_op == PERF_MEM_OP_LOAD) { + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { + data->weight.var1_dw = op_data3.dc_miss_lat; + data->weight.var2_w = op_data.tag_to_ret_ctr; + } else if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = op_data3.dc_miss_lat; + } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -977,7 +991,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, { if (sample_type & PERF_SAMPLE_RAW || (perf_ibs == &perf_ibs_op && - sample_type & PERF_SAMPLE_DATA_SRC)) + (sample_type & PERF_SAMPLE_DATA_SRC || + sample_type & PERF_SAMPLE_WEIGHT_TYPE))) return perf_ibs->offset_max; else if (check_rip) return 3; From cb2bb85f7ed8740ab5fc06bbec386faa39ba44ef Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:55 +0530 Subject: [PATCH 137/145] perf/x86/amd: Support PERF_SAMPLE_ADDR IBS_DC_LINADDR provides the linear data address for the tagged load/ store operation. Populate perf sample address using it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-6-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index d883694e0fd4..0ad49105c154 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -984,6 +984,11 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } + + if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { + data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -992,7 +997,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, if (sample_type & PERF_SAMPLE_RAW || (perf_ibs == &perf_ibs_op && (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE))) + sample_type & PERF_SAMPLE_WEIGHT_TYPE || + sample_type & PERF_SAMPLE_ADDR))) return perf_ibs->offset_max; else if (check_rip) return 3; From 5b26af6d2b7854639ddf893366bbca7e74fa7c54 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:56 +0530 Subject: [PATCH 138/145] perf/x86/amd: Support PERF_SAMPLE_PHY_ADDR IBS_DC_PHYSADDR provides the physical data address for the tagged load/ store operation. Populate perf sample physical address using it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-7-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 8 +++++++- kernel/events/core.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0ad49105c154..3271735f0070 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -989,6 +989,11 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; data->sample_flags |= PERF_SAMPLE_ADDR; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -998,7 +1003,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, (perf_ibs == &perf_ibs_op && (sample_type & PERF_SAMPLE_DATA_SRC || sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR))) + sample_type & PERF_SAMPLE_ADDR || + sample_type & PERF_SAMPLE_PHYS_ADDR))) return perf_ibs->offset_max; else if (check_rip) return 3; diff --git a/kernel/events/core.c b/kernel/events/core.c index e1ffdb861b53..49bc3b5e6c8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7435,7 +7435,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF From cfef80bad4cf79cdc964a53c98254dfa462be83f Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:57 +0530 Subject: [PATCH 139/145] perf/uapi: Define PERF_MEM_SNOOPX_PEER in kernel header file PERF_MEM_SNOOPX_PEER is defined only in tools uapi header. Although it's used only by perf tool, not defining it in kernel header can create problems in future. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-8-ravi.bangoria@amd.com --- include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4ae3c249f675..85be78e0e7f6 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1356,7 +1356,7 @@ union perf_mem_data_src { #define PERF_MEM_SNOOP_SHIFT 19 #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -/* 1 free */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ #define PERF_MEM_SNOOPX_SHIFT 38 /* locked instruction */ From 117ceeb1f4f87331e45a77e71f18303d15ec882e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 28 Sep 2022 11:40:42 -0700 Subject: [PATCH 140/145] perf/x86/utils: Fix uninitialized var in get_branch_type() offset is passed as a pointer and on certain call path is not set by the function. If the caller does not re-initialize offset between calls, value could be inherited between calls. Prevent this by initializing offset on each call. This impacts the code in amd_pmu_lbr_filter() which does: for(i=0; ...) { ret = get_branch_type_fused(..., &offset); if (offset) lbr_entries[i].from += offset; } Fixes: df3e9612f758 ("perf/x86: Make branch classifier fusion-aware") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20220928184043.408364-2-eranian@google.com --- arch/x86/events/utils.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index 5f5617afde79..76b1f8bb0fd5 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -94,6 +94,10 @@ static int get_branch_type(unsigned long from, unsigned long to, int abort, u8 buf[MAX_INSN_SIZE]; int is64 = 0; + /* make sure we initialize offset */ + if (offset) + *offset = 0; + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; From 3f9a1b3591003b122a6ea2d69f89a0fd96ec58b9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 28 Sep 2022 11:40:43 -0700 Subject: [PATCH 141/145] perf/x86/amd/lbr: Adjust LBR regardless of filtering In case of fused compare and taken branch instructions, the AMD LBR points to the compare instruction instead of the branch. Users of LBR usually expects the from address to point to a branch instruction. The kernel has code to adjust the from address via get_branch_type_fused(). However this correction is only applied when a branch filter is applied. That means that if no filter is present, the quality of the data is lower. Fix the problem by applying the adjustment regardless of the filter setting, bringing the AMD LBR to the same level as other LBR implementations. Fixes: 245268c19f70 ("perf/x86/amd/lbr: Use fusion-aware branch classifier") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20220928184043.408364-3-eranian@google.com --- arch/x86/events/amd/lbr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 2e1c1573efe7..38a75216c12c 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -99,12 +99,13 @@ static void amd_pmu_lbr_filter(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int br_sel = cpuc->br_sel, offset, type, i, j; bool compress = false; + bool fused_only = false; u64 from, to; /* If sampling all branches, there is nothing to filter */ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) - return; + fused_only = true; for (i = 0; i < cpuc->lbr_stack.nr; i++) { from = cpuc->lbr_entries[i].from; @@ -116,8 +117,11 @@ static void amd_pmu_lbr_filter(void) * fusion where it points to an instruction preceding the * actual branch */ - if (offset) + if (offset) { cpuc->lbr_entries[i].from += offset; + if (fused_only) + continue; + } /* If type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { From b731e3575f7a45a46512708f9fdf953b40c46a53 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 30 Sep 2022 18:07:30 +0800 Subject: [PATCH 142/145] mm/slub: fix a slab missed to be freed problem When enable kasan and kfence's in-kernel kunit test with slub_debug on, it caught a problem (in linux-next tree): ------------[ cut here ]------------ kmem_cache_destroy test: Slab cache still has objects when called from test_exit+0x1a/0x30 WARNING: CPU: 3 PID: 240 at mm/slab_common.c:492 kmem_cache_destroy+0x16c/0x170 Modules linked in: CPU: 3 PID: 240 Comm: kunit_try_catch Tainted: G B N 6.0.0-rc7-next-20220929 #52 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:kmem_cache_destroy+0x16c/0x170 Code: 41 5c 41 5d e9 a5 04 0b 00 c3 cc cc cc cc 48 8b 55 60 48 8b 4c 24 20 48 c7 c6 40 37 d2 82 48 c7 c7 e8 a0 33 83 e8 4e d7 14 01 <0f> 0b eb a7 41 56 41 89 d6 41 55 49 89 f5 41 54 49 89 fc 55 48 89 RSP: 0000:ffff88800775fea0 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff83bdec48 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 1ffff11000eebf9e RDI: ffffed1000eebfc6 RBP: ffff88804362fa00 R08: ffffffff81182e58 R09: ffff88800775fbdf R10: ffffed1000eebf7b R11: 0000000000000001 R12: 000000008c800d00 R13: ffff888005e78040 R14: 0000000000000000 R15: ffff888005cdfad0 FS: 0000000000000000(0000) GS:ffff88807ed00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000000360e001 CR4: 0000000000370ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: test_exit+0x1a/0x30 kunit_try_run_case+0xad/0xc0 kunit_generic_run_threadfn_adapter+0x26/0x50 kthread+0x17b/0x1b0 It was biscted to commit c7323a5ad078 ("mm/slub: restrict sysfs validation to debug caches and make it safe") The problem is inside free_debug_processing(), under certain circumstances the slab can be removed from the partial list but not freed by discard_slab() and thus n->nr_slabs is not decreased accordingly. During shutdown, this non-zero n->nr_slabs is detected and reported. Specifically, the problem is that there are two checks for detecting a full partial list by comparing n->nr_partial >= s->min_partial where the latter check is affected by remove_partial() decreasing n->nr_partial between the checks. Reoganize the code so there is a single check upfront. Link: https://lore.kernel.org/all/20220930100730.250248-1-feng.tang@intel.com/ Fixes: c7323a5ad078 ("mm/slub: restrict sysfs validation to debug caches and make it safe") Signed-off-by: Feng Tang Signed-off-by: Vlastimil Babka --- mm/slub.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d9650f2ca776..90010407d17a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2836,22 +2836,25 @@ out: set_freepointer(s, tail, prior); slab->freelist = head; - /* Do we need to remove the slab from full or partial list? */ + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + if (!prior) { + /* was on full list */ remove_full(s, n, slab); - } else if (slab->inuse == 0 && - n->nr_partial >= s->min_partial) { + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); } - - /* Do we need to discard the slab or add to partial list? */ - if (slab->inuse == 0 && n->nr_partial >= s->min_partial) { - slab_free = slab; - } else if (!prior) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } } if (slab_free) { From 0ce38047e82a02017839b6cae837f13a1383a3a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 10:46:58 +0200 Subject: [PATCH 143/145] perf: Fix lockdep_assert_event_ctx() I'm a flamin' moron; because even after Mark told me it should be '&&' I still got it wrong in the final commit. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Borislav Petkov Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Tested-by: Borislav Petkov Link: https://lkml.kernel.org/r/YvvIWmDBWdIUCMZj@FVFF77S0Q05N --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e9b151cde491..853f64b6c8c2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -644,7 +644,7 @@ struct pmu_event_list { #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ - (this_cpu_read(hardirqs_enabled) || \ + (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) From 7be51cc1c68dfa180ef84e71bcb4204237bb5620 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 11:03:47 +0200 Subject: [PATCH 144/145] perf: Fix pmu_filter_match() Mark reported that the new for_each_sibling_event() assertion triggers in pmu_filter_match() -- which isn't always called with IRQs disabled or ctx->mutex held. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YvvJq2f/7eFVcnNy@FVFF77S0Q05N --- kernel/events/core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 49bc3b5e6c8a..b981b879bcd8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2226,16 +2226,22 @@ static inline int __pmu_filter_match(struct perf_event *event) static inline int pmu_filter_match(struct perf_event *event) { struct perf_event *sibling; + unsigned long flags; + int ret = 1; if (!__pmu_filter_match(event)) return 0; + local_irq_save(flags); for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; + if (!__pmu_filter_match(sibling)) { + ret = 0; + break; + } } + local_irq_restore(flags); - return 1; + return ret; } static inline int From 82aad7ff7ac25c8cf09d491ae23b9823f1901486 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 12:20:39 +0200 Subject: [PATCH 145/145] perf/hw_breakpoint: Annotate tsk->perf_event_mutex vs ctx->mutex Perf fuzzer gifted a lockdep splat: perf_event_init_context() mutex_lock(parent_ctx->mutex); (B) inherit_task_group() inherit_group() inherit_event() perf_event_alloc() perf_try_init_event() := hw_breakpoint_event_init() register_perf_hw_breakpoint() mutex_lock(child->perf_event_mutex); (A) Which is against the normal (documented) order. Now, this is a false positive in that child is not published yet, but also inherited events never end up on ->perf_event_list. Annotate this one away. Fixes: 0912037fec11 ("perf/hw_breakpoint: Reduce contention with large number of tasks") Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/hw_breakpoint.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 7ef0e98d31e2..c3797701339c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -117,7 +117,17 @@ static struct mutex *bp_constraints_lock(struct perf_event *bp) struct mutex *tsk_mtx = get_task_bps_mutex(bp); if (tsk_mtx) { - mutex_lock(tsk_mtx); + /* + * Fully analogous to the perf_try_init_event() nesting + * argument in the comment near perf_event_ctx_lock_nested(); + * this child->perf_event_mutex cannot ever deadlock against + * the parent->perf_event_mutex usage from + * perf_event_task_{en,dis}able(). + * + * Specifically, inherited events will never occur on + * ->perf_event_list. + */ + mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); percpu_down_read(&bp_cpuinfo_sem); } else { percpu_down_write(&bp_cpuinfo_sem);