diff --git a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt index f1c848af91d3..8bbb6e94508b 100644 --- a/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt +++ b/Documentation/devicetree/bindings/timer/mediatek,mtk-timer.txt @@ -25,6 +25,7 @@ Required properties: For those SoCs that use SYST * "mediatek,mt8183-timer" for MT8183 compatible timers (SYST) * "mediatek,mt8186-timer" for MT8186 compatible timers (SYST) + * "mediatek,mt8188-timer" for MT8188 compatible timers (SYST) * "mediatek,mt8192-timer" for MT8192 compatible timers (SYST) * "mediatek,mt8195-timer" for MT8195 compatible timers (SYST) * "mediatek,mt7629-timer" for MT7629 compatible timers (SYST) diff --git a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml index 830211c55b4a..2b9653dafab8 100644 --- a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml +++ b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.yaml @@ -32,6 +32,10 @@ properties: clock-names: const: per + nxp,no-divider: + description: if present, means there is no internal base clk divider. + type: boolean + required: - compatible - reg diff --git a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml index c57169118b68..60f4c059bcff 100644 --- a/Documentation/devicetree/bindings/timer/renesas,tmu.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,tmu.yaml @@ -37,6 +37,7 @@ properties: - renesas,tmu-r8a77990 # R-Car E3 - renesas,tmu-r8a77995 # R-Car D3 - renesas,tmu-r8a779a0 # R-Car V3U + - renesas,tmu-r8a779f0 # R-Car S4-8 - const: renesas,tmu reg: diff --git a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml index 9c81d00b12e0..829bd2227f7c 100644 --- a/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml +++ b/Documentation/devicetree/bindings/timer/samsung,exynos4210-mct.yaml @@ -25,6 +25,7 @@ properties: - samsung,exynos4412-mct - items: - enum: + - axis,artpec8-mct - samsung,exynos3250-mct - samsung,exynos5250-mct - samsung,exynos5260-mct @@ -45,6 +46,19 @@ properties: reg: maxItems: 1 + samsung,frc-shared: + type: boolean + description: | + Indicates that the hardware requires that this processor share the + free-running counter with a different (main) processor. + + samsung,local-timers: + $ref: /schemas/types.yaml#/definitions/uint32-array + minItems: 1 + maxItems: 16 + description: | + List of indices of local timers usable from this processor. + interrupts: description: | Interrupts should be put in specific order. This is, the local timer @@ -74,6 +88,17 @@ required: - reg allOf: + - if: + not: + properties: + compatible: + contains: + enum: + - axis,artpec8-mct + then: + properties: + samsung,local-timers: false + samsung,frc-shared: false - if: properties: compatible: @@ -101,6 +126,7 @@ allOf: compatible: contains: enum: + - axis,artpec8-mct - samsung,exynos5260-mct - samsung,exynos5420-mct - samsung,exynos5433-mct diff --git a/Documentation/mm/slub.rst b/Documentation/mm/slub.rst index 43063ade737a..4e1578186b4f 100644 --- a/Documentation/mm/slub.rst +++ b/Documentation/mm/slub.rst @@ -400,21 +400,30 @@ information: allocated objects. The output is sorted by frequency of each trace. Information in the output: - Number of objects, allocating function, minimal/average/maximal jiffies since alloc, - pid range of the allocating processes, cpu mask of allocating cpus, and stack trace. + Number of objects, allocating function, possible memory wastage of + kmalloc objects(total/per-object), minimal/average/maximal jiffies + since alloc, pid range of the allocating processes, cpu mask of + allocating cpus, numa node mask of origins of memory, and stack trace. Example::: - 1085 populate_error_injection_list+0x97/0x110 age=166678/166680/166682 pid=1 cpus=1:: - __slab_alloc+0x6d/0x90 - kmem_cache_alloc_trace+0x2eb/0x300 - populate_error_injection_list+0x97/0x110 - init_error_injection+0x1b/0x71 - do_one_initcall+0x5f/0x2d0 - kernel_init_freeable+0x26f/0x2d7 - kernel_init+0xe/0x118 - ret_from_fork+0x22/0x30 - + 338 pci_alloc_dev+0x2c/0xa0 waste=521872/1544 age=290837/291891/293509 pid=1 cpus=106 nodes=0-1 + __kmem_cache_alloc_node+0x11f/0x4e0 + kmalloc_trace+0x26/0xa0 + pci_alloc_dev+0x2c/0xa0 + pci_scan_single_device+0xd2/0x150 + pci_scan_slot+0xf7/0x2d0 + pci_scan_child_bus_extend+0x4e/0x360 + acpi_pci_root_create+0x32e/0x3b0 + pci_acpi_scan_root+0x2b9/0x2d0 + acpi_pci_root_add.cold.11+0x110/0xb0a + acpi_bus_attach+0x262/0x3f0 + device_for_each_child+0xb7/0x110 + acpi_dev_for_each_child+0x77/0xa0 + acpi_bus_attach+0x108/0x3f0 + device_for_each_child+0xb7/0x110 + acpi_dev_for_each_child+0x77/0xa0 + acpi_bus_attach+0x108/0x3f0 2. free_traces:: diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 2669f80b3a49..8db1a15d7acb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -129,7 +130,14 @@ struct breakpoint { bool ptrace_bp; }; +/* + * While kernel/events/hw_breakpoint.c does its own synchronization, we cannot + * rely on it safely synchronizing internals here; however, we can rely on it + * not requesting more breakpoints than available. + */ +static DEFINE_SPINLOCK(cpu_bps_lock); static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]); +static DEFINE_SPINLOCK(task_bps_lock); static LIST_HEAD(task_bps); static struct breakpoint *alloc_breakpoint(struct perf_event *bp) @@ -174,7 +182,9 @@ static int task_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&task_bps_lock); list_add(&tmp->list, &task_bps); + spin_unlock(&task_bps_lock); return 0; } @@ -182,6 +192,7 @@ static void task_bps_remove(struct perf_event *bp) { struct list_head *pos, *q; + spin_lock(&task_bps_lock); list_for_each_safe(pos, q, &task_bps) { struct breakpoint *tmp = list_entry(pos, struct breakpoint, list); @@ -191,6 +202,7 @@ static void task_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&task_bps_lock); } /* @@ -200,12 +212,17 @@ static void task_bps_remove(struct perf_event *bp) static bool all_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { - if (!can_co_exist(tmp, bp)) - return true; + if (!can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } /* @@ -215,13 +232,18 @@ static bool all_task_bps_check(struct perf_event *bp) static bool same_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { if (tmp->bp->hw.target == bp->hw.target && - !can_co_exist(tmp, bp)) - return true; + !can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } static int cpu_bps_add(struct perf_event *bp) @@ -234,6 +256,7 @@ static int cpu_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) { @@ -241,6 +264,7 @@ static int cpu_bps_add(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); return 0; } @@ -249,6 +273,7 @@ static void cpu_bps_remove(struct perf_event *bp) struct breakpoint **cpu_bp; int i = 0; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) @@ -260,19 +285,25 @@ static void cpu_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); } static bool cpu_bps_check(int cpu, struct perf_event *bp) { struct breakpoint **cpu_bp; + bool ret = false; int i; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, cpu); for (i = 0; i < nr_wp_slots(); i++) { - if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) - return true; + if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&cpu_bps_lock); + return ret; } static bool all_cpu_bps_check(struct perf_event *bp) @@ -286,10 +317,6 @@ static bool all_cpu_bps_check(struct perf_event *bp) return false; } -/* - * We don't use any locks to serialize accesses to cpu_bps or task_bps - * because are already inside nr_bp_mutex. - */ int arch_reserve_bp_slot(struct perf_event *bp) { int ret; diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 03e31ae97741..942aa830e110 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2314,16 +2314,20 @@ static void record_and_restart(struct perf_event *event, unsigned long val, cpuhw = this_cpu_ptr(&cpu_hw_events); power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && - ppmu->get_mem_data_src) + ppmu->get_mem_data_src) { ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); + data.sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && - ppmu->get_mem_weight) + ppmu->get_mem_weight) { ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); - + data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } else if (period) { diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f7dd3c849e68..f043a7ff220b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -664,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event, raw.frag.data = cpuhw->stop; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index b38b4ae01589..6826e2a69a21 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -366,6 +366,7 @@ static int paicrypt_push_sample(void) raw.frag.data = cpump->save; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h index 199d17b765f2..361a0f57bdeb 100644 --- a/arch/sh/include/asm/hw_breakpoint.h +++ b/arch/sh/include/asm/hw_breakpoint.h @@ -48,10 +48,7 @@ struct pmu; /* Maximum number of UBC channels */ #define HBP_NUM 2 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) /* arch/sh/kernel/hw_breakpoint.c */ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9933c0e8e97a..86a76efa8bb6 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += core.o probe.o +obj-y += core.o probe.o utils.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index b9f5d4610256..527d947eb76b 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index bee8765a1e9b..f1bff153d945 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -81,7 +81,7 @@ static bool __init amd_brs_detect(void) * a br_sel_map. Software filtering is not supported because it would not correlate well * with a sampling period. */ -int amd_brs_setup_filter(struct perf_event *event) +static int amd_brs_setup_filter(struct perf_event *event) { u64 type = event->attr.branch_sample_type; @@ -96,6 +96,73 @@ int amd_brs_setup_filter(struct perf_event *event) return 0; } +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + +int amd_brs_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + + return ret; +} + /* tos = top of stack, i.e., last valid entry written */ static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) { diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9ac3718410ce..8b70237c33f7 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -330,16 +330,10 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } -#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ -static inline int amd_is_brs_event(struct perf_event *e) -{ - return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; -} +DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config); static int amd_core_hw_config(struct perf_event *event) { - int ret = 0; - if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -356,66 +350,10 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - /* - * if branch stack is requested - */ - if (has_branch_stack(event)) { - /* - * Due to interrupt holding, BRS is not recommended in - * counting mode. - */ - if (!is_sampling_event(event)) - return -EINVAL; + if (has_branch_stack(event)) + return static_call(amd_pmu_branch_hw_config)(event); - /* - * Due to the way BRS operates by holding the interrupt until - * lbr_nr entries have been captured, it does not make sense - * to allow sampling on BRS with an event that does not match - * what BRS is capturing, i.e., retired taken branches. - * Otherwise the correlation with the event's period is even - * more loose: - * - * With retired taken branch: - * Effective P = P + 16 + X - * With any other event: - * Effective P = P + Y + X - * - * Where X is the number of taken branches due to interrupt - * skid. Skid is large. - * - * Where Y is the occurences of the event while BRS is - * capturing the lbr_nr entries. - * - * By using retired taken branches, we limit the impact on the - * Y variable. We know it cannot be more than the depth of - * BRS. - */ - if (!amd_is_brs_event(event)) - return -EINVAL; - - /* - * BRS implementation does not work with frequency mode - * reprogramming of the period. - */ - if (event->attr.freq) - return -EINVAL; - /* - * The kernel subtracts BRS depth from period, so it must - * be big enough. - */ - if (event->attr.sample_period <= x86_pmu.lbr_nr) - return -EINVAL; - - /* - * Check if we can allow PERF_SAMPLE_BRANCH_STACK - */ - ret = amd_brs_setup_filter(event); - - /* only set in case of success */ - if (!ret) - event->hw.flags |= PERF_X86_EVENT_AMD_BRS; - } - return ret; + return 0; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -582,8 +520,14 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +typedef void (amd_pmu_branch_reset_t)(void); +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t); + static void amd_pmu_cpu_reset(int cpu) { + if (x86_pmu.lbr_nr) + static_call(amd_pmu_branch_reset)(); + if (x86_pmu.version < 2) return; @@ -598,16 +542,24 @@ static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL, + cpu_to_node(cpu)); + if (!cpuc->lbr_sel) + return -ENOMEM; + WARN_ON_ONCE(cpuc->amd_nb); if (!x86_pmu.amd_nb_constraints) return 0; cpuc->amd_nb = amd_alloc_nb(cpu); - if (!cpuc->amd_nb) - return -ENOMEM; + if (cpuc->amd_nb) + return 0; - return 0; + kfree(cpuc->lbr_sel); + cpuc->lbr_sel = NULL; + + return -ENOMEM; } static void amd_pmu_cpu_starting(int cpu) @@ -640,19 +592,19 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - amd_brs_reset(); amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) { - struct cpu_hw_events *cpuhw; + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + kfree(cpuhw->lbr_sel); + cpuhw->lbr_sel = NULL; if (!x86_pmu.amd_nb_constraints) return; - cpuhw = &per_cpu(cpu_hw_events, cpu); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; @@ -677,7 +629,7 @@ static inline u64 amd_pmu_get_global_status(void) /* PerfCntrGlobalStatus is read-only */ rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); - return status & amd_pmu_global_cntr_mask; + return status; } static inline void amd_pmu_ack_global_status(u64 status) @@ -688,8 +640,6 @@ static inline void amd_pmu_ack_global_status(u64 status) * clears the same bit in PerfCntrGlobalStatus */ - /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ - status &= amd_pmu_global_cntr_mask; wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); } @@ -799,11 +749,17 @@ static void amd_pmu_v2_enable_event(struct perf_event *event) __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } -static void amd_pmu_v2_enable_all(int added) +static __always_inline void amd_pmu_core_enable_all(void) { amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); } +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_lbr_enable_all(); + amd_pmu_core_enable_all(); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -828,23 +784,32 @@ static void amd_pmu_disable_all(void) amd_pmu_check_overflow(); } +static __always_inline void amd_pmu_core_disable_all(void) +{ + amd_pmu_set_global_ctl(0); +} + static void amd_pmu_v2_disable_all(void) { - /* Disable all PMCs */ - amd_pmu_set_global_ctl(0); + amd_pmu_core_disable_all(); + amd_pmu_lbr_disable_all(); amd_pmu_check_overflow(); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add); + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_add(event); + static_call(amd_pmu_branch_add)(event); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del); + static void amd_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_del(event); + static_call(amd_pmu_branch_del)(event); } /* @@ -930,8 +895,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) pmu_enabled = cpuc->enabled; cpuc->enabled = 0; - /* Stop counting */ - amd_pmu_v2_disable_all(); + /* Stop counting but do not disable LBR */ + amd_pmu_core_disable_all(); status = amd_pmu_get_global_status(); @@ -939,6 +904,12 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!status) goto done; + /* Read branch records before unfreezing */ + if (status & GLOBAL_STATUS_LBRS_FROZEN) { + amd_pmu_lbr_read(); + status &= ~GLOBAL_STATUS_LBRS_FROZEN; + } + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -958,6 +929,11 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; + if (has_branch_stack(event)) { + data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -971,7 +947,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) */ WARN_ON(status > 0); - /* Clear overflow bits */ + /* Clear overflow and freeze bits */ amd_pmu_ack_global_status(~status); /* @@ -985,7 +961,7 @@ done: /* Resume counting only if PMU is active */ if (pmu_enabled) - amd_pmu_v2_enable_all(0); + amd_pmu_core_enable_all(); return amd_pmu_adjust_nmi_window(handled); } @@ -1248,23 +1224,14 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static void amd_pmu_sched_task(struct perf_event_context *ctx, - bool sched_in) -{ - if (sched_in && x86_pmu.lbr_nr) - amd_pmu_brs_sched_task(ctx, sched_in); -} - -static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) +static void amd_pmu_limit_period(struct perf_event *event, s64 *left) { /* * Decrease period by the depth of the BRS feature to get the last N * taken branches and approximate the desired period */ - if (has_branch_stack(event) && left > x86_pmu.lbr_nr) - left -= x86_pmu.lbr_nr; - - return left; + if (has_branch_stack(event) && *left > x86_pmu.lbr_nr) + *left -= x86_pmu.lbr_nr; } static __initconst const struct x86_pmu amd_pmu = { @@ -1311,23 +1278,25 @@ static ssize_t branches_show(struct device *cdev, static DEVICE_ATTR_RO(branches); -static struct attribute *amd_pmu_brs_attrs[] = { +static struct attribute *amd_pmu_branches_attrs[] = { &dev_attr_branches.attr, NULL, }; static umode_t -amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.lbr_nr ? attr->mode : 0; } -static struct attribute_group group_caps_amd_brs = { +static struct attribute_group group_caps_amd_branches = { .name = "caps", - .attrs = amd_pmu_brs_attrs, - .is_visible = amd_brs_is_visible, + .attrs = amd_pmu_branches_attrs, + .is_visible = amd_branches_is_visible, }; +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + EVENT_ATTR_STR(branch-brs, amd_branch_brs, "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); @@ -1336,15 +1305,26 @@ static struct attribute *amd_brs_events_attrs[] = { NULL, }; +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ? + attr->mode : 0; +} + static struct attribute_group group_events_amd_brs = { .name = "events", .attrs = amd_brs_events_attrs, .is_visible = amd_brs_is_visible, }; +#endif /* CONFIG_PERF_EVENTS_AMD_BRS */ + static const struct attribute_group *amd_attr_update[] = { - &group_caps_amd_brs, + &group_caps_amd_branches, +#ifdef CONFIG_PERF_EVENTS_AMD_BRS &group_events_amd_brs, +#endif NULL, }; @@ -1421,13 +1401,27 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } - /* - * BRS requires special event constraints and flushing on ctxsw. - */ - if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + /* LBR and BRS are mutually exclusive features */ + if (!amd_pmu_lbr_init()) { + /* LBR requires flushing on context switch */ + x86_pmu.sched_task = amd_pmu_lbr_sched_task; + static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config); + static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); + static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + } else if (!amd_brs_init()) { + /* + * BRS requires special event constraints and flushing on ctxsw. + */ x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; - x86_pmu.sched_task = amd_pmu_sched_task; + x86_pmu.sched_task = amd_pmu_brs_sched_task; x86_pmu.limit_period = amd_pmu_limit_period; + + static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config); + static_call_update(amd_pmu_branch_reset, amd_brs_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_brs_add); + static_call_update(amd_pmu_branch_del, amd_pmu_brs_del); + /* * put_event_constraints callback same as Fam17h, set above */ diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c251bc44c088..3271735f0070 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -300,16 +300,6 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; - /* - * rip recorded by IbsOpRip will not be consistent with rsp and rbp - * recorded as part of interrupt regs. Thus we need to use rip from - * interrupt regs while unwinding call stack. Setting _EARLY flag - * makes sure we unwind call-stack before perf sample rip is set to - * IbsOpRip. - */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; - return 0; } @@ -688,6 +678,339 @@ static struct perf_ibs perf_ibs_op = { .get_count = get_ibs_op_count, }; +static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_op = PERF_MEM_OP_NA; + + if (op_data3->ld_op) + data_src->mem_op = PERF_MEM_OP_LOAD; + else if (op_data3->st_op) + data_src->mem_op = PERF_MEM_OP_STORE; +} + +/* + * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has + * more fine granular DataSrc encodings. Others have coarse. + */ +static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) +{ + if (ibs_caps & IBS_CAPS_ZEN4) + return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; + + return op_data2->data_src_lo; +} + +static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src = perf_ibs_data_src(op_data2); + + data_src->mem_lvl = 0; + + /* + * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached + * memory accesses. So, check DcUcMemAcc bit early. + */ + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; + return; + } + + /* L1 Hit */ + if (op_data3->dc_miss == 0) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + return; + } + + /* L2 Hit */ + if (op_data3->l2_miss == 0) { + /* Erratum #1293 */ + if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* + * OP_DATA2 is valid only for load ops. Skip all checks which + * uses OP_DATA2[DataSrc]. + */ + if (data_src->mem_op != PERF_MEM_OP_LOAD) + goto check_mab; + + /* L3 Hit */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | + PERF_MEM_LVL_HIT; + return; + } + } + + /* A peer cache in a near CCX */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + return; + } + + /* A peer cache in a far CCX */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* DRAM */ + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { + if (op_data2->rmt_node == 0) + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + else + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; + return; + } + + /* PMEM */ + if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* Extension Memory */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* IO */ + if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_IO; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + +check_mab: + /* + * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding + * DC misses. However, such data may come from any level in mem + * hierarchy. IBS provides detail about both MAB as well as actual + * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set + * MAB only when IBS fails to provide DataSrc. + */ + if (op_data3->dc_miss_no_mab_alloc) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + return; + } + + data_src->mem_lvl = PERF_MEM_LVL_NA; +} + +static bool perf_ibs_cache_hit_st_valid(void) +{ + /* 0: Uninitialized, 1: Valid, -1: Invalid */ + static int cache_hit_st_valid; + + if (unlikely(!cache_hit_st_valid)) { + if (boot_cpu_data.x86 == 0x19 && + (boot_cpu_data.x86_model <= 0xF || + (boot_cpu_data.x86_model >= 0x20 && + boot_cpu_data.x86_model <= 0x5F))) { + cache_hit_st_valid = -1; + } else { + cache_hit_st_valid = 1; + } + } + + return cache_hit_st_valid == 1; +} + +static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src; + + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + + if (!perf_ibs_cache_hit_st_valid() || + data_src->mem_op != PERF_MEM_OP_LOAD || + data_src->mem_lvl & PERF_MEM_LVL_L1 || + data_src->mem_lvl & PERF_MEM_LVL_L2 || + op_data2->cache_hit_st) + return; + + ibs_data_src = perf_ibs_data_src(op_data2); + + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } +} + +static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_dtlb = PERF_MEM_TLB_NA; + + if (!op_data3->dc_lin_addr_valid) + return; + + if (!op_data3->dc_l1tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; + return; + } + + if (!op_data3->dc_l2tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; + return; + } + + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; +} + +static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_lock = PERF_MEM_LOCK_NA; + + if (op_data3->dc_locked_op) + data_src->mem_lock = PERF_MEM_LOCK_LOCKED; +} + +#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) + +static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, + struct perf_sample_data *data, + union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3) +{ + perf_ibs_get_mem_lvl(op_data2, op_data3, data); + perf_ibs_get_mem_snoop(op_data2, data); + perf_ibs_get_tlb_lvl(op_data3, data); + perf_ibs_get_mem_lock(op_data3, data); +} + +static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, + union ibs_op_data3 *op_data3) +{ + __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; + + /* Erratum #1293 */ + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && + (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + /* + * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. + * DataSrc=0 is 'No valid status' and RmtNode is invalid when + * DataSrc=0. + */ + val = 0; + } + return val; +} + +static void perf_ibs_parse_ld_st_data(__u64 sample_type, + struct perf_ibs_data *ibs_data, + struct perf_sample_data *data) +{ + union ibs_op_data3 op_data3; + union ibs_op_data2 op_data2; + union ibs_op_data op_data; + + data->data_src.val = PERF_MEM_NA; + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + + perf_ibs_get_mem_op(&op_data3, data); + if (data->data_src.mem_op != PERF_MEM_OP_LOAD && + data->data_src.mem_op != PERF_MEM_OP_STORE) + return; + + op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && + data->data_src.mem_op == PERF_MEM_OP_LOAD) { + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { + data->weight.var1_dw = op_data3.dc_miss_lat; + data->weight.var2_w = op_data.tag_to_ret_ctr; + } else if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = op_data3.dc_miss_lat; + } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { + data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, + int check_rip) +{ + if (sample_type & PERF_SAMPLE_RAW || + (perf_ibs == &perf_ibs_op && + (sample_type & PERF_SAMPLE_DATA_SRC || + sample_type & PERF_SAMPLE_WEIGHT_TYPE || + sample_type & PERF_SAMPLE_ADDR || + sample_type & PERF_SAMPLE_PHYS_ADDR))) + return perf_ibs->offset_max; + else if (check_rip) + return 3; + return 1; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -735,12 +1058,9 @@ fail: size = 1; offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - if (event->attr.sample_type & PERF_SAMPLE_RAW) - offset_max = perf_ibs->offset_max; - else if (check_rip) - offset_max = 3; - else - offset_max = 1; + + offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + do { rdmsrl(msr + offset, *buf++); size++; @@ -791,15 +1111,21 @@ fail: }, }; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { data.callchain = perf_callchain(event, iregs); + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; + } throttle = perf_event_overflow(event, &data, ®s); out: diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c new file mode 100644 index 000000000000..38a75216c12c --- /dev/null +++ b/arch/x86/events/amd/lbr.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include "../perf_event.h" + +/* LBR Branch Select valid bits */ +#define LBR_SELECT_MASK 0x1ff + +/* + * LBR Branch Select filter bits which when set, ensures that the + * corresponding type of branches are not recorded + */ +#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */ +#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */ +#define LBR_SELECT_JCC 2 /* Conditional branches */ +#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */ +#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */ +#define LBR_SELECT_RET_NEAR 5 /* Near returns */ +#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */ +#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */ +#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */ + +#define LBR_KERNEL BIT(LBR_SELECT_KERNEL) +#define LBR_USER BIT(LBR_SELECT_USER) +#define LBR_JCC BIT(LBR_SELECT_JCC) +#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL) +#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND) +#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR) +#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL) +#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND) +#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH) +#define LBR_NOT_SUPP -1 /* unsupported filter */ +#define LBR_IGNORE 0 + +#define LBR_ANY \ + (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \ + LBR_REL_JMP | LBR_IND_JMP | LBR_FAR) + +struct branch_entry { + union { + struct { + u64 ip:58; + u64 ip_sign_ext:5; + u64 mispredict:1; + } split; + u64 full; + } from; + + union { + struct { + u64 ip:58; + u64 ip_sign_ext:3; + u64 reserved:1; + u64 spec:1; + u64 valid:1; + } split; + u64 full; + } to; +}; + +static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); +} + +static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); +} + +static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + + return val; +} + +static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + + return val; +} + +static __always_inline u64 sign_ext_branch_ip(u64 ip) +{ + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + return (u64)(((s64)ip << shift) >> shift); +} + +static void amd_pmu_lbr_filter(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int br_sel = cpuc->br_sel, offset, type, i, j; + bool compress = false; + bool fused_only = false; + u64 from, to; + + /* If sampling all branches, there is nothing to filter */ + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) + fused_only = true; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + type = branch_type_fused(from, to, 0, &offset); + + /* + * Adjust the branch from address in case of instruction + * fusion where it points to an instruction preceding the + * actual branch + */ + if (offset) { + cpuc->lbr_entries[i].from += offset; + if (fused_only) + continue; + } + + /* If type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; /* mark invalid */ + compress = true; + } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); + } + + if (!compress) + return; + + /* Remove all invalid entries */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + +static const int lbr_spec_map[PERF_BR_SPEC_MAX] = { + PERF_BR_SPEC_NA, + PERF_BR_SPEC_WRONG_PATH, + PERF_BR_NON_SPEC_CORRECT_PATH, + PERF_BR_SPEC_CORRECT_PATH, +}; + +void amd_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_branch_entry *br = cpuc->lbr_entries; + struct branch_entry entry; + int out = 0, idx, i; + + if (!cpuc->lbr_users) + return; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + entry.from.full = amd_pmu_lbr_get_from(i); + entry.to.full = amd_pmu_lbr_get_to(i); + + /* + * Check if a branch has been logged; if valid = 0, spec = 0 + * then no branch was recorded + */ + if (!entry.to.split.valid && !entry.to.split.spec) + continue; + + perf_clear_branch_entry_bitfields(br + out); + + br[out].from = sign_ext_branch_ip(entry.from.split.ip); + br[out].to = sign_ext_branch_ip(entry.to.split.ip); + br[out].mispred = entry.from.split.mispredict; + br[out].predicted = !br[out].mispred; + + /* + * Set branch speculation information using the status of + * the valid and spec bits. + * + * When valid = 0, spec = 0, no branch was recorded and the + * entry is discarded as seen above. + * + * When valid = 0, spec = 1, the recorded branch was + * speculative but took the wrong path. + * + * When valid = 1, spec = 0, the recorded branch was + * non-speculative but took the correct path. + * + * When valid = 1, spec = 1, the recorded branch was + * speculative and took the correct path + */ + idx = (entry.to.split.valid << 1) | entry.to.split.spec; + br[out].spec = lbr_spec_map[idx]; + out++; + } + + cpuc->lbr_stack.nr = out; + + /* + * Internal register renaming always ensures that LBR From[0] and + * LBR To[0] always represent the TOS + */ + cpuc->lbr_stack.hw_idx = 0; + + /* Perform further software filtering */ + amd_pmu_lbr_filter(); +} + +static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, + + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, + + [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, +}; + +static int amd_pmu_lbr_setup_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, v; + int i; + + /* No LBR support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* Ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + + reg->reg = mask; + mask = 0; + + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & BIT_ULL(i))) + continue; + + v = lbr_select_map[i]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGNORE) + mask |= v; + } + + /* Filter bits operate in suppress mode */ + reg->config = mask ^ LBR_SELECT_MASK; + + return 0; +} + +int amd_pmu_lbr_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* LBR is not recommended in counting mode */ + if (!is_sampling_event(event)) + return -EINVAL; + + ret = amd_pmu_lbr_setup_filter(event); + if (!ret) + event->attach_state |= PERF_ATTACH_SCHED_CB; + + return ret; +} + +void amd_pmu_lbr_reset(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + if (!x86_pmu.lbr_nr) + return; + + /* Reset all branch records individually */ + for (i = 0; i < x86_pmu.lbr_nr; i++) { + amd_pmu_lbr_set_from(i, 0); + amd_pmu_lbr_set_to(i, 0); + } + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; + wrmsrl(MSR_AMD64_LBR_SELECT, 0); +} + +void amd_pmu_lbr_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + + if (!x86_pmu.lbr_nr) + return; + + if (has_branch_stack(event)) { + cpuc->lbr_select = 1; + cpuc->lbr_sel->config = reg->config; + cpuc->br_sel = reg->reg; + } + + perf_sched_cb_inc(event->ctx->pmu); + + if (!cpuc->lbr_users++ && !event->total_time_running) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + if (has_branch_stack(event)) + cpuc->lbr_select = 0; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * A context switch can flip the address space and LBR entries are + * not tagged with an identifier. Hence, branches cannot be resolved + * from the old address space and the LBR records should be wiped. + */ + if (cpuc->lbr_users && sched_in) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 lbr_select, dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + /* Set hardware branch filter */ + if (cpuc->lbr_select) { + lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; + wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); + } + + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); +} + +void amd_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); +} + +__init int amd_pmu_lbr_init(void) +{ + union cpuid_0x80000022_ebx ebx; + + if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2)) + return -EOPNOTSUPP; + + /* Set number of entries */ + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz; + + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + + return 0; +} diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f969410d0c90..b30b8bbcd1e2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -72,6 +72,10 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); + DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); @@ -116,9 +120,6 @@ u64 x86_perf_event_update(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) - return x86_pmu.update_topdown_event(event); - /* * Careful: an NMI might modify the previous event value. * @@ -621,8 +622,9 @@ int x86_pmu_hw_config(struct perf_event *event) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, event->attr.sample_period) > - event->attr.sample_period) + s64 left = event->attr.sample_period; + x86_pmu.limit_period(event, &left); + if (left > event->attr.sample_period) return -EINVAL; } @@ -1354,7 +1356,7 @@ static void x86_pmu_enable(struct pmu *pmu) static_call(x86_pmu_enable_all)(added); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); +DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -1370,10 +1372,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && - x86_pmu.set_topdown_event_period) - return x86_pmu.set_topdown_event_period(event); - /* * If we are way outside a reasonable range then just skip forward: */ @@ -1399,10 +1397,9 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; - if (x86_pmu.limit_period) - left = x86_pmu.limit_period(event, left); + static_call_cond(x86_pmu_limit_period)(event, &left); - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, @@ -1419,16 +1416,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); - /* - * Due to erratum on certan cpu we need - * a second write to be sure the register - * is updated properly - */ - if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base, - (u64)(-left) & x86_pmu.cntval_mask); - } - perf_event_update_userpage(event); return ret; @@ -1518,7 +1505,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); } event->hw.state = 0; @@ -1610,7 +1597,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -1700,7 +1687,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; - val = x86_perf_event_update(event); + val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; @@ -1709,13 +1696,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) */ handled++; - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -2023,6 +2012,10 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); + static_call_update(x86_pmu_set_period, x86_pmu.set_period); + static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); + static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); @@ -2042,7 +2035,7 @@ static void x86_pmu_static_call_update(void) static void _x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, @@ -2149,6 +2142,12 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; + if (!x86_pmu.set_period) + x86_pmu.set_period = x86_perf_event_set_period; + + if (!x86_pmu.update) + x86_pmu.update = x86_perf_event_update; + x86_pmu_static_call_update(); /* @@ -2670,7 +2669,9 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value) return -EINVAL; if (value && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, value) > value) + s64 left = value; + x86_pmu.limit_period(event, &left); + if (left > value) return -EINVAL; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c20d8cd47c48..a646a5f9a235 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2199,6 +2199,12 @@ static void __intel_pmu_enable_all(int added, bool pmi) u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); intel_pmu_lbr_enable_all(pmi); + + if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) { + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); + cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val; + } + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, intel_ctrl & ~cpuc->intel_ctrl_guest_mask); @@ -2311,7 +2317,7 @@ static void intel_pmu_nhm_workaround(void) for (i = 0; i < 4; i++) { event = cpuc->events[i]; if (event) - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } for (i = 0; i < 4; i++) { @@ -2326,7 +2332,7 @@ static void intel_pmu_nhm_workaround(void) event = cpuc->events[i]; if (event) { - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } else @@ -2416,9 +2422,10 @@ static inline void intel_clear_masks(struct perf_event *event, int idx) static void intel_pmu_disable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask; int idx = hwc->idx; + u64 mask; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2435,9 +2442,7 @@ static void intel_pmu_disable_fixed(struct perf_event *event) intel_clear_masks(event, idx); mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; } static void intel_pmu_disable_event(struct perf_event *event) @@ -2530,6 +2535,8 @@ static int adl_set_topdown_event_period(struct perf_event *event) return icl_set_topdown_event_period(event); } +DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); + static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) { u32 val; @@ -2680,6 +2687,7 @@ static u64 adl_update_topdown_event(struct perf_event *event) return icl_update_topdown_event(event); } +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); static void intel_pmu_read_topdown_event(struct perf_event *event) { @@ -2691,7 +2699,7 @@ static void intel_pmu_read_topdown_event(struct perf_event *event) return; perf_pmu_disable(event->pmu); - x86_pmu.update_topdown_event(event); + static_call(intel_pmu_update_topdown_event)(event); perf_pmu_enable(event->pmu); } @@ -2699,7 +2707,7 @@ static void intel_pmu_read_event(struct perf_event *event) { if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_auto_reload_read(event); - else if (is_topdown_count(event) && x86_pmu.update_topdown_event) + else if (is_topdown_count(event)) intel_pmu_read_topdown_event(event); else x86_perf_event_update(event); @@ -2707,8 +2715,9 @@ static void intel_pmu_read_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask, bits = 0; + u64 mask, bits = 0; int idx = hwc->idx; if (is_topdown_idx(idx)) { @@ -2752,10 +2761,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); } - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val |= bits; } static void intel_pmu_enable_event(struct perf_event *event) @@ -2803,7 +2810,7 @@ static void intel_pmu_add_event(struct perf_event *event) */ int intel_pmu_save_and_restart(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); /* * For a checkpointed counter always reset back to 0. This * avoids a situation where the counter overflows, aborts the @@ -2815,9 +2822,25 @@ int intel_pmu_save_and_restart(struct perf_event *event) wrmsrl(event->hw.event_base, 0); local64_set(&event->hw.prev_count, 0); } + return static_call(x86_pmu_set_period)(event); +} + +static int intel_pmu_set_period(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_set_topdown_event_period)(event); + return x86_perf_event_set_period(event); } +static u64 intel_pmu_update(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_update_topdown_event)(event); + + return x86_perf_event_update(event); +} + static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); @@ -2980,8 +3003,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - if (x86_pmu.update_topdown_event) - x86_pmu.update_topdown_event(NULL); + static_call(intel_pmu_update_topdown_event)(NULL); } /* @@ -3004,8 +3026,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -3853,9 +3877,6 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { @@ -4334,28 +4355,25 @@ static u8 adl_get_hybrid_cpu_type(void) * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity. */ -static u64 bdw_limit_period(struct perf_event *event, u64 left) +static void bdw_limit_period(struct perf_event *event, s64 *left) { if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xc0, .umask=0x01)) { - if (left < 128) - left = 128; - left &= ~0x3fULL; + if (*left < 128) + *left = 128; + *left &= ~0x3fULL; } - return left; } -static u64 nhm_limit_period(struct perf_event *event, u64 left) +static void nhm_limit_period(struct perf_event *event, s64 *left) { - return max(left, 32ULL); + *left = max(*left, 32LL); } -static u64 spr_limit_period(struct perf_event *event, u64 left) +static void spr_limit_period(struct perf_event *event, s64 *left) { if (event->attr.precise_ip == 3) - return max(left, 128ULL); - - return left; + *left = max(*left, 128LL); } PMU_FORMAT_ATTR(event, "config:0-7" ); @@ -4794,6 +4812,8 @@ static __initconst const struct x86_pmu intel_pmu = { .add = intel_pmu_add_event, .del = intel_pmu_del_event, .read = intel_pmu_read_event, + .set_period = intel_pmu_set_period, + .update = intel_pmu_update, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -6312,8 +6332,10 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 4; - x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Icelake events, "); name = "icelake"; break; @@ -6348,8 +6370,10 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Sapphire Rapids events, "); name = "sapphire_rapids"; break; @@ -6358,6 +6382,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE_L: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: /* * Alder Lake has 2 types of CPU, core and atom. * @@ -6382,8 +6407,10 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_adl(); x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = adl_update_topdown_event; - x86_pmu.set_topdown_event_period = adl_set_topdown_event_period; + static_call_update(intel_pmu_update_topdown_event, + &adl_update_topdown_event); + static_call_update(intel_pmu_set_topdown_event_period, + &adl_set_topdown_event_period); x86_pmu.filter_match = intel_pmu_filter_match; x86_pmu.get_event_constraints = adl_get_event_constraints; diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 8ec23f47fee9..a2834bc93149 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -685,6 +685,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ac973c6f82ad..7839507b3844 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1540,14 +1540,18 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { data->weight.full = pebs->lat; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } /* * data.data_src encodes the data source */ - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, pebs->dse); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1555,8 +1559,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } /* * We use the interrupt regs as a base because the PEBS record does not @@ -1628,17 +1634,22 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) { data->addr = pebs->dla; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); - - if (sample_type & PERF_SAMPLE_TRANSACTION) + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } /* @@ -1648,11 +1659,15 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * We can only do this for the default trace clock. */ if (x86_pmu.intel_cap.pebs_format >= 3 && - event->attr.use_clockid == 0) + event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(pebs->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1710,8 +1725,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; - if (event->attr.use_clockid == 0) + if (event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(basic->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1719,8 +1736,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } *regs = *iregs; /* The ip in basic is EventingIP */ @@ -1771,17 +1790,24 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, meminfo->aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } - if (sample_type & PERF_SAMPLE_ADDR_TYPE) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { data->addr = meminfo->address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, gprs ? gprs->ax : 0); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } if (format_size & PEBS_DATACFG_XMMS) { @@ -1800,6 +1826,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } } diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 47fca6a7a8bc..4fce1a4226e3 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -4,7 +4,6 @@ #include #include -#include #include "../perf_event.h" @@ -65,65 +64,6 @@ #define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59)) -/* - * x86control flow change classification - * x86control flow changes include branches, interrupts, traps, faults - */ -enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ - X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ - X86_BR_CALL_STACK = 1 << 16,/* call stack */ - X86_BR_IND_JMP = 1 << 17,/* indirect jump */ - - X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ - -}; - -#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) -#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) - -#define X86_BR_ANY \ - (X86_BR_CALL |\ - X86_BR_RET |\ - X86_BR_SYSCALL |\ - X86_BR_SYSRET |\ - X86_BR_INT |\ - X86_BR_IRET |\ - X86_BR_JCC |\ - X86_BR_JMP |\ - X86_BR_IRQ |\ - X86_BR_ABORT |\ - X86_BR_IND_CALL |\ - X86_BR_IND_JMP |\ - X86_BR_ZERO_CALL) - -#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) - -#define X86_BR_ANY_CALL \ - (X86_BR_CALL |\ - X86_BR_IND_CALL |\ - X86_BR_ZERO_CALL |\ - X86_BR_SYSCALL |\ - X86_BR_IRQ |\ - X86_BR_INT) - /* * Intel LBR_CTL bits * @@ -1151,219 +1091,6 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) return ret; } -/* - * return the type of control flow change at address "from" - * instruction is not necessarily a branch (in case of interrupt). - * - * The branch type returned also includes the priv level of the - * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). - * - * If a branch type is unknown OR the instruction cannot be - * decoded (e.g., text page not present), then X86_BR_NONE is - * returned. - */ -static int branch_type(unsigned long from, unsigned long to, int abort) -{ - struct insn insn; - void *addr; - int bytes_read, bytes_left; - int ret = X86_BR_NONE; - int ext, to_plm, from_plm; - u8 buf[MAX_INSN_SIZE]; - int is64 = 0; - - to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; - from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; - - /* - * maybe zero if lbr did not fill up after a reset by the time - * we get a PMU interrupt - */ - if (from == 0 || to == 0) - return X86_BR_NONE; - - if (abort) - return X86_BR_ABORT | to_plm; - - if (from_plm == X86_BR_USER) { - /* - * can happen if measuring at the user level only - * and we interrupt in a kernel thread, e.g., idle. - */ - if (!current->mm) - return X86_BR_NONE; - - /* may fail if text not present */ - bytes_left = copy_from_user_nmi(buf, (void __user *)from, - MAX_INSN_SIZE); - bytes_read = MAX_INSN_SIZE - bytes_left; - if (!bytes_read) - return X86_BR_NONE; - - addr = buf; - } else { - /* - * The LBR logs any address in the IP, even if the IP just - * faulted. This means userspace can control the from address. - * Ensure we don't blindly read any address by validating it is - * a known text address. - */ - if (kernel_text_address(from)) { - addr = (void *)from; - /* - * Assume we can get the maximum possible size - * when grabbing kernel data. This is not - * _strictly_ true since we could possibly be - * executing up next to a memory hole, but - * it is very unlikely to be a problem. - */ - bytes_read = MAX_INSN_SIZE; - } else { - return X86_BR_NONE; - } - } - - /* - * decoder needs to know the ABI especially - * on 64-bit systems running 32-bit apps - */ -#ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); -#endif - insn_init(&insn, addr, bytes_read, is64); - if (insn_get_opcode(&insn)) - return X86_BR_ABORT; - - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; - break; - } - fallthrough; - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - if (insn_get_modrm(&insn)) - return X86_BR_ABORT; - - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; - break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; - } - /* - * interrupts, traps, faults (and thus ring transition) may - * occur on any instructions. Thus, to classify them correctly, - * we need to first look at the from and to priv levels. If they - * are different and to is in the kernel, then it indicates - * a ring transition. If the from instruction is not a ring - * transition instr (syscall, systenter, int), then it means - * it was a irq, trap or fault. - * - * we have no way of detecting kernel to kernel faults. - */ - if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL - && ret != X86_BR_SYSCALL && ret != X86_BR_INT) - ret = X86_BR_IRQ; - - /* - * branch priv level determined by target as - * is done by HW when LBR_SELECT is implemented - */ - if (ret != X86_BR_NONE) - ret |= to_plm; - - return ret; -} - -#define X86_BR_TYPE_MAP_MAX 16 - -static int branch_map[X86_BR_TYPE_MAP_MAX] = { - PERF_BR_CALL, /* X86_BR_CALL */ - PERF_BR_RET, /* X86_BR_RET */ - PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ - PERF_BR_SYSRET, /* X86_BR_SYSRET */ - PERF_BR_UNKNOWN, /* X86_BR_INT */ - PERF_BR_ERET, /* X86_BR_IRET */ - PERF_BR_COND, /* X86_BR_JCC */ - PERF_BR_UNCOND, /* X86_BR_JMP */ - PERF_BR_IRQ, /* X86_BR_IRQ */ - PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_ABORT */ - PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ - PERF_BR_CALL, /* X86_BR_ZERO_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ - PERF_BR_IND, /* X86_BR_IND_JMP */ -}; - -static int -common_branch_type(int type) -{ - int i; - - type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ - - if (type) { - i = __ffs(type); - if (i < X86_BR_TYPE_MAP_MAX) - return branch_map[i]; - } - - return PERF_BR_UNKNOWN; -} - enum { ARCH_LBR_BR_TYPE_JCC = 0, ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 7951a5dc73b6..03bbcc2fa2ff 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -1006,6 +1006,29 @@ static void p4_pmu_enable_all(int added) } } +static int p4_pmu_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = this_cpu_read(pmc_prev_left[hwc->idx]); + int ret; + + ret = x86_perf_event_set_period(event); + + if (hwc->event_base) { + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + } + + return ret; +} + static int p4_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; @@ -1044,7 +1067,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) /* event overflow for sure */ perf_sample_data_init(&data, 0, hwc->last_period); - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; @@ -1316,6 +1339,9 @@ static __initconst const struct x86_pmu p4_pmu = { .enable_all = p4_pmu_enable_all, .enable = p4_pmu_enable_event, .disable = p4_pmu_disable_event, + + .set_period = p4_pmu_set_period, + .eventsel = MSR_P4_BPU_CCCR0, .perfctr = MSR_P4_BPU_PERFCTR0, .event_map = p4_pmu_event_map, @@ -1334,15 +1360,6 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, - /* - * This handles erratum N15 in intel doc 249199-029, - * the counter may not be updated correctly on write - * so we need a second write operation to do the trick - * (the official workaround didn't work) - * - * the former idea is taken from OProfile code - */ - .perfctr_second_write = 1, .format_attrs = intel_p4_formats_attr, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index db6c31bca809..6f1ccc57a692 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1831,6 +1831,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index ac542f98c070..ecced3a52668 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -106,6 +106,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 266143abcbd8..332d2e6d8ae4 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -64,27 +64,25 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) return ((ecode & c->cmask) - c->code) <= (u64)c->size; } +#define PERF_ARCH(name, val) \ + PERF_X86_EVENT_##name = val, + /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ +enum { +#include "perf_event_flags.h" +}; -#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ -#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ -#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */ +#undef PERF_ARCH + +#define PERF_ARCH(name, val) \ + static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \ + PERF_X86_EVENT_##name); + +#include "perf_event_flags.h" + +#undef PERF_ARCH static inline bool is_topdown_count(struct perf_event *event) { @@ -272,6 +270,10 @@ struct cpu_hw_events { u64 active_pebs_data_cfg; int pebs_record_size; + /* Intel Fixed counter configuration */ + u64 fixed_ctrl_val; + u64 active_fixed_ctrl_val; + /* * Intel LBR bits */ @@ -745,6 +747,8 @@ struct x86_pmu { void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); + int (*set_period)(struct perf_event *event); + u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -780,8 +784,7 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; - int perfctr_second_write; - u64 (*limit_period)(struct perf_event *event, u64 l); + void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ unsigned int late_ack :1, @@ -889,8 +892,6 @@ struct x86_pmu { * Intel perf metrics */ int num_topdown_events; - u64 (*update_topdown_event)(struct perf_event *event); - int (*set_topdown_event_period)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) @@ -1044,6 +1045,9 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; +DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); +DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); + static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { if (static_cpu_has(X86_FEATURE_ARCH_LBR)) @@ -1059,6 +1063,7 @@ static inline bool x86_pmu_has_lbr_callstack(void) } DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); +DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); int x86_perf_event_set_period(struct perf_event *event); @@ -1210,6 +1215,70 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL |\ + X86_BR_IND_JMP |\ + X86_BR_ZERO_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +int common_branch_type(int type); +int branch_type(unsigned long from, unsigned long to, int abort); +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset); + ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); @@ -1232,7 +1301,20 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); +int amd_pmu_lbr_init(void); +void amd_pmu_lbr_reset(void); +void amd_pmu_lbr_read(void); +void amd_pmu_lbr_add(struct perf_event *event); +void amd_pmu_lbr_del(struct perf_event *event); +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_enable_all(void); +void amd_pmu_lbr_disable_all(void); +int amd_pmu_lbr_hw_config(struct perf_event *event); + #ifdef CONFIG_PERF_EVENTS_AMD_BRS + +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ + int amd_brs_init(void); void amd_brs_disable(void); void amd_brs_enable(void); @@ -1241,7 +1323,7 @@ void amd_brs_disable_all(void); void amd_brs_drain(void); void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); -int amd_brs_setup_filter(struct perf_event *event); +int amd_brs_hw_config(struct perf_event *event); void amd_brs_reset(void); static inline void amd_pmu_brs_add(struct perf_event *event) @@ -1277,7 +1359,7 @@ static inline void amd_brs_enable(void) {} static inline void amd_brs_drain(void) {} static inline void amd_brs_lopwr_init(void) {} static inline void amd_brs_disable_all(void) {} -static inline int amd_brs_setup_filter(struct perf_event *event) +static inline int amd_brs_hw_config(struct perf_event *event) { return 0; } diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h new file mode 100644 index 000000000000..1dc19b9b4426 --- /dev/null +++ b/arch/x86/events/perf_event_flags.h @@ -0,0 +1,22 @@ + +/* + * struct hw_perf_event.flags flags + */ +PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ + /* 0x00080 */ +PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c new file mode 100644 index 000000000000..76b1f8bb0fd5 --- /dev/null +++ b/arch/x86/events/utils.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "perf_event.h" + +static int decode_branch_type(struct insn *insn) +{ + int ext; + + if (insn_get_opcode(insn)) + return X86_BR_ABORT; + + switch (insn->opcode.bytes[0]) { + case 0xf: + switch (insn->opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + return X86_BR_SYSCALL; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + return X86_BR_SYSRET; + case 0x80 ... 0x8f: /* conditional */ + return X86_BR_JCC; + } + return X86_BR_NONE; + case 0x70 ... 0x7f: /* conditional */ + return X86_BR_JCC; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + return X86_BR_RET; + case 0xcf: /* iret */ + return X86_BR_IRET; + case 0xcc ... 0xce: /* int */ + return X86_BR_INT; + case 0xe8: /* call near rel */ + if (insn_get_immediate(insn) || insn->immediate1.value == 0) { + /* zero length call */ + return X86_BR_ZERO_CALL; + } + fallthrough; + case 0x9a: /* call far absolute */ + return X86_BR_CALL; + case 0xe0 ... 0xe3: /* loop jmp */ + return X86_BR_JCC; + case 0xe9 ... 0xeb: /* jmp */ + return X86_BR_JMP; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(insn)) + return X86_BR_ABORT; + + ext = (insn->modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + return X86_BR_IND_CALL; + case 4: + case 5: + return X86_BR_IND_JMP; + } + return X86_BR_NONE; + } + + return X86_BR_NONE; +} + +/* + * return the type of control flow change at address "from" + * instruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + * + * While recording branches, some processors can report the "from" + * address to be that of an instruction preceding the actual branch + * when instruction fusion occurs. If fusion is expected, attempt to + * find the type of the first branch instruction within the next + * MAX_INSN_SIZE bytes and if found, provide the offset between the + * reported "from" address and the actual branch instruction address. + */ +static int get_branch_type(unsigned long from, unsigned long to, int abort, + bool fused, int *offset) +{ + struct insn insn; + void *addr; + int bytes_read, bytes_left, insn_offset; + int ret = X86_BR_NONE; + int to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + /* make sure we initialize offset */ + if (offset) + *offset = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindly read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) { + addr = (void *)from; + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { + return X86_BR_NONE; + } + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); +#endif + insn_init(&insn, addr, bytes_read, is64); + ret = decode_branch_type(&insn); + insn_offset = 0; + + /* Check for the possibility of branch fusion */ + while (fused && ret == X86_BR_NONE) { + /* Check for decoding errors */ + if (insn_get_length(&insn) || !insn.length) + break; + + insn_offset += insn.length; + bytes_read -= insn.length; + if (bytes_read < 0) + break; + + insn_init(&insn, addr + insn_offset, bytes_read, is64); + ret = decode_branch_type(&insn); + } + + if (offset) + *offset = insn_offset; + + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +int branch_type(unsigned long from, unsigned long to, int abort) +{ + return get_branch_type(from, to, abort, false, NULL); +} + +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset) +{ + return get_branch_type(from, to, abort, true, offset); +} + +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_ERET, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_IRQ, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_NO_TX, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +int common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index f3eb098d63d4..cb2a5e113daa 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -6,6 +6,22 @@ #include +/* IBS_OP_DATA2 DataSrc */ +#define IBS_DATA_SRC_LOC_CACHE 2 +#define IBS_DATA_SRC_DRAM 3 +#define IBS_DATA_SRC_REM_CACHE 4 +#define IBS_DATA_SRC_IO 7 + +/* IBS_OP_DATA2 DataSrc Extension */ +#define IBS_DATA_SRC_EXT_LOC_CACHE 1 +#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2 +#define IBS_DATA_SRC_EXT_DRAM 3 +#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5 +#define IBS_DATA_SRC_EXT_PMEM 6 +#define IBS_DATA_SRC_EXT_IO 7 +#define IBS_DATA_SRC_EXT_EXT_MEM 8 +#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12 + /* * IBS Hardware MSRs */ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index ef4775c6db01..b71f4f2ecdd5 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -/* FREE! ( 3*32+17) */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index a1f0e90d0818..0bc931cd0698 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -44,10 +44,7 @@ struct arch_hw_breakpoint { /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) struct perf_event_attr; struct perf_event; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1e086b37a307..10ac52705892 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -590,6 +590,9 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Last Branch Record MSRs */ +#define MSR_AMD64_LBR_SELECT 0xc000010e + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -761,6 +764,8 @@ #define MSR_AMD_DBG_EXTN_CFG 0xc000010f #define MSR_AMD_SAMP_BR_FROM 0xc0010300 +#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6) + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f6fc8dd51ef4..9ac46dbe57d4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -207,7 +207,8 @@ union cpuid_0x80000022_ebx { struct { /* Number of Core Performance Counters */ unsigned int num_core_pmc:4; - unsigned int reserved:6; + /* Number of available LBR Stack Entries */ + unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; } split; diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h index 892fd8c3a6f7..60ece592b220 100644 --- a/arch/x86/include/asm/qspinlock_paravirt.h +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -12,7 +12,7 @@ */ #ifdef CONFIG_64BIT -PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath, ".spinlock.text"); #define __pv_queued_spin_unlock __pv_queued_spin_unlock #define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock" #define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath" @@ -20,9 +20,10 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); /* * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock * which combines the registers saving trunk and the body of the following - * C code: + * C code. Note that it puts the code in the .spinlock.text section which + * is equivalent to adding __lockfunc in the C code: * - * void __pv_queued_spin_unlock(struct qspinlock *lock) + * void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock) * { * u8 lockval = cmpxchg(&lock->locked, _Q_LOCKED_VAL, 0); * @@ -36,7 +37,7 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath); * rsi = lockval (second argument) * rdx = internal variable (set to 0) */ -asm (".pushsection .text;" +asm (".pushsection .spinlock.text;" ".globl " PV_UNLOCK ";" ".type " PV_UNLOCK ", @function;" ".align 4,0x90;" @@ -65,8 +66,8 @@ asm (".pushsection .text;" #else /* CONFIG_64BIT */ -extern void __pv_queued_spin_unlock(struct qspinlock *lock); -PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); +extern void __lockfunc __pv_queued_spin_unlock(struct qspinlock *lock); +__PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock, ".spinlock.text"); #endif /* CONFIG_64BIT */ #endif diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fd44b54c90d5..fc01f81f6e2a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 4d290c8b290c..aa859ca174f4 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -434,7 +434,7 @@ config ATMEL_TCB_CLKSRC config CLKSRC_EXYNOS_MCT bool "Exynos multi core timer driver" if COMPILE_TEST depends on ARM || ARM64 - depends on ARCH_EXYNOS || COMPILE_TEST + depends on ARCH_ARTPEC || ARCH_EXYNOS || COMPILE_TEST help Support for Multi Core Timer controller on Exynos SoCs. diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c index 9ab8221ee3c6..a7ff77550e17 100644 --- a/drivers/clocksource/arm_arch_timer.c +++ b/drivers/clocksource/arm_arch_timer.c @@ -44,8 +44,8 @@ #define CNTACR_RWVT BIT(4) #define CNTACR_RWPT BIT(5) -#define CNTVCT_LO 0x00 -#define CNTPCT_LO 0x08 +#define CNTPCT_LO 0x00 +#define CNTVCT_LO 0x08 #define CNTFRQ 0x10 #define CNTP_CVAL_LO 0x20 #define CNTP_CTL 0x2c @@ -473,6 +473,8 @@ static const struct arch_timer_erratum_workaround ool_workarounds[] = { .desc = "ARM erratum 858921", .read_cntpct_el0 = arm64_858921_read_cntpct_el0, .read_cntvct_el0 = arm64_858921_read_cntvct_el0, + .set_next_event_phys = erratum_set_next_event_phys, + .set_next_event_virt = erratum_set_next_event_virt, }, #endif #ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1 diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index f29c812b70c9..bfd60093ee1c 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -33,7 +33,7 @@ #define EXYNOS4_MCT_G_INT_ENB EXYNOS4_MCTREG(0x248) #define EXYNOS4_MCT_G_WSTAT EXYNOS4_MCTREG(0x24C) #define _EXYNOS4_MCT_L_BASE EXYNOS4_MCTREG(0x300) -#define EXYNOS4_MCT_L_BASE(x) (_EXYNOS4_MCT_L_BASE + (0x100 * x)) +#define EXYNOS4_MCT_L_BASE(x) (_EXYNOS4_MCT_L_BASE + (0x100 * (x))) #define EXYNOS4_MCT_L_MASK (0xffffff00) #define MCT_L_TCNTB_OFFSET (0x00) @@ -66,6 +66,8 @@ #define MCT_L0_IRQ 4 /* Max number of IRQ as per DT binding document */ #define MCT_NR_IRQS 20 +/* Max number of local timers */ +#define MCT_NR_LOCAL (MCT_NR_IRQS - MCT_L0_IRQ) enum { MCT_INT_SPI, @@ -233,9 +235,16 @@ static cycles_t exynos4_read_current_timer(void) } #endif -static int __init exynos4_clocksource_init(void) +static int __init exynos4_clocksource_init(bool frc_shared) { - exynos4_mct_frc_start(); + /* + * When the frc is shared, the main processer should have already + * turned it on and we shouldn't be writing to TCON. + */ + if (frc_shared) + mct_frc.resume = NULL; + else + exynos4_mct_frc_start(); #if defined(CONFIG_ARM) exynos4_delay_timer.read_current_timer = &exynos4_read_current_timer; @@ -449,7 +458,6 @@ static int exynos4_mct_starting_cpu(unsigned int cpu) per_cpu_ptr(&percpu_mct_tick, cpu); struct clock_event_device *evt = &mevt->evt; - mevt->base = EXYNOS4_MCT_L_BASE(cpu); snprintf(mevt->name, sizeof(mevt->name), "mct_tick%d", cpu); evt->name = mevt->name; @@ -520,8 +528,17 @@ static int __init exynos4_timer_resources(struct device_node *np) return 0; } +/** + * exynos4_timer_interrupts - initialize MCT interrupts + * @np: device node for MCT + * @int_type: interrupt type, MCT_INT_PPI or MCT_INT_SPI + * @local_idx: array mapping CPU numbers to local timer indices + * @nr_local: size of @local_idx array + */ static int __init exynos4_timer_interrupts(struct device_node *np, - unsigned int int_type) + unsigned int int_type, + const u32 *local_idx, + size_t nr_local) { int nr_irqs, i, err, cpu; @@ -554,13 +571,21 @@ static int __init exynos4_timer_interrupts(struct device_node *np, } else { for_each_possible_cpu(cpu) { int mct_irq; + unsigned int irq_idx; struct mct_clock_event_device *pcpu_mevt = per_cpu_ptr(&percpu_mct_tick, cpu); + if (cpu >= nr_local) { + err = -EINVAL; + goto out_irq; + } + + irq_idx = MCT_L0_IRQ + local_idx[cpu]; + pcpu_mevt->evt.irq = -1; - if (MCT_L0_IRQ + cpu >= ARRAY_SIZE(mct_irqs)) + if (irq_idx >= ARRAY_SIZE(mct_irqs)) break; - mct_irq = mct_irqs[MCT_L0_IRQ + cpu]; + mct_irq = mct_irqs[irq_idx]; irq_set_status_flags(mct_irq, IRQ_NOAUTOEN); if (request_irq(mct_irq, @@ -576,6 +601,17 @@ static int __init exynos4_timer_interrupts(struct device_node *np, } } + for_each_possible_cpu(cpu) { + struct mct_clock_event_device *mevt = per_cpu_ptr(&percpu_mct_tick, cpu); + + if (cpu >= nr_local) { + err = -EINVAL; + goto out_irq; + } + + mevt->base = EXYNOS4_MCT_L_BASE(local_idx[cpu]); + } + /* Install hotplug callbacks which configure the timer on this CPU */ err = cpuhp_setup_state(CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING, "clockevents/exynos4/mct_timer:starting", @@ -605,20 +641,49 @@ out_irq: static int __init mct_init_dt(struct device_node *np, unsigned int int_type) { + bool frc_shared = of_property_read_bool(np, "samsung,frc-shared"); + u32 local_idx[MCT_NR_LOCAL] = {0}; + int nr_local; int ret; + nr_local = of_property_count_u32_elems(np, "samsung,local-timers"); + if (nr_local == 0) + return -EINVAL; + if (nr_local > 0) { + if (nr_local > ARRAY_SIZE(local_idx)) + return -EINVAL; + + ret = of_property_read_u32_array(np, "samsung,local-timers", + local_idx, nr_local); + if (ret) + return ret; + } else { + int i; + + nr_local = ARRAY_SIZE(local_idx); + for (i = 0; i < nr_local; i++) + local_idx[i] = i; + } + ret = exynos4_timer_resources(np); if (ret) return ret; - ret = exynos4_timer_interrupts(np, int_type); + ret = exynos4_timer_interrupts(np, int_type, local_idx, nr_local); if (ret) return ret; - ret = exynos4_clocksource_init(); + ret = exynos4_clocksource_init(frc_shared); if (ret) return ret; + /* + * When the FRC is shared with a main processor, this secondary + * processor cannot use the global comparator. + */ + if (frc_shared) + return ret; + return exynos4_clockevent_init(); } diff --git a/drivers/clocksource/renesas-ostm.c b/drivers/clocksource/renesas-ostm.c index 21d1392637b8..8da972dc1713 100644 --- a/drivers/clocksource/renesas-ostm.c +++ b/drivers/clocksource/renesas-ostm.c @@ -224,7 +224,7 @@ err_free: TIMER_OF_DECLARE(ostm, "renesas,ostm", ostm_init); -#ifdef CONFIG_ARCH_R9A07G044 +#ifdef CONFIG_ARCH_RZG2L static int __init ostm_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; diff --git a/drivers/clocksource/timer-gxp.c b/drivers/clocksource/timer-gxp.c index 8b38b3212388..fe4fa8d7b3f1 100644 --- a/drivers/clocksource/timer-gxp.c +++ b/drivers/clocksource/timer-gxp.c @@ -171,6 +171,7 @@ static int gxp_timer_probe(struct platform_device *pdev) { struct platform_device *gxp_watchdog_device; struct device *dev = &pdev->dev; + int ret; if (!gxp_timer) { pr_err("Gxp Timer not initialized, cannot create watchdog"); @@ -187,7 +188,11 @@ static int gxp_timer_probe(struct platform_device *pdev) gxp_watchdog_device->dev.platform_data = gxp_timer->counter; gxp_watchdog_device->dev.parent = dev; - return platform_device_add(gxp_watchdog_device); + ret = platform_device_add(gxp_watchdog_device); + if (ret) + platform_device_put(gxp_watchdog_device); + + return ret; } static const struct of_device_id gxp_timer_of_match[] = { diff --git a/drivers/clocksource/timer-imx-sysctr.c b/drivers/clocksource/timer-imx-sysctr.c index 523e37662a6e..5a7a951c4efc 100644 --- a/drivers/clocksource/timer-imx-sysctr.c +++ b/drivers/clocksource/timer-imx-sysctr.c @@ -134,8 +134,10 @@ static int __init sysctr_timer_init(struct device_node *np) if (ret) return ret; - /* system counter clock is divided by 3 internally */ - to_sysctr.of_clk.rate /= SYS_CTR_CLK_DIV; + if (!of_property_read_bool(np, "nxp,no-divider")) { + /* system counter clock is divided by 3 internally */ + to_sysctr.of_clk.rate /= SYS_CTR_CLK_DIV; + } sys_ctr_base = timer_of_base(&to_sysctr); cmpcr = readl(sys_ctr_base + CMPCR); diff --git a/drivers/clocksource/timer-sun4i.c b/drivers/clocksource/timer-sun4i.c index 94dc6e42e983..e5a70aa1deb4 100644 --- a/drivers/clocksource/timer-sun4i.c +++ b/drivers/clocksource/timer-sun4i.c @@ -26,6 +26,7 @@ #define TIMER_IRQ_EN_REG 0x00 #define TIMER_IRQ_EN(val) BIT(val) #define TIMER_IRQ_ST_REG 0x04 +#define TIMER_IRQ_CLEAR(val) BIT(val) #define TIMER_CTL_REG(val) (0x10 * val + 0x10) #define TIMER_CTL_ENABLE BIT(0) #define TIMER_CTL_RELOAD BIT(1) @@ -123,7 +124,7 @@ static int sun4i_clkevt_next_event(unsigned long evt, static void sun4i_timer_clear_interrupt(void __iomem *base) { - writel(TIMER_IRQ_EN(0), base + TIMER_IRQ_ST_REG); + writel(TIMER_IRQ_CLEAR(0), base + TIMER_IRQ_ST_REG); } static irqreturn_t sun4i_timer_interrupt(int irq, void *dev_id) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 469f7c91564b..cad29ded3a48 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -33,6 +33,116 @@ #include +/* + * timer errata flags + * + * Errata i103/i767 impacts all OMAP3/4/5 devices including AM33xx. This + * errata prevents us from using posted mode on these devices, unless the + * timer counter register is never read. For more details please refer to + * the OMAP3/4/5 errata documents. + */ +#define OMAP_TIMER_ERRATA_I103_I767 0x80000000 + +/* posted mode types */ +#define OMAP_TIMER_NONPOSTED 0x00 +#define OMAP_TIMER_POSTED 0x01 + +/* register offsets with the write pending bit encoded */ +#define WPSHIFT 16 + +#define OMAP_TIMER_WAKEUP_EN_REG (_OMAP_TIMER_WAKEUP_EN_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_CTRL_REG (_OMAP_TIMER_CTRL_OFFSET \ + | (WP_TCLR << WPSHIFT)) + +#define OMAP_TIMER_COUNTER_REG (_OMAP_TIMER_COUNTER_OFFSET \ + | (WP_TCRR << WPSHIFT)) + +#define OMAP_TIMER_LOAD_REG (_OMAP_TIMER_LOAD_OFFSET \ + | (WP_TLDR << WPSHIFT)) + +#define OMAP_TIMER_TRIGGER_REG (_OMAP_TIMER_TRIGGER_OFFSET \ + | (WP_TTGR << WPSHIFT)) + +#define OMAP_TIMER_WRITE_PEND_REG (_OMAP_TIMER_WRITE_PEND_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_MATCH_REG (_OMAP_TIMER_MATCH_OFFSET \ + | (WP_TMAR << WPSHIFT)) + +#define OMAP_TIMER_CAPTURE_REG (_OMAP_TIMER_CAPTURE_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_IF_CTRL_REG (_OMAP_TIMER_IF_CTRL_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_CAPTURE2_REG (_OMAP_TIMER_CAPTURE2_OFFSET \ + | (WP_NONE << WPSHIFT)) + +#define OMAP_TIMER_TICK_POS_REG (_OMAP_TIMER_TICK_POS_OFFSET \ + | (WP_TPIR << WPSHIFT)) + +#define OMAP_TIMER_TICK_NEG_REG (_OMAP_TIMER_TICK_NEG_OFFSET \ + | (WP_TNIR << WPSHIFT)) + +#define OMAP_TIMER_TICK_COUNT_REG (_OMAP_TIMER_TICK_COUNT_OFFSET \ + | (WP_TCVR << WPSHIFT)) + +#define OMAP_TIMER_TICK_INT_MASK_SET_REG \ + (_OMAP_TIMER_TICK_INT_MASK_SET_OFFSET | (WP_TOCR << WPSHIFT)) + +#define OMAP_TIMER_TICK_INT_MASK_COUNT_REG \ + (_OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET | (WP_TOWR << WPSHIFT)) + +struct timer_regs { + u32 ocp_cfg; + u32 tidr; + u32 tier; + u32 twer; + u32 tclr; + u32 tcrr; + u32 tldr; + u32 ttrg; + u32 twps; + u32 tmar; + u32 tcar1; + u32 tsicr; + u32 tcar2; + u32 tpir; + u32 tnir; + u32 tcvr; + u32 tocr; + u32 towr; +}; + +struct dmtimer { + struct omap_dm_timer cookie; + int id; + int irq; + struct clk *fclk; + + void __iomem *io_base; + int irq_stat; /* TISR/IRQSTATUS interrupt status */ + int irq_ena; /* irq enable */ + int irq_dis; /* irq disable, only on v2 ip */ + void __iomem *pend; /* write pending */ + void __iomem *func_base; /* function register base */ + + atomic_t enabled; + unsigned long rate; + unsigned reserved:1; + unsigned posted:1; + unsigned omap1:1; + struct timer_regs context; + int revision; + u32 capability; + u32 errata; + struct platform_device *pdev; + struct list_head node; + struct notifier_block nb; +}; + static u32 omap_reserved_systimers; static LIST_HEAD(omap_timer_list); static DEFINE_SPINLOCK(dm_timer_lock); @@ -44,27 +154,56 @@ enum { REQUEST_BY_NODE, }; -static inline u32 __omap_dm_timer_read(struct omap_dm_timer *timer, u32 reg, - int posted) +/** + * dmtimer_read - read timer registers in posted and non-posted mode + * @timer: timer pointer over which read operation to perform + * @reg: lowest byte holds the register offset + * + * The posted mode bit is encoded in reg. Note that in posted mode, write + * pending bit must be checked. Otherwise a read of a non completed write + * will produce an error. + */ +static inline u32 dmtimer_read(struct dmtimer *timer, u32 reg) { - if (posted) - while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) + u16 wp, offset; + + wp = reg >> WPSHIFT; + offset = reg & 0xff; + + /* Wait for a possible write pending bit in posted mode */ + if (wp && timer->posted) + while (readl_relaxed(timer->pend) & wp) cpu_relax(); - return readl_relaxed(timer->func_base + (reg & 0xff)); + return readl_relaxed(timer->func_base + offset); } -static inline void __omap_dm_timer_write(struct omap_dm_timer *timer, - u32 reg, u32 val, int posted) +/** + * dmtimer_write - write timer registers in posted and non-posted mode + * @timer: timer pointer over which write operation is to perform + * @reg: lowest byte holds the register offset + * @value: data to write into the register + * + * The posted mode bit is encoded in reg. Note that in posted mode, the write + * pending bit must be checked. Otherwise a write on a register which has a + * pending write will be lost. + */ +static inline void dmtimer_write(struct dmtimer *timer, u32 reg, u32 val) { - if (posted) - while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) + u16 wp, offset; + + wp = reg >> WPSHIFT; + offset = reg & 0xff; + + /* Wait for a possible write pending bit in posted mode */ + if (wp && timer->posted) + while (readl_relaxed(timer->pend) & wp) cpu_relax(); - writel_relaxed(val, timer->func_base + (reg & 0xff)); + writel_relaxed(val, timer->func_base + offset); } -static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) +static inline void __omap_dm_timer_init_regs(struct dmtimer *timer) { u32 tidr; @@ -72,16 +211,16 @@ static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) tidr = readl_relaxed(timer->io_base); if (!(tidr >> 16)) { timer->revision = 1; - timer->irq_stat = timer->io_base + OMAP_TIMER_V1_STAT_OFFSET; - timer->irq_ena = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; - timer->irq_dis = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; + timer->irq_stat = OMAP_TIMER_V1_STAT_OFFSET; + timer->irq_ena = OMAP_TIMER_V1_INT_EN_OFFSET; + timer->irq_dis = OMAP_TIMER_V1_INT_EN_OFFSET; timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET; timer->func_base = timer->io_base; } else { timer->revision = 2; - timer->irq_stat = timer->io_base + OMAP_TIMER_V2_IRQSTATUS; - timer->irq_ena = timer->io_base + OMAP_TIMER_V2_IRQENABLE_SET; - timer->irq_dis = timer->io_base + OMAP_TIMER_V2_IRQENABLE_CLR; + timer->irq_stat = OMAP_TIMER_V2_IRQSTATUS - OMAP_TIMER_V2_FUNC_OFFSET; + timer->irq_ena = OMAP_TIMER_V2_IRQENABLE_SET - OMAP_TIMER_V2_FUNC_OFFSET; + timer->irq_dis = OMAP_TIMER_V2_IRQENABLE_CLR - OMAP_TIMER_V2_FUNC_OFFSET; timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET + OMAP_TIMER_V2_FUNC_OFFSET; @@ -99,35 +238,34 @@ static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) * complete. Enabling this feature can improve performance for writing to the * timer registers. */ -static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer) +static inline void __omap_dm_timer_enable_posted(struct dmtimer *timer) { if (timer->posted) return; if (timer->errata & OMAP_TIMER_ERRATA_I103_I767) { timer->posted = OMAP_TIMER_NONPOSTED; - __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0, 0); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0); return; } - __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, - OMAP_TIMER_CTRL_POSTED, 0); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, OMAP_TIMER_CTRL_POSTED); timer->context.tsicr = OMAP_TIMER_CTRL_POSTED; timer->posted = OMAP_TIMER_POSTED; } -static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, - int posted, unsigned long rate) +static inline void __omap_dm_timer_stop(struct dmtimer *timer, + unsigned long rate) { u32 l; - l = __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (l & OMAP_TIMER_CTRL_ST) { l &= ~0x1; - __omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, l, posted); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); #ifdef CONFIG_ARCH_OMAP2PLUS /* Readback to make sure write has completed */ - __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); + dmtimer_read(timer, OMAP_TIMER_CTRL_REG); /* * Wait for functional clock period x 3.5 to make sure that * timer is stopped @@ -137,104 +275,59 @@ static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, } /* Ack possibly pending interrupt */ - writel_relaxed(OMAP_TIMER_INT_OVERFLOW, timer->irq_stat); + dmtimer_write(timer, timer->irq_stat, OMAP_TIMER_INT_OVERFLOW); } -static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer, - unsigned int value) +static inline void __omap_dm_timer_int_enable(struct dmtimer *timer, + unsigned int value) { - writel_relaxed(value, timer->irq_ena); - __omap_dm_timer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value, 0); + dmtimer_write(timer, timer->irq_ena, value); + dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value); } static inline unsigned int -__omap_dm_timer_read_counter(struct omap_dm_timer *timer, int posted) +__omap_dm_timer_read_counter(struct dmtimer *timer) { - return __omap_dm_timer_read(timer, OMAP_TIMER_COUNTER_REG, posted); + return dmtimer_read(timer, OMAP_TIMER_COUNTER_REG); } -static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, +static inline void __omap_dm_timer_write_status(struct dmtimer *timer, unsigned int value) { - writel_relaxed(value, timer->irq_stat); + dmtimer_write(timer, timer->irq_stat, value); } -/** - * omap_dm_timer_read_reg - read timer registers in posted and non-posted mode - * @timer: timer pointer over which read operation to perform - * @reg: lowest byte holds the register offset - * - * The posted mode bit is encoded in reg. Note that in posted mode write - * pending bit must be checked. Otherwise a read of a non completed write - * will produce an error. - */ -static inline u32 omap_dm_timer_read_reg(struct omap_dm_timer *timer, u32 reg) +static void omap_timer_restore_context(struct dmtimer *timer) { - WARN_ON((reg & 0xff) < _OMAP_TIMER_WAKEUP_EN_OFFSET); - return __omap_dm_timer_read(timer, reg, timer->posted); + dmtimer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, timer->context.ocp_cfg); + + dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, timer->context.twer); + dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, timer->context.tcrr); + dmtimer_write(timer, OMAP_TIMER_LOAD_REG, timer->context.tldr); + dmtimer_write(timer, OMAP_TIMER_MATCH_REG, timer->context.tmar); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, timer->context.tsicr); + dmtimer_write(timer, timer->irq_ena, timer->context.tier); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, timer->context.tclr); } -/** - * omap_dm_timer_write_reg - write timer registers in posted and non-posted mode - * @timer: timer pointer over which write operation is to perform - * @reg: lowest byte holds the register offset - * @value: data to write into the register - * - * The posted mode bit is encoded in reg. Note that in posted mode the write - * pending bit must be checked. Otherwise a write on a register which has a - * pending write will be lost. - */ -static void omap_dm_timer_write_reg(struct omap_dm_timer *timer, u32 reg, - u32 value) +static void omap_timer_save_context(struct dmtimer *timer) { - WARN_ON((reg & 0xff) < _OMAP_TIMER_WAKEUP_EN_OFFSET); - __omap_dm_timer_write(timer, reg, value, timer->posted); -} + timer->context.ocp_cfg = dmtimer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET); -static void omap_timer_restore_context(struct omap_dm_timer *timer) -{ - __omap_dm_timer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, - timer->context.ocp_cfg, 0); - - omap_dm_timer_write_reg(timer, OMAP_TIMER_WAKEUP_EN_REG, - timer->context.twer); - omap_dm_timer_write_reg(timer, OMAP_TIMER_COUNTER_REG, - timer->context.tcrr); - omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG, - timer->context.tldr); - omap_dm_timer_write_reg(timer, OMAP_TIMER_MATCH_REG, - timer->context.tmar); - omap_dm_timer_write_reg(timer, OMAP_TIMER_IF_CTRL_REG, - timer->context.tsicr); - writel_relaxed(timer->context.tier, timer->irq_ena); - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, - timer->context.tclr); -} - -static void omap_timer_save_context(struct omap_dm_timer *timer) -{ - timer->context.ocp_cfg = - __omap_dm_timer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET, 0); - - timer->context.tclr = - omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); - timer->context.twer = - omap_dm_timer_read_reg(timer, OMAP_TIMER_WAKEUP_EN_REG); - timer->context.tldr = - omap_dm_timer_read_reg(timer, OMAP_TIMER_LOAD_REG); - timer->context.tmar = - omap_dm_timer_read_reg(timer, OMAP_TIMER_MATCH_REG); - timer->context.tier = readl_relaxed(timer->irq_ena); - timer->context.tsicr = - omap_dm_timer_read_reg(timer, OMAP_TIMER_IF_CTRL_REG); + timer->context.tclr = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + timer->context.twer = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG); + timer->context.tldr = dmtimer_read(timer, OMAP_TIMER_LOAD_REG); + timer->context.tmar = dmtimer_read(timer, OMAP_TIMER_MATCH_REG); + timer->context.tier = dmtimer_read(timer, timer->irq_ena); + timer->context.tsicr = dmtimer_read(timer, OMAP_TIMER_IF_CTRL_REG); } static int omap_timer_context_notifier(struct notifier_block *nb, unsigned long cmd, void *v) { - struct omap_dm_timer *timer; + struct dmtimer *timer; - timer = container_of(nb, struct omap_dm_timer, nb); + timer = container_of(nb, struct dmtimer, nb); switch (cmd) { case CPU_CLUSTER_PM_ENTER: @@ -256,18 +349,17 @@ static int omap_timer_context_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static int omap_dm_timer_reset(struct omap_dm_timer *timer) +static int omap_dm_timer_reset(struct dmtimer *timer) { u32 l, timeout = 100000; if (timer->revision != 1) return -EINVAL; - omap_dm_timer_write_reg(timer, OMAP_TIMER_IF_CTRL_REG, 0x06); + dmtimer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0x06); do { - l = __omap_dm_timer_read(timer, - OMAP_TIMER_V1_SYS_STAT_OFFSET, 0); + l = dmtimer_read(timer, OMAP_TIMER_V1_SYS_STAT_OFFSET); } while (!l && timeout--); if (!timeout) { @@ -276,22 +368,38 @@ static int omap_dm_timer_reset(struct omap_dm_timer *timer) } /* Configure timer for smart-idle mode */ - l = __omap_dm_timer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET, 0); + l = dmtimer_read(timer, OMAP_TIMER_OCP_CFG_OFFSET); l |= 0x2 << 0x3; - __omap_dm_timer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, l, 0); + dmtimer_write(timer, OMAP_TIMER_OCP_CFG_OFFSET, l); timer->posted = 0; return 0; } -static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source) +/* + * Functions exposed to PWM and remoteproc drivers via platform_data. + * Do not use these in the driver, these will get deprecated and will + * will be replaced by Linux generic framework functions such as + * chained interrupts and clock framework. + */ +static struct dmtimer *to_dmtimer(struct omap_dm_timer *cookie) +{ + if (!cookie) + return NULL; + + return container_of(cookie, struct dmtimer, cookie); +} + +static int omap_dm_timer_set_source(struct omap_dm_timer *cookie, int source) { int ret; const char *parent_name; struct clk *parent; struct dmtimer_platform_data *pdata; + struct dmtimer *timer; + timer = to_dmtimer(cookie); if (unlikely(!timer) || IS_ERR(timer->fclk)) return -EINVAL; @@ -316,7 +424,7 @@ static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source) * use the clock framework to set the parent clock. To be removed * once OMAP1 migrated to using clock framework for dmtimers */ - if (pdata && pdata->set_timer_src) + if (timer->omap1 && pdata && pdata->set_timer_src) return pdata->set_timer_src(timer->pdev, source); #if defined(CONFIG_COMMON_CLK) @@ -341,44 +449,44 @@ static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source) return ret; } -static void omap_dm_timer_enable(struct omap_dm_timer *timer) -{ - pm_runtime_get_sync(&timer->pdev->dev); -} - -static void omap_dm_timer_disable(struct omap_dm_timer *timer) -{ - pm_runtime_put_sync(&timer->pdev->dev); -} - -static int omap_dm_timer_prepare(struct omap_dm_timer *timer) +static void omap_dm_timer_enable(struct omap_dm_timer *cookie) { + struct dmtimer *timer = to_dmtimer(cookie); + struct device *dev = &timer->pdev->dev; int rc; - /* - * FIXME: OMAP1 devices do not use the clock framework for dmtimers so - * do not call clk_get() for these devices. - */ - if (!(timer->capability & OMAP_TIMER_NEEDS_RESET)) { - timer->fclk = clk_get(&timer->pdev->dev, "fck"); - if (WARN_ON_ONCE(IS_ERR(timer->fclk))) { - dev_err(&timer->pdev->dev, ": No fclk handle.\n"); - return -EINVAL; - } - } + rc = pm_runtime_resume_and_get(dev); + if (rc) + dev_err(dev, "could not enable timer\n"); +} - omap_dm_timer_enable(timer); +static void omap_dm_timer_disable(struct omap_dm_timer *cookie) +{ + struct dmtimer *timer = to_dmtimer(cookie); + struct device *dev = &timer->pdev->dev; + + pm_runtime_put_sync(dev); +} + +static int omap_dm_timer_prepare(struct dmtimer *timer) +{ + struct device *dev = &timer->pdev->dev; + int rc; + + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; if (timer->capability & OMAP_TIMER_NEEDS_RESET) { rc = omap_dm_timer_reset(timer); if (rc) { - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); return rc; } } __omap_dm_timer_enable_posted(timer); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); return 0; } @@ -388,19 +496,9 @@ static inline u32 omap_dm_timer_reserved_systimer(int id) return (omap_reserved_systimers & (1 << (id - 1))) ? 1 : 0; } -int omap_dm_timer_reserve_systimer(int id) +static struct dmtimer *_omap_dm_timer_request(int req_type, void *data) { - if (omap_dm_timer_reserved_systimer(id)) - return -ENODEV; - - omap_reserved_systimers |= (1 << (id - 1)); - - return 0; -} - -static struct omap_dm_timer *_omap_dm_timer_request(int req_type, void *data) -{ - struct omap_dm_timer *timer = NULL, *t; + struct dmtimer *timer = NULL, *t; struct device_node *np = NULL; unsigned long flags; u32 cap = 0; @@ -484,11 +582,19 @@ found: static struct omap_dm_timer *omap_dm_timer_request(void) { - return _omap_dm_timer_request(REQUEST_ANY, NULL); + struct dmtimer *timer; + + timer = _omap_dm_timer_request(REQUEST_ANY, NULL); + if (!timer) + return NULL; + + return &timer->cookie; } static struct omap_dm_timer *omap_dm_timer_request_specific(int id) { + struct dmtimer *timer; + /* Requesting timer by ID is not supported when device tree is used */ if (of_have_populated_dt()) { pr_warn("%s: Please use omap_dm_timer_request_by_node()\n", @@ -496,21 +602,11 @@ static struct omap_dm_timer *omap_dm_timer_request_specific(int id) return NULL; } - return _omap_dm_timer_request(REQUEST_BY_ID, &id); -} + timer = _omap_dm_timer_request(REQUEST_BY_ID, &id); + if (!timer) + return NULL; -/** - * omap_dm_timer_request_by_cap - Request a timer by capability - * @cap: Bit mask of capabilities to match - * - * Find a timer based upon capabilities bit mask. Callers of this function - * should use the definitions found in the plat/dmtimer.h file under the - * comment "timer capabilities used in hwmod database". Returns pointer to - * timer handle on success and a NULL pointer on failure. - */ -struct omap_dm_timer *omap_dm_timer_request_by_cap(u32 cap) -{ - return _omap_dm_timer_request(REQUEST_BY_CAP, &cap); + return &timer->cookie; } /** @@ -522,26 +618,34 @@ struct omap_dm_timer *omap_dm_timer_request_by_cap(u32 cap) */ static struct omap_dm_timer *omap_dm_timer_request_by_node(struct device_node *np) { + struct dmtimer *timer; + if (!np) return NULL; - return _omap_dm_timer_request(REQUEST_BY_NODE, np); + timer = _omap_dm_timer_request(REQUEST_BY_NODE, np); + if (!timer) + return NULL; + + return &timer->cookie; } -static int omap_dm_timer_free(struct omap_dm_timer *timer) +static int omap_dm_timer_free(struct omap_dm_timer *cookie) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - clk_put(timer->fclk); - WARN_ON(!timer->reserved); timer->reserved = 0; return 0; } -int omap_dm_timer_get_irq(struct omap_dm_timer *timer) +int omap_dm_timer_get_irq(struct omap_dm_timer *cookie) { + struct dmtimer *timer = to_dmtimer(cookie); if (timer) return timer->irq; return -EINVAL; @@ -550,7 +654,7 @@ int omap_dm_timer_get_irq(struct omap_dm_timer *timer) #if defined(CONFIG_ARCH_OMAP1) #include -static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *timer) +static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *cookie) { return NULL; } @@ -562,7 +666,7 @@ static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *timer) __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) { int i = 0; - struct omap_dm_timer *timer = NULL; + struct dmtimer *timer = NULL; unsigned long flags; /* If ARMXOR cannot be idled this function call is unnecessary */ @@ -574,7 +678,7 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) list_for_each_entry(timer, &omap_timer_list, node) { u32 l; - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (l & OMAP_TIMER_CTRL_ST) { if (((omap_readl(MOD_CONF_CTRL_1) >> (i * 2)) & 0x03) == 0) inputmask &= ~(1 << 1); @@ -590,8 +694,10 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) #else -static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *timer) +static struct clk *omap_dm_timer_get_fclk(struct omap_dm_timer *cookie) { + struct dmtimer *timer = to_dmtimer(cookie); + if (timer && !IS_ERR(timer->fclk)) return timer->fclk; return NULL; @@ -606,95 +712,125 @@ __u32 omap_dm_timer_modify_idlect_mask(__u32 inputmask) #endif -int omap_dm_timer_trigger(struct omap_dm_timer *timer) -{ - if (unlikely(!timer || !atomic_read(&timer->enabled))) { - pr_err("%s: timer not available or enabled.\n", __func__); - return -EINVAL; - } - - omap_dm_timer_write_reg(timer, OMAP_TIMER_TRIGGER_REG, 0); - return 0; -} - -static int omap_dm_timer_start(struct omap_dm_timer *timer) +static int omap_dm_timer_start(struct omap_dm_timer *cookie) { + struct dmtimer *timer; + struct device *dev; + int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (!(l & OMAP_TIMER_CTRL_ST)) { l |= OMAP_TIMER_CTRL_ST; - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); } return 0; } -static int omap_dm_timer_stop(struct omap_dm_timer *timer) +static int omap_dm_timer_stop(struct omap_dm_timer *cookie) { + struct dmtimer *timer; + struct device *dev; unsigned long rate = 0; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - if (!(timer->capability & OMAP_TIMER_NEEDS_RESET)) + dev = &timer->pdev->dev; + + if (!timer->omap1) rate = clk_get_rate(timer->fclk); - __omap_dm_timer_stop(timer, timer->posted, rate); + __omap_dm_timer_stop(timer, rate); + + pm_runtime_put_sync(dev); - omap_dm_timer_disable(timer); return 0; } -static int omap_dm_timer_set_load(struct omap_dm_timer *timer, +static int omap_dm_timer_set_load(struct omap_dm_timer *cookie, unsigned int load) { + struct dmtimer *timer; + struct device *dev; + int rc; + + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); - omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG, load); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + + dmtimer_write(timer, OMAP_TIMER_LOAD_REG, load); + + pm_runtime_put_sync(dev); - omap_dm_timer_disable(timer); return 0; } -static int omap_dm_timer_set_match(struct omap_dm_timer *timer, int enable, +static int omap_dm_timer_set_match(struct omap_dm_timer *cookie, int enable, unsigned int match) { + struct dmtimer *timer; + struct device *dev; + int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); if (enable) l |= OMAP_TIMER_CTRL_CE; else l &= ~OMAP_TIMER_CTRL_CE; - omap_dm_timer_write_reg(timer, OMAP_TIMER_MATCH_REG, match); - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_MATCH_REG, match); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); - omap_dm_timer_disable(timer); return 0; } -static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, +static int omap_dm_timer_set_pwm(struct omap_dm_timer *cookie, int def_on, int toggle, int trigger, int autoreload) { + struct dmtimer *timer; + struct device *dev; + int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); l &= ~(OMAP_TIMER_CTRL_GPOCFG | OMAP_TIMER_CTRL_SCPWM | OMAP_TIMER_CTRL_PT | (0x03 << 10) | OMAP_TIMER_CTRL_AR); if (def_on) @@ -704,57 +840,86 @@ static int omap_dm_timer_set_pwm(struct omap_dm_timer *timer, int def_on, l |= trigger << 10; if (autoreload) l |= OMAP_TIMER_CTRL_AR; - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); - omap_dm_timer_disable(timer); return 0; } -static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *timer) +static int omap_dm_timer_get_pwm_status(struct omap_dm_timer *cookie) { + struct dmtimer *timer; + struct device *dev; + int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); - omap_dm_timer_disable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); + + pm_runtime_put_sync(dev); return l; } -static int omap_dm_timer_set_prescaler(struct omap_dm_timer *timer, - int prescaler) +static int omap_dm_timer_set_prescaler(struct omap_dm_timer *cookie, + int prescaler) { + struct dmtimer *timer; + struct device *dev; + int rc; u32 l; + timer = to_dmtimer(cookie); if (unlikely(!timer) || prescaler < -1 || prescaler > 7) return -EINVAL; - omap_dm_timer_enable(timer); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + + l = dmtimer_read(timer, OMAP_TIMER_CTRL_REG); l &= ~(OMAP_TIMER_CTRL_PRE | (0x07 << 2)); if (prescaler >= 0) { l |= OMAP_TIMER_CTRL_PRE; l |= prescaler << 2; } - omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l); + dmtimer_write(timer, OMAP_TIMER_CTRL_REG, l); + + pm_runtime_put_sync(dev); - omap_dm_timer_disable(timer); return 0; } -static int omap_dm_timer_set_int_enable(struct omap_dm_timer *timer, +static int omap_dm_timer_set_int_enable(struct omap_dm_timer *cookie, unsigned int value) { + struct dmtimer *timer; + struct device *dev; + int rc; + + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; + __omap_dm_timer_int_enable(timer, value); - omap_dm_timer_disable(timer); + pm_runtime_put_sync(dev); + return 0; } @@ -765,42 +930,55 @@ static int omap_dm_timer_set_int_enable(struct omap_dm_timer *timer, * * Disables the specified timer interrupts for a timer. */ -static int omap_dm_timer_set_int_disable(struct omap_dm_timer *timer, u32 mask) +static int omap_dm_timer_set_int_disable(struct omap_dm_timer *cookie, u32 mask) { + struct dmtimer *timer; + struct device *dev; u32 l = mask; + int rc; + timer = to_dmtimer(cookie); if (unlikely(!timer)) return -EINVAL; - omap_dm_timer_enable(timer); + dev = &timer->pdev->dev; + rc = pm_runtime_resume_and_get(dev); + if (rc) + return rc; if (timer->revision == 1) - l = readl_relaxed(timer->irq_ena) & ~mask; + l = dmtimer_read(timer, timer->irq_ena) & ~mask; - writel_relaxed(l, timer->irq_dis); - l = omap_dm_timer_read_reg(timer, OMAP_TIMER_WAKEUP_EN_REG) & ~mask; - omap_dm_timer_write_reg(timer, OMAP_TIMER_WAKEUP_EN_REG, l); + dmtimer_write(timer, timer->irq_dis, l); + l = dmtimer_read(timer, OMAP_TIMER_WAKEUP_EN_REG) & ~mask; + dmtimer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, l); + + pm_runtime_put_sync(dev); - omap_dm_timer_disable(timer); return 0; } -static unsigned int omap_dm_timer_read_status(struct omap_dm_timer *timer) +static unsigned int omap_dm_timer_read_status(struct omap_dm_timer *cookie) { + struct dmtimer *timer; unsigned int l; + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) { pr_err("%s: timer not available or enabled.\n", __func__); return 0; } - l = readl_relaxed(timer->irq_stat); + l = dmtimer_read(timer, timer->irq_stat); return l; } -static int omap_dm_timer_write_status(struct omap_dm_timer *timer, unsigned int value) +static int omap_dm_timer_write_status(struct omap_dm_timer *cookie, unsigned int value) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) return -EINVAL; @@ -809,49 +987,39 @@ static int omap_dm_timer_write_status(struct omap_dm_timer *timer, unsigned int return 0; } -static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *timer) +static unsigned int omap_dm_timer_read_counter(struct omap_dm_timer *cookie) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) { pr_err("%s: timer not iavailable or enabled.\n", __func__); return 0; } - return __omap_dm_timer_read_counter(timer, timer->posted); + return __omap_dm_timer_read_counter(timer); } -static int omap_dm_timer_write_counter(struct omap_dm_timer *timer, unsigned int value) +static int omap_dm_timer_write_counter(struct omap_dm_timer *cookie, unsigned int value) { + struct dmtimer *timer; + + timer = to_dmtimer(cookie); if (unlikely(!timer || !atomic_read(&timer->enabled))) { pr_err("%s: timer not available or enabled.\n", __func__); return -EINVAL; } - omap_dm_timer_write_reg(timer, OMAP_TIMER_COUNTER_REG, value); + dmtimer_write(timer, OMAP_TIMER_COUNTER_REG, value); /* Save the context */ timer->context.tcrr = value; return 0; } -int omap_dm_timers_active(void) -{ - struct omap_dm_timer *timer; - - list_for_each_entry(timer, &omap_timer_list, node) { - if (!timer->reserved) - continue; - - if (omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG) & - OMAP_TIMER_CTRL_ST) { - return 1; - } - } - return 0; -} - static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) { - struct omap_dm_timer *timer = dev_get_drvdata(dev); + struct dmtimer *timer = dev_get_drvdata(dev); atomic_set(&timer->enabled, 0); @@ -865,7 +1033,7 @@ static int __maybe_unused omap_dm_timer_runtime_suspend(struct device *dev) static int __maybe_unused omap_dm_timer_runtime_resume(struct device *dev) { - struct omap_dm_timer *timer = dev_get_drvdata(dev); + struct dmtimer *timer = dev_get_drvdata(dev); if (!(timer->capability & OMAP_TIMER_ALWON) && timer->func_base) omap_timer_restore_context(timer); @@ -892,7 +1060,7 @@ static const struct of_device_id omap_timer_match[]; static int omap_dm_timer_probe(struct platform_device *pdev) { unsigned long flags; - struct omap_dm_timer *timer; + struct dmtimer *timer; struct device *dev = &pdev->dev; const struct dmtimer_platform_data *pdata; int ret; @@ -916,7 +1084,6 @@ static int omap_dm_timer_probe(struct platform_device *pdev) if (timer->irq < 0) return timer->irq; - timer->fclk = ERR_PTR(-ENODEV); timer->io_base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(timer->io_base)) return PTR_ERR(timer->io_base); @@ -938,6 +1105,17 @@ static int omap_dm_timer_probe(struct platform_device *pdev) timer->reserved = omap_dm_timer_reserved_systimer(timer->id); } + timer->omap1 = timer->capability & OMAP_TIMER_NEEDS_RESET; + + /* OMAP1 devices do not yet use the clock framework for dmtimers */ + if (!timer->omap1) { + timer->fclk = devm_clk_get(dev, "fck"); + if (IS_ERR(timer->fclk)) + return PTR_ERR(timer->fclk); + } else { + timer->fclk = ERR_PTR(-ENODEV); + } + if (!(timer->capability & OMAP_TIMER_ALWON)) { timer->nb.notifier_call = omap_timer_context_notifier; cpu_pm_register_notifier(&timer->nb); @@ -950,11 +1128,11 @@ static int omap_dm_timer_probe(struct platform_device *pdev) pm_runtime_enable(dev); if (!timer->reserved) { - ret = pm_runtime_get_sync(dev); - if (ret < 0) { + ret = pm_runtime_resume_and_get(dev); + if (ret) { dev_err(dev, "%s: pm_runtime_get_sync failed!\n", __func__); - goto err_get_sync; + goto err_disable; } __omap_dm_timer_init_regs(timer); pm_runtime_put(dev); @@ -969,8 +1147,7 @@ static int omap_dm_timer_probe(struct platform_device *pdev) return 0; -err_get_sync: - pm_runtime_put_noidle(dev); +err_disable: pm_runtime_disable(dev); return ret; } @@ -985,7 +1162,7 @@ err_get_sync: */ static int omap_dm_timer_remove(struct platform_device *pdev) { - struct omap_dm_timer *timer; + struct dmtimer *timer; unsigned long flags; int ret = -EINVAL; diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 6ce05ef4844d..00e3a637f7b6 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -44,7 +44,9 @@ * This allows us to perform the check, i.e, perfmon_capable(), * in the context of the event owner, once, during the event_init(). */ -#define SPE_PMU_HW_FLAGS_CX BIT(0) +#define SPE_PMU_HW_FLAGS_CX 0x00001 + +static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX); static void set_spe_event_has_cx(struct perf_event *event) { diff --git a/fs/dcache.c b/fs/dcache.c index bb0c4d0038db..2ee8636016ee 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2597,15 +2597,7 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { - /* - * The caller holds a spinlock (dentry::d_lock). On !PREEMPT_RT - * kernels spin_lock() implicitly disables preemption, but not on - * PREEMPT_RT. So for RT it has to be done explicitly to protect - * the sequence count write side critical section against a reader - * or another writer preempting, which would result in a live lock. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) @@ -2618,8 +2610,7 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, wait_queue_head_t *d_wait) { smp_store_release(&dir->i_dir_seq, n + 2); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); wake_up_all(d_wait); } diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index b0f80cfd2a26..77eceeae708c 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -52,10 +52,6 @@ #define OMAP_TIMER_TRIGGER_OVERFLOW 0x01 #define OMAP_TIMER_TRIGGER_OVERFLOW_AND_COMPARE 0x02 -/* posted mode types */ -#define OMAP_TIMER_NONPOSTED 0x00 -#define OMAP_TIMER_POSTED 0x01 - /* timer capabilities used in hwmod database */ #define OMAP_TIMER_SECURE 0x80000000 #define OMAP_TIMER_ALWON 0x40000000 @@ -63,73 +59,13 @@ #define OMAP_TIMER_NEEDS_RESET 0x10000000 #define OMAP_TIMER_HAS_DSP_IRQ 0x08000000 -/* - * timer errata flags - * - * Errata i103/i767 impacts all OMAP3/4/5 devices including AM33xx. This - * errata prevents us from using posted mode on these devices, unless the - * timer counter register is never read. For more details please refer to - * the OMAP3/4/5 errata documents. - */ -#define OMAP_TIMER_ERRATA_I103_I767 0x80000000 - -struct timer_regs { - u32 ocp_cfg; - u32 tidr; - u32 tier; - u32 twer; - u32 tclr; - u32 tcrr; - u32 tldr; - u32 ttrg; - u32 twps; - u32 tmar; - u32 tcar1; - u32 tsicr; - u32 tcar2; - u32 tpir; - u32 tnir; - u32 tcvr; - u32 tocr; - u32 towr; -}; - struct omap_dm_timer { - int id; - int irq; - struct clk *fclk; - - void __iomem *io_base; - void __iomem *irq_stat; /* TISR/IRQSTATUS interrupt status */ - void __iomem *irq_ena; /* irq enable */ - void __iomem *irq_dis; /* irq disable, only on v2 ip */ - void __iomem *pend; /* write pending */ - void __iomem *func_base; /* function register base */ - - atomic_t enabled; - unsigned long rate; - unsigned reserved:1; - unsigned posted:1; - struct timer_regs context; - int revision; - u32 capability; - u32 errata; - struct platform_device *pdev; - struct list_head node; - struct notifier_block nb; }; -int omap_dm_timer_reserve_systimer(int id); -struct omap_dm_timer *omap_dm_timer_request_by_cap(u32 cap); - int omap_dm_timer_get_irq(struct omap_dm_timer *timer); u32 omap_dm_timer_modify_idlect_mask(u32 inputmask); -int omap_dm_timer_trigger(struct omap_dm_timer *timer); - -int omap_dm_timers_active(void); - /* * Do not use the defines below, they are not needed. They should be only * used by dmtimer.c and sys_timer related code. @@ -199,52 +135,4 @@ int omap_dm_timers_active(void); #define _OMAP_TIMER_TICK_INT_MASK_SET_OFFSET 0x54 /* TOCR, 34xx only */ #define _OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET 0x58 /* TOWR, 34xx only */ -/* register offsets with the write pending bit encoded */ -#define WPSHIFT 16 - -#define OMAP_TIMER_WAKEUP_EN_REG (_OMAP_TIMER_WAKEUP_EN_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_CTRL_REG (_OMAP_TIMER_CTRL_OFFSET \ - | (WP_TCLR << WPSHIFT)) - -#define OMAP_TIMER_COUNTER_REG (_OMAP_TIMER_COUNTER_OFFSET \ - | (WP_TCRR << WPSHIFT)) - -#define OMAP_TIMER_LOAD_REG (_OMAP_TIMER_LOAD_OFFSET \ - | (WP_TLDR << WPSHIFT)) - -#define OMAP_TIMER_TRIGGER_REG (_OMAP_TIMER_TRIGGER_OFFSET \ - | (WP_TTGR << WPSHIFT)) - -#define OMAP_TIMER_WRITE_PEND_REG (_OMAP_TIMER_WRITE_PEND_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_MATCH_REG (_OMAP_TIMER_MATCH_OFFSET \ - | (WP_TMAR << WPSHIFT)) - -#define OMAP_TIMER_CAPTURE_REG (_OMAP_TIMER_CAPTURE_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_IF_CTRL_REG (_OMAP_TIMER_IF_CTRL_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_CAPTURE2_REG (_OMAP_TIMER_CAPTURE2_OFFSET \ - | (WP_NONE << WPSHIFT)) - -#define OMAP_TIMER_TICK_POS_REG (_OMAP_TIMER_TICK_POS_OFFSET \ - | (WP_TPIR << WPSHIFT)) - -#define OMAP_TIMER_TICK_NEG_REG (_OMAP_TIMER_TICK_NEG_OFFSET \ - | (WP_TNIR << WPSHIFT)) - -#define OMAP_TIMER_TICK_COUNT_REG (_OMAP_TIMER_TICK_COUNT_OFFSET \ - | (WP_TCVR << WPSHIFT)) - -#define OMAP_TIMER_TICK_INT_MASK_SET_REG \ - (_OMAP_TIMER_TICK_INT_MASK_SET_OFFSET | (WP_TOCR << WPSHIFT)) - -#define OMAP_TIMER_TICK_INT_MASK_COUNT_REG \ - (_OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET | (WP_TOWR << WPSHIFT)) - #endif /* __CLOCKSOURCE_DMTIMER_H */ diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index fc93c9488c76..898b3458b24a 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -35,7 +35,8 @@ /* * Note: do not use this directly. Instead, use __alloc_size() since it is conditionally - * available and includes other attributes. + * available and includes other attributes. For GCC < 9.1, __alloc_size__ gets undefined + * in compiler-gcc.h, due to misbehaviors. * * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-alloc_005fsize-function-attribute * clang: https://clang.llvm.org/docs/AttributeReference.html#alloc-size diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index d9d98e8a9a3b..74e04ecd4c89 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -271,14 +271,16 @@ struct ftrace_likely_data { /* * Any place that could be marked with the "alloc_size" attribute is also - * a place to be marked with the "malloc" attribute. Do this as part of the - * __alloc_size macro to avoid redundant attributes and to avoid missing a - * __malloc marking. + * a place to be marked with the "malloc" attribute, except those that may + * be performing a _reallocation_, as that may alias the existing pointer. + * For these, use __realloc_size(). */ #ifdef __alloc_size__ # define __alloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) __malloc +# define __realloc_size(x, ...) __alloc_size__(x, ## __VA_ARGS__) #else # define __alloc_size(x, ...) __malloc +# define __realloc_size(x, ...) #endif #ifndef asm_volatile_goto diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 78dd7035d1e5..f319bd26b030 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -74,12 +74,12 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, extern int register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); +extern bool hw_breakpoint_is_used(void); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); -int hw_breakpoint_weight(struct perf_event *bp); int arch_reserve_bp_slot(struct perf_event *bp); void arch_release_bp_slot(struct perf_event *bp); void arch_unregister_hw_breakpoint(struct perf_event *bp); @@ -121,6 +121,8 @@ register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } +static inline bool hw_breakpoint_is_used(void) { return false; } + static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 15ae78cd2853..b8728d11c949 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -94,6 +94,12 @@ void dump_mm(const struct mm_struct *mm); #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif +#ifdef CONFIG_DEBUG_VM_IRQSOFF +#define VM_WARN_ON_IRQS_ENABLED() WARN_ON_ONCE(!irqs_disabled()) +#else +#define VM_WARN_ON_IRQS_ENABLED() do { } while (0) +#endif + #ifdef CONFIG_DEBUG_VIRTUAL #define VIRTUAL_BUG_ON(cond) BUG_ON(cond) #else diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5fda40f97fe9..36b942b67b7d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -121,9 +121,15 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) +{ + return atomic_read(&sem->block); +} + extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0407a38b470a..0356cb6a215d 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -24,10 +24,11 @@ /* * ARM PMU hw_event flags */ -/* Event uses a 64bit counter */ -#define ARMPMU_EVT_64BIT 1 -/* Event uses a 47bit counter */ -#define ARMPMU_EVT_47BIT 2 +#define ARMPMU_EVT_64BIT 0x00001 /* Event uses a 64bit counter */ +#define ARMPMU_EVT_47BIT 0x00002 /* Event uses a 47bit counter */ + +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT); +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT); #define HW_OP_UNSUPPORTED 0xFFFF #define C(_x) PERF_COUNT_HW_CACHE_##_x diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index df74e387f703..0fd8cec3ebbb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { }; #ifdef CONFIG_HAVE_HW_BREAKPOINT +#include #include #endif @@ -60,6 +61,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -137,9 +139,11 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 +static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); + /** * struct hw_perf_event - performance event hardware details: */ @@ -178,7 +182,7 @@ struct hw_perf_event { * creation and event initalization. */ struct arch_hw_breakpoint info; - struct list_head bp_list; + struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ @@ -631,7 +635,23 @@ struct pmu_event_list { struct list_head list; }; +/* + * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex + * as such iteration must hold either lock. However, since ctx->lock is an IRQ + * safe lock, and is only held by the CPU doing the modification, having IRQs + * disabled is sufficient since it will hold-off the IPIs. + */ +#ifdef CONFIG_PROVE_LOCKING +#define lockdep_assert_event_ctx(event) \ + WARN_ON_ONCE(__lockdep_enabled && \ + (this_cpu_read(hardirqs_enabled) && \ + lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) +#else +#define lockdep_assert_event_ctx(event) +#endif + #define for_each_sibling_event(sibling, event) \ + lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) @@ -1007,18 +1027,20 @@ struct perf_sample_data { * Fields set by perf_sample_data_init(), group so as to * minimize the cachelines touched. */ - u64 addr; - struct perf_raw_record *raw; - struct perf_branch_stack *br_stack; + u64 sample_flags; u64 period; - union perf_sample_weight weight; - u64 txn; - union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ + struct perf_branch_stack *br_stack; + union perf_sample_weight weight; + union perf_mem_data_src data_src; + u64 txn; + u64 addr; + struct perf_raw_record *raw; + u64 type; u64 ip; struct { @@ -1056,13 +1078,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ - data->addr = addr; - data->raw = NULL; - data->br_stack = NULL; + data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; - data->weight.full = 0; - data->data_src.val = PERF_MEM_NA; - data->txn = 0; + + if (addr) { + data->addr = addr; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } /* @@ -1078,6 +1100,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b br->abort = 0; br->cycles = 0; br->type = 0; + br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } @@ -1702,4 +1725,30 @@ static inline void perf_lopwr_cb(bool mode) } #endif +#ifdef CONFIG_PERF_EVENTS +static inline bool branch_sample_no_flags(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; +} + +static inline bool branch_sample_no_cycles(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; +} + +static inline bool branch_sample_type(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; +} + +static inline bool branch_sample_hw_index(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + +static inline bool branch_sample_priv(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; +} +#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_PERF_EVENT_H */ diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b4381f255a5c..0df425bf9bd7 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -421,4 +421,46 @@ static inline void migrate_enable(void) { } #endif /* CONFIG_SMP */ +/** + * preempt_disable_nested - Disable preemption inside a normally preempt disabled section + * + * Use for code which requires preemption protection inside a critical + * section which has preemption disabled implicitly on non-PREEMPT_RT + * enabled kernels, by e.g.: + * - holding a spinlock/rwlock + * - soft interrupt context + * - regular interrupt handlers + * + * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft + * interrupt context and regular interrupt handlers are preemptible and + * only prevent migration. preempt_disable_nested() ensures that preemption + * is disabled for cases which require CPU local serialization even on + * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. + * + * The use cases are code sequences which are not serialized by a + * particular lock instance, e.g.: + * - seqcount write side critical sections where the seqcount is not + * associated to a particular lock and therefore the automatic + * protection mechanism does not work. This prevents a live lock + * against a preempting high priority reader. + * - RMW per CPU variable updates like vmstat. + */ +/* Macro to avoid header recursion hell vs. lockdep */ +#define preempt_disable_nested() \ +do { \ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ + preempt_disable(); \ + else \ + lockdep_assert_preemption_disabled(); \ +} while (0) + +/** + * preempt_enable_nested - Undo the effect of preempt_disable_nested() + */ +static __always_inline void preempt_enable_nested(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); +} + #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 8f416c5e929e..c0ef596f340b 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -1,7 +1,7 @@ #ifndef __LINUX_RWLOCK_H #define __LINUX_RWLOCK_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/slab.h b/include/linux/slab.h index 0fefdf528e0d..6a613e65e78d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -29,6 +29,8 @@ #define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ #define SLAB_POISON ((slab_flags_t __force)0x00000800U) +/* Indicate a kmalloc slab */ +#define SLAB_KMALLOC ((slab_flags_t __force)0x00001000U) /* Align objs on cache lines */ #define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) /* Use GFP_DMA memory */ @@ -184,11 +186,25 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); + +/** + * ksize - Report actual allocation size of associated object + * + * @objp: Pointer returned from a prior kmalloc()-family allocation. + * + * This should not be used for writing beyond the originally requested + * allocation size. Either use krealloc() or round up the allocation size + * with kmalloc_size_roundup() prior to allocation. If this is used to + * access beyond the originally requested allocation size, UBSAN_BOUNDS + * and/or FORTIFY_SOURCE may trip, since they only know about the + * originally allocated size via the __alloc_size attribute. + */ size_t ksize(const void *objp); + #ifdef CONFIG_PRINTK bool kmem_valid_obj(void *object); void kmem_dump_obj(void *object); @@ -243,27 +259,17 @@ static inline unsigned int arch_slab_minalign(void) #ifdef CONFIG_SLAB /* - * The largest kmalloc size supported by the SLAB allocators is - * 32 megabyte (2^25) or the maximum allocatable page order if that is - * less than 32 MB. - * - * WARNING: Its not easy to increase this value since the allocators have - * to do various tricks to work around compiler limitations in order to - * ensure proper constant folding. + * SLAB and SLUB directly allocates requests fitting in to an order-1 page + * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ -#define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ - (MAX_ORDER + PAGE_SHIFT - 1) : 25) -#define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH +#define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif #endif #ifdef CONFIG_SLUB -/* - * SLUB directly allocates requests fitting in to an order-1 page - * (PAGE_SIZE*2). Larger requests are passed to the page allocator. - */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW @@ -415,10 +421,6 @@ static __always_inline unsigned int __kmalloc_index(size_t size, if (size <= 512 * 1024) return 19; if (size <= 1024 * 1024) return 20; if (size <= 2 * 1024 * 1024) return 21; - if (size <= 4 * 1024 * 1024) return 22; - if (size <= 8 * 1024 * 1024) return 23; - if (size <= 16 * 1024 * 1024) return 24; - if (size <= 32 * 1024 * 1024) return 25; if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant) BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()"); @@ -428,6 +430,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, /* Will never be reached. Needed because the compiler may complain */ return -1; } +static_assert(PAGE_SHIFT <= 20); #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ @@ -456,42 +459,22 @@ static __always_inline void kfree_bulk(size_t size, void **p) kmem_cache_free_bulk(NULL, size, p); } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __alloc_size(1); void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; -#else -static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __kmalloc(size, flags); -} - -static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) -{ - return kmem_cache_alloc(s, flags); -} -#endif #ifdef CONFIG_TRACING -extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_slab_alignment __alloc_size(3); - -#ifdef CONFIG_NUMA -extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment - __alloc_size(4); -#else -static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, int node, size_t size) -{ - return kmem_cache_alloc_trace(s, gfpflags, size); -} -#endif /* CONFIG_NUMA */ +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_kmalloc_alignment __alloc_size(3); +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_kmalloc_alignment + __alloc_size(4); #else /* CONFIG_TRACING */ -static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, - gfp_t flags, size_t size) +/* Save a function call when CONFIG_TRACING=n */ +static __always_inline __alloc_size(3) +void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -499,8 +482,9 @@ static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_ return ret; } -static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) +static __always_inline __alloc_size(4) +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); @@ -509,25 +493,11 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g } #endif /* CONFIG_TRACING */ -extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment - __alloc_size(1); +void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment + __alloc_size(1); -#ifdef CONFIG_TRACING -extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) - __assume_page_alignment __alloc_size(1); -#else -static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, - unsigned int order) -{ - return kmalloc_order(size, flags, order); -} -#endif - -static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) -{ - unsigned int order = get_order(size); - return kmalloc_order_trace(size, flags, order); -} +void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment + __alloc_size(1); /** * kmalloc - allocate memory @@ -597,7 +567,7 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_trace( + return kmalloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); #endif @@ -605,23 +575,35 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } +#ifndef CONFIG_SLOB static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { -#ifndef CONFIG_SLOB - if (__builtin_constant_p(size) && - size <= KMALLOC_MAX_CACHE_SIZE) { - unsigned int i = kmalloc_index(size); + if (__builtin_constant_p(size)) { + unsigned int index; - if (!i) + if (size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + index = kmalloc_index(size); + + if (!index) return ZERO_SIZE_PTR; - return kmem_cache_alloc_node_trace( - kmalloc_caches[kmalloc_type(flags)][i], - flags, node, size); + return kmalloc_node_trace( + kmalloc_caches[kmalloc_type(flags)][index], + flags, node, size); } -#endif return __kmalloc_node(size, flags, node); } +#else +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) +{ + if (__builtin_constant_p(size) && size > KMALLOC_MAX_CACHE_SIZE) + return kmalloc_large_node(size, flags, node); + + return __kmalloc_node(size, flags, node); +} +#endif /** * kmalloc_array - allocate memory for an array. @@ -647,10 +629,10 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_ * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p, - size_t new_n, - size_t new_size, - gfp_t flags) +static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; @@ -671,6 +653,12 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag return kmalloc_array(n, size, flags | __GFP_ZERO); } +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + unsigned long caller) __alloc_size(1); +#define kmalloc_node_track_caller(size, flags, node) \ + __kmalloc_node_track_caller(size, flags, node, \ + _RET_IP_) + /* * kmalloc_track_caller is a special version of kmalloc that records the * calling function of the routine calling it for slab leak tracking instead @@ -679,9 +667,9 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, _RET_IP_) + __kmalloc_node_track_caller(size, flags, \ + NUMA_NO_NODE, _RET_IP_) static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, int node) @@ -700,21 +688,6 @@ static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } - -#ifdef CONFIG_NUMA -extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - unsigned long caller) __alloc_size(1); -#define kmalloc_node_track_caller(size, flags, node) \ - __kmalloc_node_track_caller(size, flags, node, \ - _RET_IP_) - -#else /* CONFIG_NUMA */ - -#define kmalloc_node_track_caller(size, flags, node) \ - kmalloc_track_caller(size, flags) - -#endif /* CONFIG_NUMA */ - /* * Shortcuts */ @@ -774,11 +747,28 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla } extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __alloc_size(3); + __realloc_size(3); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); + +/** + * kmalloc_size_roundup - Report allocation bucket size for the given size + * + * @size: Number of bytes to round up from. + * + * This returns the number of bytes that would be available in a kmalloc() + * allocation of @size bytes. For example, a 126 byte request would be + * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly + * for the general-purpose kmalloc()-based allocations, and is not for the + * pre-sized kmem_cache_alloc()-based allocations.) + * + * Use this to kmalloc() the full bucket size ahead of time instead of using + * ksize() to query the size after an allocation. + */ +size_t kmalloc_size_roundup(size_t size); + void __init kmem_cache_init_late(void); #if defined(CONFIG_SMP) && defined(CONFIG_SLAB) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 5c0c5174155d..1341f7d62da4 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SPINLOCK_H #define __LINUX_SPINLOCK_H +#define __LINUX_INSIDE_SPINLOCK_H /* * include/linux/spinlock.h - generic spinlock/rwlock declarations @@ -492,4 +493,5 @@ int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, void free_bucket_spinlocks(spinlock_t *locks); +#undef __LINUX_INSIDE_SPINLOCK_H #endif /* __LINUX_SPINLOCK_H */ diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 51fa0dab68c4..89eb6f4c659c 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_SMP_H #define __LINUX_SPINLOCK_API_SMP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index b8ba00ccccde..819aeba1c87e 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_API_UP_H #define __LINUX_SPINLOCK_API_UP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index 835aedaf68ac..61c49b16f69a 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -2,7 +2,7 @@ #ifndef __LINUX_SPINLOCK_RT_H #define __LINUX_SPINLOCK_RT_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H #error Do not include directly. Use spinlock.h #endif diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 16521074b6f7..c87204247592 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_UP_H #define __LINUX_SPINLOCK_UP_H -#ifndef __LINUX_SPINLOCK_H +#ifndef __LINUX_INSIDE_SPINLOCK_H # error "please don't include this file directly" #endif diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index 6ad4e9032d53..46040d66334a 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -8,7 +8,7 @@ * * Key points : * - * - Use a seqcount on 32-bit SMP, only disable preemption for 32-bit UP. + * - Use a seqcount on 32-bit * - The whole thing is a no-op on 64-bit architectures. * * Usage constraints: @@ -20,7 +20,8 @@ * writer and also spin forever. * * 3) Write side must use the _irqsave() variant if other writers, or a reader, - * can be invoked from an IRQ context. + * can be invoked from an IRQ context. On 64bit systems this variant does not + * disable interrupts. * * 4) If reader fetches several counters, there is no guarantee the whole values * are consistent w.r.t. each other (remember point #2: seqcounts are not @@ -29,11 +30,6 @@ * 5) Readers are allowed to sleep or be preempted/interrupted: they perform * pure reads. * - * 6) Readers must use both u64_stats_fetch_{begin,retry}_irq() if the stats - * might be updated from a hardirq or softirq context (remember point #1: - * seqcounts are not used for UP kernels). 32-bit UP stat readers could read - * corrupted 64-bit values otherwise. - * * Usage : * * Stats producer (writer) should use following template granted it already got @@ -66,7 +62,7 @@ #include struct u64_stats_sync { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) +#if BITS_PER_LONG == 32 seqcount_t seq; #endif }; @@ -98,7 +94,22 @@ static inline void u64_stats_inc(u64_stats_t *p) local64_inc(&p->v); } -#else +static inline void u64_stats_init(struct u64_stats_sync *syncp) { } +static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { } +static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { } +static inline unsigned long __u64_stats_irqsave(void) { return 0; } +static inline void __u64_stats_irqrestore(unsigned long flags) { } +static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ + return 0; +} +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ + return false; +} + +#else /* 64 bit */ typedef struct { u64 v; @@ -123,123 +134,95 @@ static inline void u64_stats_inc(u64_stats_t *p) { p->v++; } -#endif -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) -#define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) -#else static inline void u64_stats_init(struct u64_stats_sync *syncp) { + seqcount_init(&syncp->seq); } -#endif -static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) +static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); write_seqcount_begin(&syncp->seq); -#endif } -static inline void u64_stats_update_end(struct u64_stats_sync *syncp) +static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); -#endif + preempt_enable_nested(); } -static inline unsigned long -u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +static inline unsigned long __u64_stats_irqsave(void) { - unsigned long flags = 0; + unsigned long flags; -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); - else - local_irq_save(flags); - write_seqcount_begin(&syncp->seq); -#endif + local_irq_save(flags); return flags; } -static inline void -u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, - unsigned long flags) +static inline void __u64_stats_irqrestore(unsigned long flags) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - write_seqcount_end(&syncp->seq); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); - else - local_irq_restore(flags); -#endif + local_irq_restore(flags); } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); -#else - return 0; -#endif +} + +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ + return read_seqcount_retry(&syncp->seq, start); +} +#endif /* !64 bit */ + +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) +{ + __u64_stats_update_begin(syncp); +} + +static inline void u64_stats_update_end(struct u64_stats_sync *syncp) +{ + __u64_stats_update_end(syncp); +} + +static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) +{ + unsigned long flags = __u64_stats_irqsave(); + + __u64_stats_update_begin(syncp); + return flags; +} + +static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) +{ + __u64_stats_update_end(syncp); + __u64_stats_irqrestore(flags); } static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) - preempt_disable(); -#endif return __u64_stats_fetch_begin(syncp); } -static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, - unsigned int start) -{ -#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) - return read_seqcount_retry(&syncp->seq, start); -#else - return false; -#endif -} - static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) - preempt_enable(); -#endif return __u64_stats_fetch_retry(syncp, start); } -/* - * In case irq handlers can update u64 counters, readers can use following helpers - * - SMP 32bit arches use seqcount protection, irq safe. - * - UP 32bit must disable irqs. - * - 64bit have no problem atomically reading u64 values, irq safe. - */ +/* Obsolete interfaces */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_disable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_disable(); -#endif - return __u64_stats_fetch_begin(syncp); + return u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) - preempt_enable(); -#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) - local_irq_enable(); -#endif - return __u64_stats_fetch_retry(syncp, start); + return u64_stats_fetch_retry(syncp, start); } #endif /* _LINUX_U64_STATS_SYNC_H */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 4cb51ace600d..243073cfc29d 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -9,73 +9,15 @@ #include #include -DECLARE_EVENT_CLASS(kmem_alloc, +TRACE_EVENT(kmem_cache_alloc, TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( unsigned long, gfp_flags ) - __field( bool, accounted ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = (__force unsigned long)gfp_flags; - __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? - ((gfp_flags & __GFP_ACCOUNT) || - (s && s->flags & SLAB_ACCOUNT)) : false; - ), - - TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s accounted=%s", - (void *)__entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - show_gfp_flags(__entry->gfp_flags), - __entry->accounted ? "true" : "false") -); - -DEFINE_EVENT(kmem_alloc, kmalloc, - - TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags) -); - -DEFINE_EVENT(kmem_alloc, kmem_cache_alloc, - - TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s, - size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags) -); - -DECLARE_EVENT_CLASS(kmem_alloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - struct kmem_cache *s, - size_t bytes_req, - size_t bytes_alloc, gfp_t gfp_flags, int node), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node), + TP_ARGS(call_site, ptr, s, gfp_flags, node), TP_STRUCT__entry( __field( unsigned long, call_site ) @@ -90,13 +32,13 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; + __entry->bytes_req = s->object_size; + __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; __entry->node = node; __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ? ((gfp_flags & __GFP_ACCOUNT) || - (s && s->flags & SLAB_ACCOUNT)) : false; + (s->flags & SLAB_ACCOUNT)) : false; ), TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", @@ -109,22 +51,44 @@ DECLARE_EVENT_CLASS(kmem_alloc_node, __entry->accounted ? "true" : "false") ); -DEFINE_EVENT(kmem_alloc_node, kmalloc_node, +TRACE_EVENT(kmalloc, - TP_PROTO(unsigned long call_site, const void *ptr, - struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, - gfp_t gfp_flags, int node), + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node) -); + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), -DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node, + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( unsigned long, gfp_flags ) + __field( int, node ) + ), - TP_PROTO(unsigned long call_site, const void *ptr, - struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc, - gfp_t gfp_flags, int node), + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = (__force unsigned long)gfp_flags; + __entry->node = node; + ), - TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node) + TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", + (void *)__entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + show_gfp_flags(__entry->gfp_flags), + __entry->node, + (IS_ENABLED(CONFIG_MEMCG_KMEM) && + (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false") ); TRACE_EVENT(kfree, @@ -149,20 +113,20 @@ TRACE_EVENT(kfree, TRACE_EVENT(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr, const char *name), + TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s), - TP_ARGS(call_site, ptr, name), + TP_ARGS(call_site, ptr, s), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) - __string( name, name ) + __string( name, s->name ) ), TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; - __assign_str(name, name); + __assign_str(name, s->name); ), TP_printk("call_site=%pS ptr=%p name=%s", diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 03b370062741..85be78e0e7f6 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -164,8 +164,6 @@ enum perf_event_sample_format { PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ - - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) @@ -204,6 +202,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -233,6 +233,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -253,9 +255,48 @@ enum { PERF_BR_COND_RET = 10, /* conditional function return */ PERF_BR_ERET = 11, /* exception return */ PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ PERF_BR_MAX, }; +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1295,7 +1336,9 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0xa available */ +/* 5-0x8 available */ +#define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ @@ -1313,7 +1356,7 @@ union perf_mem_data_src { #define PERF_MEM_SNOOP_SHIFT 19 #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -/* 1 free */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ #define PERF_MEM_SNOOPX_SHIFT 38 /* locked instruction */ @@ -1363,6 +1406,7 @@ union perf_mem_data_src { * abort: aborting a hardware transaction * cycles: cycles from last branch (or 0 if not supported) * type: branch type + * spec: branch speculation info (or 0 if not supported) */ struct perf_branch_entry { __u64 from; @@ -1373,7 +1417,10 @@ struct perf_branch_entry { abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ type:4, /* branch type */ - reserved:40; + spec:2, /* branch speculation info */ + new_type:4, /* additional branch type */ + priv:3, /* privilege level */ + reserved:31; }; union perf_sample_weight { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1adbe67cdb95..aecea7451b61 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); @@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, int err = -EINVAL; __u64 nr_kernel; - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 8591c180b52b..91a62f566743 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cbebca5a79f..6be9b3e9b238 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1468,6 +1468,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; @@ -2224,16 +2226,22 @@ static inline int __pmu_filter_match(struct perf_event *event) static inline int pmu_filter_match(struct perf_event *event) { struct perf_event *sibling; + unsigned long flags; + int ret = 1; if (!__pmu_filter_match(event)) return 0; + local_irq_save(flags); for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; + if (!__pmu_filter_match(sibling)) { + ret = 0; + break; + } } + local_irq_restore(flags); - return 1; + return ret; } static inline int @@ -6780,11 +6788,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6813,7 +6820,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -6962,11 +6969,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7048,14 +7050,14 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7298,6 +7300,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7305,7 +7308,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); @@ -7313,7 +7321,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; @@ -7325,7 +7333,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7341,6 +7349,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; @@ -7348,8 +7357,8 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { - if (perf_sample_save_hw_index(event)) + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr @@ -7398,6 +7407,20 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7413,7 +7436,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF @@ -9984,8 +10008,16 @@ static void bpf_overflow_handler(struct perf_event *event, goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); - if (prog) + if (prog) { + if (prog->call_get_stack && + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { + data->callchain = perf_callchain(event, regs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + ret = bpf_prog_run(prog, &ctx); + } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -10011,7 +10043,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, if (event->attr.precise_ip && prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* @@ -10928,7 +10960,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -10939,7 +10971,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -10950,7 +10982,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -11704,11 +11736,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kmem_cache_free(perf_event_cache, event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index f32320ac02fd..c3797701339c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -17,61 +17,276 @@ * This file contains the arch-independent routines. */ +#include + +#include +#include +#include +#include +#include #include -#include -#include -#include #include #include -#include +#include +#include +#include #include +#include #include -#include #include -#include -#include -#include -#include -#include /* - * Constraints data + * Datastructure to track the total uses of N slots across tasks or CPUs; + * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. + */ +struct bp_slots_histogram { +#ifdef hw_breakpoint_slots + atomic_t count[hw_breakpoint_slots(0)]; +#else + atomic_t *count; +#endif +}; + +/* + * Per-CPU constraints data. */ struct bp_cpuinfo { - /* Number of pinned cpu breakpoints in a cpu */ - unsigned int cpu_pinned; - /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ - unsigned int *tsk_pinned; - /* Number of non-pinned cpu/task breakpoints in a cpu */ - unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ + /* Number of pinned CPU breakpoints in a CPU. */ + unsigned int cpu_pinned; + /* Histogram of pinned task breakpoints in a CPU. */ + struct bp_slots_histogram tsk_pinned; }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX]; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { return per_cpu_ptr(bp_cpuinfo + type, cpu); } +/* Number of pinned CPU breakpoints globally. */ +static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; +/* Number of pinned CPU-independent task breakpoints. */ +static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; + /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); - -static int constraints_initialized; - -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, }; -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); +static bool constraints_initialized __ro_after_init; -__weak int hw_breakpoint_weight(struct perf_event *bp) +/* + * Synchronizes accesses to the per-CPU constraints; the locking rules are: + * + * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock + * (due to bp_slots_histogram::count being atomic, no update are lost). + * + * 2. Holding a write-lock is required for computations that require a + * stable snapshot of all bp_cpuinfo::tsk_pinned. + * + * 3. In all other cases, non-atomic accesses require the appropriately held + * lock (read-lock for read-only accesses; write-lock for reads/writes). + */ +DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); + +/* + * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since + * rhltable synchronizes concurrent insertions/deletions, independent tasks may + * insert/delete concurrently; therefore, a mutex per task is sufficient. + * + * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a + * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is + * that hw_breakpoint may contend with per-task perf event list management. The + * assumption is that perf usecases involving hw_breakpoints are very unlikely + * to result in unnecessary contention. + */ +static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) +{ + struct task_struct *tsk = bp->hw.target; + + return tsk ? &tsk->perf_event_mutex : NULL; +} + +static struct mutex *bp_constraints_lock(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) { + /* + * Fully analogous to the perf_try_init_event() nesting + * argument in the comment near perf_event_ctx_lock_nested(); + * this child->perf_event_mutex cannot ever deadlock against + * the parent->perf_event_mutex usage from + * perf_event_task_{en,dis}able(). + * + * Specifically, inherited events will never occur on + * ->perf_event_list. + */ + mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); + percpu_down_read(&bp_cpuinfo_sem); + } else { + percpu_down_write(&bp_cpuinfo_sem); + } + + return tsk_mtx; +} + +static void bp_constraints_unlock(struct mutex *tsk_mtx) +{ + if (tsk_mtx) { + percpu_up_read(&bp_cpuinfo_sem); + mutex_unlock(tsk_mtx); + } else { + percpu_up_write(&bp_cpuinfo_sem); + } +} + +static bool bp_constraints_is_locked(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + return percpu_is_write_locked(&bp_cpuinfo_sem) || + (tsk_mtx ? mutex_is_locked(tsk_mtx) : + percpu_is_read_locked(&bp_cpuinfo_sem)); +} + +static inline void assert_bp_constraints_lock_held(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) + lockdep_assert_held(tsk_mtx); + lockdep_assert_held(&bp_cpuinfo_sem); +} + +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init bool +bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); + return hist->count; +} + +static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) +{ + kfree(hist->count); +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) + goto err; + } + } + for (i = 0; i < TYPE_MAX; i++) { + if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) + goto err; + if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) + goto err; + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + for (i = 0; i < TYPE_MAX; i++) { + bp_slots_histogram_free(&cpu_pinned[i]); + bp_slots_histogram_free(&tsk_pinned_all[i]); + } + + return -ENOMEM; +} +#endif + +static inline void +bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) +{ + const int old_idx = old - 1; + const int new_idx = old_idx + val; + + if (old_idx >= 0) + WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); + if (new_idx >= 0) + WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); +} + +static int +bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count = atomic_read(&hist->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist->count[i]); + if (count > 0) + return i + 1; + WARN(count < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +static int +bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, + enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count1 = atomic_read(&hist1->count[i]); + const int count2 = atomic_read(&hist2->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); + ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); + if (count1 + count2 > 0) + return i + 1; + WARN(count1 < 0, "inconsistent breakpoint slots histogram"); + WARN(count2 < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +#ifndef hw_breakpoint_weight +static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } +#endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { @@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) } /* - * Report the maximum number of pinned breakpoints a task - * have in this cpu + * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int i; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; - for (i = nr_slots[type] - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) - return i + 1; - } - - return 0; + /* + * At this point we want to have acquired the bp_cpuinfo_sem as a + * writer to ensure that there are no concurrent writers in + * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. + */ + lockdep_assert_held_write(&bp_cpuinfo_sem); + return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. + * + * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, + * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && - (iter->cpu < 0 || cpu == iter->cpu)) - count += hw_breakpoint_weight(iter); + /* + * We need a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) != type) + continue; + + if (iter->cpu >= 0) { + if (cpu == -1) { + count = -1; + goto out; + } else if (cpu != iter->cpu) + continue; + } + + count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp) } /* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). + * Returns the max pinned breakpoint slots in a given + * CPU (cpu > -1) or across all of them (cpu = -1). */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) +static int +max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); + int pinned_slots = 0; int cpu; + if (bp->hw.target && bp->cpu < 0) { + int max_pinned = task_bp_pinned(-1, bp, type); + + if (max_pinned >= 0) { + /* + * Fast path: task_bp_pinned() is CPU-independent and + * returns the same value for any CPU. + */ + max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); + return max_pinned; + } + } + for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; @@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, else nr += task_bp_pinned(cpu, bp, type); - if (nr > slots->pinned) - slots->pinned = nr; - - nr = info->flexible; - if (nr > slots->flexible) - slots->flexible = nr; + pinned_slots = max(nr, pinned_slots); } -} -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, - enum bp_type_idx type, int weight) -{ - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int old_idx, new_idx; - - old_idx = task_bp_pinned(cpu, bp, type) - 1; - new_idx = old_idx + weight; - - if (old_idx >= 0) - tsk_pinned[old_idx]--; - if (new_idx >= 0) - tsk_pinned[new_idx]++; + return pinned_slots; } /* * Add/remove the given breakpoint in our constraint table */ -static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) +static int +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - const struct cpumask *cpumask = cpumask_of_bp(bp); - int cpu; + int cpu, next_tsk_pinned; if (!enable) weight = -weight; - /* Pinned counter cpu profiling */ if (!bp->hw.target) { - get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + /* + * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the + * global histogram. + */ + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); + + lockdep_assert_held_write(&bp_cpuinfo_sem); + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); + info->cpu_pinned += weight; + return 0; } - /* Pinned counter task profiling */ - for_each_cpu(cpu, cpumask) - toggle_bp_task_slot(bp, cpu, type, weight); + /* + * If bp->hw.target, tsk_pinned is only modified, but not used + * otherwise. We can permit concurrent updates as long as there are no + * other uses: having acquired bp_cpuinfo_sem as a reader allows + * concurrent updates here. Uses of tsk_pinned will require acquiring + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. + */ + lockdep_assert_held_read(&bp_cpuinfo_sem); + + /* + * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global + * histogram. We need to take care of 4 cases: + * + * 1. This breakpoint targets all CPUs (cpu < 0), and there may only + * exist other task breakpoints targeting all CPUs. In this case we + * can simply update the global slots histogram. + * + * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may + * only exist other task breakpoints targeting all CPUs. + * + * a. On enable: remove the existing breakpoints from the global + * slots histogram and use the per-CPU histogram. + * + * b. On disable: re-insert the existing breakpoints into the global + * slots histogram and remove from per-CPU histogram. + * + * 3. Some other existing task breakpoints target specific CPUs. Only + * update the per-CPU slots histogram. + */ + + if (!enable) { + /* + * Remove before updating histograms so we can determine if this + * was the last task breakpoint for a specific CPU. + */ + int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + if (ret) + return ret; + } + /* + * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. + */ + next_tsk_pinned = task_bp_pinned(-1, bp, type); + + if (next_tsk_pinned >= 0) { + if (bp->cpu < 0) { /* Case 1: fast path */ + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); + } else if (enable) { /* Case 2.a: slow path */ + /* Add existing to per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + 0, next_tsk_pinned); + } + /* Add this first CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, + -next_tsk_pinned); + } else { /* Case 2.b: slow path */ + /* Remove this last CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned + hw_breakpoint_weight(bp), weight); + /* Remove all from per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, -next_tsk_pinned); + } + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); + } + } else { /* Case 3: slow path */ + const struct cpumask *cpumask = cpumask_of_bp(bp); + + for_each_cpu(cpu, cpumask) { + next_tsk_pinned = task_bp_pinned(cpu, bp, type); + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + } + } + + /* + * Readers want a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); - else - list_del(&bp->hw.bp_list); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + return 0; } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Constraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter. + * + * Note: Flexible breakpoints are currently unimplemented, but outlined in the + * below algorithm for completeness. The implementation treats flexible as + * pinned due to no guarantee that we currently always schedule flexible events + * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * @@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { - struct bp_busy_slots slots = {0}; enum bp_type_idx type; + int max_pinned_slots; int weight; int ret; @@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + /* Check if this new breakpoint can be satisfied across all CPUs. */ + max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; + if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) { - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __reserve_bp_slot(bp, bp->attr.bp_type); + bp_constraints_unlock(mtx); return ret; } @@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) { - mutex_lock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) @@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { - int ret; + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __modify_bp_slot(bp, old_type, new_type); - mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type, new_type); - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); return ret; } @@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) */ int dbg_reserve_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + int ret; + + if (bp_constraints_is_locked(bp)) return -1; - return __reserve_bp_slot(bp, bp->attr.bp_type); + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); + + return ret; } int dbg_release_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + if (bp_constraints_is_locked(bp)) return -1; + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); return 0; } @@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); +/** + * hw_breakpoint_is_used - check if breakpoints are currently used + * + * Returns: true if breakpoints are used, false otherwise. + */ +bool hw_breakpoint_is_used(void) +{ + int cpu; + + if (!constraints_initialized) + return false; + + for_each_possible_cpu(cpu) { + for (int type = 0; type < TYPE_MAX; ++type) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + + if (info->cpu_pinned) + return true; + + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + if (atomic_read(&info->tsk_pinned.count[slot])) + return true; + } + } + } + + for (int type = 0; type < TYPE_MAX; ++type) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + /* + * Warn, because if there are CPU pinned counters, + * should never get here; bp_cpuinfo::cpu_pinned should + * be consistent with the global cpu_pinned histogram. + */ + if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) + return true; + + if (atomic_read(&tsk_pinned_all[type].count[slot])) + return true; + } + } + + return false; +} + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ @@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i; + int ret; - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + return ret; - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); + ret = init_breakpoint_slots(); + if (ret) + return ret; - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; - } - } - - constraints_initialized = 1; + constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - - err_alloc: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return -ENOMEM; } - - diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c new file mode 100644 index 000000000000..5ced822df788 --- /dev/null +++ b/kernel/events/hw_breakpoint_test.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for hw_breakpoint constraints accounting logic. + * + * Copyright (C) 2022, Google LLC. + */ + +#include +#include +#include +#include +#include +#include + +#define TEST_REQUIRES_BP_SLOTS(test, slots) \ + do { \ + if ((slots) > get_test_bp_slots()) { \ + kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \ + get_test_bp_slots()); \ + } \ + } while (0) + +#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr)) + +#define MAX_TEST_BREAKPOINTS 512 + +static char break_vars[MAX_TEST_BREAKPOINTS]; +static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS]; +static struct task_struct *__other_task; + +static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx) +{ + struct perf_event_attr attr = {}; + + if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS)) + return NULL; + + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)&break_vars[idx]; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_RW; + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); +} + +static void unregister_test_bp(struct perf_event **bp) +{ + if (WARN_ON(IS_ERR(*bp))) + return; + if (WARN_ON(!*bp)) + return; + unregister_hw_breakpoint(*bp); + *bp = NULL; +} + +static int get_test_bp_slots(void) +{ + static int slots; + + if (!slots) + slots = hw_breakpoint_slots(TYPE_DATA); + + return slots; +} + +static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk) +{ + struct perf_event *bp = register_test_bp(cpu, tsk, *id); + + KUNIT_ASSERT_NOT_NULL(test, bp); + KUNIT_ASSERT_FALSE(test, IS_ERR(bp)); + KUNIT_ASSERT_NULL(test, test_bps[*id]); + test_bps[(*id)++] = bp; +} + +/* + * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free. + * + * Returns true if this can be called again, continuing at @id. + */ +static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip) +{ + for (int i = 0; i < get_test_bp_slots() - skip; ++i) + fill_one_bp_slot(test, id, cpu, tsk); + + return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS; +} + +static int dummy_kthread(void *arg) +{ + return 0; +} + +static struct task_struct *get_other_task(struct kunit *test) +{ + struct task_struct *tsk; + + if (__other_task) + return __other_task; + + tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task"); + KUNIT_ASSERT_FALSE(test, IS_ERR(tsk)); + __other_task = tsk; + return __other_task; +} + +static int get_test_cpu(int num) +{ + int cpu; + + WARN_ON(num < 0); + + for_each_online_cpu(cpu) { + if (num-- <= 0) + break; + } + + return cpu; +} + +/* ===== Test cases ===== */ + +static void test_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_many_cpus(struct kunit *test) +{ + int idx = 0; + int cpu; + + /* Test that CPUs are independent. */ + for_each_online_cpu(cpu) { + bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0); + + TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx)); + if (!do_continue) + break; + } +} + +static void test_one_task_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, -1, current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one and adding back CPU-target should work. */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_two_tasks_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + /* Test that tasks are independent. */ + fill_bp_slots(test, &idx, -1, current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one from first task and adding back CPU-target should not work. */ + unregister_test_bp(&test_bps[0]); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_one_task_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* + * Remove one and adding back CPU-target should work; this case is + * special vs. above because the task's constraints are CPU-dependent. + */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_one_task_mixed(struct kunit *test) +{ + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_bp_slots(test, &idx, -1, current, 1); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* Transition from CPU-dependent pinned count to CPU-independent. */ + unregister_test_bp(&test_bps[0]); + unregister_test_bp(&test_bps[1]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_two_tasks_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Can still create breakpoints on some other CPU. */ + fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0); +} + +static void test_two_tasks_on_one_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Cannot create breakpoints on some other CPU either. */ + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static void test_task_on_all_and_one_cpu(struct kunit *test) +{ + int tsk_on_cpu_idx, cpu_idx; + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_bp_slots(test, &idx, -1, current, 2); + /* Transitioning from only all CPU breakpoints to mixed. */ + tsk_on_cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* We should still be able to use up another CPU's slots. */ + cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); + + /* Transitioning back to task target on all CPUs. */ + unregister_test_bp(&test_bps[tsk_on_cpu_idx]); + /* Still have a CPU target breakpoint in get_test_cpu(1). */ + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + /* Remove it and try again. */ + unregister_test_bp(&test_bps[cpu_idx]); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static struct kunit_case hw_breakpoint_test_cases[] = { + KUNIT_CASE(test_one_cpu), + KUNIT_CASE(test_many_cpus), + KUNIT_CASE(test_one_task_on_all_cpus), + KUNIT_CASE(test_two_tasks_on_all_cpus), + KUNIT_CASE(test_one_task_on_one_cpu), + KUNIT_CASE(test_one_task_mixed), + KUNIT_CASE(test_two_tasks_on_one_cpu), + KUNIT_CASE(test_two_tasks_on_one_all_cpus), + KUNIT_CASE(test_task_on_all_and_one_cpu), + {}, +}; + +static int test_init(struct kunit *test) +{ + /* Most test cases want 2 distinct CPUs. */ + if (num_online_cpus() < 2) + return -EINVAL; + + /* Want the system to not use breakpoints elsewhere. */ + if (hw_breakpoint_is_used()) + return -EBUSY; + + return 0; +} + +static void test_exit(struct kunit *test) +{ + for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) { + if (test_bps[i]) + unregister_test_bp(&test_bps[i]); + } + + if (__other_task) { + kthread_stop(__other_task); + __other_task = NULL; + } + + /* Verify that internal state agrees that no breakpoints are in use. */ + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); +} + +static struct kunit_suite hw_breakpoint_test_suite = { + .name = "hw_breakpoint", + .test_cases = hw_breakpoint_test_cases, + .init = test_init, + .exit = test_exit, +}; + +kunit_test_suites(&hw_breakpoint_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Elver "); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 64a13eb56078..e3375bc40dad 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -934,8 +934,10 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name && - lock->key != &__lockdep_no_validate__); + WARN_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__, + "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n", + lock->name, lock->key, class->name); return class; } } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 5fe4c5495ba3..185bd1c906b0 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2e1600906c9f..d2ef312a8611 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -18,7 +18,7 @@ * queued_read_lock_slowpath - acquire read lock of a queued rwlock * @lock: Pointer to queued rwlock structure */ -void queued_read_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -63,7 +63,7 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); * queued_write_lock_slowpath - acquire write lock of a queued rwlock * @lock : Pointer to queued rwlock structure */ -void queued_write_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock) { int cnts; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 65a9a10caa6f..2b23378775fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -313,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ -void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; u32 old, tail; diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e84d21aa0722..6afc249ce697 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -489,7 +489,7 @@ gotlock: * PV versions of the unlock fastpath and slowpath functions to be used * instead of queued_spin_unlock(). */ -__visible void +__visible __lockfunc void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct pv_node *node; @@ -544,7 +544,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #include #ifndef __pv_queued_spin_unlock -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { u8 locked; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 65f0262f635e..44873594de03 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -133,14 +133,19 @@ * the owner value concurrently without lock. Read from owner, however, * may not need READ_ONCE() as long as the pointer value is only used * for comparison and isn't being dereferenced. + * + * Both rwsem_{set,clear}_owner() functions should be in the same + * preempt disable section as the atomic op that changes sem->count. */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, 0); } @@ -251,13 +256,16 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) static inline bool rwsem_write_trylock(struct rw_semaphore *sem) { long tmp = RWSEM_UNLOCKED_VALUE; + bool ret = false; + preempt_disable(); if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - return true; + ret = true; } - return false; + preempt_enable(); + return ret; } /* @@ -1352,8 +1360,10 @@ static inline void __up_write(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + preempt_enable(); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index f2654d2fe43a..34bfae72f295 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -51,7 +51,7 @@ static noinline void __up(struct semaphore *sem); * Use of this function is deprecated, please use down_interruptible() or * down_killable() instead. */ -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { unsigned long flags; @@ -74,7 +74,7 @@ EXPORT_SYMBOL(down); * If the sleep is interrupted by a signal, this function will return -EINTR. * If the semaphore is successfully acquired, this function returns 0. */ -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -101,7 +101,7 @@ EXPORT_SYMBOL(down_interruptible); * -EINTR. If the semaphore is successfully acquired, this function returns * 0. */ -int down_killable(struct semaphore *sem) +int __sched down_killable(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -131,7 +131,7 @@ EXPORT_SYMBOL(down_killable); * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int __sched down_trylock(struct semaphore *sem) { unsigned long flags; int count; @@ -156,7 +156,7 @@ EXPORT_SYMBOL(down_trylock); * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long timeout) +int __sched down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; @@ -180,7 +180,7 @@ EXPORT_SYMBOL(down_timeout); * Release the semaphore. Unlike mutexes, up() may be called from any * context and even by tasks which have never called down(). */ -void up(struct semaphore *sem) +void __sched up(struct semaphore *sem) { unsigned long flags; diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 7f49baaa4979..8475a0794f8c 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -133,7 +133,7 @@ BUILD_LOCK_OPS(write, rwlock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { return __raw_spin_trylock(lock); } @@ -141,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { return __raw_spin_trylock_bh(lock); } @@ -149,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { __raw_spin_lock(lock); } @@ -157,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { return __raw_spin_lock_irqsave(lock); } @@ -165,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { __raw_spin_lock_irq(lock); } @@ -173,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { __raw_spin_lock_bh(lock); } @@ -181,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh); #endif #ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); } @@ -189,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { __raw_spin_unlock_irqrestore(lock, flags); } @@ -197,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { __raw_spin_unlock_irq(lock); } @@ -205,7 +205,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { __raw_spin_unlock_bh(lock); } @@ -215,7 +215,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_bh); #ifndef CONFIG_PREEMPT_RT #ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_read_trylock(rwlock_t *lock) { return __raw_read_trylock(lock); } @@ -223,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock); #endif #ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock(rwlock_t *lock) { __raw_read_lock(lock); } @@ -231,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { return __raw_read_lock_irqsave(lock); } @@ -239,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { __raw_read_lock_irq(lock); } @@ -247,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq); #endif #ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { __raw_read_lock_bh(lock); } @@ -255,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock(rwlock_t *lock) { __raw_read_unlock(lock); } @@ -263,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_read_unlock_irqrestore(lock, flags); } @@ -271,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { __raw_read_unlock_irq(lock); } @@ -279,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { __raw_read_unlock_bh(lock); } @@ -287,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh); #endif #ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_write_trylock(rwlock_t *lock) { return __raw_write_trylock(lock); } @@ -295,7 +295,7 @@ EXPORT_SYMBOL(_raw_write_trylock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock(rwlock_t *lock) { __raw_write_lock(lock); } @@ -313,7 +313,7 @@ EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) { return __raw_write_lock_irqsave(lock); } @@ -321,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock) { __raw_write_lock_irq(lock); } @@ -329,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock) { __raw_write_lock_bh(lock); } @@ -337,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock(rwlock_t *lock) { __raw_write_unlock(lock); } @@ -345,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_write_unlock_irqrestore(lock, flags); } @@ -353,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) { __raw_write_unlock_irq(lock); } @@ -361,7 +361,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irq); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) { __raw_write_unlock_bh(lock); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 688552df95ca..49fb9ec8366d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1706,6 +1706,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) + return -ENOENT; + if (unlikely(!br_stack)) return -ENOENT; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a05cd9dc5f8b..3761118d1879 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -805,6 +805,9 @@ config ARCH_HAS_DEBUG_VM_PGTABLE An architecture should select this when it can successfully build and run DEBUG_VM_PGTABLE. +config DEBUG_VM_IRQSOFF + def_bool DEBUG_VM && !PREEMPT_RT + config DEBUG_VM bool "Debug VM" depends on DEBUG_KERNEL @@ -2556,6 +2559,16 @@ config FORTIFY_KUNIT_TEST by the str*() and mem*() family of functions. For testing runtime traps of FORTIFY_SOURCE, see LKDTM's "FORTIFY_*" tests. +config HW_BREAKPOINT_KUNIT_TEST + bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS + depends on HAVE_HW_BREAKPOINT + depends on KUNIT=y + default KUNIT_ALL_TESTS + help + Tests for hw_breakpoint constraints accounting. + + If unsure, say N. + config TEST_UDELAY tristate "udelay test driver" help diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index 05cccbcf1661..83332fefa6f4 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -70,6 +70,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) */ if (events <= 1) return false; + preempt_disable_nested(); write_seqcount_begin(&p->sequence); if (periods < 64) events -= events >> periods; @@ -77,6 +78,7 @@ bool fprop_new_period(struct fprop_global *p, int periods) percpu_counter_add(&p->events, -events); p->period += periods; write_seqcount_end(&p->sequence); + preempt_enable_nested(); return true; } diff --git a/mm/Kconfig b/mm/Kconfig index 0331f1461f81..3897e924e40f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -579,6 +579,12 @@ config COMPACTION it and then we would be really interested to hear about that at linux-mm@kvack.org. +config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION + default 0 if PREEMPT_RT + default 1 + # # support for free page reporting config PAGE_REPORTING diff --git a/mm/compaction.c b/mm/compaction.c index 640fa76228dd..10561cb1aaad 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1727,11 +1727,7 @@ typedef enum { * Allow userspace to control policy on scanning the unevictable LRU for * compactable pages. */ -#ifdef CONFIG_PREEMPT_RT -int sysctl_compact_unevictable_allowed __read_mostly = 0; -#else -int sysctl_compact_unevictable_allowed __read_mostly = 1; -#endif +int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT; static inline void update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) diff --git a/mm/kfence/report.c b/mm/kfence/report.c index f5a6d8ba3e21..7e496856c2eb 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -86,6 +86,7 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries /* Also the *_bulk() variants by only checking prefixes. */ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9d054e3767ce..cf405510cd17 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -597,25 +597,18 @@ static u64 flush_next_time; */ static void memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#else - VM_BUG_ON(!irqs_disabled()); -#endif + preempt_disable_nested(); + VM_WARN_ON_IRQS_ENABLED(); } static void __memcg_stats_lock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_disable(); -#endif + preempt_disable_nested(); } static void memcg_stats_unlock(void) { -#ifdef CONFIG_PREEMPT_RT - preempt_enable(); -#endif + preempt_enable_nested(); } static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) @@ -715,7 +708,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, * interrupt context while other caller need to have disabled interrupt. */ __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { + if (IS_ENABLED(CONFIG_DEBUG_VM)) { switch (idx) { case NR_ANON_MAPPED: case NR_FILE_MAPPED: @@ -725,7 +718,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, WARN_ON_ONCE(!in_task()); break; default: - WARN_ON_ONCE(!irqs_disabled()); + VM_WARN_ON_IRQS_ENABLED(); } } diff --git a/mm/slab.c b/mm/slab.c index 10e96137b44f..a5486ff8362a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3181,84 +3181,46 @@ must_grow: } static __always_inline void * -slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, - unsigned long caller) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - unsigned long save_flags; - void *ptr; + void *objp = NULL; int slab_node = numa_mem_id(); - struct obj_cgroup *objcg = NULL; - bool init = false; - flags &= gfp_allowed_mask; - cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags); - if (unlikely(!cachep)) - return NULL; - - ptr = kfence_alloc(cachep, orig_size, flags); - if (unlikely(ptr)) - goto out_hooks; - - local_irq_save(save_flags); - - if (nodeid == NUMA_NO_NODE) - nodeid = slab_node; - - if (unlikely(!get_node(cachep, nodeid))) { - /* Node not bootstrapped yet */ - ptr = fallback_alloc(cachep, flags); - goto out; - } - - if (nodeid == slab_node) { + if (nodeid == NUMA_NO_NODE) { + if (current->mempolicy || cpuset_do_slab_mem_spread()) { + objp = alternate_node_alloc(cachep, flags); + if (objp) + goto out; + } /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback * to other nodes. It may fail while we still have * objects on other nodes available. */ - ptr = ____cache_alloc(cachep, flags); - if (ptr) - goto out; + objp = ____cache_alloc(cachep, flags); + nodeid = slab_node; + } else if (nodeid == slab_node) { + objp = ____cache_alloc(cachep, flags); + } else if (!get_node(cachep, nodeid)) { + /* Node not bootstrapped yet */ + objp = fallback_alloc(cachep, flags); + goto out; } - /* ___cache_alloc_node can fall back to other nodes */ - ptr = ____cache_alloc_node(cachep, flags, nodeid); -out: - local_irq_restore(save_flags); - ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - init = slab_want_init_on_alloc(flags, cachep); - -out_hooks: - slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init); - return ptr; -} - -static __always_inline void * -__do_cache_alloc(struct kmem_cache *cache, gfp_t flags) -{ - void *objp; - - if (current->mempolicy || cpuset_do_slab_mem_spread()) { - objp = alternate_node_alloc(cache, flags); - if (objp) - goto out; - } - objp = ____cache_alloc(cache, flags); /* * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_mem_id()); - + objp = ____cache_alloc_node(cachep, flags, nodeid); out: return objp; } #else static __always_inline void * -__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused) { return ____cache_alloc(cachep, flags); } @@ -3266,8 +3228,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) #endif /* CONFIG_NUMA */ static __always_inline void * -slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, - size_t orig_size, unsigned long caller) +slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + int nodeid, size_t orig_size, unsigned long caller) { unsigned long save_flags; void *objp; @@ -3284,7 +3246,7 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, goto out; local_irq_save(save_flags); - objp = __do_cache_alloc(cachep, flags); + objp = __do_cache_alloc(cachep, flags, nodeid); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); prefetchw(objp); @@ -3295,6 +3257,14 @@ out: return objp; } +static __always_inline void * +slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, + size_t orig_size, unsigned long caller) +{ + return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size, + caller); +} + /* * Caller needs to acquire correct kmem_cache_node's list_lock * @list: List of detached free slabs should be freed by caller @@ -3470,8 +3440,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, flags); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE); return ret; } @@ -3521,7 +3490,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_irq_disable(); for (i = 0; i < size; i++) { - void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); + void *objp = kfence_alloc(s, s->object_size, flags) ?: + __do_cache_alloc(s, flags, NUMA_NO_NODE); if (unlikely(!objp)) goto error; @@ -3548,23 +3518,6 @@ error: } EXPORT_SYMBOL(kmem_cache_alloc_bulk); -#ifdef CONFIG_TRACING -void * -kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) -{ - void *ret; - - ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, cachep, - size, cachep->size, flags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif - -#ifdef CONFIG_NUMA /** * kmem_cache_alloc_node - Allocate an object on the specified node * @cachep: The cache to allocate from. @@ -3580,65 +3533,21 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_); + void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep, - cachep->object_size, cachep->size, - flags, nodeid); + trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid, - size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, size_t orig_size, + unsigned long caller) { - void *ret; - - ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc_node(_RET_IP_, ret, cachep, - size, cachep->size, - flags, nodeid); - return ret; + return slab_alloc_node(cachep, NULL, flags, nodeid, + orig_size, caller); } -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif - -static __always_inline void * -__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) -{ - struct kmem_cache *cachep; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); - ret = kasan_kmalloc(cachep, ret, size, flags); - - return ret; -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - return __do_kmalloc_node(size, flags, node, _RET_IP_); -} -EXPORT_SYMBOL(__kmalloc_node); - -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) -{ - return __do_kmalloc_node(size, flags, node, caller); -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif /* CONFIG_NUMA */ #ifdef CONFIG_PRINTK void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) @@ -3662,45 +3571,25 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) } #endif -/** - * __do_kmalloc - allocate memory - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate (see kmalloc). - * @caller: function caller for debug tracking of the caller - * - * Return: pointer to the allocated memory or %NULL in case of error - */ -static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, - unsigned long caller) +static __always_inline +void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - struct kmem_cache *cachep; - void *ret; + unsigned long flags; - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return NULL; - cachep = kmalloc_slab(size, flags); - if (unlikely(ZERO_OR_NULL_PTR(cachep))) - return cachep; - ret = slab_alloc(cachep, NULL, flags, size, caller); - - ret = kasan_kmalloc(cachep, ret, size, flags); - trace_kmalloc(caller, ret, cachep, - size, cachep->size, flags); - - return ret; + local_irq_save(flags); + debug_check_no_locks_freed(objp, cachep->object_size); + if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, cachep->object_size); + __cache_free(cachep, objp, caller); + local_irq_restore(flags); } -void *__kmalloc(size_t size, gfp_t flags) +void __kmem_cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) { - return __do_kmalloc(size, flags, _RET_IP_); + __do_kmem_cache_free(cachep, objp, caller); } -EXPORT_SYMBOL(__kmalloc); - -void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) -{ - return __do_kmalloc(size, flags, caller); -} -EXPORT_SYMBOL(__kmalloc_track_caller); /** * kmem_cache_free - Deallocate an object @@ -3712,34 +3601,38 @@ EXPORT_SYMBOL(__kmalloc_track_caller); */ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { - unsigned long flags; cachep = cache_from_obj(cachep, objp); if (!cachep) return; - trace_kmem_cache_free(_RET_IP_, objp, cachep->name); - local_irq_save(flags); - debug_check_no_locks_freed(objp, cachep->object_size); - if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(objp, cachep->object_size); - __cache_free(cachep, objp, _RET_IP_); - local_irq_restore(flags); + trace_kmem_cache_free(_RET_IP_, objp, cachep); + __do_kmem_cache_free(cachep, objp, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) { - struct kmem_cache *s; - size_t i; local_irq_disable(); - for (i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { void *objp = p[i]; + struct kmem_cache *s; - if (!orig_s) /* called via kfree_bulk */ - s = virt_to_cache(objp); - else + if (!orig_s) { + struct folio *folio = virt_to_folio(objp); + + /* called via kfree_bulk */ + if (!folio_test_slab(folio)) { + local_irq_enable(); + free_large_kmalloc(folio, objp); + local_irq_disable(); + continue; + } + s = folio_slab(folio)->slab_cache; + } else { s = cache_from_obj(orig_s, objp); + } + if (!s) continue; @@ -3755,39 +3648,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/** - * kfree - free previously allocated memory - * @objp: pointer returned by kmalloc. - * - * If @objp is NULL, no operation is performed. - * - * Don't free memory not originally allocated by kmalloc() - * or you will run into trouble. - */ -void kfree(const void *objp) -{ - struct kmem_cache *c; - unsigned long flags; - - trace_kfree(_RET_IP_, objp); - - if (unlikely(ZERO_OR_NULL_PTR(objp))) - return; - local_irq_save(flags); - kfree_debugcheck(objp); - c = virt_to_cache(objp); - if (!c) { - local_irq_restore(flags); - return; - } - debug_check_no_locks_freed(objp, c->object_size); - - debug_check_no_obj_freed(objp, c->object_size); - __cache_free(c, (void *)objp, _RET_IP_); - local_irq_restore(flags); -} -EXPORT_SYMBOL(kfree); - /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ @@ -4190,28 +4050,3 @@ void __check_heap_object(const void *ptr, unsigned long n, usercopy_abort("SLAB object", cachep->name, to_user, offset, n); } #endif /* CONFIG_HARDENED_USERCOPY */ - -/** - * __ksize -- Uninstrumented ksize. - * @objp: pointer to the object - * - * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same - * safety checks as ksize() with KASAN instrumentation enabled. - * - * Return: size of the actual memory used by @objp in bytes - */ -size_t __ksize(const void *objp) -{ - struct kmem_cache *c; - size_t size; - - BUG_ON(!objp); - if (unlikely(objp == ZERO_SIZE_PTR)) - return 0; - - c = virt_to_cache(objp); - size = c ? c->object_size : 0; - - return size; -} -EXPORT_SYMBOL(__ksize); diff --git a/mm/slab.h b/mm/slab.h index 4ec82bec15ec..65023f000d42 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -273,6 +273,11 @@ void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); + +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller); +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); #endif gfp_t kmalloc_fix_flags(gfp_t flags); @@ -658,8 +663,13 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) print_tracking(cachep, x); return cachep; } + +void free_large_kmalloc(struct folio *folio, void *object); + #endif /* CONFIG_SLOB */ +size_t __ksize(const void *objp); + static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB diff --git a/mm/slab_common.c b/mm/slab_common.c index ccc02573588f..9ad97ae73a0a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -511,13 +511,9 @@ EXPORT_SYMBOL(kmem_cache_destroy); */ int kmem_cache_shrink(struct kmem_cache *cachep) { - int ret; - - kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep); - return ret; + return __kmem_cache_shrink(cachep); } EXPORT_SYMBOL(kmem_cache_shrink); @@ -665,7 +661,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, if (!s) panic("Out of memory when creating slab %s\n", name); - create_boot_cache(s, name, size, flags, useroffset, usersize); + create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset, + usersize); kasan_cache_create_kmalloc(s); list_add(&s->list, &slab_caches); s->refcount = 1; @@ -737,6 +734,26 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) return kmalloc_caches[kmalloc_type(flags)][index]; } +size_t kmalloc_size_roundup(size_t size) +{ + struct kmem_cache *c; + + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + /* Above the smaller buckets, size is a multiple of page size. */ + if (size > KMALLOC_MAX_CACHE_SIZE) + return PAGE_SIZE << get_order(size); + + /* The flags don't matter since size_index is common to all. */ + c = kmalloc_slab(size, GFP_KERNEL); + return c ? c->object_size : 0; +} +EXPORT_SYMBOL(kmalloc_size_roundup); + #ifdef CONFIG_ZONE_DMA #define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz, #else @@ -760,8 +777,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. - * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is - * kmalloc-32M. + * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is + * kmalloc-2M. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(0, 0), @@ -785,11 +802,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { INIT_KMALLOC_INFO(262144, 256k), INIT_KMALLOC_INFO(524288, 512k), INIT_KMALLOC_INFO(1048576, 1M), - INIT_KMALLOC_INFO(2097152, 2M), - INIT_KMALLOC_INFO(4194304, 4M), - INIT_KMALLOC_INFO(8388608, 8M), - INIT_KMALLOC_INFO(16777216, 16M), - INIT_KMALLOC_INFO(33554432, 32M) + INIT_KMALLOC_INFO(2097152, 2M) }; /* @@ -902,6 +915,155 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* Kmalloc array is now usable */ slab_state = UP; } + +void free_large_kmalloc(struct folio *folio, void *object) +{ + unsigned int order = folio_order(folio); + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + + mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __free_pages(folio_page(folio, 0), order); +} + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node); +static __always_inline +void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node(size, flags, node); + trace_kmalloc(_RET_IP_, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + s = kmalloc_slab(size, flags); + + if (unlikely(ZERO_OR_NULL_PTR(s))) + return s; + + ret = __kmem_cache_alloc_node(s, flags, node, size, caller); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node); + return ret; +} + +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return __do_kmalloc_node(size, flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node); + +void *__kmalloc(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc); + +void *__kmalloc_node_track_caller(size_t size, gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, flags, node, caller); +} +EXPORT_SYMBOL(__kmalloc_node_track_caller); + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc. + * + * If @object is NULL, no operation is performed. + * + * Don't free memory not originally allocated by kmalloc() + * or you will run into trouble. + */ +void kfree(const void *object) +{ + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + folio = virt_to_folio(object); + if (unlikely(!folio_test_slab(folio))) { + free_large_kmalloc(folio, (void *)object); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + __kmem_cache_free(s, (void *)object, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/** + * __ksize -- Report full size of underlying allocation + * @objp: pointer to the object + * + * This should only be used internally to query the true size of allocations. + * It is not meant to be a way to discover the usable size of an allocation + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, + * and/or FORTIFY_SOURCE. + * + * Return: size of the actual memory used by @objp in bytes + */ +size_t __ksize(const void *object) +{ + struct folio *folio; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + folio = virt_to_folio(object); + + if (unlikely(!folio_test_slab(folio))) { + if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE)) + return 0; + if (WARN_ON(object != folio_address(folio))) + return 0; + return folio_size(folio); + } + + return slab_ksize(folio_slab(folio)->slab_cache); +} + +#ifdef CONFIG_TRACING +void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, + size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_trace); + +void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmalloc_node_trace); +#endif /* !CONFIG_TRACING */ #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) @@ -921,37 +1083,50 @@ gfp_t kmalloc_fix_flags(gfp_t flags) * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) + +static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) { - void *ret = NULL; struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); if (unlikely(flags & GFP_SLAB_BUG_MASK)) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - page = alloc_pages(flags, order); - if (likely(page)) { - ret = page_address(page); + page = alloc_pages_node(node, flags, order); + if (page) { + ptr = page_address(page); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); } - ret = kasan_kmalloc_large(ret, size, flags); - /* As ret might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ret, size, 1, flags); - return ret; -} -EXPORT_SYMBOL(kmalloc_order); -#ifdef CONFIG_TRACING -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + + return ptr; +} + +void *kmalloc_large(size_t size, gfp_t flags) { - void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags); + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_order_trace); -#endif +EXPORT_SYMBOL(kmalloc_large); + +void *kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + void *ret = __kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); + return ret; +} +EXPORT_SYMBOL(kmalloc_large_node); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ @@ -1150,8 +1325,8 @@ module_init(slab_proc_init); #endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; size_t ks; @@ -1283,8 +1458,6 @@ EXPORT_SYMBOL(ksize); /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index 2bd4f476c340..fe567fcfa3a3 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) *m = size; ret = (void *)m + minalign; - trace_kmalloc_node(caller, ret, NULL, - size, size + minalign, gfp, node); + trace_kmalloc(caller, ret, size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp |= __GFP_COMP; ret = slob_new_pages(gfp, order, node); - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << order, gfp, node); + trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node); } kmemleak_alloc(ret, size, 1, gfp); @@ -530,20 +528,12 @@ void *__kmalloc(size_t size, gfp_t gfp) } EXPORT_SYMBOL(__kmalloc); -void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) -{ - return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, int node, unsigned long caller) { return __do_kmalloc_node(size, gfp, node, caller); } EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif void kfree(const void *block) { @@ -574,6 +564,20 @@ void kfree(const void *block) } EXPORT_SYMBOL(kfree); +size_t kmalloc_size_roundup(size_t size) +{ + /* Short-circuit the 0 size case. */ + if (unlikely(size == 0)) + return 0; + /* Short-circuit saturated "too-large" case. */ + if (unlikely(size == SIZE_MAX)) + return SIZE_MAX; + + return ALIGN(size, ARCH_KMALLOC_MINALIGN); +} + +EXPORT_SYMBOL(kmalloc_size_roundup); + /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ size_t __ksize(const void *block) { @@ -594,7 +598,6 @@ size_t __ksize(const void *block) m = (unsigned int *)(block - align); return SLOB_UNITS(*m) * SLOB_UNIT; } -EXPORT_SYMBOL(__ksize); int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { @@ -602,6 +605,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } + + /* Actual size allocated */ + c->size = SLOB_UNITS(c->size) * SLOB_UNIT; c->flags = flags; return 0; } @@ -616,14 +622,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - SLOB_UNITS(c->size) * SLOB_UNIT, - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } else { b = slob_new_pages(flags, get_order(c->size), node); - trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size, - PAGE_SIZE << get_order(c->size), - flags, node); + trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node); } if (b && c->ctor) { @@ -647,7 +649,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_ return slob_alloc_node(cachep, flags, NUMA_NO_NODE); } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_NUMA + void *__kmalloc_node(size_t size, gfp_t gfp, int node) { return __do_kmalloc_node(size, gfp, node, _RET_IP_); @@ -659,7 +661,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node) return slob_alloc_node(cachep, gfp, node); } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif static void __kmem_cache_free(void *b, int size) { @@ -680,7 +681,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - trace_kmem_cache_free(_RET_IP_, b, c->name); + trace_kmem_cache_free(_RET_IP_, b, c); if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); diff --git a/mm/slub.c b/mm/slub.c index 4b98dff9be8e..2a6b3f31ce7e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -50,7 +50,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches or for debugging) + * 4. slab_lock(slab) (Only on some arches) * 5. object_map_lock (Only for debugging) * * slab_mutex @@ -64,8 +64,9 @@ * The slab_lock is a wrapper around the page lock, thus it is a bit * spinlock. * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects: + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * * A. slab->freelist -> List of free objects in a slab * B. slab->inuse -> Number of objects in use * C. slab->objects -> Number of objects in slab @@ -94,15 +95,20 @@ * allocating a long series of objects that fill up slabs does not require * the list lock. * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * * cpu_slab->lock local lock * * This locks protect slowpath manipulation of all kmem_cache_cpu fields * except the stat counters. This is a percpu structure manipulated only by * the local cpu, so the lock protects against being preempted or interrupted * by an irq. Fast path operations rely on lockless operations instead. - * On PREEMPT_RT, the local lock does not actually disable irqs (and thus - * prevent the lockless operations), so fastpath operations also need to take - * the lock and are no longer lockless. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. * * lockless fastpaths * @@ -163,8 +169,9 @@ * function call even on !PREEMPT_RT, use inline preempt_disable() there. */ #ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) #else #define slub_get_cpu_ptr(var) \ ({ \ @@ -176,6 +183,7 @@ do { \ (void)(var); \ migrate_enable(); \ } while (0) +#define USE_LOCKLESS_FAST_PATH() (false) #endif #ifdef CONFIG_SLUB_DEBUG @@ -186,11 +194,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +/* Structure holding parameters for get_partial() call chain */ +struct partial_context { + struct slab **slab; + gfp_t flags; + unsigned int orig_size; +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } +static inline bool slub_debug_orig_size(struct kmem_cache *s) +{ + return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && + (s->flags & SLAB_KMALLOC)); +} + void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -447,7 +468,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) /* * Per slab locking using the pagelock */ -static __always_inline void __slab_lock(struct slab *slab) +static __always_inline void slab_lock(struct slab *slab) { struct page *page = slab_page(slab); @@ -455,7 +476,7 @@ static __always_inline void __slab_lock(struct slab *slab) bit_spin_lock(PG_locked, &page->flags); } -static __always_inline void __slab_unlock(struct slab *slab) +static __always_inline void slab_unlock(struct slab *slab) { struct page *page = slab_page(slab); @@ -463,31 +484,19 @@ static __always_inline void __slab_unlock(struct slab *slab) __bit_spin_unlock(PG_locked, &page->flags); } -static __always_inline void slab_lock(struct slab *slab, unsigned long *flags) -{ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(*flags); - __slab_lock(slab); -} - -static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags) -{ - __slab_unlock(slab); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(*flags); -} - /* * Interrupts must be disabled (for the fallback code to work right), typically - * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different - * so we disable interrupts as part of slab_[un]lock(). + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, void *freelist_old, unsigned long counters_old, void *freelist_new, unsigned long counters_new, const char *n) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) @@ -499,18 +508,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab } else #endif { - /* init to 0 to prevent spurious warnings */ - unsigned long flags = 0; - - slab_lock(slab, &flags); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - slab_unlock(slab, &flags); + slab_unlock(slab); return true; } - slab_unlock(slab, &flags); + slab_unlock(slab); } cpu_relax(); @@ -541,16 +547,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, unsigned long flags; local_irq_save(flags); - __slab_lock(slab); + slab_lock(slab); if (slab->freelist == freelist_old && slab->counters == counters_old) { slab->freelist = freelist_new; slab->counters = counters_new; - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); return true; } - __slab_unlock(slab); + slab_unlock(slab); local_irq_restore(flags); } @@ -566,7 +572,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_RAW_SPINLOCK(object_map_lock); +static DEFINE_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct slab *slab) @@ -600,30 +606,6 @@ static bool slab_add_kunit_errors(void) static inline bool slab_add_kunit_errors(void) { return false; } #endif -/* - * Determine a map of objects in use in a slab. - * - * Node listlock must be held to guarantee that the slab does - * not vanish from under us. - */ -static unsigned long *get_map(struct kmem_cache *s, struct slab *slab) - __acquires(&object_map_lock) -{ - VM_BUG_ON(!irqs_disabled()); - - raw_spin_lock(&object_map_lock); - - __fill_map(object_map, s, slab); - - return object_map; -} - -static void put_map(unsigned long *map) __releases(&object_map_lock) -{ - VM_BUG_ON(map != object_map); - raw_spin_unlock(&object_map_lock); -} - static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) @@ -821,6 +803,39 @@ static void print_slab_info(const struct slab *slab) folio_flags(folio, 0)); } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + static void slab_bug(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@ -880,6 +895,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); + if (slub_debug_orig_size(s)) + off += sizeof(unsigned int); + off += kasan_metadata_size(s); if (off != size_from_object(s)) @@ -1013,7 +1031,8 @@ skip_bug_print: * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at minimum + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * D. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -1031,10 +1050,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ - if (s->flags & SLAB_STORE_USER) + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); + if (s->flags & SLAB_KMALLOC) + off += sizeof(unsigned int); + } + off += kasan_metadata_size(s); if (size_from_object(s) == off) @@ -1329,18 +1352,16 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, } static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, - void *object, unsigned long addr) + struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) goto bad; } - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); + /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); + set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); return 1; @@ -1390,63 +1411,6 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 1; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - void *object = head; - int cnt = 0; - unsigned long flags, flags2; - int ret = 0; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, slab)) - goto out; - } - -next_object: - cnt++; - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, slab, object, addr)) - goto out; - } - - if (s->flags & SLAB_STORE_USER) - set_track_update(s, object, TRACK_FREE, addr, handle); - trace(s, slab, object, 0); - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ - init_object(s, object, SLUB_RED_INACTIVE); - - /* Reached end of constructed freelist yet? */ - if (object != tail) { - object = get_freepointer(s, object); - goto next_object; - } - ret = 1; - -out: - if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); - return ret; -} - /* * Parse a block of slub_debug options. Blocks are delimited by ';' * @@ -1666,16 +1630,18 @@ static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, unsigned long addr) { return 0; } + struct slab *slab, void *object, int orig_size) { return 0; } -static inline int free_debug_processing( +static inline void free_debug_processing( struct kmem_cache *s, struct slab *slab, void *head, void *tail, int bulk_cnt, - unsigned long addr) { return 0; } + unsigned long addr) {} static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1709,20 +1675,6 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - return ptr; -} - -static __always_inline void kfree_hook(void *x) -{ - kmemleak_free(x); - kasan_kfree_large(x); -} - static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { @@ -1981,11 +1933,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) - goto out; + return NULL; stat(s, ORDER_FALLBACK); } slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; account_slab(slab, oo_order(oo), s, flags); @@ -2012,15 +1966,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - slab->inuse = slab->objects; - slab->frozen = 1; - -out: - if (!slab) - return NULL; - - inc_slabs_node(s, slab_nid(slab), slab->objects); - return slab; } @@ -2107,6 +2052,75 @@ static inline void remove_partial(struct kmem_cache_node *n, n->nr_partial--; } +/* + * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab, int orig_size) +{ + void *object; + + lockdep_assert_held(&n->list_lock); + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object, orig_size)) { + remove_partial(n, slab); + return NULL; + } + + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; +} + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, + struct slab *slab, int orig_size) +{ + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object, orig_size)) + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + */ + return NULL; + + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; +} + /* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. @@ -2164,7 +2178,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); * Try to allocate a partial slab from a specific node. */ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct slab **ret_slab, gfp_t gfpflags) + struct partial_context *pc) { struct slab *slab, *slab2; void *object = NULL; @@ -2184,15 +2198,23 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { void *t; - if (!pfmemalloc_match(slab, gfpflags)) + if (!pfmemalloc_match(slab, pc->flags)) continue; + if (kmem_cache_debug(s)) { + object = alloc_single_from_partial(s, n, slab, + pc->orig_size); + if (object) + break; + continue; + } + t = acquire_slab(s, n, slab, object == NULL); if (!t) break; if (!object) { - *ret_slab = slab; + *pc->slab = slab; stat(s, ALLOC_FROM_PARTIAL); object = t; } else { @@ -2216,14 +2238,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct slab **ret_slab) +static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type highest_zoneidx = gfp_zone(flags); + enum zone_type highest_zoneidx = gfp_zone(pc->flags); void *object; unsigned int cpuset_mems_cookie; @@ -2251,15 +2272,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(mempolicy_slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed(zone, flags) && + if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, ret_slab, flags); + object = get_partial_node(s, n, pc); if (object) { /* * Don't check read_mems_allowed_retry() @@ -2280,8 +2301,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct slab **ret_slab) +static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc) { void *object; int searchnode = node; @@ -2289,11 +2309,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags); + object = get_partial_node(s, get_node(s, searchnode), pc); if (object || node != NUMA_NO_NODE) return object; - return get_any_partial(s, flags, ret_slab); + return get_any_partial(s, pc); } #ifdef CONFIG_PREEMPTION @@ -2793,6 +2813,113 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) { return atomic_long_read(&n->total_objects); } + +/* Supports checking bulk free of a constructed freelist */ +static noinline void free_debug_processing( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + void *object = head; + int cnt = 0; + unsigned long flags; + bool checks_ok = false; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + checks_ok = true; + +out_cnt: + if (cnt != bulk_cnt) + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + bulk_cnt, cnt); + +out: + if (checks_ok) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (!checks_ok) + slab_fix(s, "Object at 0x%p not freed", object); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} #endif /* CONFIG_SLUB_DEBUG */ #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) @@ -2910,11 +3037,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *freelist; struct slab *slab; unsigned long flags; + struct partial_context pc; stat(s, ALLOC_SLOWPATH); @@ -3028,7 +3156,10 @@ new_slab: new_objects: - freelist = get_partial(s, gfpflags, node, &slab); + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + freelist = get_partial(s, node, &pc); if (freelist) goto check_new_slab; @@ -3041,36 +3172,53 @@ new_objects: return NULL; } + stat(s, ALLOC_SLAB); + + if (kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab, orig_size); + + if (unlikely(!freelist)) + goto new_objects; + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; + } + /* * No other reference to the slab yet so we can * muck around with it freely without cmpxchg */ freelist = slab->freelist; slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; - stat(s, ALLOC_SLAB); + inc_slabs_node(s, slab_nid(slab), slab->objects); check_new_slab: if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, slab, freelist, addr)) { - /* Slab failed checks. Next slab needed */ - goto new_slab; - } else { - /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() - */ - goto return_single; - } + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the tracking info + * and return the object + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr); + + return freelist; } - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags))) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. */ - goto return_single; + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } retry_load_slab: @@ -3094,11 +3242,6 @@ retry_load_slab: c->slab = slab; goto load_freelist; - -return_single: - - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; } /* @@ -3107,7 +3250,7 @@ return_single: * pointer. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *p; @@ -3120,7 +3263,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = slub_get_cpu_ptr(s->cpu_slab); #endif - p = ___slab_alloc(s, gfpflags, node, addr, c); + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif @@ -3202,16 +3345,10 @@ redo: object = c->freelist; slab = c->slab; - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if a - * slowpath has taken the local_lock_irqsave(), it is not protected - * against a fast path operation in an irq handler. So we need to take - * the slow path which uses local_lock. It is still relatively fast if - * there is a suitable cpu freelist. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) || + + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); } else { void *next_object = get_freepointer_safe(s, object); @@ -3262,8 +3399,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, { void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size, - s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } @@ -3281,46 +3417,24 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, } EXPORT_SYMBOL(kmem_cache_alloc_lru); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, + unsigned long caller) { - void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags); - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; + return slab_alloc_node(s, NULL, gfpflags, node, + caller, orig_size); } -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif -#ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - trace_kmem_cache_alloc_node(_RET_IP_, ret, s, - s->object_size, s->size, gfpflags, node); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) -{ - void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, - size, s->size, gfpflags, node); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; -} -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif -#endif /* CONFIG_NUMA */ - /* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -3346,9 +3460,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) + if (kmem_cache_debug(s)) { + free_debug_processing(s, slab, head, tail, cnt, addr); return; + } do { if (unlikely(n)) { @@ -3468,6 +3583,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s, void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; + void **freelist; redo: /* @@ -3482,9 +3598,13 @@ redo: /* Same with comment on barrier() in slab_alloc_node() */ barrier(); - if (likely(slab == c->slab)) { -#ifndef CONFIG_PREEMPT_RT - void **freelist = READ_ONCE(c->freelist); + if (unlikely(slab != c->slab)) { + __slab_free(s, slab, head, tail_obj, cnt, addr); + return; + } + + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); set_freepointer(s, tail_obj, freelist); @@ -3496,16 +3616,8 @@ redo: note_cmpxchg_failure("slab_free", s, tid); goto redo; } -#else /* CONFIG_PREEMPT_RT */ - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if - * a slowpath has taken the local_lock_irqsave(), it is not - * protected against a fast path operation in an irq handler. So - * we need to take the local_lock. We shouldn't simply defer to - * __slab_free() as that wouldn't use the cpu freelist at all. - */ - void **freelist; - + } else { + /* Update the free list under the local lock */ local_lock(&s->cpu_slab->lock); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { @@ -3520,11 +3632,8 @@ redo: c->tid = next_tid(tid); local_unlock(&s->cpu_slab->lock); -#endif - stat(s, FREE_FASTPATH); - } else - __slab_free(s, slab, head, tail_obj, cnt, addr); - + } + stat(s, FREE_FASTPATH); } static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, @@ -3547,12 +3656,17 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif +void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller) +{ + slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller); +} + void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); if (!s) return; - trace_kmem_cache_free(_RET_IP_, x, s->name); + trace_kmem_cache_free(_RET_IP_, x, s); slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -3565,19 +3679,6 @@ struct detached_freelist { struct kmem_cache *s; }; -static inline void free_large_kmalloc(struct folio *folio, void *object) -{ - unsigned int order = folio_order(folio); - - if (WARN_ON_ONCE(order == 0)) - pr_warn_once("object pointer: 0x%p\n", object); - - kfree_hook(object); - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); -} - /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same @@ -3714,7 +3815,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c); + _RET_IP_, c, s->object_size); if (unlikely(!p[i])) goto error; @@ -3941,6 +4042,7 @@ static void early_kmem_cache_node_alloc(int node) slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!slab); + inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects); if (slab_nid(slab) != node) { pr_err("SLUB: Unable to allocate memory from node %d\n", node); pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); @@ -3955,7 +4057,6 @@ static void early_kmem_cache_node_alloc(int node) n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - slab->frozen = 0; kmem_cache_node->node[node] = n; init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, slab->objects); @@ -4117,12 +4218,17 @@ static int calculate_sizes(struct kmem_cache *s) } #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) { /* * Need to store information about allocs and frees after * the object. */ size += 2 * sizeof(struct track); + + /* Save the original kmalloc request size */ + if (flags & SLAB_KMALLOC) + size += sizeof(unsigned int); + } #endif kasan_cache_create(s, &size, &s->flags); @@ -4242,23 +4348,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); - unsigned long flags; - unsigned long *map; void *p; slab_err(s, slab, text, s->name); - slab_lock(slab, &flags); - map = get_map(s, slab); + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); + for_each_object(p, s, addr, slab->objects) { - if (!test_bit(__obj_to_index(s, addr, p), map)) { + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } - put_map(map); - slab_unlock(slab, &flags); + spin_unlock(&object_map_lock); #endif } @@ -4409,78 +4513,6 @@ static int __init setup_slub_min_objects(char *str) __setup("slub_min_objects=", setup_slub_min_objects); -void *__kmalloc(size_t size, gfp_t flags) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, flags); - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, flags, _RET_IP_, size); - - trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc); - -#ifdef CONFIG_NUMA -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); - } - - return kmalloc_large_node_hook(ptr, size, flags); -} - -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, flags, node); - - trace_kmalloc_node(_RET_IP_, ret, NULL, - size, PAGE_SIZE << get_order(size), - flags, node); - - return ret; - } - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node); - - ret = kasan_kmalloc(s, ret, size, flags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -4531,43 +4563,6 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif /* CONFIG_HARDENED_USERCOPY */ -size_t __ksize(const void *object) -{ - struct folio *folio; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(object); - - if (unlikely(!folio_test_slab(folio))) - return folio_size(folio); - - return slab_ksize(folio_slab(folio)->slab_cache); -} -EXPORT_SYMBOL(__ksize); - -void kfree(const void *x) -{ - struct folio *folio; - struct slab *slab; - void *object = (void *)x; - - trace_kfree(_RET_IP_, x); - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return; - - folio = virt_to_folio(x); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); - return; - } - slab = folio_slab(folio); - slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - #define SHRINK_PROMOTE_MAX 32 /* @@ -4616,6 +4611,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); n->nr_partial--; + dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } @@ -4631,7 +4627,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) - discard_slab(s, slab); + free_slab(s, slab); if (slabs_node(s, node)) ret = 1; @@ -4915,64 +4911,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, gfpflags); - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, NULL, gfpflags, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, s, size, s->size, gfpflags); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_track_caller); - -#ifdef CONFIG_NUMA -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, gfpflags, node); - - trace_kmalloc_node(caller, ret, NULL, - size, PAGE_SIZE << get_order(size), - gfpflags, node); - - return ret; - } - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size); - - /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node); - - ret = kasan_kmalloc(s, ret, size, gfpflags); - - return ret; -} -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif - #ifdef CONFIG_SYSFS static int count_inuse(struct slab *slab) { @@ -4991,12 +4929,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, { void *p; void *addr = slab_address(slab); - unsigned long flags; - - slab_lock(slab, &flags); if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) - goto unlock; + return; /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); @@ -5007,8 +4942,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, if (!check_object(s, slab, p, val)) break; } -unlock: - slab_unlock(slab, &flags); } static int validate_slab_node(struct kmem_cache *s, @@ -5079,6 +5012,7 @@ struct location { depot_stack_handle_t handle; unsigned long count; unsigned long addr; + unsigned long waste; long long sum_time; long min_time; long max_time; @@ -5125,13 +5059,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) } static int add_location(struct loc_track *t, struct kmem_cache *s, - const struct track *track) + const struct track *track, + unsigned int orig_size) { long start, end, pos; struct location *l; - unsigned long caddr, chandle; + unsigned long caddr, chandle, cwaste; unsigned long age = jiffies - track->when; depot_stack_handle_t handle = 0; + unsigned int waste = s->object_size - orig_size; #ifdef CONFIG_STACKDEPOT handle = READ_ONCE(track->handle); @@ -5149,11 +5085,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (pos == end) break; - caddr = t->loc[pos].addr; - chandle = t->loc[pos].handle; - if ((track->addr == caddr) && (handle == chandle)) { + l = &t->loc[pos]; + caddr = l->addr; + chandle = l->handle; + cwaste = l->waste; + if ((track->addr == caddr) && (handle == chandle) && + (waste == cwaste)) { - l = &t->loc[pos]; l->count++; if (track->when) { l->sum_time += age; @@ -5178,6 +5116,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, end = pos; else if (track->addr == caddr && handle < chandle) end = pos; + else if (track->addr == caddr && handle == chandle && + waste < cwaste) + end = pos; else start = pos; } @@ -5201,6 +5142,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->min_pid = track->pid; l->max_pid = track->pid; l->handle = handle; + l->waste = waste; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -5213,13 +5155,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, unsigned long *obj_map) { void *addr = slab_address(slab); + bool is_alloc = (alloc == TRACK_ALLOC); void *p; __fill_map(obj_map, s, slab); for_each_object(p, s, addr, slab->objects) if (!test_bit(__obj_to_index(s, addr, p), obj_map)) - add_location(t, s, get_track(s, p, alloc)); + add_location(t, s, get_track(s, p, alloc), + is_alloc ? get_orig_size(s, p) : + s->object_size); } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ @@ -5612,7 +5557,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length; @@ -5837,7 +5782,6 @@ static ssize_t slab_attr_show(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5845,9 +5789,7 @@ static ssize_t slab_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - err = attribute->show(s, buf); - - return err; + return attribute->show(s, buf); } static ssize_t slab_attr_store(struct kobject *kobj, @@ -5856,7 +5798,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5864,8 +5805,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - err = attribute->store(s, buf, len); - return err; + return attribute->store(s, buf, len); } static void kmem_cache_release(struct kobject *k) @@ -5890,7 +5830,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) return slab_kset; } -#define ID_STR_LENGTH 64 +#define ID_STR_LENGTH 32 /* Create a unique string id for a slab cache: * @@ -5924,9 +5864,12 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07u", s->size); + p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); - BUG_ON(p > name + ID_STR_LENGTH - 1); + if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { + kfree(name); + return ERR_PTR(-EINVAL); + } return name; } @@ -6092,6 +6035,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) else seq_puts(seq, ""); + if (l->waste) + seq_printf(seq, " waste=%lu/%lu", + l->count * l->waste, l->waste); + if (l->sum_time != l->min_time) { seq_printf(seq, " age=%ld/%llu/%ld", l->min_time, div_u64(l->sum_time, l->count), diff --git a/mm/vmstat.c b/mm/vmstat.c index ac798de3d55e..21b2d6e788e8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -355,8 +355,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, * CPU migrations and preemption potentially corrupts a counter so * disable preemption. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -368,8 +367,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_zone_page_state); @@ -393,8 +391,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); x = delta + __this_cpu_read(*p); @@ -406,8 +403,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, } __this_cpu_write(*p, x); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } EXPORT_SYMBOL(__mod_node_page_state); @@ -441,8 +437,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -453,8 +448,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -466,8 +460,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -478,8 +471,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, -overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __inc_zone_page_state(struct page *page, enum zone_stat_item item) @@ -501,8 +493,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) s8 v, t; /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -513,8 +504,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) @@ -526,8 +516,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); /* See __mod_node_page_state */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_disable(); + preempt_disable_nested(); v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -538,8 +527,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) __this_cpu_write(*p, overstep); } - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - preempt_enable(); + preempt_enable_nested(); } void __dec_zone_page_state(struct page *page, enum zone_stat_item item) diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index c260006106be..1c253b4b7ce0 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -635,6 +635,12 @@ int arch_decode_instruction(struct objtool_file *file, const struct section *sec *type = INSN_CONTEXT_SWITCH; break; + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + *type = INSN_JUMP_CONDITIONAL; + break; + case 0xe8: *type = INSN_CALL; /* diff --git a/tools/objtool/check.c b/tools/objtool/check.c index a8cf38639fe8..715f35a8cc00 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2107,9 +2107,6 @@ static int read_noendbr_hints(struct objtool_file *file) return -1; } - if (insn->type == INSN_ENDBR) - WARN_FUNC("ANNOTATE_NOENDBR on ENDBR", insn->sec, insn->offset); - insn->noendbr = 1; } @@ -2238,7 +2235,7 @@ static int read_intra_function_calls(struct objtool_file *file) */ insn->type = INSN_JUMP_UNCONDITIONAL; - dest_off = insn->offset + insn->len + insn->immediate; + dest_off = arch_jump_destination(insn); insn->jump_dest = find_insn(file, insn->sec, dest_off); if (!insn->jump_dest) { WARN_FUNC("can't find call dest at %s+0x%lx", diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c index f8c43ce8fe66..c6b8f32990c8 100644 --- a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c +++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c @@ -184,7 +184,7 @@ int main(int argc, char *argv[]) /* * If res is non-zero, we either requeued the waiter or hit an * error, break out and handle it. If it is zero, then the - * signal may have hit before the the waiter was blocked on f1. + * signal may have hit before the waiter was blocked on f1. * Try again. */ if (res > 0) {