From 26573a97746c7a99f394f9d398ce91a8853b3b89 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:01 +0100 Subject: [PATCH 01/77] x86/apic: Fix x2apic enablement without interrupt remapping Currently, Linux as a hypervisor guest will enable x2apic only if there are no CPUs present at boot time with an APIC ID above 255. Hotplugging a CPU later with a higher APIC ID would result in a CPU which cannot be targeted by external interrupts. Add a filter in x2apic_apic_id_valid() which can be used to prevent such CPUs from coming online, and allow x2apic to be enabled even if they are present at boot time. Fixes: ce69a784504 ("x86/apic: Enable x2APIC without interrupt remapping under KVM") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-2-dwmw2@infradead.org --- arch/x86/include/asm/apic.h | 1 + arch/x86/kernel/apic/apic.c | 14 ++++++++------ arch/x86/kernel/apic/x2apic_phys.c | 9 +++++++++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 4e3099d9ae62..57af25cb44f6 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -259,6 +259,7 @@ static inline u64 native_x2apic_icr_read(void) extern int x2apic_mode; extern int x2apic_phys; +extern void __init x2apic_set_max_apicid(u32 apicid); extern void __init check_x2apic(void); extern void x2apic_setup(void); static inline int x2apic_enabled(void) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b3eef1d5c903..113f6ca7b828 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1841,20 +1841,22 @@ static __init void try_to_enable_x2apic(int remap_mode) return; if (remap_mode != IRQ_REMAP_X2APIC_MODE) { - /* IR is required if there is APIC ID > 255 even when running - * under KVM + /* + * Using X2APIC without IR is not architecturally supported + * on bare metal but may be supported in guests. */ - if (max_physical_apicid > 255 || - !x86_init.hyper.x2apic_available()) { + if (!x86_init.hyper.x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; } /* - * without IR all CPUs can be addressed by IOAPIC/MSI - * only in physical mode + * Without IR, all CPUs can be addressed by IOAPIC/MSI only + * in physical mode, and CPUs with an APIC ID that cannnot + * be addressed must not be brought online. */ + x2apic_set_max_apicid(255); x2apic_phys = 1; } x2apic_enable(); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index bc9693841353..e14eae6d6ea7 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -8,6 +8,12 @@ int x2apic_phys; static struct apic apic_x2apic_phys; +static u32 x2apic_max_apicid __ro_after_init; + +void __init x2apic_set_max_apicid(u32 apicid) +{ + x2apic_max_apicid = apicid; +} static int __init set_x2apic_phys_mode(char *arg) { @@ -98,6 +104,9 @@ static int x2apic_phys_probe(void) /* Common x2apic functions, also used by x2apic_cluster */ int x2apic_apic_id_valid(u32 apicid) { + if (x2apic_max_apicid && apicid > x2apic_max_apicid) + return 0; + return 1; } From 47bea873cf809f490cfac0c4e88533fd7fed6064 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:02 +0100 Subject: [PATCH 02/77] x86/msi: Only use high bits of MSI address for DMAR unit The Intel IOMMU has an MSI-like configuration for its interrupt, but it isn't really MSI. So it gets to abuse the high 32 bits of the address, and puts the high 24 bits of the extended APIC ID there. This isn't something that can be used in the general case for real MSIs, since external devices using the high bits of the address would be performing writes to actual memory space above 4GiB, not targeted at the APIC. Factor the hack out and allow it only to be used when appropriate, adding a WARN_ON_ONCE() if other MSIs are targeted at an unreachable APIC ID. That should never happen since the compatibility MSI messages are not used when Interrupt Remapping is enabled. The x2apic_enabled() check isn't needed because Linux won't bring up CPUs with higher APIC IDs unless IR and x2apic are enabled anyway. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-3-dwmw2@infradead.org --- arch/x86/kernel/apic/msi.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 6313f0a05db7..516df47bde73 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -23,13 +23,11 @@ struct irq_domain *x86_pci_msi_default_domain __ro_after_init; -static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) +static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar) { msg->address_hi = MSI_ADDR_BASE_HI; - if (x2apic_enabled()) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - msg->address_lo = MSI_ADDR_BASE_LO | ((apic->irq_dest_mode == 0) ? @@ -43,18 +41,29 @@ static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg) MSI_DATA_LEVEL_ASSERT | MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(cfg->vector); + + /* + * Only the IOMMU itself can use the trick of putting destination + * APIC ID into the high bits of the address. Anything else would + * just be writing to memory if it tried that, and needs IR to + * address higher APIC IDs. + */ + if (dmar) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); + else + WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid)); } void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { - __irq_msi_compose_msg(irqd_cfg(data), msg); + __irq_msi_compose_msg(irqd_cfg(data), msg, false); } static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) { struct msi_msg msg[2] = { [1] = { }, }; - __irq_msi_compose_msg(cfg, msg); + __irq_msi_compose_msg(cfg, msg, false); irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg); } @@ -276,6 +285,17 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent, #endif #ifdef CONFIG_DMAR_TABLE +/* + * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the + * high bits of the destination APIC ID. This can't be done in the general + * case for MSIs as it would be targeting real memory above 4GiB not the + * APIC. + */ +static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, true); +} + static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg) { dmar_msi_write(data->irq, msg); @@ -288,6 +308,7 @@ static struct irq_chip dmar_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, .flags = IRQCHIP_SKIP_SET_WAKE, }; From 93b7a3d6a1f0f159d390959de7a1b9ad863d6b26 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:03 +0100 Subject: [PATCH 03/77] x86/apic/uv: Fix inconsistent destination mode The UV x2apic is strictly using physical destination mode, but apic::dest_logical is initialized with APIC_DEST_LOGICAL. This does not matter much because UV does not use any of the generic functions which use apic::dest_logical, but is still inconsistent. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-4-dwmw2@infradead.org --- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 714233cee0b5..9ade9e6a95ff 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -811,7 +811,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .irq_dest_mode = 0, /* Physical */ .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, + .dest_logical = APIC_DEST_PHYSICAL, .check_apicid_used = NULL, .init_apic_ldr = uv_init_apic_ldr, From 2e730cb56b2cd1626fecaf23ef1537fb24721ef2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:04 +0100 Subject: [PATCH 04/77] x86/devicetree: Fix the ioapic interrupt type table The ioapic interrupt type table is wrong as it assumes that polarity in IO/APIC context means active high when set. But the IO/APIC polarity is working the other way round. This works because the ordering of the entries is consistent with the device tree and the type information is not used by the IO/APIC interrupt chip. The whole trigger and polarity business of IO/APIC is misleading and the corresponding constants which are defined as 0/1 are not used consistently and are going to be removed. Rename the type table members to 'is_level' and 'active_low' and adjust the type information for consistency sake. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-5-dwmw2@infradead.org --- arch/x86/kernel/devicetree.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index ddffd80f5c52..6a4cb71c2498 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -184,31 +184,31 @@ static unsigned int ioapic_id; struct of_ioapic_type { u32 out_type; - u32 trigger; - u32 polarity; + u32 is_level; + u32 active_low; }; static struct of_ioapic_type of_ioapic_type[] = { { - .out_type = IRQ_TYPE_EDGE_RISING, - .trigger = IOAPIC_EDGE, - .polarity = 1, - }, - { - .out_type = IRQ_TYPE_LEVEL_LOW, - .trigger = IOAPIC_LEVEL, - .polarity = 0, + .out_type = IRQ_TYPE_EDGE_FALLING, + .is_level = 0, + .active_low = 1, }, { .out_type = IRQ_TYPE_LEVEL_HIGH, - .trigger = IOAPIC_LEVEL, - .polarity = 1, + .is_level = 1, + .active_low = 0, }, { - .out_type = IRQ_TYPE_EDGE_FALLING, - .trigger = IOAPIC_EDGE, - .polarity = 0, + .out_type = IRQ_TYPE_LEVEL_LOW, + .is_level = 1, + .active_low = 1, + }, + { + .out_type = IRQ_TYPE_EDGE_RISING, + .is_level = 0, + .active_low = 0, }, }; @@ -228,7 +228,7 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, return -EINVAL; it = &of_ioapic_type[type_index]; - ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); + ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low); tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); tmp.ioapic.pin = fwspec->param[0]; From 721612994f53ed600b39a80d912b10f51960e2e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:05 +0100 Subject: [PATCH 05/77] x86/apic: Cleanup delivery mode defines The enum ioapic_irq_destination_types and the enumerated constants starting with 'dest_' are gross misnomers because they describe the delivery mode. Rename then enum and the constants so they actually make sense. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-6-dwmw2@infradead.org --- arch/x86/include/asm/apic.h | 3 ++- arch/x86/include/asm/apicdef.h | 16 +++++++--------- arch/x86/kernel/apic/apic_flat_64.c | 4 ++-- arch/x86/kernel/apic/apic_noop.c | 2 +- arch/x86/kernel/apic/apic_numachip.c | 4 ++-- arch/x86/kernel/apic/bigsmp_32.c | 2 +- arch/x86/kernel/apic/io_apic.c | 11 ++++++----- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 6 +++--- arch/x86/platform/uv/uv_irq.c | 2 +- drivers/iommu/amd/iommu.c | 4 ++-- drivers/iommu/intel/irq_remapping.c | 2 +- drivers/pci/controller/pci-hyperv.c | 6 +++--- 15 files changed, 34 insertions(+), 34 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 57af25cb44f6..37a08f37a19d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -309,7 +309,8 @@ struct apic { /* dest_logical is used by the IPI functions */ u32 dest_logical; u32 disable_esr; - u32 irq_delivery_mode; + + enum apic_delivery_modes delivery_mode; u32 irq_dest_mode; u32 (*calc_dest_apicid)(unsigned int cpu); diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 05e694ed8386..5716f22f81ac 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -432,15 +432,13 @@ struct local_apic { #define BAD_APICID 0xFFFFu #endif -enum ioapic_irq_destination_types { - dest_Fixed = 0, - dest_LowestPrio = 1, - dest_SMI = 2, - dest__reserved_1 = 3, - dest_NMI = 4, - dest_INIT = 5, - dest__reserved_2 = 6, - dest_ExtINT = 7 +enum apic_delivery_modes { + APIC_DELIVERY_MODE_FIXED = 0, + APIC_DELIVERY_MODE_LOWESTPRIO = 1, + APIC_DELIVERY_MODE_SMI = 2, + APIC_DELIVERY_MODE_NMI = 4, + APIC_DELIVERY_MODE_INIT = 5, + APIC_DELIVERY_MODE_EXTINT = 7, }; #endif /* _ASM_X86_APICDEF_H */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 7862b152a052..fdd38a17f835 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -113,7 +113,7 @@ static struct apic apic_flat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, @@ -206,7 +206,7 @@ static struct apic apic_physflat __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = flat_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, .irq_dest_mode = 0, /* physical */ .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 780c702969b7..4fc934b11851 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -95,7 +95,7 @@ struct apic apic_noop __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = noop_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 35edd57f064a..db715d082ec9 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -246,7 +246,7 @@ static const struct apic apic_numachip1 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, .irq_dest_mode = 0, /* physical */ .disable_esr = 0, @@ -295,7 +295,7 @@ static const struct apic apic_numachip2 __refconst = { .apic_id_valid = numachip_apic_id_valid, .apic_id_registered = numachip_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, .irq_dest_mode = 0, /* physical */ .disable_esr = 0, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 98d015a4405a..7f6461f5d349 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -127,7 +127,7 @@ static struct apic apic_bigsmp __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = bigsmp_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, /* phys delivery to target CPU: */ .irq_dest_mode = 0, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7b3c7e0d4a09..cff6cbc3d183 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -535,7 +535,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) /* Check delivery_mode to be sure we're not clearing an SMI pin */ entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) + if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI) return; /* @@ -1368,7 +1368,8 @@ void __init enable_IO_APIC(void) /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + if ((entry.mask == 0) && + (entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT)) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; goto found_i8259; @@ -1416,7 +1417,7 @@ void native_restore_boot_irq_mode(void) entry.trigger = IOAPIC_EDGE; entry.polarity = IOAPIC_POL_HIGH; entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry.delivery_mode = dest_ExtINT; + entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT; entry.dest = read_apic_id(); /* @@ -2047,7 +2048,7 @@ static inline void __init unlock_ExtINT_logic(void) entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; entry1.mask = IOAPIC_UNMASKED; entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; + entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; entry1.polarity = entry0.polarity; entry1.trigger = IOAPIC_EDGE; entry1.vector = 0; @@ -2948,7 +2949,7 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, struct IO_APIC_route_entry *entry) { memset(entry, 0, sizeof(*entry)); - entry->delivery_mode = apic->irq_delivery_mode; + entry->delivery_mode = apic->delivery_mode; entry->dest_mode = apic->irq_dest_mode; entry->dest = cfg->dest_apicid; entry->vector = cfg->vector; diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 67b6f7c049ec..77c6e2e04a1f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -69,7 +69,7 @@ static struct apic apic_default __ro_after_init = { .apic_id_valid = default_apic_id_valid, .apic_id_registered = default_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, /* logical delivery broadcast to all CPUs: */ .irq_dest_mode = 1, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index b0889c48a2ac..82fb43d807f7 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, .irq_dest_mode = 1, /* logical */ .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index e14eae6d6ea7..437e8439db67 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -157,7 +157,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .apic_id_valid = x2apic_apic_id_valid, .apic_id_registered = x2apic_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, .irq_dest_mode = 0, /* physical */ .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 9ade9e6a95ff..49deefdded68 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -703,9 +703,9 @@ static void uv_send_IPI_one(int cpu, int vector) unsigned long dmode, val; if (vector == NMI_VECTOR) - dmode = dest_NMI; + dmode = APIC_DELIVERY_MODE_NMI; else - dmode = dest_Fixed; + dmode = APIC_DELIVERY_MODE_FIXED; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | @@ -807,7 +807,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .apic_id_valid = uv_apic_id_valid, .apic_id_registered = uv_apic_id_registered, - .irq_delivery_mode = dest_Fixed, + .delivery_mode = APIC_DELIVERY_MODE_FIXED, .irq_dest_mode = 0, /* Physical */ .disable_esr = 0, diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 18ca2261cc9a..e7020d162949 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -35,7 +35,7 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) mmr_value = 0; entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; + entry->delivery_mode = apic->delivery_mode; entry->dest_mode = apic->irq_dest_mode; entry->polarity = 0; entry->trigger = 0; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b9cf59443843..bc81b91f89fe 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3671,7 +3671,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, data->irq_2_irte.devid = devid; data->irq_2_irte.index = index + sub_handle; - iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode, + iommu->irte_ops->prepare(data->entry, apic->delivery_mode, apic->irq_dest_mode, irq_cfg->vector, irq_cfg->dest_apicid, devid); @@ -3944,7 +3944,7 @@ int amd_iommu_deactivate_guest_mode(void *data) entry->lo.fields_remap.valid = valid; entry->lo.fields_remap.dm = apic->irq_dest_mode; - entry->lo.fields_remap.int_type = apic->irq_delivery_mode; + entry->lo.fields_remap.int_type = apic->delivery_mode; entry->hi.fields.vector = cfg->vector; entry->lo.fields_remap.destination = APICID_TO_IRTE_DEST_LO(cfg->dest_apicid); diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 0cfce1d3b7bb..d44e719d1984 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1122,7 +1122,7 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest) * irq migration in the presence of interrupt-remapping. */ irte->trigger_mode = 0; - irte->dlvry_mode = apic->irq_delivery_mode; + irte->dlvry_mode = apic->delivery_mode; irte->vector = vector; irte->dest_id = IRTE_DEST(dest); irte->redir_hint = 1; diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 03ed5cb1c4b2..6db8d96a78eb 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1226,7 +1226,7 @@ static void hv_irq_unmask(struct irq_data *data) params->int_target.vector = cfg->vector; /* - * Honoring apic->irq_delivery_mode set to dest_Fixed by + * Honoring apic->delivery_mode set to APIC_DELIVERY_MODE_FIXED by * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a * spurious interrupt storm. Not doing so does not seem to have a * negative effect (yet?). @@ -1324,7 +1324,7 @@ static u32 hv_compose_msi_req_v1( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.vector_count = 1; - int_pkt->int_desc.delivery_mode = dest_Fixed; + int_pkt->int_desc.delivery_mode = APIC_DELIVERY_MODE_FIXED; /* * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in @@ -1345,7 +1345,7 @@ static u32 hv_compose_msi_req_v2( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.vector_count = 1; - int_pkt->int_desc.delivery_mode = dest_Fixed; + int_pkt->int_desc.delivery_mode = APIC_DELIVERY_MODE_FIXED; /* * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten From 22e0db42097b30a49771744e514350b7e9dd26f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:06 +0100 Subject: [PATCH 06/77] x86/apic: Replace pointless apic:: Dest_logical usage All these functions are only used for logical destination mode. So reading the destination mode mask from the apic structure is a pointless exercise. Just hand in the proper constant: APIC_DEST_LOGICAL. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-7-dwmw2@infradead.org --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- arch/x86/kernel/apic/ipi.c | 6 +++--- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index fdd38a17f835..6df837fd5081 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector) unsigned long flags; local_irq_save(flags); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 387154e39e08..d1fb874fbe64 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -260,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, for_each_cpu(query_cpu, mask) __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } @@ -279,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, continue; __default_send_IPI_dest_field( early_per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + vector, APIC_DEST_LOGICAL); } local_irq_restore(flags); } @@ -297,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector) local_irq_save(flags); WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]); - __default_send_IPI_dest_field(mask, vector, apic->dest_logical); + __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 82fb43d807f7..f77e9fb7aac1 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -61,7 +61,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) if (!dest) continue; - __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); /* Remove cluster CPUs from tmpmask */ cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); } From e57d04e5fa00f7649d4c00796f8d12054799be4a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:07 +0100 Subject: [PATCH 07/77] x86/apic: Get rid of apic:: Dest_logical struct apic has two members which store information about the destination mode: dest_logical and irq_dest_mode. dest_logical contains a mask which was historically used to set the destination mode in IPI messages. Over time the usage was reduced and the logical/physical functions were seperated. There are only a few places which still use 'dest_logical' but they can use 'irq_dest_mode' instead. irq_dest_mode is actually a boolean where 0 means physical destination mode and 1 means logical destination mode. Of course the name does not reflect the functionality. This will be cleaned up in a subsequent change. Remove apic::dest_logical and fixup the remaining users. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-8-dwmw2@infradead.org --- arch/x86/include/asm/apic.h | 2 -- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/apic_flat_64.c | 8 ++------ arch/x86/kernel/apic/apic_noop.c | 4 +--- arch/x86/kernel/apic/apic_numachip.c | 8 ++------ arch/x86/kernel/apic/bigsmp_32.c | 4 +--- arch/x86/kernel/apic/probe_32.c | 4 +--- arch/x86/kernel/apic/x2apic_cluster.c | 4 +--- arch/x86/kernel/apic/x2apic_phys.c | 4 +--- arch/x86/kernel/apic/x2apic_uv_x.c | 4 +--- arch/x86/kernel/smpboot.c | 5 +++-- arch/x86/xen/apic.c | 4 +--- 12 files changed, 15 insertions(+), 38 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 37a08f37a19d..e230ed2d88e2 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,8 +306,6 @@ struct apic { void (*send_IPI_all)(int vector); void (*send_IPI_self)(int vector); - /* dest_logical is used by the IPI functions */ - u32 dest_logical; u32 disable_esr; enum apic_delivery_modes delivery_mode; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 113f6ca7b828..29d28b34cb2f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1591,7 +1591,7 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - if (apic->dest_logical) { + if (apic->irq_dest_mode == 1) { int logical_apicid, ldr_apicid; /* diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 6df837fd5081..bbb1b89fe711 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -117,11 +117,9 @@ static struct apic apic_flat __ro_after_init = { .irq_dest_mode = 1, /* logical */ .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -210,11 +208,9 @@ static struct apic apic_physflat __ro_after_init = { .irq_dest_mode = 0, /* physical */ .disable_esr = 0, - .dest_logical = 0, + .check_apicid_used = NULL, - .init_apic_ldr = physflat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 4fc934b11851..38f167ce5031 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,11 +100,9 @@ struct apic apic_noop __ro_after_init = { .irq_dest_mode = 1, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = default_check_apicid_used, - .init_apic_ldr = noop_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index db715d082ec9..4ebf9fe2c95d 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -250,11 +250,9 @@ static const struct apic apic_numachip1 __refconst = { .irq_dest_mode = 0, /* physical */ .disable_esr = 0, - .dest_logical = 0, + .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, @@ -299,11 +297,9 @@ static const struct apic apic_numachip2 __refconst = { .irq_dest_mode = 0, /* physical */ .disable_esr = 0, - .dest_logical = 0, + .check_apicid_used = NULL, - .init_apic_ldr = flat_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 7f6461f5d349..64c375b8c54e 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -132,11 +132,9 @@ static struct apic apic_bigsmp __ro_after_init = { .irq_dest_mode = 0, .disable_esr = 1, - .dest_logical = 0, + .check_apicid_used = bigsmp_check_apicid_used, - .init_apic_ldr = bigsmp_init_apic_ldr, - .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, .setup_apic_routing = bigsmp_setup_apic_routing, .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 77c6e2e04a1f..97652aacf3e1 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -74,11 +74,9 @@ static struct apic apic_default __ro_after_init = { .irq_dest_mode = 1, .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = default_check_apicid_used, - .init_apic_ldr = default_init_apic_ldr, - .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = setup_apic_flat_routing, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index f77e9fb7aac1..53390fc9f51e 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -188,11 +188,9 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .irq_dest_mode = 1, /* logical */ .disable_esr = 0, - .dest_logical = APIC_DEST_LOGICAL, + .check_apicid_used = NULL, - .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 437e8439db67..ee0c4d08092c 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -161,11 +161,9 @@ static struct apic apic_x2apic_phys __ro_after_init = { .irq_dest_mode = 0, /* physical */ .disable_esr = 0, - .dest_logical = 0, + .check_apicid_used = NULL, - .init_apic_ldr = init_x2apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 49deefdded68..d21a6853afee 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -811,11 +811,9 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .irq_dest_mode = 0, /* Physical */ .disable_esr = 0, - .dest_logical = APIC_DEST_PHYSICAL, + .check_apicid_used = NULL, - .init_apic_ldr = uv_init_apic_ldr, - .ioapic_phys_id_map = NULL, .setup_apic_routing = NULL, .cpu_present_to_apicid = default_cpu_present_to_apicid, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index de776b2e6046..6c14f1091f60 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -747,13 +747,14 @@ static void __init smp_quirk_init_udelay(void) int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { + u32 dm = apic->irq_dest_mode ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; unsigned long send_status, accept_status = 0; int maxlvt; /* Target chip */ /* Boot on the stack */ /* Kick the second */ - apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); + apic_icr_write(APIC_DM_NMI | dm, apicid); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -980,7 +981,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, if (!boot_error) { enable_start_cpu0 = 1; *cpu0_nmi_registered = 1; - if (apic->dest_logical == APIC_DEST_LOGICAL) + if (apic->irq_dest_mode) id = cpu0_logical_apicid; else id = apicid; diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index e82fd1910dae..c35c24b5bc01 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -152,11 +152,9 @@ static struct apic xen_pv_apic = { /* .irq_dest_mode - used in native_compose_msi_msg only */ .disable_esr = 0, - /* .dest_logical - default_send_IPI_ use it but we use our own. */ + .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ - .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ - .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ .setup_apic_routing = NULL, .cpu_present_to_apicid = xen_cpu_present_to_apicid, From 8c44963b603db76e3e5f57d90d027657ba43c1fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:08 +0100 Subject: [PATCH 08/77] x86/apic: Cleanup destination mode apic::irq_dest_mode is actually a boolean, but defined as u32 and named in a way which does not explain what it means. Make it a boolean and rename it to 'dest_mode_logical' Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-9-dwmw2@infradead.org --- arch/x86/include/asm/apic.h | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/apic_flat_64.c | 4 ++-- arch/x86/kernel/apic/apic_noop.c | 4 +--- arch/x86/kernel/apic/apic_numachip.c | 4 ++-- arch/x86/kernel/apic/bigsmp_32.c | 3 +-- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/apic/msi.c | 6 +++--- arch/x86/kernel/apic/probe_32.c | 3 +-- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- arch/x86/kernel/apic/x2apic_phys.c | 2 +- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- arch/x86/kernel/smpboot.c | 7 ++----- arch/x86/platform/uv/uv_irq.c | 2 +- arch/x86/xen/apic.c | 3 +-- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/iommu.c | 8 ++++---- drivers/iommu/intel/irq_remapping.c | 2 +- 18 files changed, 26 insertions(+), 34 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index e230ed2d88e2..c1f64c6fa357 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -309,7 +309,7 @@ struct apic { u32 disable_esr; enum apic_delivery_modes delivery_mode; - u32 irq_dest_mode; + bool dest_mode_logical; u32 (*calc_dest_apicid)(unsigned int cpu); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 29d28b34cb2f..54f04355aaa2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1591,7 +1591,7 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - if (apic->irq_dest_mode == 1) { + if (apic->dest_mode_logical) { int logical_apicid, ldr_apicid; /* diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index bbb1b89fe711..8f72b4351c9f 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -114,7 +114,7 @@ static struct apic apic_flat __ro_after_init = { .apic_id_registered = flat_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .irq_dest_mode = 1, /* logical */ + .dest_mode_logical = true, .disable_esr = 0, @@ -205,7 +205,7 @@ static struct apic apic_physflat __ro_after_init = { .apic_id_registered = flat_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .irq_dest_mode = 0, /* physical */ + .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 38f167ce5031..fe78319e0f7a 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -96,8 +96,7 @@ struct apic apic_noop __ro_after_init = { .apic_id_registered = noop_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .dest_mode_logical = true, .disable_esr = 0, @@ -105,7 +104,6 @@ struct apic apic_noop __ro_after_init = { .init_apic_ldr = noop_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, .setup_apic_routing = NULL, - .cpu_present_to_apicid = default_cpu_present_to_apicid, .apicid_to_cpu_present = physid_set_mask_of_physid, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 4ebf9fe2c95d..a54d817eb4b6 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -247,7 +247,7 @@ static const struct apic apic_numachip1 __refconst = { .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .irq_dest_mode = 0, /* physical */ + .dest_mode_logical = false, .disable_esr = 0, @@ -294,7 +294,7 @@ static const struct apic apic_numachip2 __refconst = { .apic_id_registered = numachip_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .irq_dest_mode = 0, /* physical */ + .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 64c375b8c54e..77555f66c14d 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -128,8 +128,7 @@ static struct apic apic_bigsmp __ro_after_init = { .apic_id_registered = bigsmp_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - /* phys delivery to target CPU: */ - .irq_dest_mode = 0, + .dest_mode_logical = false, .disable_esr = 1, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index cff6cbc3d183..c6d92d2570d0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2950,7 +2950,7 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, { memset(entry, 0, sizeof(*entry)); entry->delivery_mode = apic->delivery_mode; - entry->dest_mode = apic->irq_dest_mode; + entry->dest_mode = apic->dest_mode_logical; entry->dest = cfg->dest_apicid; entry->vector = cfg->vector; entry->trigger = data->trigger; diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 516df47bde73..46ffd41a4238 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -30,9 +30,9 @@ static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, msg->address_lo = MSI_ADDR_BASE_LO | - ((apic->irq_dest_mode == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL : - MSI_ADDR_DEST_MODE_LOGICAL) | + (apic->dest_mode_logical ? + MSI_ADDR_DEST_MODE_LOGICAL : + MSI_ADDR_DEST_MODE_PHYSICAL) | MSI_ADDR_REDIRECTION_CPU | MSI_ADDR_DEST_ID(cfg->dest_apicid); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 97652aacf3e1..a61f642b1b90 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -70,8 +70,7 @@ static struct apic apic_default __ro_after_init = { .apic_id_registered = default_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - /* logical delivery broadcast to all CPUs: */ - .irq_dest_mode = 1, + .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 53390fc9f51e..df6adc5674c9 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -185,7 +185,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = { .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .irq_dest_mode = 1, /* logical */ + .dest_mode_logical = true, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index ee0c4d08092c..0e4e81971567 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -158,7 +158,7 @@ static struct apic apic_x2apic_phys __ro_after_init = { .apic_id_registered = x2apic_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .irq_dest_mode = 0, /* physical */ + .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index d21a6853afee..de94181f4d0c 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -808,7 +808,7 @@ static struct apic apic_x2apic_uv_x __ro_after_init = { .apic_id_registered = uv_apic_id_registered, .delivery_mode = APIC_DELIVERY_MODE_FIXED, - .irq_dest_mode = 0, /* Physical */ + .dest_mode_logical = false, .disable_esr = 0, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6c14f1091f60..d133d6580f41 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -747,7 +747,7 @@ static void __init smp_quirk_init_udelay(void) int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) { - u32 dm = apic->irq_dest_mode ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; + u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; unsigned long send_status, accept_status = 0; int maxlvt; @@ -981,10 +981,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid, if (!boot_error) { enable_start_cpu0 = 1; *cpu0_nmi_registered = 1; - if (apic->irq_dest_mode) - id = cpu0_logical_apicid; - else - id = apicid; + id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid; boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip); } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index e7020d162949..1a536a187d74 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -36,7 +36,7 @@ static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info) entry = (struct uv_IO_APIC_route_entry *)&mmr_value; entry->vector = cfg->vector; entry->delivery_mode = apic->delivery_mode; - entry->dest_mode = apic->irq_dest_mode; + entry->dest_mode = apic->dest_mode_logical; entry->polarity = 0; entry->trigger = 0; entry->mask = 0; diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index c35c24b5bc01..0d46cc283cf5 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -148,8 +148,7 @@ static struct apic xen_pv_apic = { .apic_id_valid = xen_id_always_valid, .apic_id_registered = xen_id_always_registered, - /* .irq_delivery_mode - used in native_compose_msi_msg only */ - /* .irq_dest_mode - used in native_compose_msi_msg only */ + /* .delivery_mode and .dest_mode_logical not used by XENPV */ .disable_esr = 0, diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index f696ac7c5f89..ba74a722a400 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -893,7 +893,7 @@ struct amd_ir_data { }; struct amd_irte_ops { - void (*prepare)(void *, u32, u32, u8, u32, int); + void (*prepare)(void *, u32, bool, u8, u32, int); void (*activate)(void *, u16, u16); void (*deactivate)(void *, u16, u16); void (*set_affinity)(void *, u16, u16, u8, u32); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index bc81b91f89fe..d7f0c8908602 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3466,7 +3466,7 @@ static void free_irte(u16 devid, int index) } static void irte_prepare(void *entry, - u32 delivery_mode, u32 dest_mode, + u32 delivery_mode, bool dest_mode, u8 vector, u32 dest_apicid, int devid) { union irte *irte = (union irte *) entry; @@ -3480,7 +3480,7 @@ static void irte_prepare(void *entry, } static void irte_ga_prepare(void *entry, - u32 delivery_mode, u32 dest_mode, + u32 delivery_mode, bool dest_mode, u8 vector, u32 dest_apicid, int devid) { struct irte_ga *irte = (struct irte_ga *) entry; @@ -3672,7 +3672,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, data->irq_2_irte.devid = devid; data->irq_2_irte.index = index + sub_handle; iommu->irte_ops->prepare(data->entry, apic->delivery_mode, - apic->irq_dest_mode, irq_cfg->vector, + apic->dest_mode_logical, irq_cfg->vector, irq_cfg->dest_apicid, devid); switch (info->type) { @@ -3943,7 +3943,7 @@ int amd_iommu_deactivate_guest_mode(void *data) entry->hi.val = 0; entry->lo.fields_remap.valid = valid; - entry->lo.fields_remap.dm = apic->irq_dest_mode; + entry->lo.fields_remap.dm = apic->dest_mode_logical; entry->lo.fields_remap.int_type = apic->delivery_mode; entry->hi.fields.vector = cfg->vector; entry->lo.fields_remap.destination = diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index d44e719d1984..5628d43b795e 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1113,7 +1113,7 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest) memset(irte, 0, sizeof(*irte)); irte->present = 1; - irte->dst_mode = apic->irq_dest_mode; + irte->dst_mode = apic->dest_mode_logical; /* * Trigger mode in the IRTE will always be edge, and for IO-APIC, the * actual level or edge trigger will be setup in the IO-APIC From f598181acfb36f67e1de138cbe80a7db497f7d8c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:09 +0100 Subject: [PATCH 09/77] x86/apic: Always provide irq_compose_msi_msg() method for vector domain This shouldn't be dependent on PCI_MSI. HPET and I/O-APIC can deliver interrupts through MSI without having any PCI in the system at all. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-10-dwmw2@infradead.org --- arch/x86/include/asm/apic.h | 8 +++----- arch/x86/kernel/apic/apic.c | 32 ++++++++++++++++++++++++++++++ arch/x86/kernel/apic/msi.c | 37 ----------------------------------- arch/x86/kernel/apic/vector.c | 6 ++++++ 4 files changed, 41 insertions(+), 42 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index c1f64c6fa357..34cb3c159481 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -520,12 +520,10 @@ static inline void apic_smt_update(void) { } #endif struct msi_msg; +struct irq_cfg; -#ifdef CONFIG_PCI_MSI -void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg); -#else -# define x86_vector_msi_compose_msg NULL -#endif +extern void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar); extern void ioapic_zap_locks(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 54f04355aaa2..4c15bf29ea2c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -2480,6 +2481,37 @@ int hard_smp_processor_id(void) return read_apic_id(); } +void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, + bool dmar) +{ + msg->address_hi = MSI_ADDR_BASE_HI; + + msg->address_lo = + MSI_ADDR_BASE_LO | + (apic->dest_mode_logical ? + MSI_ADDR_DEST_MODE_LOGICAL : + MSI_ADDR_DEST_MODE_PHYSICAL) | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DEST_ID(cfg->dest_apicid); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + MSI_DATA_DELIVERY_FIXED | + MSI_DATA_VECTOR(cfg->vector); + + /* + * Only the IOMMU itself can use the trick of putting destination + * APIC ID into the high bits of the address. Anything else would + * just be writing to memory if it tried that, and needs IR to + * address higher APIC IDs. + */ + if (dmar) + msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); + else + WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid)); +} + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 46ffd41a4238..4eda617eda1e 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -23,42 +22,6 @@ struct irq_domain *x86_pci_msi_default_domain __ro_after_init; -static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, - bool dmar) -{ - msg->address_hi = MSI_ADDR_BASE_HI; - - msg->address_lo = - MSI_ADDR_BASE_LO | - (apic->dest_mode_logical ? - MSI_ADDR_DEST_MODE_LOGICAL : - MSI_ADDR_DEST_MODE_PHYSICAL) | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(cfg->dest_apicid); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - MSI_DATA_DELIVERY_FIXED | - MSI_DATA_VECTOR(cfg->vector); - - /* - * Only the IOMMU itself can use the trick of putting destination - * APIC ID into the high bits of the address. Anything else would - * just be writing to memory if it tried that, and needs IR to - * address higher APIC IDs. - */ - if (dmar) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); - else - WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid)); -} - -void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) -{ - __irq_msi_compose_msg(irqd_cfg(data), msg, false); -} - static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg) { struct msi_msg msg[2] = { [1] = { }, }; diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1eac53632786..bb2e2a2488a5 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -818,6 +818,12 @@ void apic_ack_edge(struct irq_data *irqd) apic_ack_irq(irqd); } +static void x86_vector_msi_compose_msg(struct irq_data *data, + struct msi_msg *msg) +{ + __irq_msi_compose_msg(irqd_cfg(data), msg, false); +} + static struct irq_chip lapic_controller = { .name = "APIC", .irq_ack = apic_ack_edge, From 3d7295eb3003aea9f89de35304b3a88ae4d5036b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:10 +0100 Subject: [PATCH 10/77] x86/hpet: Move MSI support into hpet.c This isn't really dependent on PCI MSI; it's just generic MSI which is now supported by the generic x86_vector_domain. Move the HPET MSI support back into hpet.c with the rest of the HPET support. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-11-dwmw2@infradead.org --- arch/x86/include/asm/hpet.h | 11 ---- arch/x86/kernel/apic/msi.c | 111 --------------------------------- arch/x86/kernel/hpet.c | 118 ++++++++++++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 128 deletions(-) diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 6352dee37cda..ab9f3dd87c80 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -74,17 +74,6 @@ extern void hpet_disable(void); extern unsigned int hpet_readl(unsigned int a); extern void force_hpet_resume(void); -struct irq_data; -struct hpet_channel; -struct irq_domain; - -extern void hpet_msi_unmask(struct irq_data *data); -extern void hpet_msi_mask(struct irq_data *data); -extern void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg); -extern struct irq_domain *hpet_create_irq_domain(int hpet_id); -extern int hpet_assign_irq(struct irq_domain *domain, - struct hpet_channel *hc, int dev_num); - #ifdef CONFIG_HPET_EMULATE_RTC #include diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 4eda617eda1e..44ebe25e7703 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -340,114 +340,3 @@ void dmar_free_hwirq(int irq) irq_domain_free_irqs(irq, 1); } #endif - -/* - * MSI message composition - */ -#ifdef CONFIG_HPET_TIMER -static inline int hpet_dev_id(struct irq_domain *domain) -{ - struct msi_domain_info *info = msi_get_domain_info(domain); - - return (int)(long)info->data; -} - -static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) -{ - hpet_msi_write(irq_data_get_irq_handler_data(data), msg); -} - -static struct irq_chip hpet_msi_controller __ro_after_init = { - .name = "HPET-MSI", - .irq_unmask = hpet_msi_unmask, - .irq_mask = hpet_msi_mask, - .irq_ack = irq_chip_ack_parent, - .irq_set_affinity = msi_domain_set_affinity, - .irq_retrigger = irq_chip_retrigger_hierarchy, - .irq_write_msi_msg = hpet_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, -}; - -static int hpet_msi_init(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq, - irq_hw_number_t hwirq, msi_alloc_info_t *arg) -{ - irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); - irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, - handle_edge_irq, arg->data, "edge"); - - return 0; -} - -static void hpet_msi_free(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); -} - -static struct msi_domain_ops hpet_msi_domain_ops = { - .msi_init = hpet_msi_init, - .msi_free = hpet_msi_free, -}; - -static struct msi_domain_info hpet_msi_domain_info = { - .ops = &hpet_msi_domain_ops, - .chip = &hpet_msi_controller, - .flags = MSI_FLAG_USE_DEF_DOM_OPS, -}; - -struct irq_domain *hpet_create_irq_domain(int hpet_id) -{ - struct msi_domain_info *domain_info; - struct irq_domain *parent, *d; - struct irq_alloc_info info; - struct fwnode_handle *fn; - - if (x86_vector_domain == NULL) - return NULL; - - domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); - if (!domain_info) - return NULL; - - *domain_info = hpet_msi_domain_info; - domain_info->data = (void *)(long)hpet_id; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; - info.devid = hpet_id; - parent = irq_remapping_get_irq_domain(&info); - if (parent == NULL) - parent = x86_vector_domain; - else - hpet_msi_controller.name = "IR-HPET-MSI"; - - fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, - hpet_id); - if (!fn) { - kfree(domain_info); - return NULL; - } - - d = msi_create_irq_domain(fn, domain_info, parent); - if (!d) { - irq_domain_free_fwnode(fn); - kfree(domain_info); - } - return d; -} - -int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, - int dev_num) -{ - struct irq_alloc_info info; - - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET; - info.data = hc; - info.devid = hpet_dev_id(domain); - info.hwirq = dev_num; - - return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); -} -#endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7a50f0b62a70..3b8b12769f3b 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -50,7 +51,7 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ bool hpet_msi_disable; -#ifdef CONFIG_PCI_MSI +#ifdef CONFIG_GENERIC_MSI_IRQ static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel); static struct irq_domain *hpet_domain; #endif @@ -467,9 +468,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc) /* * HPET MSI Support */ -#ifdef CONFIG_PCI_MSI - -void hpet_msi_unmask(struct irq_data *data) +#ifdef CONFIG_GENERIC_MSI_IRQ +static void hpet_msi_unmask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -479,7 +479,7 @@ void hpet_msi_unmask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_mask(struct irq_data *data) +static void hpet_msi_mask(struct irq_data *data) { struct hpet_channel *hc = irq_data_get_irq_handler_data(data); unsigned int cfg; @@ -489,12 +489,118 @@ void hpet_msi_mask(struct irq_data *data) hpet_writel(cfg, HPET_Tn_CFG(hc->num)); } -void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) +static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg) { hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num)); hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4); } +static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) +{ + hpet_msi_write(irq_data_get_irq_handler_data(data), msg); +} + +static struct irq_chip hpet_msi_controller __ro_after_init = { + .name = "HPET-MSI", + .irq_unmask = hpet_msi_unmask, + .irq_mask = hpet_msi_mask, + .irq_ack = irq_chip_ack_parent, + .irq_set_affinity = msi_domain_set_affinity, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_write_msi_msg = hpet_msi_write_msg, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static int hpet_msi_init(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq, + irq_hw_number_t hwirq, msi_alloc_info_t *arg) +{ + irq_set_status_flags(virq, IRQ_MOVE_PCNTXT); + irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL, + handle_edge_irq, arg->data, "edge"); + + return 0; +} + +static void hpet_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, unsigned int virq) +{ + irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT); +} + +static struct msi_domain_ops hpet_msi_domain_ops = { + .msi_init = hpet_msi_init, + .msi_free = hpet_msi_free, +}; + +static struct msi_domain_info hpet_msi_domain_info = { + .ops = &hpet_msi_domain_ops, + .chip = &hpet_msi_controller, + .flags = MSI_FLAG_USE_DEF_DOM_OPS, +}; + +static struct irq_domain *hpet_create_irq_domain(int hpet_id) +{ + struct msi_domain_info *domain_info; + struct irq_domain *parent, *d; + struct irq_alloc_info info; + struct fwnode_handle *fn; + + if (x86_vector_domain == NULL) + return NULL; + + domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL); + if (!domain_info) + return NULL; + + *domain_info = hpet_msi_domain_info; + domain_info->data = (void *)(long)hpet_id; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; + info.devid = hpet_id; + parent = irq_remapping_get_irq_domain(&info); + if (parent == NULL) + parent = x86_vector_domain; + else + hpet_msi_controller.name = "IR-HPET-MSI"; + + fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, + hpet_id); + if (!fn) { + kfree(domain_info); + return NULL; + } + + d = msi_create_irq_domain(fn, domain_info, parent); + if (!d) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + } + return d; +} + +static inline int hpet_dev_id(struct irq_domain *domain) +{ + struct msi_domain_info *info = msi_get_domain_info(domain); + + return (int)(long)info->data; +} + +static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc, + int dev_num) +{ + struct irq_alloc_info info; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_HPET; + info.data = hc; + info.devid = hpet_dev_id(domain); + info.hwirq = dev_num; + + return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); +} + static int hpet_clkevt_msi_resume(struct clock_event_device *evt) { struct hpet_channel *hc = clockevent_to_channel(evt); From 8073c1ac82c12aaf1b475a3ce5328d43b3eaa4ae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:11 +0100 Subject: [PATCH 11/77] genirq/msi: Allow shadow declarations of msi_msg:: $member Architectures like x86 have their MSI messages in various bits of the data, address_lo and address_hi field. Composing or decomposing these messages with bitmasks and shifts is possible, but unreadable gunk. Allow architectures to provide an architecture specific representation for each member of msi_msg. Provide empty defaults for each and stick them into an union. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-12-dwmw2@infradead.org --- include/asm-generic/msi.h | 4 ++++ include/linux/msi.h | 46 +++++++++++++++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/msi.h b/include/asm-generic/msi.h index e6795f088bdd..25344de0e8f9 100644 --- a/include/asm-generic/msi.h +++ b/include/asm-generic/msi.h @@ -4,6 +4,8 @@ #include +#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN + #ifndef NUM_MSI_ALLOC_SCRATCHPAD_REGS # define NUM_MSI_ALLOC_SCRATCHPAD_REGS 2 #endif @@ -30,4 +32,6 @@ typedef struct msi_alloc_info { #define GENERIC_MSI_DOMAIN_OPS 1 +#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ + #endif diff --git a/include/linux/msi.h b/include/linux/msi.h index 6b584cc4757c..360a0a7e7341 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -4,11 +4,50 @@ #include #include +#include +/* Dummy shadow structures if an architecture does not define them */ +#ifndef arch_msi_msg_addr_lo +typedef struct arch_msi_msg_addr_lo { + u32 address_lo; +} __attribute__ ((packed)) arch_msi_msg_addr_lo_t; +#endif + +#ifndef arch_msi_msg_addr_hi +typedef struct arch_msi_msg_addr_hi { + u32 address_hi; +} __attribute__ ((packed)) arch_msi_msg_addr_hi_t; +#endif + +#ifndef arch_msi_msg_data +typedef struct arch_msi_msg_data { + u32 data; +} __attribute__ ((packed)) arch_msi_msg_data_t; +#endif + +/** + * msi_msg - Representation of a MSI message + * @address_lo: Low 32 bits of msi message address + * @arch_addrlo: Architecture specific shadow of @address_lo + * @address_hi: High 32 bits of msi message address + * (only used when device supports it) + * @arch_addrhi: Architecture specific shadow of @address_hi + * @data: MSI message data (usually 16 bits) + * @arch_data: Architecture specific shadow of @data + */ struct msi_msg { - u32 address_lo; /* low 32 bits of msi message address */ - u32 address_hi; /* high 32 bits of msi message address */ - u32 data; /* 16 bits of msi message data */ + union { + u32 address_lo; + arch_msi_msg_addr_lo_t arch_addr_lo; + }; + union { + u32 address_hi; + arch_msi_msg_addr_hi_t arch_addr_hi; + }; + union { + u32 data; + arch_msi_msg_data_t arch_data; + }; }; extern int pci_msi_ignore_mask; @@ -243,7 +282,6 @@ struct msi_controller { #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN #include -#include struct irq_domain; struct irq_domain_ops; From 6285aa507366729c618d5295fb540b24a956088a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:12 +0100 Subject: [PATCH 12/77] x86/msi: Provide msi message shadow structs Create shadow structs with named bitfields for msi_msg data, address_lo and address_hi and use them in the MSI message composer. Provide a function to retrieve the destination ID. This could be inline, but that'd create a circular header dependency. [dwmw2: fix bitfields not all to be a union] Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-13-dwmw2@infradead.org --- arch/x86/include/asm/msi.h | 49 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/apic/apic.c | 35 ++++++++++++++------------ 2 files changed, 68 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index cd30013d15d3..322fd905da9c 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -9,4 +9,53 @@ typedef struct irq_alloc_info msi_alloc_info_t; int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); +/* Structs and defines for the X86 specific MSI message format */ + +typedef struct x86_msi_data { + u32 vector : 8, + delivery_mode : 3, + dest_mode_logical : 1, + reserved : 2, + active_low : 1, + is_level : 1; + + u32 dmar_subhandle; +} __attribute__ ((packed)) arch_msi_msg_data_t; +#define arch_msi_msg_data x86_msi_data + +typedef struct x86_msi_addr_lo { + union { + struct { + u32 reserved_0 : 2, + dest_mode_logical : 1, + redirect_hint : 1, + reserved_1 : 8, + destid_0_7 : 8, + base_address : 12; + }; + struct { + u32 dmar_reserved_0 : 2, + dmar_index_15 : 1, + dmar_subhandle_valid : 1, + dmar_format : 1, + dmar_index_0_14 : 15, + dmar_base_address : 12; + }; + }; +} __attribute__ ((packed)) arch_msi_msg_addr_lo_t; +#define arch_msi_msg_addr_lo x86_msi_addr_lo + +#define X86_MSI_BASE_ADDRESS_LOW (0xfee00000 >> 20) + +typedef struct x86_msi_addr_hi { + u32 reserved : 8, + destid_8_31 : 24; +} __attribute__ ((packed)) arch_msi_msg_addr_hi_t; +#define arch_msi_msg_addr_hi x86_msi_addr_hi + +#define X86_MSI_BASE_ADDRESS_HIGH (0) + +struct msi_msg; +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid); + #endif /* _ASM_X86_MSI_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 4c15bf29ea2c..f7196ee0f005 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -2484,22 +2483,16 @@ int hard_smp_processor_id(void) void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, bool dmar) { - msg->address_hi = MSI_ADDR_BASE_HI; + memset(msg, 0, sizeof(*msg)); - msg->address_lo = - MSI_ADDR_BASE_LO | - (apic->dest_mode_logical ? - MSI_ADDR_DEST_MODE_LOGICAL : - MSI_ADDR_DEST_MODE_PHYSICAL) | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(cfg->dest_apicid); + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical; + msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF; - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - MSI_DATA_DELIVERY_FIXED | - MSI_DATA_VECTOR(cfg->vector); + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED; + msg->arch_data.vector = cfg->vector; + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; /* * Only the IOMMU itself can use the trick of putting destination * APIC ID into the high bits of the address. Anything else would @@ -2507,11 +2500,21 @@ void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, * address higher APIC IDs. */ if (dmar) - msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid); + msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8; else - WARN_ON_ONCE(MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid)); + WARN_ON_ONCE(cfg->dest_apicid > 0xFF); } +u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) +{ + u32 dest = msg->arch_addr_lo.destid_0_7; + + if (extid) + dest |= msg->arch_addr_hi.destid_8_31 << 8; + return dest; +} +EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with From 5c0d0e2cc6e0e7a96c25351fd67c775e7b1f11f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:13 +0100 Subject: [PATCH 13/77] iommu/intel: Use msi_msg shadow structs Use the bitfields in the x86 shadow struct to compose the MSI message. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-14-dwmw2@infradead.org --- drivers/iommu/intel/irq_remapping.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 5628d43b795e..30269b738fa5 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -20,7 +20,6 @@ #include #include #include -#include #include "../irq_remapping.h" @@ -1260,6 +1259,21 @@ static struct irq_chip intel_ir_chip = { .irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity, }; +static void fill_msi_msg(struct msi_msg *msg, u32 index, u32 subhandle) +{ + memset(msg, 0, sizeof(*msg)); + + msg->arch_addr_lo.dmar_base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.dmar_subhandle_valid = true; + msg->arch_addr_lo.dmar_format = true; + msg->arch_addr_lo.dmar_index_0_14 = index & 0x7FFF; + msg->arch_addr_lo.dmar_index_15 = !!(index & 0x8000); + + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + + msg->arch_data.dmar_subhandle = subhandle; +} + static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, struct irq_cfg *irq_cfg, struct irq_alloc_info *info, @@ -1267,7 +1281,6 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, { struct IR_IO_APIC_route_entry *entry; struct irte *irte = &data->irte_entry; - struct msi_msg *msg = &data->msi_entry; prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid); switch (info->type) { @@ -1308,12 +1321,7 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, else set_msi_sid(irte, msi_desc_to_pci_dev(info->desc)); - msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(index) | - MSI_ADDR_IR_INDEX2(index); + fill_msi_msg(&data->msi_entry, index, sub_handle); break; default: From b5c3786ee3704bd8cd5b29ae168526f2b1af4557 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:14 +0100 Subject: [PATCH 14/77] iommu/amd: Use msi_msg shadow structs Get rid of the macro mess and use the shadow structs for the x86 specific MSI message format. Convert the intcapxt setup to use named bitfields as well while touching it anyway. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-15-dwmw2@infradead.org --- drivers/iommu/amd/init.c | 46 ++++++++++++++++++++++----------------- drivers/iommu/amd/iommu.c | 14 +++++++----- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 82e4af8f09bb..263670d36fed 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -1966,10 +1965,16 @@ static int iommu_setup_msi(struct amd_iommu *iommu) return 0; } -#define XT_INT_DEST_MODE(x) (((x) & 0x1ULL) << 2) -#define XT_INT_DEST_LO(x) (((x) & 0xFFFFFFULL) << 8) -#define XT_INT_VEC(x) (((x) & 0xFFULL) << 32) -#define XT_INT_DEST_HI(x) ((((x) >> 24) & 0xFFULL) << 56) +union intcapxt { + u64 capxt; + u64 reserved_0 : 2, + dest_mode_logical : 1, + reserved_1 : 5, + destid_0_23 : 24, + vector : 8, + reserved_2 : 16, + destid_24_31 : 8; +} __attribute__ ((packed)); /* * Setup the IntCapXT registers with interrupt routing information @@ -1978,28 +1983,29 @@ static int iommu_setup_msi(struct amd_iommu *iommu) */ static void iommu_update_intcapxt(struct amd_iommu *iommu) { - u64 val; - u32 addr_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET); - u32 addr_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET); - u32 data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET); - bool dm = (addr_lo >> MSI_ADDR_DEST_MODE_SHIFT) & 0x1; - u32 dest = ((addr_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xFF); + struct msi_msg msg; + union intcapxt xt; + u32 destid; - if (x2apic_enabled()) - dest |= MSI_ADDR_EXT_DEST_ID(addr_hi); + msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET); + msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET); + msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET); - val = XT_INT_VEC(data & 0xFF) | - XT_INT_DEST_MODE(dm) | - XT_INT_DEST_LO(dest) | - XT_INT_DEST_HI(dest); + destid = x86_msi_msg_get_destid(&msg, x2apic_enabled()); + + xt.capxt = 0ULL; + xt.dest_mode_logical = msg.arch_data.dest_mode_logical; + xt.vector = msg.arch_data.vector; + xt.destid_0_23 = destid & GENMASK(23, 0); + xt.destid_24_31 = destid >> 24; /** * Current IOMMU implemtation uses the same IRQ for all * 3 IOMMU interrupts. */ - writeq(val, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET); - writeq(val, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET); - writeq(val, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET); + writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET); + writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET); + writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET); } static void _irq_notifier_notify(struct irq_affinity_notify *notify, diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index d7f0c8908602..473de0920b64 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -3656,13 +3655,20 @@ struct irq_remap_ops amd_iommu_irq_ops = { .get_irq_domain = get_irq_domain, }; +static void fill_msi_msg(struct msi_msg *msg, u32 index) +{ + msg->data = index; + msg->address_lo = 0; + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; +} + static void irq_remapping_prepare_irte(struct amd_ir_data *data, struct irq_cfg *irq_cfg, struct irq_alloc_info *info, int devid, int index, int sub_handle) { struct irq_2_irte *irte_info = &data->irq_2_irte; - struct msi_msg *msg = &data->msi_entry; struct IO_APIC_route_entry *entry; struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; @@ -3693,9 +3699,7 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, case X86_IRQ_ALLOC_TYPE_HPET: case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSIX: - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = MSI_ADDR_BASE_LO; - msg->data = irte_info->index; + fill_msi_msg(&data->msi_entry, irte_info->index); break; default: From e16c8058a10ba8e38d0d1ad0b64e444b245ffdbd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:15 +0100 Subject: [PATCH 15/77] PCI: vmd: Use msi_msg shadow structs Use the x86 shadow structs in msi_msg instead of the macros. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-16-dwmw2@infradead.org --- drivers/pci/controller/vmd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index f375c21ceeb1..6f8795454e5a 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -18,7 +18,6 @@ #include #include #include -#include #define VMD_CFGBAR 0 #define VMD_MEMBAR1 2 @@ -131,10 +130,10 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct vmd_irq_list *irq = vmdirq->irq; struct vmd_dev *vmd = irq_data_get_irq_handler_data(data); - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = MSI_ADDR_BASE_LO | - MSI_ADDR_DEST_ID(index_from_irqs(vmd, irq)); - msg->data = 0; + memset(msg, 0, sizeof(*msg)); + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_addr_lo.destid_0_7 = index_from_irqs(vmd, irq); } /* From 485940e0e691d6d7874fe1fe3b9453c5af41aace Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:16 +0100 Subject: [PATCH 16/77] x86/kvm: Use msi_msg shadow structs Use the bitfields in the x86 shadow structs instead of decomposing the 32bit value with macros. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-17-dwmw2@infradead.org --- arch/x86/kvm/irq_comm.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 4aa1c2e00e2a..8a4de3f12820 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -16,8 +16,6 @@ #include -#include - #include "irq.h" #include "ioapic.h" @@ -104,22 +102,19 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq) { - trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ? - (u64)e->msi.address_hi << 32 : 0), - e->msi.data); + struct msi_msg msg = { .address_lo = e->msi.address_lo, + .address_hi = e->msi.address_hi, + .data = e->msi.data }; - irq->dest_id = (e->msi.address_lo & - MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; - if (kvm->arch.x2apic_format) - irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi); - irq->vector = (e->msi.data & - MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; - irq->dest_mode = kvm_lapic_irq_dest_mode( - !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo)); - irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; - irq->delivery_mode = e->msi.data & 0x700; - irq->msi_redir_hint = ((e->msi.address_lo - & MSI_ADDR_REDIRECTION_LOWPRI) > 0); + trace_kvm_msi_set_irq(msg.address_lo | (kvm->arch.x2apic_format ? + (u64)msg.address_hi << 32 : 0), msg.data); + + irq->dest_id = x86_msi_msg_get_destid(&msg, kvm->arch.x2apic_format); + irq->vector = msg.arch_data.vector; + irq->dest_mode = kvm_lapic_irq_dest_mode(msg.arch_addr_lo.dest_mode_logical); + irq->trig_mode = msg.arch_data.is_level; + irq->delivery_mode = msg.arch_data.delivery_mode << 8; + irq->msi_redir_hint = msg.arch_addr_lo.redirect_hint; irq->level = 1; irq->shorthand = APIC_DEST_NOSHORT; } From 41bb2115beec5e318095a89f5ad4a9c343cb21ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:17 +0100 Subject: [PATCH 17/77] x86/pci/xen: Use msi_msg shadow structs Use the msi_msg shadow structs and compose the message with named bitfields instead of the unreadable macro maze. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-18-dwmw2@infradead.org --- arch/x86/pci/xen.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index c552cd2d0632..3d41a09c2c14 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -152,7 +152,6 @@ static int acpi_register_gsi_xen(struct device *dev, u32 gsi, #if defined(CONFIG_PCI_MSI) #include -#include struct xen_pci_frontend_ops *xen_pci_frontend; EXPORT_SYMBOL_GPL(xen_pci_frontend); @@ -210,23 +209,20 @@ free: return ret; } -#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ - MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) - static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, struct msi_msg *msg) { - /* We set vector == 0 to tell the hypervisor we don't care about it, - * but we want a pirq setup instead. - * We use the dest_id field to pass the pirq that we want. */ - msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq); - msg->address_lo = - MSI_ADDR_BASE_LO | - MSI_ADDR_DEST_MODE_PHYSICAL | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DEST_ID(pirq); - - msg->data = XEN_PIRQ_MSI_DATA; + /* + * We set vector == 0 to tell the hypervisor we don't care about + * it, but we want a pirq setup instead. We use the dest_id fields + * to pass the pirq that we want. + */ + memset(msg, 0, sizeof(*msg)); + msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH; + msg->arch_addr_hi.destid_8_31 = pirq >> 8; + msg->arch_addr_lo.destid_0_7 = pirq & 0xFF; + msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW; + msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_EXTINT; } static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) From 0c1883c1eb9dfa3c72af6e00425eeb1eb171a03e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:18 +0100 Subject: [PATCH 18/77] x86/msi: Remove msidef.h Nothing uses the macro maze anymore. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-19-dwmw2@infradead.org --- arch/x86/include/asm/msidef.h | 57 ----------------------------------- 1 file changed, 57 deletions(-) delete mode 100644 arch/x86/include/asm/msidef.h diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h deleted file mode 100644 index ee2f8ccc32d0..000000000000 --- a/arch/x86/include/asm/msidef.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_MSIDEF_H -#define _ASM_X86_MSIDEF_H - -/* - * Constants for Intel APIC based MSI messages. - */ - -/* - * Shifts for MSI data - */ - -#define MSI_DATA_VECTOR_SHIFT 0 -#define MSI_DATA_VECTOR_MASK 0x000000ff -#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \ - MSI_DATA_VECTOR_MASK) - -#define MSI_DATA_DELIVERY_MODE_SHIFT 8 -#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT) -#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT) - -#define MSI_DATA_LEVEL_SHIFT 14 -#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT) -#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT) - -#define MSI_DATA_TRIGGER_SHIFT 15 -#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT) -#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT) - -/* - * Shift/mask fields for msi address - */ - -#define MSI_ADDR_BASE_HI 0 -#define MSI_ADDR_BASE_LO 0xfee00000 - -#define MSI_ADDR_DEST_MODE_SHIFT 2 -#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT) -#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT) - -#define MSI_ADDR_REDIRECTION_SHIFT 3 -#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT) - /* dedicated cpu */ -#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT) - /* lowest priority */ - -#define MSI_ADDR_DEST_ID_SHIFT 12 -#define MSI_ADDR_DEST_ID_MASK 0x00ffff0 -#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \ - MSI_ADDR_DEST_ID_MASK) -#define MSI_ADDR_EXT_DEST_ID(dest) ((dest) & 0xffffff00) - -#define MSI_ADDR_IR_EXT_INT (1 << 4) -#define MSI_ADDR_IR_SHV (1 << 3) -#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13) -#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5) -#endif /* _ASM_X86_MSIDEF_H */ From a27dca645d2c0f31abb7858aa0e10b2fa0f2f659 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:19 +0100 Subject: [PATCH 19/77] x86/io_apic: Cleanup trigger/polarity helpers 'trigger' and 'polarity' are used throughout the I/O-APIC code for handling the trigger type (edge/level) and the active low/high configuration. While there are defines for initializing these variables and struct members, they are not used consequently and the meaning of 'trigger' and 'polarity' is opaque and confusing at best. Rename them to 'is_level' and 'active_low' and make them boolean in various structs so it's entirely clear what the meaning is. Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-20-dwmw2@infradead.org --- arch/x86/include/asm/hw_irq.h | 6 +- arch/x86/kernel/apic/io_apic.c | 244 +++++++++++++--------------- arch/x86/pci/intel_mid_pci.c | 8 +- drivers/iommu/amd/iommu.c | 10 +- drivers/iommu/intel/irq_remapping.c | 9 +- 5 files changed, 130 insertions(+), 147 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index a4aeeaace040..517847a94dbe 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -47,9 +47,9 @@ enum irq_alloc_type { struct ioapic_alloc_info { int pin; int node; - u32 trigger : 1; - u32 polarity : 1; - u32 valid : 1; + u32 is_level : 1; + u32 active_low : 1; + u32 valid : 1; struct IO_APIC_route_entry *entry; }; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c6d92d2570d0..24a7bba7cbf4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -89,12 +89,12 @@ struct irq_pin_list { }; struct mp_chip_data { - struct list_head irq_2_pin; - struct IO_APIC_route_entry entry; - int trigger; - int polarity; + struct list_head irq_2_pin; + struct IO_APIC_route_entry entry; + bool is_level; + bool active_low; + bool isa_irq; u32 count; - bool isa_irq; }; struct mp_ioapic_gsi { @@ -745,44 +745,7 @@ static int __init find_isa_irq_apic(int irq, int type) return -1; } -#ifdef CONFIG_EISA -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < nr_legacy_irqs()) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always active high edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (IOAPIC_EDGE) -#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always active low level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (IOAPIC_LEVEL) -#define default_PCI_polarity(idx) (IOAPIC_POL_LOW) - -static int irq_polarity(int idx) +static bool irq_active_low(int idx) { int bus = mp_irqs[idx].srcbus; @@ -791,90 +754,139 @@ static int irq_polarity(int idx) */ switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) { case MP_IRQPOL_DEFAULT: - /* conforms to spec, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - return default_ISA_polarity(idx); - else - return default_PCI_polarity(idx); + /* + * Conforms to spec, ie. bus-type dependent polarity. PCI + * defaults to low active. [E]ISA defaults to high active. + */ + return !test_bit(bus, mp_bus_not_pci); case MP_IRQPOL_ACTIVE_HIGH: - return IOAPIC_POL_HIGH; + return false; case MP_IRQPOL_RESERVED: pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n"); fallthrough; case MP_IRQPOL_ACTIVE_LOW: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_POL_LOW; + return true; } } #ifdef CONFIG_EISA -static int eisa_irq_trigger(int idx, int bus, int trigger) +/* + * EISA Edge/Level control register, ELCR + */ +static bool EISA_ELCR(unsigned int irq) +{ + if (irq < nr_legacy_irqs()) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return false; +} + +/* + * EISA interrupts are always active high and can be edge or level + * triggered depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must be + * read in from the ELCR. + */ +static bool eisa_irq_is_level(int idx, int bus, bool level) { switch (mp_bus_id_to_type[bus]) { case MP_BUS_PCI: case MP_BUS_ISA: - return trigger; + return level; case MP_BUS_EISA: - return default_EISA_trigger(idx); + return EISA_ELCR(mp_irqs[idx].srcbusirq); } pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus); - return IOAPIC_LEVEL; + return true; } #else -static inline int eisa_irq_trigger(int idx, int bus, int trigger) +static inline int eisa_irq_is_level(int idx, int bus, bool level) { - return trigger; + return level; } #endif -static int irq_trigger(int idx) +static bool irq_is_level(int idx) { int bus = mp_irqs[idx].srcbus; - int trigger; + bool level; /* * Determine IRQ trigger mode (edge or level sensitive): */ switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) { case MP_IRQTRIG_DEFAULT: - /* conforms to spec, ie. bus-type dependent trigger mode */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); + /* + * Conforms to spec, ie. bus-type dependent trigger + * mode. PCI defaults to egde, ISA to level. + */ + level = test_bit(bus, mp_bus_not_pci); /* Take EISA into account */ - return eisa_irq_trigger(idx, bus, trigger); + return eisa_irq_is_level(idx, bus, level); case MP_IRQTRIG_EDGE: - return IOAPIC_EDGE; + return false; case MP_IRQTRIG_RESERVED: pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n"); fallthrough; case MP_IRQTRIG_LEVEL: default: /* Pointless default required due to do gcc stupidity */ - return IOAPIC_LEVEL; + return true; } } +static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity) +{ + int ioapic, pin, idx; + + if (skip_ioapic_setup) + return -1; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return -1; + + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_is_level(idx); + *polarity = irq_active_low(idx); + return 0; +} + +#ifdef CONFIG_ACPI +int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low) +{ + *is_level = *active_low = 0; + return __acpi_get_override_irq(gsi, (bool *)is_level, + (bool *)active_low); +} +#endif + void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node, int trigger, int polarity) { init_irq_alloc_info(info, NULL); info->type = X86_IRQ_ALLOC_TYPE_IOAPIC; info->ioapic.node = node; - info->ioapic.trigger = trigger; - info->ioapic.polarity = polarity; + info->ioapic.is_level = trigger; + info->ioapic.active_low = polarity; info->ioapic.valid = 1; } -#ifndef CONFIG_ACPI -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); -#endif - static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, struct irq_alloc_info *src, u32 gsi, int ioapic_idx, int pin) { - int trigger, polarity; + bool level, pol_low; copy_irq_alloc_info(dst, src); dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC; @@ -883,20 +895,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst, dst->ioapic.valid = 1; if (src && src->ioapic.valid) { dst->ioapic.node = src->ioapic.node; - dst->ioapic.trigger = src->ioapic.trigger; - dst->ioapic.polarity = src->ioapic.polarity; + dst->ioapic.is_level = src->ioapic.is_level; + dst->ioapic.active_low = src->ioapic.active_low; } else { dst->ioapic.node = NUMA_NO_NODE; - if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) { - dst->ioapic.trigger = trigger; - dst->ioapic.polarity = polarity; + if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) { + dst->ioapic.is_level = level; + dst->ioapic.active_low = pol_low; } else { /* * PCI interrupts are always active low level * triggered. */ - dst->ioapic.trigger = IOAPIC_LEVEL; - dst->ioapic.polarity = IOAPIC_POL_LOW; + dst->ioapic.is_level = true; + dst->ioapic.active_low = true; } } } @@ -906,12 +918,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info) return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE; } -static void mp_register_handler(unsigned int irq, unsigned long trigger) +static void mp_register_handler(unsigned int irq, bool level) { irq_flow_handler_t hdl; bool fasteoi; - if (trigger) { + if (level) { irq_set_status_flags(irq, IRQ_LEVEL); fasteoi = true; } else { @@ -933,14 +945,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) * pin with real trigger and polarity attributes. */ if (irq < nr_legacy_irqs() && data->count == 1) { - if (info->ioapic.trigger != data->trigger) - mp_register_handler(irq, info->ioapic.trigger); - data->entry.trigger = data->trigger = info->ioapic.trigger; - data->entry.polarity = data->polarity = info->ioapic.polarity; + if (info->ioapic.is_level != data->is_level) + mp_register_handler(irq, info->ioapic.is_level); + data->entry.trigger = data->is_level = info->ioapic.is_level; + data->entry.polarity = data->active_low = info->ioapic.active_low; } - return data->trigger == info->ioapic.trigger && - data->polarity == info->ioapic.polarity; + return data->is_level == info->ioapic.is_level && + data->active_low == info->ioapic.active_low; } static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, @@ -2179,9 +2191,9 @@ static inline void __init check_timer(void) * so only need to unmask if it is level-trigger * do we really have level trigger timer? */ - int idx; - idx = find_irq_entry(apic1, pin1, mp_INT); - if (idx != -1 && irq_trigger(idx)) + int idx = find_irq_entry(apic1, pin1, mp_INT); + + if (idx != -1 && irq_is_level(idx)) unmask_ioapic_irq(irq_get_irq_data(0)); } irq_domain_deactivate_irq(irq_data); @@ -2588,30 +2600,6 @@ static int io_apic_get_version(int ioapic) return reg_01.bits.version; } -int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) -{ - int ioapic, pin, idx; - - if (skip_ioapic_setup) - return -1; - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return -1; - - pin = mp_find_ioapic_pin(ioapic, gsi); - if (pin < 0) - return -1; - - idx = find_irq_entry(ioapic, pin, mp_INT); - if (idx < 0) - return -1; - - *trigger = irq_trigger(idx); - *polarity = irq_polarity(idx); - return 0; -} - /* * This function updates target affinity of IOAPIC interrupts to include * the CPUs which came online during SMP bringup. @@ -2935,13 +2923,13 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, struct irq_alloc_info *info) { if (info && info->ioapic.valid) { - data->trigger = info->ioapic.trigger; - data->polarity = info->ioapic.polarity; - } else if (acpi_get_override_irq(gsi, &data->trigger, - &data->polarity) < 0) { + data->is_level = info->ioapic.is_level; + data->active_low = info->ioapic.active_low; + } else if (__acpi_get_override_irq(gsi, &data->is_level, + &data->active_low) < 0) { /* PCI interrupts are always active low level triggered. */ - data->trigger = IOAPIC_LEVEL; - data->polarity = IOAPIC_POL_LOW; + data->is_level = true; + data->active_low = true; } } @@ -2953,16 +2941,13 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, entry->dest_mode = apic->dest_mode_logical; entry->dest = cfg->dest_apicid; entry->vector = cfg->vector; - entry->trigger = data->trigger; - entry->polarity = data->polarity; + entry->trigger = data->is_level; + entry->polarity = data->active_low; /* * Mask level triggered irqs. Edge triggered irqs are masked * by the irq core code in case they fire. */ - if (data->trigger == IOAPIC_LEVEL) - entry->mask = IOAPIC_MASKED; - else - entry->mask = IOAPIC_UNMASKED; + entry->mask = data->is_level; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, @@ -3010,7 +2995,7 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, local_irq_save(flags); if (info->ioapic.entry) mp_setup_entry(cfg, data, info->ioapic.entry); - mp_register_handler(virq, data->trigger); + mp_register_handler(virq, data->is_level); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); local_irq_restore(flags); @@ -3018,7 +3003,8 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, - virq, data->trigger, data->polarity, cfg->dest_apicid); + virq, data->is_level, data->active_low, + cfg->dest_apicid); return 0; } diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c index 24ca4ee2802f..95e2e6bd8d8c 100644 --- a/arch/x86/pci/intel_mid_pci.c +++ b/arch/x86/pci/intel_mid_pci.c @@ -215,7 +215,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, static int intel_mid_pci_irq_enable(struct pci_dev *dev) { struct irq_alloc_info info; - int polarity; + bool polarity_low; int ret; u8 gsi; @@ -230,7 +230,7 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) switch (intel_mid_identify_cpu()) { case INTEL_MID_CPU_CHIP_TANGIER: - polarity = IOAPIC_POL_HIGH; + polarity_low = false; /* Special treatment for IRQ0 */ if (gsi == 0) { @@ -252,11 +252,11 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev) } break; default: - polarity = IOAPIC_POL_LOW; + polarity_low = true; break; } - ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity); + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low); /* * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 473de0920b64..b0e5210e53b2 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3687,13 +3687,11 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, entry = info->ioapic.entry; info->ioapic.entry = NULL; memset(entry, 0, sizeof(*entry)); - entry->vector = index; - entry->mask = 0; - entry->trigger = info->ioapic.trigger; - entry->polarity = info->ioapic.polarity; + entry->vector = index; + entry->trigger = info->ioapic.is_level; + entry->polarity = info->ioapic.active_low; /* Mask level triggered irqs. */ - if (info->ioapic.trigger) - entry->mask = 1; + entry->mask = info->ioapic.is_level; break; case X86_IRQ_ALLOC_TYPE_HPET: diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 30269b738fa5..54ca69333445 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1306,11 +1306,10 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, * irq handler will do the explicit EOI to the io-apic. */ entry->vector = info->ioapic.pin; - entry->mask = 0; /* enable IRQ */ - entry->trigger = info->ioapic.trigger; - entry->polarity = info->ioapic.polarity; - if (info->ioapic.trigger) - entry->mask = 1; /* Mask level triggered irqs. */ + entry->trigger = info->ioapic.is_level; + entry->polarity = info->ioapic.active_low; + /* Mask level triggered irqs. */ + entry->mask = info->ioapic.is_level; break; case X86_IRQ_ALLOC_TYPE_HPET: From 341b4a7211b6ba3a7089e1dc09ac4bd576dfb05f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Oct 2020 22:35:20 +0100 Subject: [PATCH 20/77] x86/ioapic: Cleanup IO/APIC route entry structs Having two seperate structs for the I/O-APIC RTE entries (non-remapped and DMAR remapped) requires type casts and makes it hard to map. Combine them in IO_APIC_routing_entry by defining a union of two 64bit bitfields. Use naming which reflects which bits are shared and which bits are actually different for the operating modes. [dwmw2: Fix it up and finish the job, pulling the 32-bit w1,w2 words for register access into the same union and eliminating a few more places where bits were accessed through masks and shifts.] Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-21-dwmw2@infradead.org --- arch/x86/include/asm/io_apic.h | 78 ++++++--------- arch/x86/kernel/apic/io_apic.c | 144 +++++++++++++--------------- drivers/iommu/amd/iommu.c | 8 +- drivers/iommu/hyperv-iommu.c | 4 +- drivers/iommu/intel/irq_remapping.c | 19 ++-- 5 files changed, 108 insertions(+), 145 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a1a26f6d3aa4..73da644b2f0d 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -13,15 +13,6 @@ * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar */ -/* I/O Unit Redirection Table */ -#define IO_APIC_REDIR_VECTOR_MASK 0x000FF -#define IO_APIC_REDIR_DEST_LOGICAL 0x00800 -#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000 -#define IO_APIC_REDIR_SEND_PENDING (1 << 12) -#define IO_APIC_REDIR_REMOTE_IRR (1 << 14) -#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15) -#define IO_APIC_REDIR_MASKED (1 << 16) - /* * The structure of the IO-APIC: */ @@ -65,52 +56,39 @@ union IO_APIC_reg_03 { }; struct IO_APIC_route_entry { - __u32 vector : 8, - delivery_mode : 3, /* 000: FIXED - * 001: lowest prio - * 111: ExtINT - */ - dest_mode : 1, /* 0: physical, 1: logical */ - delivery_status : 1, - polarity : 1, - irr : 1, - trigger : 1, /* 0: edge, 1: level */ - mask : 1, /* 0: enabled, 1: disabled */ - __reserved_2 : 15; - - __u32 __reserved_3 : 24, - dest : 8; -} __attribute__ ((packed)); - -struct IR_IO_APIC_route_entry { - __u64 vector : 8, - zero : 3, - index2 : 1, - delivery_status : 1, - polarity : 1, - irr : 1, - trigger : 1, - mask : 1, - reserved : 31, - format : 1, - index : 15; + union { + struct { + u64 vector : 8, + delivery_mode : 3, + dest_mode_logical : 1, + delivery_status : 1, + active_low : 1, + irr : 1, + is_level : 1, + masked : 1, + reserved_0 : 15, + reserved_1 : 24, + destid_0_7 : 8; + }; + struct { + u64 ir_shared_0 : 8, + ir_zero : 3, + ir_index_15 : 1, + ir_shared_1 : 5, + ir_reserved_0 : 31, + ir_format : 1, + ir_index_0_14 : 15; + }; + struct { + u64 w1 : 32, + w2 : 32; + }; + }; } __attribute__ ((packed)); struct irq_alloc_info; struct ioapic_domain_cfg; -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -#define IOAPIC_MASKED 1 -#define IOAPIC_UNMASKED 0 - -#define IOAPIC_POL_HIGH 0 -#define IOAPIC_POL_LOW 1 - -#define IOAPIC_DEST_MODE_PHYSICAL 0 -#define IOAPIC_DEST_MODE_LOGICAL 1 - #define IOAPIC_MAP_ALLOC 0x1 #define IOAPIC_MAP_CHECK 0x2 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 24a7bba7cbf4..07e754131854 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -286,31 +286,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg, writel(value, &io_apic->data); } -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + entry.w1 = io_apic_read(apic, 0x10 + 2 * pin); + entry.w2 = io_apic_read(apic, 0x11 + 2 * pin); - return eu.entry; + return entry; } static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { - union entry_union eu; + struct IO_APIC_route_entry entry; unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - eu.entry = __ioapic_read_entry(apic, pin); + entry = __ioapic_read_entry(apic, pin); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; + return entry; } /* @@ -321,11 +316,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) */ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - union entry_union eu = {{0, 0}}; - - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); } static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) @@ -344,12 +336,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) */ static void ioapic_mask_entry(int apic, int pin) { + struct IO_APIC_route_entry e = { .masked = true }; unsigned long flags; - union entry_union eu = { .entry.mask = IOAPIC_MASKED }; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, e.w1); + io_apic_write(apic, 0x11 + 2*pin, e.w2); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -422,20 +414,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node, add_pin_to_irq_node(data, node, newapic, newpin); } -static void io_apic_modify_irq(struct mp_chip_data *data, - int mask_and, int mask_or, +static void io_apic_modify_irq(struct mp_chip_data *data, bool masked, void (*final)(struct irq_pin_list *entry)) { - union entry_union eu; struct irq_pin_list *entry; - eu.entry = data->entry; - eu.w1 &= mask_and; - eu.w1 |= mask_or; - data->entry = eu.entry; + data->entry.masked = masked; for_each_irq_pin(entry, data->irq_2_pin) { - io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1); + io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1); if (final) final(entry); } @@ -459,13 +446,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data) unsigned long flags; raw_spin_lock_irqsave(&ioapic_lock, flags); - io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); + io_apic_modify_irq(data, true, &io_apic_sync); raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void __unmask_ioapic(struct mp_chip_data *data) { - io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL); + io_apic_modify_irq(data, false, NULL); } static void unmask_ioapic_irq(struct irq_data *irq_data) @@ -506,8 +493,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector) /* * Mask the entry and change the trigger mode to edge. */ - entry1.mask = IOAPIC_MASKED; - entry1.trigger = IOAPIC_EDGE; + entry1.masked = true; + entry1.is_level = false; __ioapic_write_entry(apic, pin, entry1); @@ -542,8 +529,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * Make sure the entry is masked and re-read the contents to check * if it is a level triggered pin and if the remote-IRR is set. */ - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); entry = ioapic_read_entry(apic, pin); } @@ -556,8 +543,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) * doesn't clear the remote-IRR if the trigger mode is not * set to level. */ - if (entry.trigger == IOAPIC_EDGE) { - entry.trigger = IOAPIC_LEVEL; + if (!entry.is_level) { + entry.is_level = true; ioapic_write_entry(apic, pin, entry); } raw_spin_lock_irqsave(&ioapic_lock, flags); @@ -659,8 +646,8 @@ void mask_ioapic_entries(void) struct IO_APIC_route_entry entry; entry = ioapics[apic].saved_registers[pin]; - if (entry.mask == IOAPIC_UNMASKED) { - entry.mask = IOAPIC_MASKED; + if (!entry.masked) { + entry.masked = true; ioapic_write_entry(apic, pin, entry); } } @@ -947,8 +934,8 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info) if (irq < nr_legacy_irqs() && data->count == 1) { if (info->ioapic.is_level != data->is_level) mp_register_handler(irq, info->ioapic.is_level); - data->entry.trigger = data->is_level = info->ioapic.is_level; - data->entry.polarity = data->active_low = info->ioapic.active_low; + data->entry.is_level = data->is_level = info->ioapic.is_level; + data->entry.active_low = data->active_low = info->ioapic.active_low; } return data->is_level == info->ioapic.is_level && @@ -1231,10 +1218,9 @@ void ioapic_zap_locks(void) static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) { - int i; - char buf[256]; struct IO_APIC_route_entry entry; - struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry; + char buf[256]; + int i; printk(KERN_DEBUG "IOAPIC %d:\n", apic); for (i = 0; i <= nr_entries; i++) { @@ -1242,20 +1228,20 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) snprintf(buf, sizeof(buf), " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)", i, - entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ", - entry.trigger == IOAPIC_LEVEL ? "level" : "edge ", - entry.polarity == IOAPIC_POL_LOW ? "low " : "high", + entry.masked ? "disabled" : "enabled ", + entry.is_level ? "level" : "edge ", + entry.active_low ? "low " : "high", entry.vector, entry.irr, entry.delivery_status); - if (ir_entry->format) + if (entry.ir_format) { printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n", - buf, (ir_entry->index2 << 15) | ir_entry->index, - ir_entry->zero); - else - printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf, - entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ? - "logical " : "physical", - entry.dest, entry.delivery_mode); + (entry.ir_index_15 << 15) | entry.ir_index_0_14, + entry.ir_zero); + } else { + printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf, + entry.dest_mode_logical ? "logical " : "physical", + entry.destid_0_7, entry.delivery_mode); + } } } @@ -1380,8 +1366,8 @@ void __init enable_IO_APIC(void) /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ - if ((entry.mask == 0) && - (entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT)) { + if (!entry.masked && + entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) { ioapic_i8259.apic = apic; ioapic_i8259.pin = pin; goto found_i8259; @@ -1425,12 +1411,12 @@ void native_restore_boot_irq_mode(void) struct IO_APIC_route_entry entry; memset(&entry, 0, sizeof(entry)); - entry.mask = IOAPIC_UNMASKED; - entry.trigger = IOAPIC_EDGE; - entry.polarity = IOAPIC_POL_HIGH; - entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; + entry.masked = false; + entry.is_level = false; + entry.active_low = false; + entry.dest_mode_logical = false; entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT; - entry.dest = read_apic_id(); + entry.destid_0_7 = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -1709,13 +1695,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, data->irq_2_pin) { - unsigned int reg; + struct IO_APIC_route_entry e; int pin; pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); + e.w1 = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { + if (e.irr) { raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } @@ -1874,7 +1860,7 @@ static void ioapic_configure_entry(struct irq_data *irqd) * ioapic chip to verify that. */ if (irqd->chip == &ioapic_chip) { - mpd->entry.dest = cfg->dest_apicid; + mpd->entry.destid_0_7 = cfg->dest_apicid; mpd->entry.vector = cfg->vector; } for_each_irq_pin(entry, mpd->irq_2_pin) @@ -1932,7 +1918,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd, * irrelevant because the IO-APIC treats them as fire and * forget. */ - if (rentry.irr && rentry.trigger) { + if (rentry.irr && rentry.is_level) { *state = true; break; } @@ -2057,12 +2043,12 @@ static inline void __init unlock_ExtINT_logic(void) memset(&entry1, 0, sizeof(entry1)); - entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL; - entry1.mask = IOAPIC_UNMASKED; - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; - entry1.polarity = entry0.polarity; - entry1.trigger = IOAPIC_EDGE; + entry1.dest_mode_logical = true; + entry1.masked = false; + entry1.destid_0_7 = hard_smp_processor_id(); + entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; + entry1.active_low = entry0.active_low; + entry1.is_level = false; entry1.vector = 0; ioapic_write_entry(apic, pin, entry1); @@ -2937,17 +2923,17 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, struct IO_APIC_route_entry *entry) { memset(entry, 0, sizeof(*entry)); - entry->delivery_mode = apic->delivery_mode; - entry->dest_mode = apic->dest_mode_logical; - entry->dest = cfg->dest_apicid; - entry->vector = cfg->vector; - entry->trigger = data->is_level; - entry->polarity = data->active_low; + entry->delivery_mode = apic->delivery_mode; + entry->dest_mode_logical = apic->dest_mode_logical; + entry->destid_0_7 = cfg->dest_apicid; + entry->vector = cfg->vector; + entry->is_level = data->is_level; + entry->active_low = data->active_low; /* * Mask level triggered irqs. Edge triggered irqs are masked * by the irq core code in case they fire. */ - entry->mask = data->is_level; + entry->masked = data->is_level; } int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b0e5210e53b2..3d72ec7bbbf8 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3687,11 +3687,11 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, entry = info->ioapic.entry; info->ioapic.entry = NULL; memset(entry, 0, sizeof(*entry)); - entry->vector = index; - entry->trigger = info->ioapic.is_level; - entry->polarity = info->ioapic.active_low; + entry->vector = index; + entry->is_level = info->ioapic.is_level; + entry->active_low = info->ioapic.active_low; /* Mask level triggered irqs. */ - entry->mask = info->ioapic.is_level; + entry->masked = info->ioapic.is_level; break; case X86_IRQ_ALLOC_TYPE_HPET: diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index e09e2d734c57..1ab7eb918a5c 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -52,7 +52,7 @@ static int hyperv_ir_set_affinity(struct irq_data *data, return ret; entry = data->chip_data; - entry->dest = cfg->dest_apicid; + entry->destid_0_7 = cfg->dest_apicid; entry->vector = cfg->vector; send_cleanup_vector(cfg); @@ -125,7 +125,7 @@ static int hyperv_irq_remapping_activate(struct irq_domain *domain, struct irq_cfg *cfg = irqd_cfg(irq_data); struct IO_APIC_route_entry *entry = irq_data->chip_data; - entry->dest = cfg->dest_apicid; + entry->destid_0_7 = cfg->dest_apicid; entry->vector = cfg->vector; return 0; diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 54ca69333445..625bdb9f1627 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1279,8 +1279,8 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, struct irq_alloc_info *info, int index, int sub_handle) { - struct IR_IO_APIC_route_entry *entry; struct irte *irte = &data->irte_entry; + struct IO_APIC_route_entry *entry; prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid); switch (info->type) { @@ -1294,22 +1294,21 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, irte->avail, irte->vector, irte->dest_id, irte->sid, irte->sq, irte->svt); - entry = (struct IR_IO_APIC_route_entry *)info->ioapic.entry; + entry = info->ioapic.entry; info->ioapic.entry = NULL; memset(entry, 0, sizeof(*entry)); - entry->index2 = (index >> 15) & 0x1; - entry->zero = 0; - entry->format = 1; - entry->index = (index & 0x7fff); + entry->ir_index_15 = !!(index & 0x8000); + entry->ir_format = true; + entry->ir_index_0_14 = index & 0x7fff; /* * IO-APIC RTE will be configured with virtual vector. * irq handler will do the explicit EOI to the io-apic. */ - entry->vector = info->ioapic.pin; - entry->trigger = info->ioapic.is_level; - entry->polarity = info->ioapic.active_low; + entry->vector = info->ioapic.pin; + entry->is_level = info->ioapic.is_level; + entry->active_low = info->ioapic.active_low; /* Mask level triggered irqs. */ - entry->mask = info->ioapic.is_level; + entry->masked = info->ioapic.is_level; break; case X86_IRQ_ALLOC_TYPE_HPET: From 5d5a97133887b2dfd8e2ad0347c3a02cc7aaa0cb Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:21 +0100 Subject: [PATCH 21/77] x86/ioapic: Generate RTE directly from parent irqchip's MSI message The I/O-APIC generates an MSI cycle with address/data bits taken from its Redirection Table Entry in some combination which used to make sense, but now is just a bunch of bits which get passed through in some seemingly arbitrary order. Instead of making IRQ remapping drivers directly frob the I/OA-PIC RTE, let them just do their job and generate an MSI message. The bit swizzling to turn that MSI message into the I/O-APIC's RTE is the same in all cases, since it's a function of the I/O-APIC hardware. The IRQ remappers have no real need to get involved with that. The only slight caveat is that the I/OAPIC is interpreting some of those fields too, and it does want the 'vector' field to be unique to make EOI work. The AMD IOMMU happens to put its IRTE index in the bits that the I/O-APIC thinks are the vector field, and accommodates this requirement by reserving the first 32 indices for the I/O-APIC. The Intel IOMMU doesn't actually use the bits that the I/O-APIC thinks are the vector field, so it fills in the 'pin' value there instead. [ tglx: Replaced the unreadably macro maze with the cleaned up RTE/msi_msg bitfields and added commentry to explain the mapping magic ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-22-dwmw2@infradead.org --- arch/x86/include/asm/hw_irq.h | 11 ++- arch/x86/kernel/apic/io_apic.c | 103 +++++++++++++++++++--------- drivers/iommu/amd/iommu.c | 12 ---- drivers/iommu/hyperv-iommu.c | 31 --------- drivers/iommu/intel/irq_remapping.c | 31 ++------- 5 files changed, 83 insertions(+), 105 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 517847a94dbe..83a69f62637e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -45,12 +45,11 @@ enum irq_alloc_type { }; struct ioapic_alloc_info { - int pin; - int node; - u32 is_level : 1; - u32 active_low : 1; - u32 valid : 1; - struct IO_APIC_route_entry *entry; + int pin; + int node; + u32 is_level : 1; + u32 active_low : 1; + u32 valid : 1; }; struct uv_alloc_info { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 07e754131854..ea983da1a57f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,6 +48,7 @@ #include /* time_after() */ #include #include +#include #include #include @@ -63,7 +64,6 @@ #include #include #include - #include #define for_each_ioapic(idx) \ @@ -1848,21 +1848,58 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(data->entry.vector, data); } +/* + * The I/OAPIC is just a device for generating MSI messages from legacy + * interrupt pins. Various fields of the RTE translate into bits of the + * resulting MSI which had a historical meaning. + * + * With interrupt remapping, many of those bits have different meanings + * in the underlying MSI, but the way that the I/OAPIC transforms them + * from its RTE to the MSI message is the same. This function allows + * the parent IRQ domain to compose the MSI message, then takes the + * relevant bits to put them in the appropriate places in the RTE in + * order to generate that message when the IRQ happens. + * + * The setup here relies on a preconfigured route entry (is_level, + * active_low, masked) because the parent domain is merely composing the + * generic message routing information which is used for the MSI. + */ +static void ioapic_setup_msg_from_msi(struct irq_data *irq_data, + struct IO_APIC_route_entry *entry) +{ + struct msi_msg msg; + + /* Let the parent domain compose the MSI message */ + irq_chip_compose_msi_msg(irq_data, &msg); + + /* + * - Real vector + * - DMAR/IR: 8bit subhandle (ioapic.pin) + * - AMD/IR: 8bit IRTE index + */ + entry->vector = msg.arch_data.vector; + /* Delivery mode (for DMAR/IR all 0) */ + entry->delivery_mode = msg.arch_data.delivery_mode; + /* Destination mode or DMAR/IR index bit 15 */ + entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical; + /* DMAR/IR: 1, 0 for all other modes */ + entry->ir_format = msg.arch_addr_lo.dmar_format; + /* + * DMAR/IR: index bit 0-14. + * + * All other modes have bit 0-6 of dmar_index_0_14 cleared and the + * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7). + */ + entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14; +} + static void ioapic_configure_entry(struct irq_data *irqd) { struct mp_chip_data *mpd = irqd->chip_data; - struct irq_cfg *cfg = irqd_cfg(irqd); struct irq_pin_list *entry; - /* - * Only update when the parent is the vector domain, don't touch it - * if the parent is the remapping domain. Check the installed - * ioapic chip to verify that. - */ - if (irqd->chip == &ioapic_chip) { - mpd->entry.destid_0_7 = cfg->dest_apicid; - mpd->entry.vector = cfg->vector; - } + ioapic_setup_msg_from_msi(irqd, &mpd->entry); + for_each_irq_pin(entry, mpd->irq_2_pin) __ioapic_write_entry(entry->apic, entry->pin, mpd->entry); } @@ -2919,14 +2956,23 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data, } } -static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, - struct IO_APIC_route_entry *entry) +/* + * Configure the I/O-APIC specific fields in the routing entry. + * + * This is important to setup the I/O-APIC specific bits (is_level, + * active_low, masked) because the underlying parent domain will only + * provide the routing information and is oblivious of the I/O-APIC + * specific bits. + * + * The entry is just preconfigured at this point and not written into the + * RTE. This happens later during activation which will fill in the actual + * routing information. + */ +static void mp_preconfigure_entry(struct mp_chip_data *data) { + struct IO_APIC_route_entry *entry = &data->entry; + memset(entry, 0, sizeof(*entry)); - entry->delivery_mode = apic->delivery_mode; - entry->dest_mode_logical = apic->dest_mode_logical; - entry->destid_0_7 = cfg->dest_apicid; - entry->vector = cfg->vector; entry->is_level = data->is_level; entry->active_low = data->active_low; /* @@ -2939,11 +2985,10 @@ static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data, int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - int ret, ioapic, pin; - struct irq_cfg *cfg; - struct irq_data *irq_data; - struct mp_chip_data *data; struct irq_alloc_info *info = arg; + struct mp_chip_data *data; + struct irq_data *irq_data; + int ret, ioapic, pin; unsigned long flags; if (!info || nr_irqs > 1) @@ -2961,7 +3006,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, if (!data) return -ENOMEM; - info->ioapic.entry = &data->entry; ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info); if (ret < 0) { kfree(data); @@ -2975,23 +3019,20 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, irq_data->chip_data = data; mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info); - cfg = irqd_cfg(irq_data); add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin); - local_irq_save(flags); - if (info->ioapic.entry) - mp_setup_entry(cfg, data, info->ioapic.entry); + mp_preconfigure_entry(data); mp_register_handler(virq, data->is_level); + + local_irq_save(flags); if (virq < nr_legacy_irqs()) legacy_pic->mask(virq); local_irq_restore(flags); apic_printk(APIC_VERBOSE, KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n", - ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector, - virq, data->is_level, data->active_low, - cfg->dest_apicid); - + "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n", + ioapic, mpc_ioapic_id(ioapic), pin, virq, + data->is_level, data->active_low); return 0; } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 3d72ec7bbbf8..9744cdbefd88 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3669,7 +3669,6 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, int devid, int index, int sub_handle) { struct irq_2_irte *irte_info = &data->irq_2_irte; - struct IO_APIC_route_entry *entry; struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; if (!iommu) @@ -3683,17 +3682,6 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, switch (info->type) { case X86_IRQ_ALLOC_TYPE_IOAPIC: - /* Setup IOAPIC entry */ - entry = info->ioapic.entry; - info->ioapic.entry = NULL; - memset(entry, 0, sizeof(*entry)); - entry->vector = index; - entry->is_level = info->ioapic.is_level; - entry->active_low = info->ioapic.active_low; - /* Mask level triggered irqs. */ - entry->masked = info->ioapic.is_level; - break; - case X86_IRQ_ALLOC_TYPE_HPET: case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSIX: diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 1ab7eb918a5c..37dd485a5640 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -40,7 +40,6 @@ static int hyperv_ir_set_affinity(struct irq_data *data, { struct irq_data *parent = data->parent_data; struct irq_cfg *cfg = irqd_cfg(data); - struct IO_APIC_route_entry *entry; int ret; /* Return error If new irq affinity is out of ioapic_max_cpumask. */ @@ -51,9 +50,6 @@ static int hyperv_ir_set_affinity(struct irq_data *data, if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) return ret; - entry = data->chip_data; - entry->destid_0_7 = cfg->dest_apicid; - entry->vector = cfg->vector; send_cleanup_vector(cfg); return 0; @@ -89,20 +85,6 @@ static int hyperv_irq_remapping_alloc(struct irq_domain *domain, irq_data->chip = &hyperv_ir_chip; - /* - * If there is interrupt remapping function of IOMMU, setting irq - * affinity only needs to change IRTE of IOMMU. But Hyper-V doesn't - * support interrupt remapping function, setting irq affinity of IO-APIC - * interrupts still needs to change IO-APIC registers. But ioapic_ - * configure_entry() will ignore value of cfg->vector and cfg-> - * dest_apicid when IO-APIC's parent irq domain is not the vector - * domain.(See ioapic_configure_entry()) In order to setting vector - * and dest_apicid to IO-APIC register, IO-APIC entry pointer is saved - * in the chip_data and hyperv_irq_remapping_activate()/hyperv_ir_set_ - * affinity() set vector and dest_apicid directly into IO-APIC entry. - */ - irq_data->chip_data = info->ioapic.entry; - /* * Hypver-V IO APIC irq affinity should be in the scope of * ioapic_max_cpumask because no irq remapping support. @@ -119,22 +101,9 @@ static void hyperv_irq_remapping_free(struct irq_domain *domain, irq_domain_free_irqs_common(domain, virq, nr_irqs); } -static int hyperv_irq_remapping_activate(struct irq_domain *domain, - struct irq_data *irq_data, bool reserve) -{ - struct irq_cfg *cfg = irqd_cfg(irq_data); - struct IO_APIC_route_entry *entry = irq_data->chip_data; - - entry->destid_0_7 = cfg->dest_apicid; - entry->vector = cfg->vector; - - return 0; -} - static const struct irq_domain_ops hyperv_ir_domain_ops = { .alloc = hyperv_irq_remapping_alloc, .free = hyperv_irq_remapping_free, - .activate = hyperv_irq_remapping_activate, }; static int __init hyperv_prepare_irq_remapping(void) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 625bdb9f1627..96c84b19940e 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1280,9 +1280,9 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, int index, int sub_handle) { struct irte *irte = &data->irte_entry; - struct IO_APIC_route_entry *entry; prepare_irte(irte, irq_cfg->vector, irq_cfg->dest_apicid); + switch (info->type) { case X86_IRQ_ALLOC_TYPE_IOAPIC: /* Set source-id of interrupt request */ @@ -1293,39 +1293,20 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, irte->trigger_mode, irte->dlvry_mode, irte->avail, irte->vector, irte->dest_id, irte->sid, irte->sq, irte->svt); - - entry = info->ioapic.entry; - info->ioapic.entry = NULL; - memset(entry, 0, sizeof(*entry)); - entry->ir_index_15 = !!(index & 0x8000); - entry->ir_format = true; - entry->ir_index_0_14 = index & 0x7fff; - /* - * IO-APIC RTE will be configured with virtual vector. - * irq handler will do the explicit EOI to the io-apic. - */ - entry->vector = info->ioapic.pin; - entry->is_level = info->ioapic.is_level; - entry->active_low = info->ioapic.active_low; - /* Mask level triggered irqs. */ - entry->masked = info->ioapic.is_level; + sub_handle = info->ioapic.pin; break; - case X86_IRQ_ALLOC_TYPE_HPET: + set_hpet_sid(irte, info->devid); + break; case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSIX: - if (info->type == X86_IRQ_ALLOC_TYPE_HPET) - set_hpet_sid(irte, info->devid); - else - set_msi_sid(irte, msi_desc_to_pci_dev(info->desc)); - - fill_msi_msg(&data->msi_entry, index, sub_handle); + set_msi_sid(irte, msi_desc_to_pci_dev(info->desc)); break; - default: BUG_ON(1); break; } + fill_msi_msg(&data->msi_entry, index, sub_handle); } static void intel_free_irq_resources(struct irq_domain *domain, From 2cbd5a45e5296b28d64224ffbbd33d427704ba1b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:22 +0100 Subject: [PATCH 22/77] genirq/irqdomain: Implement get_name() method on irqchip fwnodes Prerequisite to make x86 more irqdomain compliant. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20201024213535.443185-23-dwmw2@infradead.org --- kernel/irq/irqdomain.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..673fa64c1c44 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -42,7 +42,16 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { } static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif -const struct fwnode_operations irqchip_fwnode_ops; +static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode) +{ + struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + return fwid->name; +} + +const struct fwnode_operations irqchip_fwnode_ops = { + .get_name = irqchip_fwnode_get_name, +}; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** From 6452ea2a323b80868ce5e6d3030e4ccbeab9dc30 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:23 +0100 Subject: [PATCH 23/77] x86/apic: Add select() method on vector irqdomain This will be used to select the irqdomain for I/O-APIC and HPET. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-24-dwmw2@infradead.org --- arch/x86/include/asm/irqdomain.h | 3 +++ arch/x86/kernel/apic/vector.c | 43 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h index cd684d45cb5f..125c23b7bad3 100644 --- a/arch/x86/include/asm/irqdomain.h +++ b/arch/x86/include/asm/irqdomain.h @@ -12,6 +12,9 @@ enum { X86_IRQ_ALLOC_LEGACY = 0x2, }; +extern int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec); +extern int x86_fwspec_is_hpet(struct irq_fwspec *fwspec); + extern struct irq_domain *x86_vector_domain; extern void init_irq_alloc_info(struct irq_alloc_info *info, diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index bb2e2a2488a5..b9b05caa28a4 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -636,7 +636,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d, } #endif +int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "IO-APIC-", 8) && + simple_strtol(fwname+8, NULL, 10) == fwspec->param[0]; + } + return to_of_node(fwspec->fwnode) && + of_device_is_compatible(to_of_node(fwspec->fwnode), + "intel,ce4100-ioapic"); +} + +int x86_fwspec_is_hpet(struct irq_fwspec *fwspec) +{ + if (fwspec->param_count != 1) + return 0; + + if (is_fwnode_irqchip(fwspec->fwnode)) { + const char *fwname = fwnode_get_name(fwspec->fwnode); + return fwname && !strncmp(fwname, "HPET-MSI-", 9) && + simple_strtol(fwname+9, NULL, 10) == fwspec->param[0]; + } + return 0; +} + +static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + /* + * HPET and I/OAPIC cannot be parented in the vector domain + * if IRQ remapping is enabled. APIC IDs above 15 bits are + * only permitted if IRQ remapping is enabled, so check that. + */ + if (apic->apic_id_valid(32768)) + return 0; + + return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec); +} + static const struct irq_domain_ops x86_vector_domain_ops = { + .select = x86_vector_select, .alloc = x86_vector_alloc_irqs, .free = x86_vector_free_irqs, .activate = x86_vector_activate, From a1a785b572425ab3ca5494a4be02ab59a796df51 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:24 +0100 Subject: [PATCH 24/77] iommu/amd: Implement select() method on remapping irqdomain Preparatory change to remove irq_remapping_get_irq_domain(). Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-25-dwmw2@infradead.org --- drivers/iommu/amd/iommu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 9744cdbefd88..31b22244e9c2 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3882,7 +3882,26 @@ static void irq_remapping_deactivate(struct irq_domain *domain, irte_info->index); } +static int irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + struct amd_iommu *iommu; + int devid = -1; + + if (x86_fwspec_is_ioapic(fwspec)) + devid = get_ioapic_devid(fwspec->param[0]); + else if (x86_fwspec_is_hpet(fwspec)) + devid = get_hpet_devid(fwspec->param[0]); + + if (devid < 0) + return 0; + + iommu = amd_iommu_rlookup_table[devid]; + return iommu && iommu->ir_domain == d; +} + static const struct irq_domain_ops amd_ir_domain_ops = { + .select = irq_remapping_select, .alloc = irq_remapping_alloc, .free = irq_remapping_free, .activate = irq_remapping_activate, From a87fb465ffe8eacd0d69032da33455e4f6fd8b41 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:25 +0100 Subject: [PATCH 25/77] iommu/vt-d: Implement select() method on remapping irqdomain Preparatory for removing irq_remapping_get_irq_domain() Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-26-dwmw2@infradead.org --- drivers/iommu/intel/irq_remapping.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 96c84b19940e..b3b079c0b51e 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1431,7 +1431,20 @@ static void intel_irq_remapping_deactivate(struct irq_domain *domain, modify_irte(&data->irq_2_iommu, &entry); } +static int intel_irq_remapping_select(struct irq_domain *d, + struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + if (x86_fwspec_is_ioapic(fwspec)) + return d == map_ioapic_to_ir(fwspec->param[0]); + else if (x86_fwspec_is_hpet(fwspec)) + return d == map_hpet_to_ir(fwspec->param[0]); + + return 0; +} + static const struct irq_domain_ops intel_ir_domain_ops = { + .select = intel_irq_remapping_select, .alloc = intel_irq_remapping_alloc, .free = intel_irq_remapping_free, .activate = intel_irq_remapping_activate, From a491bb19f728cdb8cc1f4734ecc57c0afa099fac Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:26 +0100 Subject: [PATCH 26/77] iommu/hyper-v: Implement select() method on remapping irqdomain Preparatory for removing irq_remapping_get_irq_domain() Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-27-dwmw2@infradead.org --- drivers/iommu/hyperv-iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 37dd485a5640..78a264ad9405 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -101,7 +101,16 @@ static void hyperv_irq_remapping_free(struct irq_domain *domain, irq_domain_free_irqs_common(domain, virq, nr_irqs); } +static int hyperv_irq_remapping_select(struct irq_domain *d, + struct irq_fwspec *fwspec, + enum irq_domain_bus_token bus_token) +{ + /* Claim only the first (and only) I/OAPIC */ + return x86_fwspec_is_ioapic(fwspec) && fwspec->param[0] == 0; +} + static const struct irq_domain_ops hyperv_ir_domain_ops = { + .select = hyperv_irq_remapping_select, .alloc = hyperv_irq_remapping_alloc, .free = hyperv_irq_remapping_free, }; From c2a5881c28e5bb4cb901029423a1c7068c0afa2d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:27 +0100 Subject: [PATCH 27/77] x86/hpet: Use irq_find_matching_fwspec() to find remapping irqdomain All possible parent domains have a select method now. Make use of it. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-28-dwmw2@infradead.org --- arch/x86/kernel/hpet.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3b8b12769f3b..08651a4e6aa0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -543,8 +543,8 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id) { struct msi_domain_info *domain_info; struct irq_domain *parent, *d; - struct irq_alloc_info info; struct fwnode_handle *fn; + struct irq_fwspec fwspec; if (x86_vector_domain == NULL) return NULL; @@ -556,15 +556,6 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id) *domain_info = hpet_msi_domain_info; domain_info->data = (void *)(long)hpet_id; - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT; - info.devid = hpet_id; - parent = irq_remapping_get_irq_domain(&info); - if (parent == NULL) - parent = x86_vector_domain; - else - hpet_msi_controller.name = "IR-HPET-MSI"; - fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name, hpet_id); if (!fn) { @@ -572,6 +563,19 @@ static struct irq_domain *hpet_create_irq_domain(int hpet_id) return NULL; } + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = hpet_id; + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + irq_domain_free_fwnode(fn); + kfree(domain_info); + return NULL; + } + if (parent != x86_vector_domain) + hpet_msi_controller.name = "IR-HPET-MSI"; + d = msi_create_irq_domain(fn, domain_info, parent); if (!d) { irq_domain_free_fwnode(fn); From b643128b917ca8f1c8b1e14af64ebdc81147b2d1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:28 +0100 Subject: [PATCH 28/77] x86/ioapic: Use irq_find_matching_fwspec() to find remapping irqdomain All possible parent domains have a select method now. Make use of it. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-29-dwmw2@infradead.org --- arch/x86/kernel/apic/io_apic.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index ea983da1a57f..443d2c9086b9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2320,36 +2320,37 @@ out: static int mp_irqdomain_create(int ioapic) { - struct irq_alloc_info info; struct irq_domain *parent; int hwirqs = mp_ioapic_pin_count(ioapic); struct ioapic *ip = &ioapics[ioapic]; struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg; struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic); struct fwnode_handle *fn; - char *name = "IO-APIC"; + struct irq_fwspec fwspec; if (cfg->type == IOAPIC_DOMAIN_INVALID) return 0; - init_irq_alloc_info(&info, NULL); - info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT; - info.devid = mpc_ioapic_id(ioapic); - parent = irq_remapping_get_irq_domain(&info); - if (!parent) - parent = x86_vector_domain; - else - name = "IO-APIC-IR"; - /* Handle device tree enumerated APICs proper */ if (cfg->dev) { fn = of_node_to_fwnode(cfg->dev); } else { - fn = irq_domain_alloc_named_id_fwnode(name, ioapic); + fn = irq_domain_alloc_named_id_fwnode("IO-APIC", ioapic); if (!fn) return -ENOMEM; } + fwspec.fwnode = fn; + fwspec.param_count = 1; + fwspec.param[0] = ioapic; + + parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); + if (!parent) { + if (!cfg->dev) + irq_domain_free_fwnode(fn); + return -ENODEV; + } + ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops, (void *)(long)ioapic); From ed381fca47122f0787ee53b97e5f9d562eec7237 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:29 +0100 Subject: [PATCH 29/77] x86: Kill all traces of irq_remapping_get_irq_domain() All users are converted to use the fwspec based parent domain lookup. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-30-dwmw2@infradead.org --- arch/x86/include/asm/hw_irq.h | 2 -- arch/x86/include/asm/irq_remapping.h | 9 -------- drivers/iommu/amd/iommu.c | 34 ---------------------------- drivers/iommu/hyperv-iommu.c | 9 -------- drivers/iommu/intel/irq_remapping.c | 17 -------------- drivers/iommu/irq_remapping.c | 14 ------------ drivers/iommu/irq_remapping.h | 3 --- 7 files changed, 88 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 83a69f62637e..458f5a676402 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -40,8 +40,6 @@ enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_PCI_MSIX, X86_IRQ_ALLOC_TYPE_DMAR, X86_IRQ_ALLOC_TYPE_UV, - X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT, - X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT, }; struct ioapic_alloc_info { diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index af4a151d70b3..7cc49432187f 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -44,9 +44,6 @@ extern int irq_remapping_reenable(int); extern int irq_remap_enable_fault_handling(void); extern void panic_if_irq_remap(const char *msg); -extern struct irq_domain * -irq_remapping_get_irq_domain(struct irq_alloc_info *info); - /* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */ extern struct irq_domain * arch_create_remap_msi_irq_domain(struct irq_domain *par, const char *n, int id); @@ -71,11 +68,5 @@ static inline void panic_if_irq_remap(const char *msg) { } -static inline struct irq_domain * -irq_remapping_get_irq_domain(struct irq_alloc_info *info) -{ - return NULL; -} - #endif /* CONFIG_IRQ_REMAP */ #endif /* __X86_IRQ_REMAPPING_H */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 31b22244e9c2..463d322a4f3b 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3601,10 +3601,8 @@ static int get_devid(struct irq_alloc_info *info) { switch (info->type) { case X86_IRQ_ALLOC_TYPE_IOAPIC: - case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT: return get_ioapic_devid(info->devid); case X86_IRQ_ALLOC_TYPE_HPET: - case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT: return get_hpet_devid(info->devid); case X86_IRQ_ALLOC_TYPE_PCI_MSI: case X86_IRQ_ALLOC_TYPE_PCI_MSIX: @@ -3615,44 +3613,12 @@ static int get_devid(struct irq_alloc_info *info) } } -static struct irq_domain *get_irq_domain_for_devid(struct irq_alloc_info *info, - int devid) -{ - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; - - if (!iommu) - return NULL; - - switch (info->type) { - case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT: - case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT: - return iommu->ir_domain; - default: - WARN_ON_ONCE(1); - return NULL; - } -} - -static struct irq_domain *get_irq_domain(struct irq_alloc_info *info) -{ - int devid; - - if (!info) - return NULL; - - devid = get_devid(info); - if (devid < 0) - return NULL; - return get_irq_domain_for_devid(info, devid); -} - struct irq_remap_ops amd_iommu_irq_ops = { .prepare = amd_iommu_prepare, .enable = amd_iommu_enable, .disable = amd_iommu_disable, .reenable = amd_iommu_reenable, .enable_faulting = amd_iommu_enable_faulting, - .get_irq_domain = get_irq_domain, }; static void fill_msi_msg(struct msi_msg *msg, u32 index) diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 78a264ad9405..a629a6be65c7 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -160,18 +160,9 @@ static int __init hyperv_enable_irq_remapping(void) return IRQ_REMAP_X2APIC_MODE; } -static struct irq_domain *hyperv_get_irq_domain(struct irq_alloc_info *info) -{ - if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT) - return ioapic_ir_domain; - else - return NULL; -} - struct irq_remap_ops hyperv_irq_remap_ops = { .prepare = hyperv_prepare_irq_remapping, .enable = hyperv_enable_irq_remapping, - .get_irq_domain = hyperv_get_irq_domain, }; #endif diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index b3b079c0b51e..bca44015bc1d 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1127,29 +1127,12 @@ static void prepare_irte(struct irte *irte, int vector, unsigned int dest) irte->redir_hint = 1; } -static struct irq_domain *intel_get_irq_domain(struct irq_alloc_info *info) -{ - if (!info) - return NULL; - - switch (info->type) { - case X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT: - return map_ioapic_to_ir(info->devid); - case X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT: - return map_hpet_to_ir(info->devid); - default: - WARN_ON_ONCE(1); - return NULL; - } -} - struct irq_remap_ops intel_irq_remap_ops = { .prepare = intel_prepare_irq_remapping, .enable = intel_enable_irq_remapping, .disable = disable_irq_remapping, .reenable = reenable_irq_remapping, .enable_faulting = enable_drhd_fault_handling, - .get_irq_domain = intel_get_irq_domain, }; static void intel_ir_reconfigure_irte(struct irq_data *irqd, bool force) diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c index 2d84b1ed205e..83314b9d8f38 100644 --- a/drivers/iommu/irq_remapping.c +++ b/drivers/iommu/irq_remapping.c @@ -158,17 +158,3 @@ void panic_if_irq_remap(const char *msg) if (irq_remapping_enabled) panic(msg); } - -/** - * irq_remapping_get_irq_domain - Get the irqdomain serving the request @info - * @info: interrupt allocation information, used to identify the IOMMU device - * - * Returns pointer to IRQ domain, or NULL on failure. - */ -struct irq_domain *irq_remapping_get_irq_domain(struct irq_alloc_info *info) -{ - if (!remap_ops || !remap_ops->get_irq_domain) - return NULL; - - return remap_ops->get_irq_domain(info); -} diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h index 1661b3d75920..8c89cb947cdb 100644 --- a/drivers/iommu/irq_remapping.h +++ b/drivers/iommu/irq_remapping.h @@ -42,9 +42,6 @@ struct irq_remap_ops { /* Enable fault handling */ int (*enable_faulting)(void); - - /* Get the irqdomain associated to IOMMU device */ - struct irq_domain *(*get_irq_domain)(struct irq_alloc_info *); }; extern struct irq_remap_ops intel_irq_remap_ops; From 79eb3581bcaae9b5677629d945e14da212aa76e2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:30 +0100 Subject: [PATCH 30/77] iommu/vt-d: Simplify intel_irq_remapping_select() Now that the old get_irq_domain() method has gone, consolidate on just the map_XXX_to_iommu() functions. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-31-dwmw2@infradead.org --- drivers/iommu/intel/irq_remapping.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index bca44015bc1d..aeffda92b10b 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -203,13 +203,13 @@ static int modify_irte(struct irq_2_iommu *irq_iommu, return rc; } -static struct irq_domain *map_hpet_to_ir(u8 hpet_id) +static struct intel_iommu *map_hpet_to_iommu(u8 hpet_id) { int i; for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].id == hpet_id && ir_hpet[i].iommu) - return ir_hpet[i].iommu->ir_domain; + return ir_hpet[i].iommu; } return NULL; } @@ -225,13 +225,6 @@ static struct intel_iommu *map_ioapic_to_iommu(int apic) return NULL; } -static struct irq_domain *map_ioapic_to_ir(int apic) -{ - struct intel_iommu *iommu = map_ioapic_to_iommu(apic); - - return iommu ? iommu->ir_domain : NULL; -} - static struct irq_domain *map_dev_to_ir(struct pci_dev *dev) { struct dmar_drhd_unit *drhd = dmar_find_matched_drhd_unit(dev); @@ -1418,12 +1411,14 @@ static int intel_irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { - if (x86_fwspec_is_ioapic(fwspec)) - return d == map_ioapic_to_ir(fwspec->param[0]); - else if (x86_fwspec_is_hpet(fwspec)) - return d == map_hpet_to_ir(fwspec->param[0]); + struct intel_iommu *iommu = NULL; - return 0; + if (x86_fwspec_is_ioapic(fwspec)) + iommu = map_ioapic_to_iommu(fwspec->param[0]); + else if (x86_fwspec_is_hpet(fwspec)) + iommu = map_hpet_to_iommu(fwspec->param[0]); + + return iommu && d == iommu->ir_domain; } static const struct irq_domain_ops intel_ir_domain_ops = { From 51130d21881d435fad5fa7f25bea77aa0ffc9a4e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:31 +0100 Subject: [PATCH 31/77] x86/ioapic: Handle Extended Destination ID field in RTE Bits 63-48 of the I/OAPIC Redirection Table Entry map directly to bits 19-4 of the address used in the resulting MSI cycle. Historically, the x86 MSI format only used the top 8 of those 16 bits as the destination APIC ID, and the "Extended Destination ID" in the lower 8 bits was unused. With interrupt remapping, the lowest bit of the Extended Destination ID (bit 48 of RTE, bit 4 of MSI address) is now used to indicate a remappable format MSI. A hypervisor can use the other 7 bits of the Extended Destination ID to permit guests to address up to 15 bits of APIC IDs, thus allowing 32768 vCPUs before having to expose a vIOMMU and interrupt remapping to the guest. No behavioural change in this patch, since nothing yet permits APIC IDs above 255 to be used with the non-IR I/OAPIC domain. [ tglx: Converted it to the cleaned up entry/msi_msg format and added commentry ] Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-32-dwmw2@infradead.org --- arch/x86/include/asm/io_apic.h | 3 ++- arch/x86/kernel/apic/io_apic.c | 20 +++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 73da644b2f0d..437aa8d00e53 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -67,7 +67,8 @@ struct IO_APIC_route_entry { is_level : 1, masked : 1, reserved_0 : 15, - reserved_1 : 24, + reserved_1 : 17, + virt_destid_8_14 : 7, destid_0_7 : 8; }; struct { diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 443d2c9086b9..1cfd65ef295b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1238,9 +1238,10 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries) (entry.ir_index_15 << 15) | entry.ir_index_0_14, entry.ir_zero); } else { - printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n", buf, + printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf, entry.dest_mode_logical ? "logical " : "physical", - entry.destid_0_7, entry.delivery_mode); + entry.virt_destid_8_14, entry.destid_0_7, + entry.delivery_mode); } } } @@ -1409,6 +1410,7 @@ void native_restore_boot_irq_mode(void) */ if (ioapic_i8259.pin != -1) { struct IO_APIC_route_entry entry; + u32 apic_id = read_apic_id(); memset(&entry, 0, sizeof(entry)); entry.masked = false; @@ -1416,7 +1418,8 @@ void native_restore_boot_irq_mode(void) entry.active_low = false; entry.dest_mode_logical = false; entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT; - entry.destid_0_7 = read_apic_id(); + entry.destid_0_7 = apic_id & 0xFF; + entry.virt_destid_8_14 = apic_id >> 8; /* * Add it to the IO-APIC irq-routing table: @@ -1885,7 +1888,11 @@ static void ioapic_setup_msg_from_msi(struct irq_data *irq_data, /* DMAR/IR: 1, 0 for all other modes */ entry->ir_format = msg.arch_addr_lo.dmar_format; /* - * DMAR/IR: index bit 0-14. + * - DMAR/IR: index bit 0-14. + * + * - Virt: If the host supports x2apic without a virtualized IR + * unit then bit 0-6 of dmar_index_0_14 are providing bit + * 8-14 of the destination id. * * All other modes have bit 0-6 of dmar_index_0_14 cleared and the * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7). @@ -2063,6 +2070,7 @@ static inline void __init unlock_ExtINT_logic(void) int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; unsigned char save_control, save_freq_select; + u32 apic_id; pin = find_isa_irq_pin(8, mp_INT); if (pin == -1) { @@ -2078,11 +2086,13 @@ static inline void __init unlock_ExtINT_logic(void) entry0 = ioapic_read_entry(apic, pin); clear_IO_APIC_pin(apic, pin); + apic_id = hard_smp_processor_id(); memset(&entry1, 0, sizeof(entry1)); entry1.dest_mode_logical = true; entry1.masked = false; - entry1.destid_0_7 = hard_smp_processor_id(); + entry1.destid_0_7 = apic_id & 0xFF; + entry1.virt_destid_8_14 = apic_id >> 8; entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT; entry1.active_low = entry0.active_low; entry1.is_level = false; From ab0f59c6f135289c7ea90b0e2471674bf289d884 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:32 +0100 Subject: [PATCH 32/77] x86/apic: Support 15 bits of APIC ID in MSI where available Some hypervisors can allow the guest to use the Extended Destination ID field in the MSI address to address up to 32768 CPUs. This applies to all downstream devices which generate MSI cycles, including HPET, I/O-APIC and PCI MSI. HPET and PCI MSI use the same __irq_msi_compose_msg() function, while I/O-APIC generates its own and had support for the extended bits added in a previous commit. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-33-dwmw2@infradead.org --- arch/x86/include/asm/msi.h | 3 ++- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/apic.c | 26 ++++++++++++++++++++++++-- arch/x86/kernel/x86_init.c | 1 + 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h index 322fd905da9c..b85147d75626 100644 --- a/arch/x86/include/asm/msi.h +++ b/arch/x86/include/asm/msi.h @@ -29,7 +29,8 @@ typedef struct x86_msi_addr_lo { u32 reserved_0 : 2, dest_mode_logical : 1, redirect_hint : 1, - reserved_1 : 8, + reserved_1 : 1, + virt_destid_8_14 : 7, destid_0_7 : 8, base_address : 12; }; diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index dde5b3f1e7cd..5c69f7eb5d47 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -116,6 +116,7 @@ struct x86_init_pci { * @init_platform: platform setup * @guest_late_init: guest late init * @x2apic_available: X2APIC detection + * @msi_ext_dest_id: MSI supports 15-bit APIC IDs * @init_mem_mapping: setup early mappings during init_mem_mapping() * @init_after_bootmem: guest init after boot allocator is finished */ @@ -123,6 +124,7 @@ struct x86_hyper_init { void (*init_platform)(void); void (*guest_late_init)(void); bool (*x2apic_available)(void); + bool (*msi_ext_dest_id)(void); void (*init_mem_mapping)(void); void (*init_after_bootmem)(void); }; diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f7196ee0f005..6bd20c0de8bc 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -93,6 +93,11 @@ static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID; */ static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP; +/* + * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID + */ +static bool virt_ext_dest_id __ro_after_init; + /* * Map cpu index to physical APIC ID */ @@ -1841,6 +1846,8 @@ static __init void try_to_enable_x2apic(int remap_mode) return; if (remap_mode != IRQ_REMAP_X2APIC_MODE) { + u32 apic_limit = 255; + /* * Using X2APIC without IR is not architecturally supported * on bare metal but may be supported in guests. @@ -1851,12 +1858,22 @@ static __init void try_to_enable_x2apic(int remap_mode) return; } + /* + * If the hypervisor supports extended destination ID in + * MSI, that increases the maximum APIC ID that can be + * used for non-remapped IRQ domains. + */ + if (x86_init.hyper.msi_ext_dest_id()) { + virt_ext_dest_id = 1; + apic_limit = 32767; + } + /* * Without IR, all CPUs can be addressed by IOAPIC/MSI only * in physical mode, and CPUs with an APIC ID that cannnot * be addressed must not be brought online. */ - x2apic_set_max_apicid(255); + x2apic_set_max_apicid(apic_limit); x2apic_phys = 1; } x2apic_enable(); @@ -2497,10 +2514,15 @@ void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg, * Only the IOMMU itself can use the trick of putting destination * APIC ID into the high bits of the address. Anything else would * just be writing to memory if it tried that, and needs IR to - * address higher APIC IDs. + * address APICs which can't be addressed in the normal 32-bit + * address range at 0xFFExxxxx. That is typically just 8 bits, but + * some hypervisors allow the extended destination ID field in bits + * 5-11 to be used, giving support for 15 bits of APIC IDs in total. */ if (dmar) msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8; + else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000) + msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8; else WARN_ON_ONCE(cfg->dest_apicid > 0xFF); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index a3038d8deb6a..8b395821cb8d 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -110,6 +110,7 @@ struct x86_init_ops x86_init __initdata = { .init_platform = x86_init_noop, .guest_late_init = x86_init_noop, .x2apic_available = bool_x86_init_noop, + .msi_ext_dest_id = bool_x86_init_noop, .init_mem_mapping = x86_init_noop, .init_after_bootmem = x86_init_noop, }, From bf27ef8a77d8da38c9f35f8f6aab013a2dcf175f Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:33 +0100 Subject: [PATCH 33/77] iommu/hyper-v: Disable IRQ pseudo-remapping if 15 bit APIC IDs are available If the 15-bit APIC ID support is present in emulated MSI then there's no need for the pseudo-remapping support. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201024213535.443185-34-dwmw2@infradead.org --- drivers/iommu/hyperv-iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index a629a6be65c7..9438daa24fdb 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -121,6 +121,7 @@ static int __init hyperv_prepare_irq_remapping(void) int i; if (!hypervisor_is_type(X86_HYPER_MS_HYPERV) || + x86_init.hyper.msi_ext_dest_id() || !x2apic_supported()) return -ENODEV; From 2e008ffe426f927b1697adb4ed10c1e419927ae4 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:35 +0100 Subject: [PATCH 34/77] x86/kvm: Enable 15-bit extension when KVM_FEATURE_MSI_EXT_DEST_ID detected This allows the host to indicate that MSI emulation supports 15-bit destination IDs, allowing up to 32768 CPUs without interrupt remapping. cf. https://patchwork.kernel.org/patch/11816693/ for qemu Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Link: https://lore.kernel.org/r/20201024213535.443185-36-dwmw2@infradead.org --- arch/x86/kernel/kvm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7f57ede3cb8e..5e78e01ca3b4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -740,6 +740,11 @@ static void __init kvm_apic_init(void) #endif } +static bool __init kvm_msi_ext_dest_id(void) +{ + return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); +} + static void __init kvm_init_platform(void) { kvmclock_init(); @@ -769,6 +774,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .type = X86_HYPER_KVM, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, + .init.msi_ext_dest_id = kvm_msi_ext_dest_id, .init.init_platform = kvm_init_platform, #if defined(CONFIG_AMD_MEM_ENCRYPT) .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, From d981059e13ffa9ed03a73472e932d070323bd057 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 2 Nov 2020 17:11:36 -0800 Subject: [PATCH 35/77] x86/hyperv: Enable 15-bit APIC ID if the hypervisor supports it When a Linux VM runs on Hyper-V, if the VM has CPUs with >255 APIC IDs, the CPUs can't be the destination of IOAPIC interrupts, because the IOAPIC RTE's Dest Field has only 8 bits. Currently the hackery driver drivers/iommu/hyperv-iommu.c is used to ensure IOAPIC interrupts are only routed to CPUs that don't have >255 APIC IDs. However, there is an issue with kdump, because the kdump kernel can run on any CPU, and hence IOAPIC interrupts can't work if the kdump kernel run on a CPU with a >255 APIC ID. The kdump issue can be fixed by the Extended Dest ID, which is introduced recently by David Woodhouse (for IOAPIC, see the field virt_destid_8_14 in struct IO_APIC_route_entry). Of course, the Extended Dest ID needs the support of the underlying hypervisor. The latest Hyper-V has added the support recently: with this commit, on such a Hyper-V host, Linux VM does not use hyperv-iommu.c because hyperv_prepare_irq_remapping() returns -ENODEV; instead, Linux kernel's generic support of Extended Dest ID from David is used, meaning that Linux VM is able to support up to 32K CPUs, and IOAPIC interrupts can be routed to all the CPUs. On an old Hyper-V host that doesn't support the Extended Dest ID, nothing changes with this commit: Linux VM is still able to bring up the CPUs with > 255 APIC IDs with the help of hyperv-iommu.c, but IOAPIC interrupts still can not go to such CPUs, and the kdump kernel still can not work properly on such CPUs. [ tglx: Updated comment as suggested by David ] Signed-off-by: Dexuan Cui Signed-off-by: Thomas Gleixner Acked-by: David Woodhouse Link: https://lore.kernel.org/r/20201103011136.59108-1-decui@microsoft.com --- arch/x86/include/asm/hyperv-tlfs.h | 7 +++++++ arch/x86/kernel/cpu/mshyperv.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 0ed20e8bba9e..6bf42aed387e 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -23,6 +23,13 @@ #define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 #define HYPERV_CPUID_NESTED_FEATURES 0x4000000A +#define HYPERV_CPUID_VIRT_STACK_INTERFACE 0x40000081 +#define HYPERV_VS_INTERFACE_EAX_SIGNATURE 0x31235356 /* "VS#1" */ + +#define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 +/* Support for the extended IOAPIC RTE format */ +#define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) + #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 #define HYPERV_CPUID_MAX 0x4000ffff diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 05ef1f4550cb..f628e3dc150f 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -366,9 +366,38 @@ static void __init ms_hyperv_init_platform(void) #endif } +static bool __init ms_hyperv_x2apic_available(void) +{ + return x2apic_supported(); +} + +/* + * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping() + * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the + * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg(). + * + * Note: for a VM on Hyper-V, the I/O-APIC is the only device which + * (logically) generates MSIs directly to the system APIC irq domain. + * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the + * pci-hyperv host bridge. + */ +static bool __init ms_hyperv_msi_ext_dest_id(void) +{ + u32 eax; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE); + if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE) + return false; + + eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES); + return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; +} + const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .type = X86_HYPER_MS_HYPERV, + .init.x2apic_available = ms_hyperv_x2apic_available, + .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, }; From f36a74b9345aebaf5d325380df87a54720229d18 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 3 Nov 2020 16:36:22 +0000 Subject: [PATCH 36/77] x86/ioapic: Use I/O-APIC ID for finding irqdomain, not index In commit b643128b917 ("x86/ioapic: Use irq_find_matching_fwspec() to find remapping irqdomain") the I/O-APIC code was changed to find its parent irqdomain using irq_find_matching_fwspec(), but the key used for the lookup was wrong. It shouldn't use 'ioapic' which is the index into its own ioapics[] array. It should use the actual arbitration ID of the I/O-APIC in question, which is mpc_ioapic_id(ioapic). Fixes: b643128b917 ("x86/ioapic: Use irq_find_matching_fwspec() to find remapping irqdomain") Reported-by: lkp Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/57adf2c305cd0c5e9d860b2f3007a7e676fd0f9f.camel@infradead.org --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1cfd65ef295b..0602c9533d17 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2345,14 +2345,14 @@ static int mp_irqdomain_create(int ioapic) if (cfg->dev) { fn = of_node_to_fwnode(cfg->dev); } else { - fn = irq_domain_alloc_named_id_fwnode("IO-APIC", ioapic); + fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) return -ENOMEM; } fwspec.fwnode = fn; fwspec.param_count = 1; - fwspec.param[0] = ioapic; + fwspec.param[0] = mpc_ioapic_id(ioapic); parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY); if (!parent) { From 16675dda9355505245b89dd50723a2754819594b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:13 +0100 Subject: [PATCH 37/77] mm/highmem: Un-EXPORT __kmap_atomic_idx() Nothing in modules can use that. Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095856.595767588@linutronix.de --- mm/highmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 1352a27951e3..6abfd762eee7 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -108,8 +108,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); - unsigned int nr_free_highpages (void) { struct zone *zone; From b819fd9da38508e0504624b87d9983fcc4237f3c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:14 +0100 Subject: [PATCH 38/77] highmem: Remove unused functions Nothing uses totalhigh_pages_dec() and totalhigh_pages_set(). Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095856.732891880@linutronix.de --- include/linux/highmem.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 14e6202ce47f..f5c31338f0a3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -104,21 +104,11 @@ static inline void totalhigh_pages_inc(void) atomic_long_inc(&_totalhigh_pages); } -static inline void totalhigh_pages_dec(void) -{ - atomic_long_dec(&_totalhigh_pages); -} - static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); } -static inline void totalhigh_pages_set(long val) -{ - atomic_long_set(&_totalhigh_pages, val); -} - void kmap_flush_unused(void); struct page *kmap_to_page(void *addr); From e8f147dc3f1f6b4c27b2eeaf82df4f469d80d469 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:15 +0100 Subject: [PATCH 39/77] fs: Remove asm/kmap_types.h includes Historical leftovers from the time where kmap() had fixed slots. Signed-off-by: Thomas Gleixner Acked-by: David Sterba Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095856.870272797@linutronix.de --- fs/aio.c | 1 - fs/btrfs/ctree.h | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index c45c20d87538..0247daf55987 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -43,7 +43,6 @@ #include #include -#include #include #include diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0378933d163c..01947f603300 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include From 673afbace459ae6fd8d03bda410e0a9f10438c99 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:16 +0100 Subject: [PATCH 40/77] sh/highmem: Remove all traces of unused cruft For whatever reasons SH has highmem bits all over the place but does not enable it via Kconfig. Remove the bitrot. Signed-off-by: Thomas Gleixner Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095856.979798613@linutronix.de Cc: Yoshinori Sato Cc: Rich Felker Cc: Arnd Bergmann Cc: Andrew Morton --- arch/sh/include/asm/fixmap.h | 8 -------- arch/sh/include/asm/kmap_types.h | 15 --------------- arch/sh/mm/init.c | 8 -------- 3 files changed, 31 deletions(-) delete mode 100644 arch/sh/include/asm/kmap_types.h diff --git a/arch/sh/include/asm/fixmap.h b/arch/sh/include/asm/fixmap.h index f38adc189b83..b07fbc7f7bc6 100644 --- a/arch/sh/include/asm/fixmap.h +++ b/arch/sh/include/asm/fixmap.h @@ -13,9 +13,6 @@ #include #include #include -#ifdef CONFIG_HIGHMEM -#include -#endif /* * Here we define all the compile-time 'special' virtual @@ -53,11 +50,6 @@ enum fixed_addresses { FIX_CMAP_BEGIN, FIX_CMAP_END = FIX_CMAP_BEGIN + (FIX_N_COLOURS * NR_CPUS) - 1, -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, -#endif - #ifdef CONFIG_IOREMAP_FIXED /* * FIX_IOREMAP entries are useful for mapping physical address diff --git a/arch/sh/include/asm/kmap_types.h b/arch/sh/include/asm/kmap_types.h deleted file mode 100644 index b78107f923dd..000000000000 --- a/arch/sh/include/asm/kmap_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __SH_KMAP_TYPES_H -#define __SH_KMAP_TYPES_H - -/* Dummy header just to define km_type. */ - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3348e0c4d769..0db6919af8d3 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -362,9 +362,6 @@ void __init mem_init(void) mem_init_print_info(NULL); pr_info("virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" -#ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" -#endif " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" " lowmem : 0x%08lx - 0x%08lx (%4ld MB) (cached)\n" #ifdef CONFIG_UNCACHED_MAPPING @@ -376,11 +373,6 @@ void __init mem_init(void) FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10, -#ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, -#endif - (unsigned long)VMALLOC_START, VMALLOC_END, (VMALLOC_END - VMALLOC_START) >> 20, From 4f8b96cd47b06f1e3ec71c1a3216113efe8dbfb5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:17 +0100 Subject: [PATCH 41/77] asm-generic: Provide kmap_size.h kmap_types.h is a misnomer because the old atomic MAP based array does not exist anymore and the whole indirection of architectures including kmap_types.h is inconinstent and does not allow to provide guard page debugging for this misfeature. Add a common header file which defines the mapping stack size for all architectures. Will be used when converting architectures over to a generic kmap_local/atomic implementation. The array size is chosen with the following constraints in mind: - The deepest nest level in one context is 3 according to code inspection. - The worst case nesting for the upcoming reemptible version would be: 2 maps in task context and a fault inside 2 maps in the fault handler 3 maps in softirq 2 maps in interrupt So a total of 16 is sufficient and probably overestimated. Signed-off-by: Thomas Gleixner Acked-by: Arnd Bergmann Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095857.078043987@linutronix.de --- include/asm-generic/Kbuild | 1 + include/asm-generic/kmap_size.h | 12 ++++++++++++ 2 files changed, 13 insertions(+) create mode 100644 include/asm-generic/kmap_size.h diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index e78bbb9a07e9..ed62d384fb79 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -31,6 +31,7 @@ mandatory-y += irq_regs.h mandatory-y += irq_work.h mandatory-y += kdebug.h mandatory-y += kmap_types.h +mandatory-y += kmap_size.h mandatory-y += kprobes.h mandatory-y += linkage.h mandatory-y += local.h diff --git a/include/asm-generic/kmap_size.h b/include/asm-generic/kmap_size.h new file mode 100644 index 000000000000..9d6c7786a645 --- /dev/null +++ b/include/asm-generic/kmap_size.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_KMAP_SIZE_H +#define _ASM_GENERIC_KMAP_SIZE_H + +/* For debug this provides guard pages between the maps */ +#ifdef CONFIG_DEBUG_HIGHMEM +# define KM_MAX_IDX 33 +#else +# define KM_MAX_IDX 16 +#endif + +#endif From 298fa1ad5571f59cb3ca5497a9455f36867f065e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:18 +0100 Subject: [PATCH 42/77] highmem: Provide generic variant of kmap_atomic* The kmap_atomic* interfaces in all architectures are pretty much the same except for post map operations (flush) and pre- and post unmap operations. Provide a generic variant for that. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Christoph Hellwig Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095857.175939340@linutronix.de --- include/linux/highmem.h | 82 ++++++++++++++++++----- mm/Kconfig | 3 + mm/highmem.c | 144 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 211 insertions(+), 18 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index f5c31338f0a3..f5ecee9c2576 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -31,9 +31,16 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include +/* + * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. + */ +#ifdef CONFIG_KMAP_LOCAL +void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); +void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +void kunmap_local_indexed(void *vaddr); +#endif + #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -extern void kunmap_atomic_high(void *kvaddr); #include #ifndef ARCH_HAS_KMAP_FLUSH_TLB @@ -81,6 +88,11 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ + +#ifndef CONFIG_KMAP_LOCAL +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +void kunmap_atomic_high(void *kvaddr); + static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { preempt_disable(); @@ -89,7 +101,38 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return page_address(page); return kmap_atomic_high_prot(page, prot); } -#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) + +static inline void __kunmap_atomic(void *vaddr) +{ + kunmap_atomic_high(vaddr); +} +#else /* !CONFIG_KMAP_LOCAL */ + +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + return __kmap_local_page_prot(page, prot); +} + +static inline void *kmap_atomic_pfn(unsigned long pfn) +{ + preempt_disable(); + pagefault_disable(); + return __kmap_local_pfn_prot(pfn, kmap_prot); +} + +static inline void __kunmap_atomic(void *addr) +{ + kunmap_local_indexed(addr); +} + +#endif /* CONFIG_KMAP_LOCAL */ + +static inline void *kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); @@ -147,25 +190,33 @@ static inline void *kmap_atomic(struct page *page) pagefault_disable(); return page_address(page); } -#define kmap_atomic_prot(page, prot) kmap_atomic(page) -static inline void kunmap_atomic_high(void *addr) +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + return kmap_atomic(page); +} + +static inline void *kmap_atomic_pfn(unsigned long pfn) +{ + return kmap_atomic(pfn_to_page(pfn)); +} + +static inline void __kunmap_atomic(void *addr) { /* * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() - * handles re-enabling faults + preemption + * handles re-enabling faults and preemption */ #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(addr); #endif } -#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) - #define kmap_flush_unused() do {} while(0) #endif /* CONFIG_HIGHMEM */ +#if !defined(CONFIG_KMAP_LOCAL) #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) DECLARE_PER_CPU(int, __kmap_atomic_idx); @@ -196,22 +247,21 @@ static inline void kmap_atomic_idx_pop(void) __this_cpu_dec(__kmap_atomic_idx); #endif } - +#endif #endif /* * Prevent people trying to call kunmap_atomic() as if it were kunmap() * kunmap_atomic() should get the return value of kmap_atomic, not the page. */ -#define kunmap_atomic(addr) \ -do { \ - BUILD_BUG_ON(__same_type((addr), struct page *)); \ - kunmap_atomic_high(addr); \ - pagefault_enable(); \ - preempt_enable(); \ +#define kunmap_atomic(__addr) \ +do { \ + BUILD_BUG_ON(__same_type((__addr), struct page *)); \ + __kunmap_atomic(__addr); \ + pagefault_enable(); \ + preempt_enable(); \ } while (0) - /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) diff --git a/mm/Kconfig b/mm/Kconfig index d42423f884a7..a1ccf98b7333 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -872,4 +872,7 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +config KMAP_LOCAL + bool + endmenu diff --git a/mm/highmem.c b/mm/highmem.c index 6abfd762eee7..bb4ce13ee7e7 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -31,9 +31,11 @@ #include #include +#ifndef CONFIG_KMAP_LOCAL #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) DEFINE_PER_CPU(int, __kmap_atomic_idx); #endif +#endif /* * Virtual_count is not a pure "count". @@ -365,9 +367,147 @@ void kunmap_high(struct page *page) if (need_wakeup) wake_up(pkmap_map_wait); } - EXPORT_SYMBOL(kunmap_high); -#endif /* CONFIG_HIGHMEM */ +#endif /* CONFIG_HIGHMEM */ + +#ifdef CONFIG_KMAP_LOCAL + +#include + +static DEFINE_PER_CPU(int, __kmap_local_idx); + +static inline int kmap_local_idx_push(void) +{ + int idx = __this_cpu_inc_return(__kmap_local_idx) - 1; + + WARN_ON_ONCE(in_irq() && !irqs_disabled()); + BUG_ON(idx >= KM_MAX_IDX); + return idx; +} + +static inline int kmap_local_idx(void) +{ + return __this_cpu_read(__kmap_local_idx) - 1; +} + +static inline void kmap_local_idx_pop(void) +{ + int idx = __this_cpu_dec_return(__kmap_local_idx); + + BUG_ON(idx < 0); +} + +#ifndef arch_kmap_local_post_map +# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) +#endif +#ifndef arch_kmap_local_pre_unmap +# define arch_kmap_local_pre_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_post_unmap +# define arch_kmap_local_post_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif + +#ifndef arch_kmap_local_unmap_idx +#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) +#endif + +#ifndef arch_kmap_local_high_get +static inline void *arch_kmap_local_high_get(struct page *page) +{ + return NULL; +} +#endif + +/* Unmap a local mapping which was obtained by kmap_high_get() */ +static inline void kmap_high_unmap_local(unsigned long vaddr) +{ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) + kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); +#endif +} + +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +static pte_t *__kmap_pte; + +static pte_t *kmap_get_pte(void) +{ + if (!__kmap_pte) + __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + return __kmap_pte; +} + +void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + pte_t pteval, *kmap_pte = kmap_get_pte(); + unsigned long vaddr; + int idx; + + preempt_disable(); + idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte - idx))); + pteval = pfn_pte(pfn, prot); + set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(vaddr, pteval); + preempt_enable(); + + return (void *)vaddr; +} +EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); + +void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + void *kmap; + + if (!PageHighMem(page)) + return page_address(page); + + /* Try kmap_high_get() if architecture has it enabled */ + kmap = arch_kmap_local_high_get(page); + if (kmap) + return kmap; + + return __kmap_local_pfn_prot(page_to_pfn(page), prot); +} +EXPORT_SYMBOL(__kmap_local_page_prot); + +void kunmap_local_indexed(void *vaddr) +{ + unsigned long addr = (unsigned long) vaddr & PAGE_MASK; + pte_t *kmap_pte = kmap_get_pte(); + int idx; + + if (addr < __fix_to_virt(FIX_KMAP_END) || + addr > __fix_to_virt(FIX_KMAP_BEGIN)) { + WARN_ON_ONCE(addr < PAGE_OFFSET); + + /* Handle mappings which were obtained by kmap_high_get() */ + kmap_high_unmap_local(addr); + return; + } + + preempt_disable(); + idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); + WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + kmap_local_idx_pop(); + preempt_enable(); +} +EXPORT_SYMBOL(kunmap_local_indexed); +#endif #if defined(HASHED_PAGE_VIRTUAL) From 389755c250814185938f5b04334a4f0184c30647 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:19 +0100 Subject: [PATCH 43/77] highmem: Make DEBUG_HIGHMEM functional For some obscure reason when CONFIG_DEBUG_HIGHMEM is enabled the stack depth is increased from 20 to 41. But the only thing DEBUG_HIGHMEM does is to enable a few BUG_ON()'s in the mapping code. That's a leftover from the historical mapping code which had fixed entries for various purposes. DEBUG_HIGHMEM inserted guard mappings between the map types. But that got all ditched when kmap_atomic() switched to a stack based map management. Though the WITH_KM_FENCE magic survived without being functional. All the thing does today is to increase the stack depth. Add a working implementation to the generic kmap_local* implementation. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Christoph Hellwig Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095857.268258322@linutronix.de --- mm/highmem.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index bb4ce13ee7e7..67d2d5983cb0 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -376,9 +376,19 @@ EXPORT_SYMBOL(kunmap_high); static DEFINE_PER_CPU(int, __kmap_local_idx); +/* + * With DEBUG_HIGHMEM the stack depth is doubled and every second + * slot is unused which acts as a guard page + */ +#ifdef CONFIG_DEBUG_HIGHMEM +# define KM_INCR 2 +#else +# define KM_INCR 1 +#endif + static inline int kmap_local_idx_push(void) { - int idx = __this_cpu_inc_return(__kmap_local_idx) - 1; + int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1; WARN_ON_ONCE(in_irq() && !irqs_disabled()); BUG_ON(idx >= KM_MAX_IDX); @@ -392,7 +402,7 @@ static inline int kmap_local_idx(void) static inline void kmap_local_idx_pop(void) { - int idx = __this_cpu_dec_return(__kmap_local_idx); + int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR); BUG_ON(idx < 0); } From 157e118b55113d1e6c7f8ddfcec0a1dbf3a69511 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:20 +0100 Subject: [PATCH 44/77] x86/mm/highmem: Use generic kmap atomic implementation Convert X86 to the generic kmap atomic implementation and make the iomap_atomic() naming convention consistent while at it. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201103095857.375127260@linutronix.de --- arch/x86/Kconfig | 3 +- arch/x86/include/asm/fixmap.h | 5 +-- arch/x86/include/asm/highmem.h | 13 ++++-- arch/x86/include/asm/iomap.h | 18 ++++---- arch/x86/include/asm/kmap_types.h | 13 ------ arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/mm/highmem_32.c | 59 --------------------------- arch/x86/mm/init_32.c | 15 ------- arch/x86/mm/iomap_32.c | 59 +++------------------------ include/linux/highmem.h | 2 +- include/linux/io-mapping.h | 2 +- mm/highmem.c | 2 +- 12 files changed, 31 insertions(+), 161 deletions(-) delete mode 100644 arch/x86/include/asm/kmap_types.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f6946b81f74a..33c273cb3023 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,10 +14,11 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS + select GENERIC_VDSO_32 select HAVE_DEBUG_STACKOVERFLOW + select KMAP_LOCAL select MODULES_USE_ELF_REL select OLD_SIGACTION - select GENERIC_VDSO_32 config X86_64 def_bool y diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 77217bd292bd..8eba66a33e39 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -31,7 +31,7 @@ #include #ifdef CONFIG_X86_32 #include -#include +#include #else #include #endif @@ -94,7 +94,7 @@ enum fixed_addresses { #endif #ifdef CONFIG_X86_32 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, #endif @@ -151,7 +151,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 0f420b24e0fc..032e020853aa 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -23,7 +23,6 @@ #include #include -#include #include #include #include @@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -void *kmap_atomic_pfn(unsigned long pfn); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); - #define flush_cache_kmaps() do { } while (0) +#define arch_kmap_local_post_map(vaddr, pteval) \ + arch_flush_lazy_mmu_mode() + +#define arch_kmap_local_post_unmap(vaddr) \ + do { \ + flush_tlb_one_kernel((vaddr)); \ + arch_flush_lazy_mmu_mode(); \ + } while (0) + extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index bacf68c4d70e..0be7a30fd6bc 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -9,19 +9,21 @@ #include #include #include +#include #include #include -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); +void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot); -void -iounmap_atomic(void __iomem *kvaddr); +static inline void iounmap_atomic(void __iomem *vaddr) +{ + kunmap_local_indexed((void __force *)vaddr); + pagefault_enable(); + preempt_enable(); +} -int -iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); -void -iomap_free(resource_size_t base, unsigned long size); +void iomap_free(resource_size_t base, unsigned long size); #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h deleted file mode 100644 index 04ab8266e347..000000000000 --- a/arch/x86/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KMAP_TYPES_H -#define _ASM_X86_KMAP_TYPES_H - -#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) -#define __WITH_KM_FENCE -#endif - -#include - -#undef __WITH_KM_FENCE - -#endif /* _ASM_X86_KMAP_TYPES_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0fad9f61c76a..b6b02b7c19cc 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -41,7 +41,6 @@ #ifndef __ASSEMBLY__ #include -#include #include #include diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 075fe51317b0..2c54b76d8f84 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,65 +4,6 @@ #include /* for totalram_pages */ #include -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); - set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - return kmap_atomic_prot_pfn(pfn, kmap_prot); -} -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - arch_flush_lazy_mmu_mode(); - } -#ifdef CONFIG_DEBUG_HIGHMEM - else { - BUG_ON(vaddr < PAGE_OFFSET); - BUG_ON(vaddr >= (unsigned long)high_memory); - } -#endif -} -EXPORT_SYMBOL(kunmap_atomic_high); - void __init set_highmem_pages_init(void) { struct zone *zone; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7c055259de3a..da31c2635ee4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -394,19 +394,6 @@ repeat: return last_map_addr; } -pte_t *kmap_pte; - -static void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* - * Cache the first kmap pte: - */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); -} - #ifdef CONFIG_HIGHMEM static void __init permanent_kmaps_init(pgd_t *pgd_base) { @@ -712,8 +699,6 @@ void __init paging_init(void) __flush_tlb_all(); - kmap_init(); - /* * NOTE: at this point the bootmem allocator is fully available. */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index f60398aeb644..e0a40d7cc66c 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - preempt_disable(); - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} - -/* - * Map 'pfn' using protections 'prot' - */ -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, translate non-WB request to UC- just in @@ -81,36 +60,8 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) /* Filter out unsupported __PAGE_KERNEL* bits: */ pgprot_val(prot) &= __default_kernel_pte_mask; - return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); + preempt_disable(); + pagefault_disable(); + return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); } -EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); - -void -iounmap_atomic(void __iomem *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - } - - pagefault_enable(); - preempt_enable(); -} -EXPORT_SYMBOL_GPL(iounmap_atomic); +EXPORT_SYMBOL_GPL(iomap_atomic_pfn_prot); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index f5ecee9c2576..1222a31be842 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -217,7 +217,7 @@ static inline void __kunmap_atomic(void *addr) #endif /* CONFIG_HIGHMEM */ #if !defined(CONFIG_KMAP_LOCAL) -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +#if defined(CONFIG_HIGHMEM) DECLARE_PER_CPU(int, __kmap_atomic_idx); diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index c75e4d3d8833..3b0940be72e9 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -69,7 +69,7 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, BUG_ON(offset >= mapping->size); phys_addr = mapping->base + offset; - return iomap_atomic_prot_pfn(PHYS_PFN(phys_addr), mapping->prot); + return iomap_atomic_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); } static inline void diff --git a/mm/highmem.c b/mm/highmem.c index 67d2d5983cb0..77677c6844f7 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -32,7 +32,7 @@ #include #ifndef CONFIG_KMAP_LOCAL -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +#ifdef CONFIG_HIGHMEM DEFINE_PER_CPU(int, __kmap_atomic_idx); #endif #endif From 39cac191ff37939544af80d5d2af6b870fd94c9b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:21 +0100 Subject: [PATCH 45/77] arc/mm/highmem: Use generic kmap atomic implementation Adopt the map ordering to match the other architectures and the generic code. Also make the maximum entries limited and not dependend on the number of CPUs. With the original implementation did the following calculation: nr_slots = mapsize >> PAGE_SHIFT; The results in either 512 or 1024 total slots depending on configuration. The total slots have to be divided by the number of CPUs to get the number of slots per CPU (former KM_TYPE_NR). ARC supports up to 4k CPUs, so this just falls apart in random ways depending on the number of CPUs and the actual kmap (atomic) nesting. The comment in highmem.c: * - fixmap anyhow needs a limited number of mappings. So 2M kvaddr == 256 PTE * slots across NR_CPUS would be more than sufficient (generic code defines * KM_TYPE_NR as 20). is just wrong. KM_TYPE_NR (now KM_MAX_IDX) is the number of slots per CPU because kmap_local/atomic() needs to support nested mappings (thread, softirq, interrupt). While KM_MAX_IDX might be overestimated, the above reasoning is just wrong and clearly the highmem code was never tested with any system with more than a few CPUs. Use the default number of slots and fail the build when it does not fit. Randomly failing at runtime is not a really good option. Signed-off-by: Thomas Gleixner Cc: Vineet Gupta Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095857.472289952@linutronix.de --- arch/arc/Kconfig | 1 + arch/arc/include/asm/highmem.h | 26 ++++++++++---- arch/arc/include/asm/kmap_types.h | 14 -------- arch/arc/mm/highmem.c | 56 ++++--------------------------- 4 files changed, 27 insertions(+), 70 deletions(-) delete mode 100644 arch/arc/include/asm/kmap_types.h diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 0a89cc9def65..d8804001d550 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -507,6 +507,7 @@ config LINUX_RAM_BASE config HIGHMEM bool "High Memory Support" select ARCH_DISCONTIGMEM_ENABLE + select KMAP_LOCAL help With ARC 2G:2G address split, only upper 2G is directly addressable by kernel. Enable this to potentially allow access to rest of 2G and PAE diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 6e5eafb3afdd..a6b8e2c352c4 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -9,17 +9,29 @@ #ifdef CONFIG_HIGHMEM #include -#include +#include + +#define FIXMAP_SIZE PGDIR_SIZE +#define PKMAP_SIZE PGDIR_SIZE /* start after vmalloc area */ #define FIXMAP_BASE (PAGE_OFFSET - FIXMAP_SIZE - PKMAP_SIZE) -#define FIXMAP_SIZE PGDIR_SIZE /* only 1 PGD worth */ -#define KM_TYPE_NR ((FIXMAP_SIZE >> PAGE_SHIFT)/NR_CPUS) -#define FIXMAP_ADDR(nr) (FIXMAP_BASE + ((nr) << PAGE_SHIFT)) + +#define FIX_KMAP_SLOTS (KM_MAX_IDX * NR_CPUS) +#define FIX_KMAP_BEGIN (0UL) +#define FIX_KMAP_END ((FIX_KMAP_BEGIN + FIX_KMAP_SLOTS) - 1) + +#define FIXADDR_TOP (FIXMAP_BASE + (FIX_KMAP_END << PAGE_SHIFT)) + +/* + * This should be converted to the asm-generic version, but of course this + * is needlessly different from all other architectures. Sigh - tglx + */ +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) (((FIXADDR_TOP - ((x) & PAGE_MASK))) >> PAGE_SHIFT) /* start after fixmap area */ #define PKMAP_BASE (FIXMAP_BASE + FIXMAP_SIZE) -#define PKMAP_SIZE PGDIR_SIZE #define LAST_PKMAP (PKMAP_SIZE >> PAGE_SHIFT) #define LAST_PKMAP_MASK (LAST_PKMAP - 1) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) @@ -29,11 +41,13 @@ extern void kmap_init(void); +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) + static inline void flush_cache_kmaps(void) { flush_cache_all(); } - #endif #endif diff --git a/arch/arc/include/asm/kmap_types.h b/arch/arc/include/asm/kmap_types.h deleted file mode 100644 index fecf7851ec32..000000000000 --- a/arch/arc/include/asm/kmap_types.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2015 Synopsys, Inc. (www.synopsys.com) - */ - -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -/* - * We primarily need to define KM_TYPE_NR here but that in turn - * is a function of PGDIR_SIZE etc. - * To avoid circular deps issue, put everything in asm/highmem.h - */ -#endif diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 1b9f473c6369..c79912a6b196 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -36,9 +36,8 @@ * This means each only has 1 PGDIR_SIZE worth of kvaddr mappings, which means * 2M of kvaddr space for typical config (8K page and 11:8:13 traversal split) * - * - fixmap anyhow needs a limited number of mappings. So 2M kvaddr == 256 PTE - * slots across NR_CPUS would be more than sufficient (generic code defines - * KM_TYPE_NR as 20). + * - The fixed KMAP slots for kmap_local/atomic() require KM_MAX_IDX slots per + * CPU. So the number of CPUs sharing a single PTE page is limited. * * - pkmap being preemptible, in theory could do with more than 256 concurrent * mappings. However, generic pkmap code: map_new_virtual(), doesn't traverse @@ -47,48 +46,6 @@ */ extern pte_t * pkmap_page_table; -static pte_t * fixmap_page_table; - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - int idx, cpu_idx; - unsigned long vaddr; - - cpu_idx = kmap_atomic_idx_push(); - idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); - vaddr = FIXMAP_ADDR(idx); - - set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, - mk_pte(page, prot)); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kv) -{ - unsigned long kvaddr = (unsigned long)kv; - - if (kvaddr >= FIXMAP_BASE && kvaddr < (FIXMAP_BASE + FIXMAP_SIZE)) { - - /* - * Because preemption is disabled, this vaddr can be associated - * with the current allocated index. - * But in case of multiple live kmap_atomic(), it still relies on - * callers to unmap in right order. - */ - int cpu_idx = kmap_atomic_idx(); - int idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); - - WARN_ON(kvaddr != FIXMAP_ADDR(idx)); - - pte_clear(&init_mm, kvaddr, fixmap_page_table + idx); - local_flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); - - kmap_atomic_idx_pop(); - } -} -EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { @@ -108,10 +65,9 @@ void __init kmap_init(void) { /* Due to recursive include hell, we can't do this in processor.h */ BUILD_BUG_ON(PAGE_OFFSET < (VMALLOC_END + FIXMAP_SIZE + PKMAP_SIZE)); - - BUILD_BUG_ON(KM_TYPE_NR > PTRS_PER_PTE); - pkmap_page_table = alloc_kmap_pgtable(PKMAP_BASE); - BUILD_BUG_ON(LAST_PKMAP > PTRS_PER_PTE); - fixmap_page_table = alloc_kmap_pgtable(FIXMAP_BASE); + BUILD_BUG_ON(FIX_KMAP_SLOTS > PTRS_PER_PTE); + + pkmap_page_table = alloc_kmap_pgtable(PKMAP_BASE); + alloc_kmap_pgtable(FIXMAP_BASE); } From 2a15ba82fa6ca3f35502b3060f22118a938d2889 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:22 +0100 Subject: [PATCH 46/77] ARM: highmem: Switch to generic kmap atomic No reason having the same code in every architecture. Signed-off-by: Thomas Gleixner Cc: Russell King Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095857.582196476@linutronix.de --- arch/arm/Kconfig | 1 + arch/arm/include/asm/fixmap.h | 4 +- arch/arm/include/asm/highmem.h | 33 +++++--- arch/arm/include/asm/kmap_types.h | 10 --- arch/arm/mm/Makefile | 1 - arch/arm/mm/highmem.c | 121 ------------------------------ 6 files changed, 26 insertions(+), 144 deletions(-) delete mode 100644 arch/arm/include/asm/kmap_types.h delete mode 100644 arch/arm/mm/highmem.c diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fe2f17eb2b50..46f890083607 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1498,6 +1498,7 @@ config HAVE_ARCH_PFN_VALID config HIGHMEM bool "High Memory Support" depends on MMU + select KMAP_LOCAL help The address space of ARM processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h index fc56fc3e1931..c279a8a463a2 100644 --- a/arch/arm/include/asm/fixmap.h +++ b/arch/arm/include/asm/fixmap.h @@ -7,14 +7,14 @@ #define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE) #include -#include +#include enum fixed_addresses { FIX_EARLYCON_MEM_BASE, __end_of_permanent_fixed_addresses, FIX_KMAP_BEGIN = __end_of_permanent_fixed_addresses, - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, /* Support writing RO kernel text via kprobes, jump labels, etc. */ FIX_TEXT_POKE0, diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 31811be38d78..a41de523d053 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -2,7 +2,7 @@ #ifndef _ASM_HIGHMEM_H #define _ASM_HIGHMEM_H -#include +#include #define PKMAP_BASE (PAGE_OFFSET - PMD_SIZE) #define LAST_PKMAP PTRS_PER_PTE @@ -46,19 +46,32 @@ extern pte_t *pkmap_page_table; #ifdef ARCH_NEEDS_KMAP_HIGH_GET extern void *kmap_high_get(struct page *page); -#else + +static inline void *arch_kmap_local_high_get(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt()) + return NULL; + return kmap_high_get(page); +} +#define arch_kmap_local_high_get arch_kmap_local_high_get + +#else /* ARCH_NEEDS_KMAP_HIGH_GET */ static inline void *kmap_high_get(struct page *page) { return NULL; } -#endif +#endif /* !ARCH_NEEDS_KMAP_HIGH_GET */ -/* - * The following functions are already defined by - * when CONFIG_HIGHMEM is not set. - */ -#ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_pfn(unsigned long pfn); -#endif +#define arch_kmap_local_post_map(vaddr, pteval) \ + local_flush_tlb_kernel_page(vaddr) + +#define arch_kmap_local_pre_unmap(vaddr) \ +do { \ + if (cache_is_vivt()) \ + __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); \ +} while (0) + +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_kernel_page(vaddr) #endif diff --git a/arch/arm/include/asm/kmap_types.h b/arch/arm/include/asm/kmap_types.h deleted file mode 100644 index 5590940ee43d..000000000000 --- a/arch/arm/include/asm/kmap_types.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ARM_KMAP_TYPES_H -#define __ARM_KMAP_TYPES_H - -/* - * This is the "bare minimum". AIO seems to require this. - */ -#define KM_TYPE_NR 16 - -#endif diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index 7cb1699fbfc4..c4ce477c5261 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -19,7 +19,6 @@ obj-$(CONFIG_MODULES) += proc-syms.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o -obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_ARM_PV_FIXUP) += pv-fixup-asm.o diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c deleted file mode 100644 index 187fab227b50..000000000000 --- a/arch/arm/mm/highmem.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * arch/arm/mm/highmem.c -- ARM highmem support - * - * Author: Nicolas Pitre - * Created: september 8, 2008 - * Copyright: Marvell Semiconductors Inc. - */ - -#include -#include -#include -#include -#include -#include -#include "mm.h" - -static inline void set_fixmap_pte(int idx, pte_t pte) -{ - unsigned long vaddr = __fix_to_virt(idx); - pte_t *ptep = virt_to_kpte(vaddr); - - set_pte_ext(ptep, pte, 0); - local_flush_tlb_kernel_page(vaddr); -} - -static inline pte_t get_fixmap_pte(unsigned long vaddr) -{ - pte_t *ptep = virt_to_kpte(vaddr); - - return *ptep; -} - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned int idx; - unsigned long vaddr; - void *kmap; - int type; - -#ifdef CONFIG_DEBUG_HIGHMEM - /* - * There is no cache coherency issue when non VIVT, so force the - * dedicated kmap usage for better debugging purposes in that case. - */ - if (!cache_is_vivt()) - kmap = NULL; - else -#endif - kmap = kmap_high_get(page); - if (kmap) - return kmap; - - type = kmap_atomic_idx_push(); - - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(idx); -#ifdef CONFIG_DEBUG_HIGHMEM - /* - * With debugging enabled, kunmap_atomic forces that entry to 0. - * Make sure it was indeed properly unmapped. - */ - BUG_ON(!pte_none(get_fixmap_pte(vaddr))); -#endif - /* - * When debugging is off, kunmap_atomic leaves the previous mapping - * in place, so the contained TLB flush ensures the TLB is updated - * with the new mapping. - */ - set_fixmap_pte(idx, mk_pte(page, prot)); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int idx, type; - - if (kvaddr >= (void *)FIXADDR_START) { - type = kmap_atomic_idx(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); - - if (cache_is_vivt()) - __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr != __fix_to_virt(idx)); - set_fixmap_pte(idx, __pte(0)); -#else - (void) idx; /* to kill a warning */ -#endif - kmap_atomic_idx_pop(); - } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - /* this address was obtained through kmap_high_get() */ - kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); - } -} -EXPORT_SYMBOL(kunmap_atomic_high); - -void *kmap_atomic_pfn(unsigned long pfn) -{ - unsigned long vaddr; - int idx, type; - struct page *page = pfn_to_page(pfn); - - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(get_fixmap_pte(vaddr))); -#endif - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); - - return (void *)vaddr; -} From 5af627a043e39d3226eecd75753dcd2c920c16ec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:23 +0100 Subject: [PATCH 47/77] csky/mm/highmem: Switch to generic kmap atomic No reason having the same code in every architecture. Signed-off-by: Thomas Gleixner Cc: Guo Ren Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095857.681196473@linutronix.de --- arch/csky/Kconfig | 1 + arch/csky/include/asm/fixmap.h | 4 +- arch/csky/include/asm/highmem.h | 6 ++- arch/csky/mm/highmem.c | 75 +-------------------------------- 4 files changed, 8 insertions(+), 78 deletions(-) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 268fad5f51cf..7a86481a22ff 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -286,6 +286,7 @@ config NR_CPUS config HIGHMEM bool "High Memory Support" depends on !CPU_CK610 + select KMAP_LOCAL default y config FORCE_MAX_ZONEORDER diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h index 81f9477d5330..4b589cc20900 100644 --- a/arch/csky/include/asm/fixmap.h +++ b/arch/csky/include/asm/fixmap.h @@ -8,7 +8,7 @@ #include #ifdef CONFIG_HIGHMEM #include -#include +#include #endif enum fixed_addresses { @@ -17,7 +17,7 @@ enum fixed_addresses { #endif #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 14645e3d5cd5..1f4ed3f4c0d9 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include /* undef for production */ @@ -32,10 +32,12 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() do {} while (0) +#define arch_kmap_local_post_map(vaddr, pteval) kmap_flush_tlb(vaddr) +#define arch_kmap_local_post_unmap(vaddr) kmap_flush_tlb(vaddr) + extern void kmap_init(void); #endif /* __KERNEL__ */ diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 89c10800a002..4161df3c6c15 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -9,8 +9,6 @@ #include #include -static pte_t *kmap_pte; - unsigned long highstart_pfn, highend_pfn; void kmap_flush_tlb(unsigned long addr) @@ -19,67 +17,7 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte - idx))); -#endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); - flush_tlb_one((unsigned long)vaddr); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int idx; - - if (vaddr < FIXADDR_START) - return; - -#ifdef CONFIG_DEBUG_HIGHMEM - idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); - - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - - pte_clear(&init_mm, vaddr, kmap_pte - idx); - flush_tlb_one(vaddr); -#else - (void) idx; /* to kill a warning */ -#endif - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - unsigned long vaddr; - int idx, type; - - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); - flush_tlb_one(vaddr); - - return (void *) vaddr; -} - -static void __init kmap_pages_init(void) +void __init kmap_init(void) { unsigned long vaddr; pgd_t *pgd; @@ -96,14 +34,3 @@ static void __init kmap_pages_init(void) pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } - -void __init kmap_init(void) -{ - unsigned long vaddr; - - kmap_pages_init(); - - vaddr = __fix_to_virt(FIX_KMAP_BEGIN); - - kmap_pte = pte_offset_kernel((pmd_t *)pgd_offset_k(vaddr), vaddr); -} From 7ac1b26b0a7288fc8f87aa8978891375f23740b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:24 +0100 Subject: [PATCH 48/77] microblaze/mm/highmem: Switch to generic kmap atomic No reason having the same code in every architecture. Signed-off-by: Thomas Gleixner Cc: Michal Simek Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095857.777445435@linutronix.de --- arch/microblaze/Kconfig | 1 + arch/microblaze/include/asm/fixmap.h | 4 +- arch/microblaze/include/asm/highmem.h | 6 ++- arch/microblaze/mm/Makefile | 1 - arch/microblaze/mm/highmem.c | 78 --------------------------- arch/microblaze/mm/init.c | 6 --- 6 files changed, 8 insertions(+), 88 deletions(-) delete mode 100644 arch/microblaze/mm/highmem.c diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 33925ffed68f..7f6ca0ab4f81 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -155,6 +155,7 @@ config XILINX_UNCACHED_SHADOW config HIGHMEM bool "High memory support" depends on MMU + select KMAP_LOCAL help The address space of Microblaze processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h index 0379ce5229e3..e6e9288bff76 100644 --- a/arch/microblaze/include/asm/fixmap.h +++ b/arch/microblaze/include/asm/fixmap.h @@ -20,7 +20,7 @@ #include #ifdef CONFIG_HIGHMEM #include -#include +#include #endif #define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) @@ -47,7 +47,7 @@ enum fixed_addresses { FIX_HOLE, #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * num_possible_cpus()) - 1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * num_possible_cpus()) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 284ca8fb54c1..4418633fb163 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -25,7 +25,6 @@ #include #include -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; /* @@ -52,6 +51,11 @@ extern pte_t *pkmap_page_table; #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } +#define arch_kmap_local_post_map(vaddr, pteval) \ + local_flush_tlb_page(NULL, vaddr); +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_page(NULL, vaddr); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/microblaze/mm/Makefile b/arch/microblaze/mm/Makefile index 1b16875cea70..8ced71100047 100644 --- a/arch/microblaze/mm/Makefile +++ b/arch/microblaze/mm/Makefile @@ -6,4 +6,3 @@ obj-y := consistent.o init.o obj-$(CONFIG_MMU) += pgtable.o mmu_context.o fault.o -obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c deleted file mode 100644 index 92e0890416c9..000000000000 --- a/arch/microblaze/mm/highmem.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * highmem.c: virtual kernel memory mappings for high memory - * - * PowerPC version, stolen from the i386 version. - * - * Used in CONFIG_HIGHMEM systems for memory pages which - * are not addressable by direct kernel virtual addresses. - * - * Copyright (C) 1999 Gerhard Wichert, Siemens AG - * Gerhard.Wichert@pdb.siemens.de - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * up to 16 Terrabyte physical memory. With current x86 CPUs - * we now support up to 64 Gigabytes physical RAM. - * - * Copyright (C) 1999 Ingo Molnar - * - * Reworked for PowerPC by various contributors. Moved from - * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. - */ - -#include -#include - -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ -#include - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif - set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); - local_flush_tlb_page(NULL, vaddr); - - return (void *) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type; - unsigned int idx; - - if (vaddr < __fix_to_virt(FIX_KMAP_END)) - return; - - type = kmap_atomic_idx(); - - idx = type + KM_TYPE_NR * smp_processor_id(); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_page(NULL, vaddr); - - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 45da639bd22c..1f4b5b34e600 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -49,17 +49,11 @@ unsigned long lowmem_size; EXPORT_SYMBOL(min_low_pfn); EXPORT_SYMBOL(max_low_pfn); -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -EXPORT_SYMBOL(kmap_pte); - static void __init highmem_init(void) { pr_debug("%x\n", (u32)PKMAP_BASE); map_page(PKMAP_BASE, 0, 0); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); - - kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); } static void highmem_setup(void) From a4c33e83bca133ff979e13c784c7605e1ac143df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:25 +0100 Subject: [PATCH 49/77] mips/mm/highmem: Switch to generic kmap atomic No reason having the same code in every architecture Signed-off-by: Thomas Gleixner Cc: Thomas Bogendoerfer Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095857.885321106@linutronix.de --- arch/mips/Kconfig | 1 + arch/mips/include/asm/fixmap.h | 4 +- arch/mips/include/asm/highmem.h | 6 +-- arch/mips/include/asm/kmap_types.h | 13 ----- arch/mips/mm/highmem.c | 77 ------------------------------ arch/mips/mm/init.c | 4 -- 6 files changed, 6 insertions(+), 99 deletions(-) delete mode 100644 arch/mips/include/asm/kmap_types.h diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 2000bb2b0220..6b762bebff33 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2719,6 +2719,7 @@ config WAR_MIPS34K_MISSED_ITLB config HIGHMEM bool "High Memory Support" depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA + select KMAP_LOCAL config CPU_SUPPORTS_HIGHMEM bool diff --git a/arch/mips/include/asm/fixmap.h b/arch/mips/include/asm/fixmap.h index 743535be7528..beea14761cef 100644 --- a/arch/mips/include/asm/fixmap.h +++ b/arch/mips/include/asm/fixmap.h @@ -17,7 +17,7 @@ #include #ifdef CONFIG_HIGHMEM #include -#include +#include #endif /* @@ -52,7 +52,7 @@ enum fixed_addresses { #ifdef CONFIG_HIGHMEM /* reserved pte's for temporary kernel mappings */ FIX_KMAP_BEGIN = FIX_CMAP_END + 1, - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index f1f788b57166..19edf8e69971 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; @@ -48,11 +48,11 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) -extern void kmap_init(void); +#define arch_kmap_local_post_map(vaddr, pteval) local_flush_tlb_one(vaddr) +#define arch_kmap_local_post_unmap(vaddr) local_flush_tlb_one(vaddr) #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/kmap_types.h b/arch/mips/include/asm/kmap_types.h deleted file mode 100644 index 16665dc2431b..000000000000 --- a/arch/mips/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 5fec7f45d79a..57e2f08f00d0 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -8,8 +8,6 @@ #include #include -static pte_t *kmap_pte; - unsigned long highstart_pfn, highend_pfn; void kmap_flush_tlb(unsigned long addr) @@ -17,78 +15,3 @@ void kmap_flush_tlb(unsigned long addr) flush_tlb_one(addr); } EXPORT_SYMBOL(kmap_flush_tlb); - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte - idx))); -#endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); - local_flush_tlb_one((unsigned long)vaddr); - - return (void*) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type __maybe_unused; - - if (vaddr < FIXADDR_START) - return; - - type = kmap_atomic_idx(); -#ifdef CONFIG_DEBUG_HIGHMEM - { - int idx = type + KM_TYPE_NR * smp_processor_id(); - - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_one(vaddr); - } -#endif - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - unsigned long vaddr; - int idx, type; - - preempt_disable(); - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); - flush_tlb_one(vaddr); - - return (void*) vaddr; -} - -void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); -} diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 07e84a774938..bc80893e5c0f 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -402,9 +401,6 @@ void __init paging_init(void) pagetable_init(); -#ifdef CONFIG_HIGHMEM - kmap_init(); -#endif #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif From 5f037ea3b26767e0b1bdc522948321b282268b49 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:26 +0100 Subject: [PATCH 50/77] nds32/mm/highmem: Switch to generic kmap atomic The mapping code is odd and looks broken. See FIXME in the comment. Also fix the harmless off by one in the FIX_KMAP_END define. Signed-off-by: Thomas Gleixner Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095857.980576055@linutronix.de --- arch/nds32/Kconfig.cpu | 1 + arch/nds32/include/asm/fixmap.h | 4 +-- arch/nds32/include/asm/highmem.h | 22 +++++++++++---- arch/nds32/mm/Makefile | 1 - arch/nds32/mm/highmem.c | 48 -------------------------------- 5 files changed, 19 insertions(+), 57 deletions(-) delete mode 100644 arch/nds32/mm/highmem.c diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu index f88a12fdf0f3..c10759952485 100644 --- a/arch/nds32/Kconfig.cpu +++ b/arch/nds32/Kconfig.cpu @@ -157,6 +157,7 @@ config HW_SUPPORT_UNALIGNMENT_ACCESS config HIGHMEM bool "High Memory Support" depends on MMU && !CPU_CACHE_ALIASING + select KMAP_LOCAL help The address space of Andes processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address diff --git a/arch/nds32/include/asm/fixmap.h b/arch/nds32/include/asm/fixmap.h index 5a4bf11e5800..2fa09a2de428 100644 --- a/arch/nds32/include/asm/fixmap.h +++ b/arch/nds32/include/asm/fixmap.h @@ -6,7 +6,7 @@ #ifdef CONFIG_HIGHMEM #include -#include +#include #endif enum fixed_addresses { @@ -14,7 +14,7 @@ enum fixed_addresses { FIX_KMAP_RESERVED, FIX_KMAP_BEGIN, #ifdef CONFIG_HIGHMEM - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS), + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif FIX_EARLYCON_MEM_BASE, __end_of_fixed_addresses diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index fe986d0e6e3f..16159a8716f2 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -5,7 +5,6 @@ #define _ASM_HIGHMEM_H #include -#include #include /* @@ -45,11 +44,22 @@ extern pte_t *pkmap_page_table; extern void kmap_init(void); /* - * The following functions are already defined by - * when CONFIG_HIGHMEM is not set. + * FIXME: The below looks broken vs. a kmap_atomic() in task context which + * is interupted and another kmap_atomic() happens in interrupt context. + * But what do I know about nds32. -- tglx */ -#ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_pfn(unsigned long pfn); -#endif +#define arch_kmap_local_post_map(vaddr, pteval) \ + do { \ + __nds32__tlbop_inv(vaddr); \ + __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); \ + __nds32__tlbop_rwr(pteval); \ + __nds32__isb(); \ + } while (0) + +#define arch_kmap_local_pre_unmap(vaddr) \ + do { \ + __nds32__tlbop_inv(vaddr); \ + __nds32__isb(); \ + } while (0) #endif diff --git a/arch/nds32/mm/Makefile b/arch/nds32/mm/Makefile index 897ecaf5cf54..14fb2e8eb036 100644 --- a/arch/nds32/mm/Makefile +++ b/arch/nds32/mm/Makefile @@ -3,7 +3,6 @@ obj-y := extable.o tlb.o fault.o init.o mmap.o \ mm-nds32.o cacheflush.o proc.o obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o -obj-$(CONFIG_HIGHMEM) += highmem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_proc.o = $(CC_FLAGS_FTRACE) diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c deleted file mode 100644 index 4284cd59e21a..000000000000 --- a/arch/nds32/mm/highmem.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2005-2017 Andes Technology Corporation - -#include -#include -#include -#include -#include -#include -#include -#include - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned int idx; - unsigned long vaddr, pte; - int type; - pte_t *ptep; - - type = kmap_atomic_idx_push(); - - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; - ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); - set_pte(ptep, pte); - - __nds32__tlbop_inv(vaddr); - __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); - __nds32__tlbop_rwr(pte); - __nds32__isb(); - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - if (kvaddr >= (void *)FIXADDR_START) { - unsigned long vaddr = (unsigned long)kvaddr; - pte_t *ptep; - kmap_atomic_idx_pop(); - __nds32__tlbop_inv(vaddr); - __nds32__isb(); - ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); - set_pte(ptep, 0); - } -} -EXPORT_SYMBOL(kunmap_atomic_high); From 47da42b27a56f3ee5abace2858b69e277703f707 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:27 +0100 Subject: [PATCH 51/77] powerpc/mm/highmem: Switch to generic kmap atomic No reason having the same code in every architecture Signed-off-by: Thomas Gleixner Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095858.087635810@linutronix.de --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/fixmap.h | 4 +- arch/powerpc/include/asm/highmem.h | 7 ++- arch/powerpc/include/asm/kmap_types.h | 13 ------ arch/powerpc/mm/Makefile | 1 - arch/powerpc/mm/highmem.c | 67 --------------------------- arch/powerpc/mm/mem.c | 7 --- 7 files changed, 8 insertions(+), 92 deletions(-) delete mode 100644 arch/powerpc/include/asm/kmap_types.h delete mode 100644 arch/powerpc/mm/highmem.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe08492..d4cfddc0fb4c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -409,6 +409,7 @@ menu "Kernel options" config HIGHMEM bool "High memory support" depends on PPC32 + select KMAP_LOCAL source "kernel/Kconfig.hz" diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index 6bfc87915d5d..8d03c16a3663 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -20,7 +20,7 @@ #include #ifdef CONFIG_HIGHMEM #include -#include +#include #endif #ifdef CONFIG_KASAN @@ -55,7 +55,7 @@ enum fixed_addresses { FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128K, PAGE_SIZE)/PAGE_SIZE)-1, #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif #ifdef CONFIG_PPC_8xx /* For IMMR we need an aligned 512K area */ diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 104026f7d6bc..80a5ae771c65 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -24,12 +24,10 @@ #ifdef __KERNEL__ #include -#include #include #include #include -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; /* @@ -60,6 +58,11 @@ extern pte_t *pkmap_page_table; #define flush_cache_kmaps() flush_cache_all() +#define arch_kmap_local_post_map(vaddr, pteval) \ + local_flush_tlb_page(NULL, vaddr) +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_page(NULL, vaddr) + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/powerpc/include/asm/kmap_types.h b/arch/powerpc/include/asm/kmap_types.h deleted file mode 100644 index c8fa182d48c8..000000000000 --- a/arch/powerpc/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_KMAP_TYPES_H -#define _ASM_POWERPC_KMAP_TYPES_H - -#ifdef __KERNEL__ - -/* - */ - -#define KM_TYPE_NR 16 - -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_KMAP_TYPES_H */ diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 5e147986400d..1c552b53aa63 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -16,7 +16,6 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o -obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c deleted file mode 100644 index 624b4438aff9..000000000000 --- a/arch/powerpc/mm/highmem.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * highmem.c: virtual kernel memory mappings for high memory - * - * PowerPC version, stolen from the i386 version. - * - * Used in CONFIG_HIGHMEM systems for memory pages which - * are not addressable by direct kernel virtual addresses. - * - * Copyright (C) 1999 Gerhard Wichert, Siemens AG - * Gerhard.Wichert@pdb.siemens.de - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * up to 16 Terrabyte physical memory. With current x86 CPUs - * we now support up to 64 Gigabytes physical RAM. - * - * Copyright (C) 1999 Ingo Molnar - * - * Reworked for PowerPC by various contributors. Moved from - * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. - */ - -#include -#include - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx))); - __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); - local_flush_tlb_page(NULL, vaddr); - - return (void*) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr < __fix_to_virt(FIX_KMAP_END)) - return; - - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { - int type = kmap_atomic_idx(); - unsigned int idx; - - idx = type + KM_TYPE_NR * smp_processor_id(); - WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_page(NULL, vaddr); - } - - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 01ec2a252f09..375a9894063b 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -61,11 +61,6 @@ unsigned long long memory_limit; bool init_mem_is_free; -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -EXPORT_SYMBOL(kmap_pte); -#endif - pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -235,8 +230,6 @@ void __init paging_init(void) map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); - - kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); #endif /* CONFIG_HIGHMEM */ printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", From 3293efa9780712ad8504689e0c296d2bd33827d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:28 +0100 Subject: [PATCH 52/77] sparc/mm/highmem: Switch to generic kmap atomic No reason having the same code in every architecture Signed-off-by: Thomas Gleixner Cc: "David S. Miller" Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095858.197568209@linutronix.de --- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/highmem.h | 8 +- arch/sparc/include/asm/kmap_types.h | 11 --- arch/sparc/include/asm/vaddrs.h | 4 +- arch/sparc/mm/Makefile | 3 - arch/sparc/mm/highmem.c | 115 ---------------------------- arch/sparc/mm/srmmu.c | 2 - 7 files changed, 8 insertions(+), 136 deletions(-) delete mode 100644 arch/sparc/include/asm/kmap_types.h delete mode 100644 arch/sparc/mm/highmem.c diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index a6ca135442f9..e841708cb830 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -139,6 +139,7 @@ config MMU config HIGHMEM bool default y if SPARC32 + select KMAP_LOCAL config ZONE_DMA bool diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 6c35f0d27ee1..875116209ec1 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -24,7 +24,6 @@ #include #include #include -#include #include /* declarations for highmem.c */ @@ -33,8 +32,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) extern pte_t *pkmap_page_table; -void kmap_init(void) __init; - /* * Right now we initialize only a single pte table. It can be extended * easily, subsequent pte tables have to be allocated in one physical @@ -53,6 +50,11 @@ void kmap_init(void) __init; #define flush_cache_kmaps() flush_cache_all() +/* FIXME: Use __flush_tlb_one(vaddr) instead of flush_cache_all() -- Anton */ +#define arch_kmap_local_post_map(vaddr, pteval) flush_cache_all() +#define arch_kmap_local_post_unmap(vaddr) flush_cache_all() + + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/sparc/include/asm/kmap_types.h b/arch/sparc/include/asm/kmap_types.h deleted file mode 100644 index 55a99b6bd91e..000000000000 --- a/arch/sparc/include/asm/kmap_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -/* Dummy header just to define km_type. None of this - * is actually used on sparc. -DaveM - */ - -#include - -#endif diff --git a/arch/sparc/include/asm/vaddrs.h b/arch/sparc/include/asm/vaddrs.h index 84d054b07a6f..4fec0341e2a8 100644 --- a/arch/sparc/include/asm/vaddrs.h +++ b/arch/sparc/include/asm/vaddrs.h @@ -32,13 +32,13 @@ #define SRMMU_NOCACHE_ALCRATIO 64 /* 256 pages per 64MB of system RAM */ #ifndef __ASSEMBLY__ -#include +#include enum fixed_addresses { FIX_HOLE, #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, - FIX_KMAP_END = (KM_TYPE_NR * NR_CPUS), + FIX_KMAP_END = (KM_MAX_IDX * NR_CPUS), #endif __end_of_fixed_addresses }; diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile index b078205b70e0..68db1f859b02 100644 --- a/arch/sparc/mm/Makefile +++ b/arch/sparc/mm/Makefile @@ -15,6 +15,3 @@ obj-$(CONFIG_SPARC32) += leon_mm.o # Only used by sparc64 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o - -# Only used by sparc32 -obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c deleted file mode 100644 index 8f2a2afb048a..000000000000 --- a/arch/sparc/mm/highmem.c +++ /dev/null @@ -1,115 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * highmem.c: virtual kernel memory mappings for high memory - * - * Provides kernel-static versions of atomic kmap functions originally - * found as inlines in include/asm-sparc/highmem.h. These became - * needed as kmap_atomic() and kunmap_atomic() started getting - * called from within modules. - * -- Tomas Szepe , September 2002 - * - * But kmap_atomic() and kunmap_atomic() cannot be inlined in - * modules because they are loaded with btfixup-ped functions. - */ - -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need it. - * - * XXX This is an old text. Actually, it's good to use atomic kmaps, - * provided you remember that they are atomic and not try to sleep - * with a kmap taken, much like a spinlock. Non-atomic kmaps are - * shared by CPUs, and so precious, and establishing them requires IPI. - * Atomic kmaps are lightweight and we may have NCPUS more of them. - */ -#include -#include -#include - -#include -#include -#include - -static pte_t *kmap_pte; - -void __init kmap_init(void) -{ - unsigned long address = __fix_to_virt(FIX_KMAP_BEGIN); - - /* cache the first kmap pte */ - kmap_pte = virt_to_kpte(address); -} - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - long idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - -/* XXX Fix - Anton */ -#if 0 - __flush_cache_one(vaddr); -#else - flush_cache_all(); -#endif - -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); -/* XXX Fix - Anton */ -#if 0 - __flush_tlb_one(vaddr); -#else - flush_tlb_all(); -#endif - - return (void*) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type; - - if (vaddr < FIXADDR_START) - return; - - type = kmap_atomic_idx(); - -#ifdef CONFIG_DEBUG_HIGHMEM - { - unsigned long idx; - - idx = type + KM_TYPE_NR * smp_processor_id(); - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); - - /* XXX Fix - Anton */ -#if 0 - __flush_cache_one(vaddr); -#else - flush_cache_all(); -#endif - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - /* XXX Fix - Anton */ -#if 0 - __flush_tlb_one(vaddr); -#else - flush_tlb_all(); -#endif - } -#endif - - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 0070f8b9a753..a03caa5f6628 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -971,8 +971,6 @@ void __init srmmu_paging_init(void) sparc_context_init(num_contexts); - kmap_init(); - { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; From 629ed3f7dad2a914b3a89fad49b358e363e3e6d1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:29 +0100 Subject: [PATCH 53/77] xtensa/mm/highmem: Switch to generic kmap atomic No reason having the same code in every architecture Signed-off-by: Thomas Gleixner Cc: Chris Zankel Cc: Max Filippov Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095858.311016780@linutronix.de --- arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/fixmap.h | 4 +-- arch/xtensa/include/asm/highmem.h | 12 ++++++-- arch/xtensa/mm/highmem.c | 46 ++++--------------------------- 4 files changed, 18 insertions(+), 45 deletions(-) diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index d0dfa50bd0bb..dc22ef3cf4be 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -666,6 +666,7 @@ endchoice config HIGHMEM bool "High Memory Support" depends on MMU + select KMAP_LOCAL help Linux can use the full amount of RAM in the system by default. However, the default MMUv2 setup only maps the diff --git a/arch/xtensa/include/asm/fixmap.h b/arch/xtensa/include/asm/fixmap.h index a06ffb0c61c7..92049b61c351 100644 --- a/arch/xtensa/include/asm/fixmap.h +++ b/arch/xtensa/include/asm/fixmap.h @@ -16,7 +16,7 @@ #ifdef CONFIG_HIGHMEM #include #include -#include +#include #endif /* @@ -39,7 +39,7 @@ enum fixed_addresses { /* reserved pte's for temporary kernel mappings */ FIX_KMAP_BEGIN, FIX_KMAP_END = FIX_KMAP_BEGIN + - (KM_TYPE_NR * NR_CPUS * DCACHE_N_COLORS) - 1, + (KM_MAX_IDX * NR_CPUS * DCACHE_N_COLORS) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index eac503215f17..0fc3b1cebc56 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -16,9 +16,8 @@ #include #include #include -#include -#define PKMAP_BASE ((FIXADDR_START - \ +#define PKMAP_BASE ((FIXADDR_START - \ (LAST_PKMAP + 1) * PAGE_SIZE) & PMD_MASK) #define LAST_PKMAP (PTRS_PER_PTE * DCACHE_N_COLORS) #define LAST_PKMAP_MASK (LAST_PKMAP - 1) @@ -68,6 +67,15 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } +enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn); +#define arch_kmap_local_map_idx kmap_local_map_idx + +enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr); +#define arch_kmap_local_unmap_idx kmap_local_unmap_idx + +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) + void kmap_init(void); #endif diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 673196fe862e..0735ca5e8f86 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -12,8 +12,6 @@ #include #include -static pte_t *kmap_pte; - #if DCACHE_WAY_SIZE > PAGE_SIZE unsigned int last_pkmap_nr_arr[DCACHE_N_COLORS]; wait_queue_head_t pkmap_map_wait_arr[DCACHE_N_COLORS]; @@ -33,59 +31,25 @@ static inline void kmap_waitqueues_init(void) static inline enum fixed_addresses kmap_idx(int type, unsigned long color) { - return (type + KM_TYPE_NR * smp_processor_id()) * DCACHE_N_COLORS + + return (type + KM_MAX_IDX * smp_processor_id()) * DCACHE_N_COLORS + color; } -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn) { - enum fixed_addresses idx; - unsigned long vaddr; - - idx = kmap_idx(kmap_atomic_idx_push(), - DCACHE_ALIAS(page_to_phys(page))); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte + idx))); -#endif - set_pte(kmap_pte + idx, mk_pte(page, prot)); - - return (void *)vaddr; + return kmap_idx(type, DCACHE_ALIAS(pfn << PAGE_SHIFT)); } -EXPORT_SYMBOL(kmap_atomic_high_prot); -void kunmap_atomic_high(void *kvaddr) +enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr) { - if (kvaddr >= (void *)FIXADDR_START && - kvaddr < (void *)FIXADDR_TOP) { - int idx = kmap_idx(kmap_atomic_idx(), - DCACHE_ALIAS((unsigned long)kvaddr)); - - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - pte_clear(&init_mm, kvaddr, kmap_pte + idx); - local_flush_tlb_kernel_range((unsigned long)kvaddr, - (unsigned long)kvaddr + PAGE_SIZE); - - kmap_atomic_idx_pop(); - } + return kmap_idx(type, DCACHE_ALIAS(addr)); } -EXPORT_SYMBOL(kunmap_atomic_high); void __init kmap_init(void) { - unsigned long kmap_vstart; - /* Check if this memory layout is broken because PKMAP overlaps * page table. */ BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); kmap_waitqueues_init(); } From d7029e4549691ecaf1ead536d3322a00bda85659 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:30 +0100 Subject: [PATCH 54/77] highmem: Get rid of kmap_types.h The header is not longer used and on alpha, ia64, openrisc, parisc and um it was completely unused anyway as these architectures have no highmem support. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: Andrew Morton Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095858.422094352@linutronix.de --- arch/alpha/include/asm/kmap_types.h | 15 --------------- arch/ia64/include/asm/kmap_types.h | 13 ------------- arch/openrisc/mm/init.c | 1 - arch/openrisc/mm/ioremap.c | 1 - arch/parisc/include/asm/kmap_types.h | 13 ------------- arch/um/include/asm/fixmap.h | 1 - arch/um/include/asm/kmap_types.h | 13 ------------- include/asm-generic/Kbuild | 1 - include/asm-generic/kmap_types.h | 11 ----------- include/linux/highmem.h | 2 -- 10 files changed, 71 deletions(-) delete mode 100644 arch/alpha/include/asm/kmap_types.h delete mode 100644 arch/ia64/include/asm/kmap_types.h delete mode 100644 arch/parisc/include/asm/kmap_types.h delete mode 100644 arch/um/include/asm/kmap_types.h delete mode 100644 include/asm-generic/kmap_types.h diff --git a/arch/alpha/include/asm/kmap_types.h b/arch/alpha/include/asm/kmap_types.h deleted file mode 100644 index 651714b45729..000000000000 --- a/arch/alpha/include/asm/kmap_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -/* Dummy header just to define km_type. */ - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/ia64/include/asm/kmap_types.h b/arch/ia64/include/asm/kmap_types.h deleted file mode 100644 index 5c268cf7c2bd..000000000000 --- a/arch/ia64/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_IA64_KMAP_TYPES_H -#define _ASM_IA64_KMAP_TYPES_H - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include - -#undef __WITH_KM_FENCE - -#endif /* _ASM_IA64_KMAP_TYPES_H */ diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 8348feaaf46e..bf9b2310fc93 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index a978590d802d..5aed97a18bac 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/parisc/include/asm/kmap_types.h b/arch/parisc/include/asm/kmap_types.h deleted file mode 100644 index 3e70b5cd1123..000000000000 --- a/arch/parisc/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h index 2c697a145ac1..2efac5827188 100644 --- a/arch/um/include/asm/fixmap.h +++ b/arch/um/include/asm/fixmap.h @@ -3,7 +3,6 @@ #define __UM_FIXMAP_H #include -#include #include #include #include diff --git a/arch/um/include/asm/kmap_types.h b/arch/um/include/asm/kmap_types.h deleted file mode 100644 index b0bd12de1d23..000000000000 --- a/arch/um/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - */ - -#ifndef __UM_KMAP_TYPES_H -#define __UM_KMAP_TYPES_H - -/* No more #include "asm/arch/kmap_types.h" ! */ - -#define KM_TYPE_NR 14 - -#endif diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index ed62d384fb79..4365b9aa3e3f 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -30,7 +30,6 @@ mandatory-y += irq.h mandatory-y += irq_regs.h mandatory-y += irq_work.h mandatory-y += kdebug.h -mandatory-y += kmap_types.h mandatory-y += kmap_size.h mandatory-y += kprobes.h mandatory-y += linkage.h diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h deleted file mode 100644 index 9f95b7b63d19..000000000000 --- a/include/asm-generic/kmap_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_KMAP_TYPES_H -#define _ASM_GENERIC_KMAP_TYPES_H - -#ifdef __WITH_KM_FENCE -# define KM_TYPE_NR 41 -#else -# define KM_TYPE_NR 20 -#endif - -#endif diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 1222a31be842..de78869454b1 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -29,8 +29,6 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) } #endif -#include - /* * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. */ From 3c1016b53c311906878c703af1e2b29855a9a962 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:31 +0100 Subject: [PATCH 55/77] mm/highmem: Remove the old kmap_atomic cruft All users gone. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Christoph Hellwig Cc: Andrew Morton Cc: Arnd Bergmann Link: https://lore.kernel.org/r/20201103095858.516281567@linutronix.de --- include/linux/highmem.h | 67 ++++------------------------------------- mm/highmem.c | 7 +---- 2 files changed, 7 insertions(+), 67 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index de78869454b1..3180a8f7307f 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -86,26 +86,6 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ - -#ifndef CONFIG_KMAP_LOCAL -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -void kunmap_atomic_high(void *kvaddr); - -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_atomic_high_prot(page, prot); -} - -static inline void __kunmap_atomic(void *vaddr) -{ - kunmap_atomic_high(vaddr); -} -#else /* !CONFIG_KMAP_LOCAL */ - static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { preempt_disable(); @@ -113,6 +93,11 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return __kmap_local_page_prot(page, prot); } +static inline void *kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} + static inline void *kmap_atomic_pfn(unsigned long pfn) { preempt_disable(); @@ -125,13 +110,6 @@ static inline void __kunmap_atomic(void *addr) kunmap_local_indexed(addr); } -#endif /* CONFIG_KMAP_LOCAL */ - -static inline void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} - /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; @@ -212,42 +190,9 @@ static inline void __kunmap_atomic(void *addr) #define kmap_flush_unused() do {} while(0) + #endif /* CONFIG_HIGHMEM */ -#if !defined(CONFIG_KMAP_LOCAL) -#if defined(CONFIG_HIGHMEM) - -DECLARE_PER_CPU(int, __kmap_atomic_idx); - -static inline int kmap_atomic_idx_push(void) -{ - int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx >= KM_TYPE_NR); -#endif - return idx; -} - -static inline int kmap_atomic_idx(void) -{ - return __this_cpu_read(__kmap_atomic_idx) - 1; -} - -static inline void kmap_atomic_idx_pop(void) -{ -#ifdef CONFIG_DEBUG_HIGHMEM - int idx = __this_cpu_dec_return(__kmap_atomic_idx); - - BUG_ON(idx < 0); -#else - __this_cpu_dec(__kmap_atomic_idx); -#endif -} -#endif -#endif - /* * Prevent people trying to call kunmap_atomic() as if it were kunmap() * kunmap_atomic() should get the return value of kmap_atomic, not the page. diff --git a/mm/highmem.c b/mm/highmem.c index 77677c6844f7..499dfafd36b7 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -31,12 +31,6 @@ #include #include -#ifndef CONFIG_KMAP_LOCAL -#ifdef CONFIG_HIGHMEM -DEFINE_PER_CPU(int, __kmap_atomic_idx); -#endif -#endif - /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -410,6 +404,7 @@ static inline void kmap_local_idx_pop(void) #ifndef arch_kmap_local_post_map # define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) #endif + #ifndef arch_kmap_local_pre_unmap # define arch_kmap_local_pre_unmap(vaddr) do { } while (0) #endif From 351191ad55c8a1eccaf23e4187c62056229c0779 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:32 +0100 Subject: [PATCH 56/77] io-mapping: Cleanup atomic iomap Switch the atomic iomap implementation over to kmap_local and stick the preempt/pagefault mechanics into the generic code similar to the kmap_atomic variants. Rename the x86 map function in preparation for a non-atomic variant. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Christoph Hellwig Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095858.625310005@linutronix.de --- arch/x86/include/asm/iomap.h | 9 +-------- arch/x86/mm/iomap_32.c | 6 ++---- include/linux/io-mapping.h | 8 ++++++-- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 0be7a30fd6bc..e2de092fc38c 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -13,14 +13,7 @@ #include #include -void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot); - -static inline void iounmap_atomic(void __iomem *vaddr) -{ - kunmap_local_indexed((void __force *)vaddr); - pagefault_enable(); - preempt_enable(); -} +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot); int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index e0a40d7cc66c..9aaa756ddf21 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -44,7 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) } EXPORT_SYMBOL_GPL(iomap_free); -void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot) +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, translate non-WB request to UC- just in @@ -60,8 +60,6 @@ void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot) /* Filter out unsupported __PAGE_KERNEL* bits: */ pgprot_val(prot) &= __default_kernel_pte_mask; - preempt_disable(); - pagefault_disable(); return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); } -EXPORT_SYMBOL_GPL(iomap_atomic_pfn_prot); +EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot); diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 3b0940be72e9..60e7c83e4904 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -69,13 +69,17 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, BUG_ON(offset >= mapping->size); phys_addr = mapping->base + offset; - return iomap_atomic_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); + preempt_disable(); + pagefault_disable(); + return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); } static inline void io_mapping_unmap_atomic(void __iomem *vaddr) { - iounmap_atomic(vaddr); + kunmap_local_indexed((void __force *)vaddr); + pagefault_enable(); + preempt_enable(); } static inline void __iomem * From 9bf6f7bab3bad4b30f108fca25aa5297dff0973f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:33 +0100 Subject: [PATCH 57/77] Documentation/io-mapping: Remove outdated blurb The implementation details in the documentation are outdated and not really helpful. Remove them. Signed-off-by: Thomas Gleixner Cc: Christoph Hellwig Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095858.734064977@linutronix.de --- Documentation/driver-api/io-mapping.rst | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst index a966239f04e4..e33b88268554 100644 --- a/Documentation/driver-api/io-mapping.rst +++ b/Documentation/driver-api/io-mapping.rst @@ -73,25 +73,3 @@ for pages mapped with io_mapping_map_wc. At driver close time, the io_mapping object must be freed:: void io_mapping_free(struct io_mapping *mapping) - -Current Implementation -====================== - -The initial implementation of these functions uses existing mapping -mechanisms and so provides only an abstraction layer and no new -functionality. - -On 64-bit processors, io_mapping_create_wc calls ioremap_wc for the whole -range, creating a permanent kernel-visible mapping to the resource. The -map_atomic and map functions add the requested offset to the base of the -virtual address returned by ioremap_wc. - -On 32-bit processors with HIGHMEM defined, io_mapping_map_atomic_wc uses -kmap_atomic_pfn to map the specified page in an atomic fashion; -kmap_atomic_pfn isn't really supposed to be used with device pages, but it -provides an efficient mapping for this usage. - -On 32-bit processors without HIGHMEM defined, io_mapping_map_atomic_wc and -io_mapping_map_wc both use ioremap_wc, a terribly inefficient function which -performs an IPI to inform all processors about the new mapping. This results -in a significant performance penalty. From 13f876ba77ebd5125799bb042201f22cf73df154 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 3 Nov 2020 10:27:34 +0100 Subject: [PATCH 58/77] highmem: High implementation details and document API Move the gory details of kmap & al into a private header and only document the interfaces which are usable by drivers. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Christoph Hellwig Cc: Andrew Morton Link: https://lore.kernel.org/r/20201103095858.827582066@linutronix.de --- include/linux/highmem-internal.h | 174 +++++++++++++++++++ include/linux/highmem.h | 278 ++++++++++++------------------- mm/highmem.c | 11 +- 3 files changed, 280 insertions(+), 183 deletions(-) create mode 100644 include/linux/highmem-internal.h diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h new file mode 100644 index 000000000000..6ceed907b14e --- /dev/null +++ b/include/linux/highmem-internal.h @@ -0,0 +1,174 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_HIGHMEM_INTERNAL_H +#define _LINUX_HIGHMEM_INTERNAL_H + +/* + * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. + */ +#ifdef CONFIG_KMAP_LOCAL +void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); +void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +void kunmap_local_indexed(void *vaddr); +#endif + +#ifdef CONFIG_HIGHMEM +#include + +#ifndef ARCH_HAS_KMAP_FLUSH_TLB +static inline void kmap_flush_tlb(unsigned long addr) { } +#endif + +#ifndef kmap_prot +#define kmap_prot PAGE_KERNEL +#endif + +void *kmap_high(struct page *page); +void kunmap_high(struct page *page); +void __kmap_flush_unused(void); +struct page *__kmap_to_page(void *addr); + +static inline void *kmap(struct page *page) +{ + void *addr; + + might_sleep(); + if (!PageHighMem(page)) + addr = page_address(page); + else + addr = kmap_high(page); + kmap_flush_tlb((unsigned long)addr); + return addr; +} + +static inline void kunmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +static inline struct page *kmap_to_page(void *addr) +{ + return __kmap_to_page(addr); +} + +static inline void kmap_flush_unused(void) +{ + __kmap_flush_unused(); +} + +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + return __kmap_local_page_prot(page, prot); +} + +static inline void *kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} + +static inline void *kmap_atomic_pfn(unsigned long pfn) +{ + preempt_disable(); + pagefault_disable(); + return __kmap_local_pfn_prot(pfn, kmap_prot); +} + +static inline void __kunmap_atomic(void *addr) +{ + kunmap_local_indexed(addr); + pagefault_enable(); + preempt_enable(); +} + +unsigned int __nr_free_highpages(void); +extern atomic_long_t _totalhigh_pages; + +static inline unsigned int nr_free_highpages(void) +{ + return __nr_free_highpages(); +} + +static inline unsigned long totalhigh_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalhigh_pages); +} + +static inline void totalhigh_pages_inc(void) +{ + atomic_long_inc(&_totalhigh_pages); +} + +static inline void totalhigh_pages_add(long count) +{ + atomic_long_add(count, &_totalhigh_pages); +} + +#else /* CONFIG_HIGHMEM */ + +static inline struct page *kmap_to_page(void *addr) +{ + return virt_to_page(addr); +} + +static inline void *kmap(struct page *page) +{ + might_sleep(); + return page_address(page); +} + +static inline void kunmap_high(struct page *page) { } +static inline void kmap_flush_unused(void) { } + +static inline void kunmap(struct page *page) +{ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(page_address(page)); +#endif +} + +static inline void *kmap_atomic(struct page *page) +{ + preempt_disable(); + pagefault_disable(); + return page_address(page); +} + +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + return kmap_atomic(page); +} + +static inline void *kmap_atomic_pfn(unsigned long pfn) +{ + return kmap_atomic(pfn_to_page(pfn)); +} + +static inline void __kunmap_atomic(void *addr) +{ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif + pagefault_enable(); + preempt_enable(); +} + +static inline unsigned int nr_free_highpages(void) { return 0; } +static inline unsigned long totalhigh_pages(void) { return 0UL; } + +#endif /* CONFIG_HIGHMEM */ + +/* + * Prevent people trying to call kunmap_atomic() as if it were kunmap() + * kunmap_atomic() should get the return value of kmap_atomic, not the page. + */ +#define kunmap_atomic(__addr) \ +do { \ + BUILD_BUG_ON(__same_type((__addr), struct page *)); \ + __kunmap_atomic(__addr); \ +} while (0) + +#endif diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 3180a8f7307f..7d098bd621f6 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -11,6 +11,108 @@ #include +#include "highmem-internal.h" + +/** + * kmap - Map a page for long term usage + * @page: Pointer to the page to be mapped + * + * Returns: The virtual address of the mapping + * + * Can only be invoked from preemptible task context because on 32bit + * systems with CONFIG_HIGHMEM enabled this function might sleep. + * + * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area + * this returns the virtual address of the direct kernel mapping. + * + * The returned virtual address is globally visible and valid up to the + * point where it is unmapped via kunmap(). The pointer can be handed to + * other contexts. + * + * For highmem pages on 32bit systems this can be slow as the mapping space + * is limited and protected by a global lock. In case that there is no + * mapping slot available the function blocks until a slot is released via + * kunmap(). + */ +static inline void *kmap(struct page *page); + +/** + * kunmap - Unmap the virtual address mapped by kmap() + * @addr: Virtual address to be unmapped + * + * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of + * pages in the low memory area. + */ +static inline void kunmap(struct page *page); + +/** + * kmap_to_page - Get the page for a kmap'ed address + * @addr: The address to look up + * + * Returns: The page which is mapped to @addr. + */ +static inline struct page *kmap_to_page(void *addr); + +/** + * kmap_flush_unused - Flush all unused kmap mappings in order to + * remove stray mappings + */ +static inline void kmap_flush_unused(void); + +/** + * kmap_atomic - Atomically map a page for temporary usage + * @page: Pointer to the page to be mapped + * + * Returns: The virtual address of the mapping + * + * Side effect: On return pagefaults and preemption are disabled. + * + * Can be invoked from any context. + * + * Requires careful handling when nesting multiple mappings because the map + * management is stack based. The unmap has to be in the reverse order of + * the map operation: + * + * addr1 = kmap_atomic(page1); + * addr2 = kmap_atomic(page2); + * ... + * kunmap_atomic(addr2); + * kunmap_atomic(addr1); + * + * Unmapping addr1 before addr2 is invalid and causes malfunction. + * + * Contrary to kmap() mappings the mapping is only valid in the context of + * the caller and cannot be handed to other contexts. + * + * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the + * virtual address of the direct mapping. Only real highmem pages are + * temporarily mapped. + * + * While it is significantly faster than kmap() it comes with restrictions + * about the pointer validity and the side effects of disabling page faults + * and preemption. Use it only when absolutely necessary, e.g. from non + * preemptible contexts. + */ +static inline void *kmap_atomic(struct page *page); + +/** + * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() + * @addr: Virtual address to be unmapped + * + * Counterpart to kmap_atomic(). + * + * Undoes the side effects of kmap_atomic(), i.e. reenabling pagefaults and + * preemption. + * + * Other than that a NOOP for CONFIG_HIGHMEM=n and for mappings of pages + * in the low memory area. For real highmen pages the mapping which was + * established with kmap_atomic() is destroyed. + */ + +/* Highmem related interfaces for management code */ +static inline unsigned int nr_free_highpages(void); +static inline unsigned long totalhigh_pages(void); + #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { @@ -29,182 +131,6 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) } #endif -/* - * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. - */ -#ifdef CONFIG_KMAP_LOCAL -void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot); -void kunmap_local_indexed(void *vaddr); -#endif - -#ifdef CONFIG_HIGHMEM -#include - -#ifndef ARCH_HAS_KMAP_FLUSH_TLB -static inline void kmap_flush_tlb(unsigned long addr) { } -#endif - -#ifndef kmap_prot -#define kmap_prot PAGE_KERNEL -#endif - -void *kmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - addr = page_address(page); - else - addr = kmap_high(page); - kmap_flush_tlb((unsigned long)addr); - return addr; -} - -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - * - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - return __kmap_local_page_prot(page, prot); -} - -static inline void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} - -static inline void *kmap_atomic_pfn(unsigned long pfn) -{ - preempt_disable(); - pagefault_disable(); - return __kmap_local_pfn_prot(pfn, kmap_prot); -} - -static inline void __kunmap_atomic(void *addr) -{ - kunmap_local_indexed(addr); -} - -/* declarations for linux/mm/highmem.c */ -unsigned int nr_free_highpages(void); -extern atomic_long_t _totalhigh_pages; -static inline unsigned long totalhigh_pages(void) -{ - return (unsigned long)atomic_long_read(&_totalhigh_pages); -} - -static inline void totalhigh_pages_inc(void) -{ - atomic_long_inc(&_totalhigh_pages); -} - -static inline void totalhigh_pages_add(long count) -{ - atomic_long_add(count, &_totalhigh_pages); -} - -void kmap_flush_unused(void); - -struct page *kmap_to_page(void *addr); - -#else /* CONFIG_HIGHMEM */ - -static inline unsigned int nr_free_highpages(void) { return 0; } - -static inline struct page *kmap_to_page(void *addr) -{ - return virt_to_page(addr); -} - -static inline unsigned long totalhigh_pages(void) { return 0UL; } - -static inline void *kmap(struct page *page) -{ - might_sleep(); - return page_address(page); -} - -static inline void kunmap_high(struct page *page) -{ -} - -static inline void kunmap(struct page *page) -{ -#ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(page_address(page)); -#endif -} - -static inline void *kmap_atomic(struct page *page) -{ - preempt_disable(); - pagefault_disable(); - return page_address(page); -} - -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - return kmap_atomic(page); -} - -static inline void *kmap_atomic_pfn(unsigned long pfn) -{ - return kmap_atomic(pfn_to_page(pfn)); -} - -static inline void __kunmap_atomic(void *addr) -{ - /* - * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() - * handles re-enabling faults and preemption - */ -#ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); -#endif -} - -#define kmap_flush_unused() do {} while(0) - - -#endif /* CONFIG_HIGHMEM */ - -/* - * Prevent people trying to call kunmap_atomic() as if it were kunmap() - * kunmap_atomic() should get the return value of kmap_atomic, not the page. - */ -#define kunmap_atomic(__addr) \ -do { \ - BUILD_BUG_ON(__same_type((__addr), struct page *)); \ - __kunmap_atomic(__addr); \ - pagefault_enable(); \ - preempt_enable(); \ -} while (0) - /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) diff --git a/mm/highmem.c b/mm/highmem.c index 499dfafd36b7..54bd233846c9 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -104,7 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -unsigned int nr_free_highpages (void) +unsigned int __nr_free_highpages (void) { struct zone *zone; unsigned int pages = 0; @@ -141,7 +141,7 @@ pte_t * pkmap_page_table; do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif -struct page *kmap_to_page(void *vaddr) +struct page *__kmap_to_page(void *vaddr) { unsigned long addr = (unsigned long)vaddr; @@ -152,7 +152,7 @@ struct page *kmap_to_page(void *vaddr) return virt_to_page(addr); } -EXPORT_SYMBOL(kmap_to_page); +EXPORT_SYMBOL(__kmap_to_page); static void flush_all_zero_pkmaps(void) { @@ -194,10 +194,7 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } -/** - * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings - */ -void kmap_flush_unused(void) +void __kmap_flush_unused(void) { lock_kmap(); flush_all_zero_pkmaps(); From aec8da04e4d71afdd4ab3025ea34a6517435f363 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Nov 2020 15:34:32 +0100 Subject: [PATCH 59/77] x86/ioapic: Correct the PCI/ISA trigger type selection PCI's default trigger type is level and ISA's is edge. The recent refactoring made it the other way round, which went unnoticed as it seems only to cause havoc on some AMD systems. Make the comment and code do the right thing again. Fixes: a27dca645d2c ("x86/io_apic: Cleanup trigger/polarity helpers") Reported-by: Tom Lendacky Reported-by: Borislav Petkov Reported-by: Qian Cai Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Cc: David Woodhouse Link: https://lore.kernel.org/r/87d00lgu13.fsf@nanos.tec.linutronix.de --- arch/x86/kernel/apic/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0602c9533d17..089e755eadf6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -809,9 +809,9 @@ static bool irq_is_level(int idx) case MP_IRQTRIG_DEFAULT: /* * Conforms to spec, ie. bus-type dependent trigger - * mode. PCI defaults to egde, ISA to level. + * mode. PCI defaults to level, ISA to edge. */ - level = test_bit(bus, mp_bus_not_pci); + level = !test_bit(bus, mp_bus_not_pci); /* Take EISA into account */ return eisa_irq_is_level(idx, bus, level); case MP_IRQTRIG_EDGE: From 5f0c71278d6848b4809f83af90f28196e1505ab1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 27 Oct 2020 11:09:50 +0100 Subject: [PATCH 60/77] x86/fpu: Simplify fpregs_[un]lock() There is no point in disabling preemption and then disabling bottom halfs. Just disabling bottom halfs is sufficient as it implicitly disables preemption on !RT kernels. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201027101349.455380473@linutronix.de --- arch/x86/include/asm/fpu/api.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index dcd9503b1098..2c5bef78a484 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -29,17 +29,18 @@ extern void fpregs_mark_activate(void); * A context switch will (and softirq might) save CPU's FPU registers to * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. + * + * local_bh_disable() protects against both preemption and soft interrupts + * on !RT kernels. */ static inline void fpregs_lock(void) { - preempt_disable(); local_bh_disable(); } static inline void fpregs_unlock(void) { local_bh_enable(); - preempt_enable(); } #ifdef CONFIG_X86_DEBUG_FPU From cba08c5dc6dc1a906a0b5ddac9a9ac6c9a64f2e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 27 Oct 2020 11:09:51 +0100 Subject: [PATCH 61/77] x86/fpu: Make kernel FPU protection RT friendly Non RT kernels need to protect FPU against preemption and bottom half processing. This is achieved by disabling bottom halfs via local_bh_disable() which implictly disables preemption. On RT kernels this protection mechanism is not sufficient because local_bh_disable() does not disable preemption. It serializes bottom half related processing via a CPU local lock. As bottom halfs are running always in thread context on RT kernels disabling preemption is the proper choice as it implicitly prevents bottom half processing. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201027101349.588965083@linutronix.de --- arch/x86/include/asm/fpu/api.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 2c5bef78a484..a5aba4ab0224 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -32,15 +32,29 @@ extern void fpregs_mark_activate(void); * * local_bh_disable() protects against both preemption and soft interrupts * on !RT kernels. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. + * + * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); } static inline void fpregs_unlock(void) { - local_bh_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } #ifdef CONFIG_X86_DEBUG_FPU From 2fb6acf3edfeb904505f9ba3fd01166866062591 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Nov 2020 14:43:21 +0000 Subject: [PATCH 62/77] iommu/amd: Fix union of bitfields in intcapxt support All the bitfields in here are overlaid on top of each other since they're a union. Change the second u64 to be in a struct so it does the intended thing. Fixes: b5c3786ee370 ("iommu/amd: Use msi_msg shadow structs") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201111144322.1659970-2-dwmw2@infradead.org --- drivers/iommu/amd/init.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 263670d36fed..c2769f2b2abd 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1967,13 +1967,15 @@ static int iommu_setup_msi(struct amd_iommu *iommu) union intcapxt { u64 capxt; - u64 reserved_0 : 2, - dest_mode_logical : 1, - reserved_1 : 5, - destid_0_23 : 24, - vector : 8, - reserved_2 : 16, - destid_24_31 : 8; + struct { + u64 reserved_0 : 2, + dest_mode_logical : 1, + reserved_1 : 5, + destid_0_23 : 24, + vector : 8, + reserved_2 : 16, + destid_24_31 : 8; + }; } __attribute__ ((packed)); /* From 2df985f5e44c43f5d29d8cc3aaa8e8ac697e9de6 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Nov 2020 14:43:20 +0000 Subject: [PATCH 63/77] iommu/amd: Don't register interrupt remapping irqdomain when IR is disabled Registering the remapping irq domain unconditionally is potentially allowing I/O-APIC and MSI interrupts to be parented in the IOMMU IR domain even when IR is disabled. Don't do that. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201111144322.1659970-1-dwmw2@infradead.org --- drivers/iommu/amd/init.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c2769f2b2abd..a94b96f1e13a 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1601,9 +1601,11 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (ret) return ret; - ret = amd_iommu_create_irq_domain(iommu); - if (ret) - return ret; + if (amd_iommu_irq_remap) { + ret = amd_iommu_create_irq_domain(iommu); + if (ret) + return ret; + } /* * Make sure IOMMU is not considered to translate itself. The IVRS From 2a656cad337e0e1ca582f58847d7b0c7eeba4dc8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Nov 2020 11:59:32 +0100 Subject: [PATCH 64/77] mm/highmem: Take kmap_high_get() properly into account kunmap_local() warns when the virtual address to unmap is below PAGE_OFFSET. This is correct except for the case that the mapping was obtained via kmap_high_get() because the PKMAP addresses are right below PAGE_OFFSET. Cure it by skipping the WARN_ON() when the unmap was handled by kunmap_high(). Fixes: 298fa1ad5571 ("highmem: Provide generic variant of kmap_atomic*") Reported-by: vtolkm@googlemail.com Reported-by: Marek Szyprowski Signed-off-by: Thomas Gleixner Tested-by: Marek Szyprowski Tested-by: Sebastian Andrzej Siewior Cc: Andrew Morton Link: https://lore.kernel.org/r/87y2j6n8mj.fsf@nanos.tec.linutronix.de --- mm/highmem.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/highmem.c b/mm/highmem.c index 54bd233846c9..78c481a30c9a 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -426,12 +426,15 @@ static inline void *arch_kmap_local_high_get(struct page *page) #endif /* Unmap a local mapping which was obtained by kmap_high_get() */ -static inline void kmap_high_unmap_local(unsigned long vaddr) +static inline bool kmap_high_unmap_local(unsigned long vaddr) { #ifdef ARCH_NEEDS_KMAP_HIGH_GET - if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + return true; + } #endif + return false; } static inline int kmap_local_calc_idx(int idx) @@ -491,10 +494,14 @@ void kunmap_local_indexed(void *vaddr) if (addr < __fix_to_virt(FIX_KMAP_END) || addr > __fix_to_virt(FIX_KMAP_BEGIN)) { - WARN_ON_ONCE(addr < PAGE_OFFSET); - - /* Handle mappings which were obtained by kmap_high_get() */ - kmap_high_unmap_local(addr); + /* + * Handle mappings which were obtained by kmap_high_get() + * first as the virtual address of such mappings is below + * PAGE_OFFSET. Warn for all other addresses which are in + * the user space part of the virtual address space. + */ + if (!kmap_high_unmap_local(addr)) + WARN_ON_ONCE(addr < PAGE_OFFSET); return; } From 1eb0616c2df5b78c301eaa7bd2ee859f43915001 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2020 11:32:53 -0800 Subject: [PATCH 65/77] xtensa/mm/highmem: Make generic kmap_atomic() work correctly The conversion to the generic kmap_atomic() implementation missed the fact that xtensa's fixmap works bottom up while all other implementations work top down. There is no real reason why xtensa needs to work that way. Cure it by: - Using the generic fix_to_virt()/virt_to_fix() functions which work top down - Adjusting the mapping defines - Using the generic index calculation for the non cache aliasing case - Making the cache colour offset reverse so the effective index is correct While at it, remove the outdated and misleading comment above the fixmap enum which originates from the initial copy&pasta of this code from i386. [ Max: Fixed the off by one in the index calculation ] Fixes: 629ed3f7dad2 ("xtensa/mm/highmem: Switch to generic kmap atomic") Reported-by: Max Filippov Signed-off-by: Thomas Gleixner Signed-off-by: Max Filippov Signed-off-by: Thomas Gleixner Tested-by: Max Filippov Link: https://lore.kernel.org/r/20201116193253.23875-1-jcmvbkbc@gmail.com --- arch/xtensa/include/asm/fixmap.h | 55 ++++--------------------------- arch/xtensa/include/asm/highmem.h | 15 +++++---- arch/xtensa/mm/highmem.c | 18 ++++++---- arch/xtensa/mm/init.c | 4 +-- arch/xtensa/mm/mmu.c | 3 +- 5 files changed, 31 insertions(+), 64 deletions(-) diff --git a/arch/xtensa/include/asm/fixmap.h b/arch/xtensa/include/asm/fixmap.h index 92049b61c351..1c65dc1d3397 100644 --- a/arch/xtensa/include/asm/fixmap.h +++ b/arch/xtensa/include/asm/fixmap.h @@ -17,63 +17,22 @@ #include #include #include -#endif -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the start of the consistent memory region upwards. - * Also this lets us do fail-safe vmalloc(), we - * can guarantee that these special addresses and - * vmalloc()-ed addresses never overlap. - * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * higher than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. - */ +/* The map slots for temporary mappings via kmap_atomic/local(). */ enum fixed_addresses { -#ifdef CONFIG_HIGHMEM - /* reserved pte's for temporary kernel mappings */ FIX_KMAP_BEGIN, FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS * DCACHE_N_COLORS) - 1, -#endif __end_of_fixed_addresses }; -#define FIXADDR_TOP (XCHAL_KSEG_CACHED_VADDR - PAGE_SIZE) +#define FIXADDR_END (XCHAL_KSEG_CACHED_VADDR - PAGE_SIZE) #define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START ((FIXADDR_TOP - FIXADDR_SIZE) & PMD_MASK) +/* Enforce that FIXADDR_START is PMD aligned to handle cache aliasing */ +#define FIXADDR_START ((FIXADDR_END - FIXADDR_SIZE) & PMD_MASK) +#define FIXADDR_TOP (FIXADDR_START + FIXADDR_SIZE - PAGE_SIZE) -#define __fix_to_virt(x) (FIXADDR_START + ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) (((x) - FIXADDR_START) >> PAGE_SHIFT) - -#ifndef __ASSEMBLY__ -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without translation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -static __always_inline unsigned long fix_to_virt(const unsigned int idx) -{ - /* Check if this memory layout is broken because fixmap overlaps page - * table. - */ - BUILD_BUG_ON(FIXADDR_START < - TLBTEMP_BASE_1 + TLBTEMP_SIZE); - BUILD_BUG_ON(idx >= __end_of_fixed_addresses); - return __fix_to_virt(idx); -} - -static inline unsigned long virt_to_fix(const unsigned long vaddr) -{ - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); - return __virt_to_fix(vaddr); -} - -#endif +#include +#endif /* CONFIG_HIGHMEM */ #endif diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 0fc3b1cebc56..34b8b620e7f1 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -12,6 +12,7 @@ #ifndef _XTENSA_HIGHMEM_H #define _XTENSA_HIGHMEM_H +#ifdef CONFIG_HIGHMEM #include #include #include @@ -58,6 +59,13 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) { return pkmap_map_wait_arr + color; } + +enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn); +#define arch_kmap_local_map_idx kmap_local_map_idx + +enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr); +#define arch_kmap_local_unmap_idx kmap_local_unmap_idx + #endif extern pte_t *pkmap_page_table; @@ -67,15 +75,10 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn); -#define arch_kmap_local_map_idx kmap_local_map_idx - -enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr); -#define arch_kmap_local_unmap_idx kmap_local_unmap_idx - #define arch_kmap_local_post_unmap(vaddr) \ local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) void kmap_init(void); +#endif /* CONFIG_HIGHMEM */ #endif diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 0735ca5e8f86..35c4f7d4a333 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -23,16 +23,16 @@ static void __init kmap_waitqueues_init(void) for (i = 0; i < ARRAY_SIZE(pkmap_map_wait_arr); ++i) init_waitqueue_head(pkmap_map_wait_arr + i); } -#else -static inline void kmap_waitqueues_init(void) -{ -} -#endif static inline enum fixed_addresses kmap_idx(int type, unsigned long color) { - return (type + KM_MAX_IDX * smp_processor_id()) * DCACHE_N_COLORS + - color; + int idx = (type + KM_MAX_IDX * smp_processor_id()) * DCACHE_N_COLORS; + + /* + * The fixmap operates top down, so the color offset needs to be + * reverse as well. + */ + return idx + DCACHE_N_COLORS - 1 - color; } enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn) @@ -45,6 +45,10 @@ enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr) return kmap_idx(type, DCACHE_ALIAS(addr)); } +#else +static inline void kmap_waitqueues_init(void) { } +#endif + void __init kmap_init(void) { /* Check if this memory layout is broken because PKMAP overlaps diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index c6fc83efee0c..0f85666bcbeb 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -147,8 +147,8 @@ void __init mem_init(void) #ifdef CONFIG_HIGHMEM PKMAP_BASE, PKMAP_BASE + LAST_PKMAP * PAGE_SIZE, (LAST_PKMAP*PAGE_SIZE) >> 10, - FIXADDR_START, FIXADDR_TOP, - (FIXADDR_TOP - FIXADDR_START) >> 10, + FIXADDR_START, FIXADDR_END, + (FIXADDR_END - FIXADDR_START) >> 10, #endif PAGE_OFFSET, PAGE_OFFSET + (max_low_pfn - min_low_pfn) * PAGE_SIZE, diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index fd2193df8a14..7e4d97dc8bd8 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -52,7 +52,8 @@ static void * __init init_pmd(unsigned long vaddr, unsigned long n_pages) static void __init fixedrange_init(void) { - init_pmd(__fix_to_virt(0), __end_of_fixed_addresses); + BUILD_BUG_ON(FIXADDR_START < TLBTEMP_BASE_1 + TLBTEMP_SIZE); + init_pmd(FIXADDR_START, __end_of_fixed_addresses); } #endif From d1adcfbb520c43c10fc22fcdccdd4204e014fb53 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 11 Nov 2020 12:09:01 +0000 Subject: [PATCH 66/77] iommu/amd: Fix IOMMU interrupt generation in X2APIC mode The AMD IOMMU has two modes for generating its own interrupts. The first is very much based on PCI MSI, and can be configured by Linux precisely that way. But like legacy unmapped PCI MSI it's limited to 8 bits of APIC ID. The second method does not use PCI MSI at all in hardawre, and instead configures the INTCAPXT registers in the IOMMU directly with the APIC ID and vector. In the latter case, the IOMMU driver would still use pci_enable_msi(), read back (through MMIO) the MSI message that Linux wrote to the PCI MSI table, then swizzle those bits into the appropriate register. Historically, this worked because__irq_compose_msi_msg() would silently generate an invalid MSI message with the high bits of the APIC ID in the high bits of the MSI address. That hack was intended only for the Intel IOMMU, and I recently enforced that, introducing a warning in __irq_msi_compose_msg() if it was invoked with an APIC ID above 255. Fix the AMD IOMMU not to depend on that hack any more, by having its own irqdomain and directly putting the bits from the irq_cfg into the right place in its ->activate() method. Fixes: 47bea873cf80 "x86/msi: Only use high bits of MSI address for DMAR unit") Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Tested-by: Suravee Suthikulpanit Link: https://lore.kernel.org/r/05e3a5ba317f5ff48d2f8356f19e617f8b9d23a4.camel@infradead.org --- arch/x86/include/asm/hw_irq.h | 1 + drivers/iommu/amd/init.c | 207 +++++++++++++++++++++++----------- 2 files changed, 141 insertions(+), 67 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 458f5a676402..d465ece58151 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -39,6 +39,7 @@ enum irq_alloc_type { X86_IRQ_ALLOC_TYPE_PCI_MSI, X86_IRQ_ALLOC_TYPE_PCI_MSIX, X86_IRQ_ALLOC_TYPE_DMAR, + X86_IRQ_ALLOC_TYPE_AMDVI, X86_IRQ_ALLOC_TYPE_UV, }; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index a94b96f1e13a..07d1f9913ce4 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1557,14 +1558,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) break; } - /* - * Note: Since iommu_update_intcapxt() leverages - * the IOMMU MMIO access to MSI capability block registers - * for MSI address lo/hi/data, we need to check both - * EFR[XtSup] and EFR[MsiCapMmioSup] for x2APIC support. - */ - if ((h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) && - (h->efr_reg & BIT(IOMMU_EFR_MSICAPMMIOSUP_SHIFT))) + if (h->efr_reg & BIT(IOMMU_EFR_XTSUP_SHIFT)) amd_iommu_xt_mode = IRQ_REMAP_X2APIC_MODE; break; default: @@ -1981,27 +1975,32 @@ union intcapxt { } __attribute__ ((packed)); /* - * Setup the IntCapXT registers with interrupt routing information - * based on the PCI MSI capability block registers, accessed via - * MMIO MSI address low/hi and MSI data registers. + * There isn't really any need to mask/unmask at the irqchip level because + * the 64-bit INTCAPXT registers can be updated atomically without tearing + * when the affinity is being updated. */ -static void iommu_update_intcapxt(struct amd_iommu *iommu) +static void intcapxt_unmask_irq(struct irq_data *data) { - struct msi_msg msg; +} + +static void intcapxt_mask_irq(struct irq_data *data) +{ +} + +static struct irq_chip intcapxt_controller; + +static int intcapxt_irqdomain_activate(struct irq_domain *domain, + struct irq_data *irqd, bool reserve) +{ + struct amd_iommu *iommu = irqd->chip_data; + struct irq_cfg *cfg = irqd_cfg(irqd); union intcapxt xt; - u32 destid; - - msg.address_lo = readl(iommu->mmio_base + MMIO_MSI_ADDR_LO_OFFSET); - msg.address_hi = readl(iommu->mmio_base + MMIO_MSI_ADDR_HI_OFFSET); - msg.data = readl(iommu->mmio_base + MMIO_MSI_DATA_OFFSET); - - destid = x86_msi_msg_get_destid(&msg, x2apic_enabled()); xt.capxt = 0ULL; - xt.dest_mode_logical = msg.arch_data.dest_mode_logical; - xt.vector = msg.arch_data.vector; - xt.destid_0_23 = destid & GENMASK(23, 0); - xt.destid_24_31 = destid >> 24; + xt.dest_mode_logical = apic->dest_mode_logical; + xt.vector = cfg->vector; + xt.destid_0_23 = cfg->dest_apicid & GENMASK(23, 0); + xt.destid_24_31 = cfg->dest_apicid >> 24; /** * Current IOMMU implemtation uses the same IRQ for all @@ -2010,64 +2009,142 @@ static void iommu_update_intcapxt(struct amd_iommu *iommu) writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_EVT_OFFSET); writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_PPR_OFFSET); writeq(xt.capxt, iommu->mmio_base + MMIO_INTCAPXT_GALOG_OFFSET); + return 0; } -static void _irq_notifier_notify(struct irq_affinity_notify *notify, - const cpumask_t *mask) +static void intcapxt_irqdomain_deactivate(struct irq_domain *domain, + struct irq_data *irqd) { - struct amd_iommu *iommu; - - for_each_iommu(iommu) { - if (iommu->dev->irq == notify->irq) { - iommu_update_intcapxt(iommu); - break; - } - } + intcapxt_mask_irq(irqd); } -static void _irq_notifier_release(struct kref *ref) + +static int intcapxt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) { -} + struct irq_alloc_info *info = arg; + int i, ret; -static int iommu_init_intcapxt(struct amd_iommu *iommu) -{ - int ret; - struct irq_affinity_notify *notify = &iommu->intcapxt_notify; + if (!info || info->type != X86_IRQ_ALLOC_TYPE_AMDVI) + return -EINVAL; - /** - * IntCapXT requires XTSup=1 and MsiCapMmioSup=1, - * which can be inferred from amd_iommu_xt_mode. - */ - if (amd_iommu_xt_mode != IRQ_REMAP_X2APIC_MODE) - return 0; - - /** - * Also, we need to setup notifier to update the IntCapXT registers - * whenever the irq affinity is changed from user-space. - */ - notify->irq = iommu->dev->irq; - notify->notify = _irq_notifier_notify, - notify->release = _irq_notifier_release, - ret = irq_set_affinity_notifier(iommu->dev->irq, notify); - if (ret) { - pr_err("Failed to register irq affinity notifier (devid=%#x, irq %d)\n", - iommu->devid, iommu->dev->irq); + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) return ret; + + for (i = virq; i < virq + nr_irqs; i++) { + struct irq_data *irqd = irq_domain_get_irq_data(domain, i); + + irqd->chip = &intcapxt_controller; + irqd->chip_data = info->data; + __irq_set_handler(i, handle_edge_irq, 0, "edge"); } - iommu_update_intcapxt(iommu); - iommu_feature_enable(iommu, CONTROL_INTCAPXT_EN); return ret; } -static int iommu_init_msi(struct amd_iommu *iommu) +static void intcapxt_irqdomain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static int intcapxt_set_affinity(struct irq_data *irqd, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irqd->parent_data; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) + return ret; + + return intcapxt_irqdomain_activate(irqd->domain, irqd, false); +} + +static struct irq_chip intcapxt_controller = { + .name = "IOMMU-MSI", + .irq_unmask = intcapxt_unmask_irq, + .irq_mask = intcapxt_mask_irq, + .irq_ack = irq_chip_ack_parent, + .irq_retrigger = irq_chip_retrigger_hierarchy, + .irq_set_affinity = intcapxt_set_affinity, + .flags = IRQCHIP_SKIP_SET_WAKE, +}; + +static const struct irq_domain_ops intcapxt_domain_ops = { + .alloc = intcapxt_irqdomain_alloc, + .free = intcapxt_irqdomain_free, + .activate = intcapxt_irqdomain_activate, + .deactivate = intcapxt_irqdomain_deactivate, +}; + + +static struct irq_domain *iommu_irqdomain; + +static struct irq_domain *iommu_get_irqdomain(void) +{ + struct fwnode_handle *fn; + + /* No need for locking here (yet) as the init is single-threaded */ + if (iommu_irqdomain) + return iommu_irqdomain; + + fn = irq_domain_alloc_named_fwnode("AMD-Vi-MSI"); + if (!fn) + return NULL; + + iommu_irqdomain = irq_domain_create_hierarchy(x86_vector_domain, 0, 0, + fn, &intcapxt_domain_ops, + NULL); + if (!iommu_irqdomain) + irq_domain_free_fwnode(fn); + + return iommu_irqdomain; +} + +static int iommu_setup_intcapxt(struct amd_iommu *iommu) +{ + struct irq_domain *domain; + struct irq_alloc_info info; + int irq, ret; + + domain = iommu_get_irqdomain(); + if (!domain) + return -ENXIO; + + init_irq_alloc_info(&info, NULL); + info.type = X86_IRQ_ALLOC_TYPE_AMDVI; + info.data = iommu; + + irq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info); + if (irq < 0) { + irq_domain_remove(domain); + return irq; + } + + ret = request_threaded_irq(irq, amd_iommu_int_handler, + amd_iommu_int_thread, 0, "AMD-Vi", iommu); + if (ret) { + irq_domain_free_irqs(irq, 1); + irq_domain_remove(domain); + return ret; + } + + iommu_feature_enable(iommu, CONTROL_INTCAPXT_EN); + return 0; +} + +static int iommu_init_irq(struct amd_iommu *iommu) { int ret; if (iommu->int_enabled) goto enable_faults; - if (iommu->dev->msi_cap) + if (amd_iommu_xt_mode == IRQ_REMAP_X2APIC_MODE) + ret = iommu_setup_intcapxt(iommu); + else if (iommu->dev->msi_cap) ret = iommu_setup_msi(iommu); else ret = -ENODEV; @@ -2076,10 +2153,6 @@ static int iommu_init_msi(struct amd_iommu *iommu) return ret; enable_faults: - ret = iommu_init_intcapxt(iommu); - if (ret) - return ret; - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); if (iommu->ppr_log != NULL) @@ -2702,7 +2775,7 @@ static int amd_iommu_enable_interrupts(void) int ret = 0; for_each_iommu(iommu) { - ret = iommu_init_msi(iommu); + ret = iommu_init_irq(iommu); if (ret) goto out; } From a0e169978303ee5873142599c8c9660b2d296243 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Nov 2020 20:45:03 +0100 Subject: [PATCH 67/77] microblaze/mm/highmem: Add dropped #ifdef back The conversion to generic kmap atomic broke microblaze by removing the build fail. Add it back. Fixes: 7ac1b26b0a72 ("microblaze/mm/highmem: Switch to generic kmap atomic") Reported-by: Randy Dunlap Signed-off-by: Thomas Gleixner Cc: Michal Simek --- arch/microblaze/mm/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 1f4b5b34e600..a444778e59de 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -49,6 +49,7 @@ unsigned long lowmem_size; EXPORT_SYMBOL(min_low_pfn); EXPORT_SYMBOL(max_low_pfn); +#ifdef CONFIG_HIGHMEM static void __init highmem_init(void) { pr_debug("%x\n", (u32)PKMAP_BASE); From 6e799cb69a70eedbb41561b750f7180c12cff280 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:39 +0100 Subject: [PATCH 68/77] mm/highmem: Provide and use CONFIG_DEBUG_KMAP_LOCAL CONFIG_KMAP_LOCAL can be enabled by x86/32bit even if CONFIG_HIGHMEM is not enabled for temporary MMIO space mappings. Provide it as a seperate config option which depends on CONFIG_KMAP_LOCAL and let CONFIG_DEBUG_HIGHMEM select it. This won't increase the debug coverage of this significantly but it paves the way to do so. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204006.869487226@linutronix.de --- include/asm-generic/kmap_size.h | 2 +- lib/Kconfig.debug | 8 ++++++++ mm/highmem.c | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/asm-generic/kmap_size.h b/include/asm-generic/kmap_size.h index 9d6c7786a645..6e36b2443ece 100644 --- a/include/asm-generic/kmap_size.h +++ b/include/asm-generic/kmap_size.h @@ -3,7 +3,7 @@ #define _ASM_GENERIC_KMAP_SIZE_H /* For debug this provides guard pages between the maps */ -#ifdef CONFIG_DEBUG_HIGHMEM +#ifdef CONFIG_DEBUG_KMAP_LOCAL # define KM_MAX_IDX 33 #else # define KM_MAX_IDX 16 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c789b39ed527..f24fa158f9db 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -849,9 +849,17 @@ config DEBUG_PER_CPU_MAPS Say N if unsure. +config DEBUG_KMAP_LOCAL + bool "Debug kmap_local temporary mappings" + depends on DEBUG_KERNEL && KMAP_LOCAL + help + This option enables additional error checking for the kmap_local + infrastructure. Disable for production use. + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM + select DEBUG_KMAP_LOCAL help This option enables additional error checking for high memory systems. Disable for production systems. diff --git a/mm/highmem.c b/mm/highmem.c index 78c481a30c9a..fab128de97de 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -368,10 +368,10 @@ EXPORT_SYMBOL(kunmap_high); static DEFINE_PER_CPU(int, __kmap_local_idx); /* - * With DEBUG_HIGHMEM the stack depth is doubled and every second + * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second * slot is unused which acts as a guard page */ -#ifdef CONFIG_DEBUG_HIGHMEM +#ifdef CONFIG_DEBUG_KMAP_LOCAL # define KM_INCR 2 #else # define KM_INCR 1 From 0e91a0c6984c837a7c6760e3f28e8e1c532abf87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:40 +0100 Subject: [PATCH 69/77] mm/highmem: Provide CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP CONFIG_DEBUG_KMAP_LOCAL, which is selected by CONFIG_DEBUG_HIGHMEM is only providing guard pages, but does not provide a mechanism to enforce the usage of the kmap_local() infrastructure. Provide CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP which forces the temporary mapping even for lowmem pages. This needs to be a seperate config switch because this only works on architectures which do not have cache aliasing problems. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.028261233@linutronix.de --- lib/Kconfig.debug | 14 ++++++++++++++ mm/highmem.c | 12 +++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f24fa158f9db..e952f892f047 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -856,9 +856,23 @@ config DEBUG_KMAP_LOCAL This option enables additional error checking for the kmap_local infrastructure. Disable for production use. +config ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP + bool + +config DEBUG_KMAP_LOCAL_FORCE_MAP + bool "Enforce kmap_local temporary mappings" + depends on DEBUG_KERNEL && ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP + select KMAP_LOCAL + select DEBUG_KMAP_LOCAL + help + This option enforces temporary mappings through the kmap_local + mechanism for non-highmem pages and on non-highmem systems. + Disable this for production systems! + config DEBUG_HIGHMEM bool "Highmem debugging" depends on DEBUG_KERNEL && HIGHMEM + select DEBUG_KMAP_LOCAL_FORCE_MAP if ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP select DEBUG_KMAP_LOCAL help This option enables additional error checking for high memory diff --git a/mm/highmem.c b/mm/highmem.c index fab128de97de..39aaca1a1ece 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -474,7 +474,12 @@ void *__kmap_local_page_prot(struct page *page, pgprot_t prot) { void *kmap; - if (!PageHighMem(page)) + /* + * To broaden the usage of the actual kmap_local() machinery always map + * pages when debugging is enabled and the architecture has no problems + * with alias mappings. + */ + if (!IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) && !PageHighMem(page)) return page_address(page); /* Try kmap_high_get() if architecture has it enabled */ @@ -494,6 +499,11 @@ void kunmap_local_indexed(void *vaddr) if (addr < __fix_to_virt(FIX_KMAP_END) || addr > __fix_to_virt(FIX_KMAP_BEGIN)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP)) { + /* This _should_ never happen! See above. */ + WARN_ON_ONCE(1); + return; + } /* * Handle mappings which were obtained by kmap_high_get() * first as the virtual address of such mappings is below From 14df32670291588036a498051a54cd8462d7f611 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:41 +0100 Subject: [PATCH 70/77] x86: Support kmap_local() forced debugging kmap_local() and related interfaces are NOOPs on 64bit and only create temporary fixmaps for highmem pages on 32bit. That means the test coverage for this code is pretty small. CONFIG_KMAP_LOCAL can be enabled independent from CONFIG_HIGHMEM, which allows to provide support for enforced kmap_local() debugging even on 64bit. For 32bit the support is unconditional, for 64bit it's only supported when CONFIG_NR_CPUS <= 4096 as supporting it for 8192 CPUs would require to set up yet another fixmap PGT. If CONFIG_KMAP_LOCAL_FORCE_DEBUG is enabled then kmap_local()/kmap_atomic() will use the temporary fixmap mapping path. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.169209557@linutronix.de --- arch/x86/Kconfig | 1 + arch/x86/include/asm/fixmap.h | 12 +++++++++--- arch/x86/include/asm/pgtable_64_types.h | 6 +++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 33c273cb3023..b5137cc5b7b4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -93,6 +93,7 @@ config X86 select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8eba66a33e39..9f1a0a987e5e 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -14,13 +14,20 @@ #ifndef _ASM_X86_FIXMAP_H #define _ASM_X86_FIXMAP_H +#include + /* * Exposed to assembly code for setting up initial page tables. Cannot be * calculated in assembly code (fixmap entries are an enum), but is sanity * checked in the actual fixmap C code to make sure that the fixmap is * covered fully. */ -#define FIXMAP_PMD_NUM 2 +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +# define FIXMAP_PMD_NUM 2 +#else +# define KM_PMDS (KM_MAX_IDX * ((CONFIG_NR_CPUS + 511) / 512)) +# define FIXMAP_PMD_NUM (KM_PMDS + 2) +#endif /* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */ #define FIXMAP_PMD_TOP 507 @@ -31,7 +38,6 @@ #include #ifdef CONFIG_X86_32 #include -#include #else #include #endif @@ -92,7 +98,7 @@ enum fixed_addresses { FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif -#ifdef CONFIG_X86_32 +#ifdef CONFIG_KMAP_LOCAL FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #ifdef CONFIG_PCI_MMCONFIG diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..91ac10654570 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -143,7 +143,11 @@ extern unsigned int ptrs_per_p4d; #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END _AC(0xffffffffff000000, UL) +#ifndef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP +# define MODULES_END _AC(0xffffffffff000000, UL) +#else +# define MODULES_END _AC(0xfffffffffe000000, UL) +#endif #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) From 5fbda3ecd14a5343644979c98d6eb65b7e7de9d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:43 +0100 Subject: [PATCH 71/77] sched: highmem: Store local kmaps in task struct Instead of storing the map per CPU provide and use per task storage. That prepares for local kmaps which are preemptible. The context switch code is preparatory and not yet in use because kmap_atomic() runs with preemption disabled. Will be made usable in the next step. The context switch logic is safe even when an interrupt happens after clearing or before restoring the kmaps. The kmap index in task struct is not modified so any nesting kmap in an interrupt will use unused indices and on return the counter is the same as before. Also add an assert into the return to user space code. Going back to user space with an active kmap local is a nono. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.372935758@linutronix.de --- include/linux/highmem-internal.h | 10 ++++ include/linux/sched.h | 9 +++ kernel/entry/common.c | 2 + kernel/fork.c | 1 + kernel/sched/core.c | 25 ++++++++ mm/highmem.c | 99 ++++++++++++++++++++++++++++---- 6 files changed, 136 insertions(+), 10 deletions(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 6ceed907b14e..c5a22177db85 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -9,6 +9,16 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(void *vaddr); +void kmap_local_fork(struct task_struct *tsk); +void __kmap_local_sched_out(void); +void __kmap_local_sched_in(void); +static inline void kmap_assert_nomap(void) +{ + DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); +} +#else +static inline void kmap_local_fork(struct task_struct *tsk) { } +static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM diff --git a/include/linux/sched.h b/include/linux/sched.h index a33f35f68060..8946911cee9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -629,6 +630,13 @@ struct wake_q_node { struct wake_q_node *next; }; +struct kmap_ctrl { +#ifdef CONFIG_KMAP_LOCAL + int idx; + pte_t pteval[KM_MAX_IDX]; +#endif +}; + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -1294,6 +1302,7 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif + struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..4ae1fe0898e9 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) /* Ensure that the address limit is intact and no locks are held */ addr_limit_user_check(); + kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..17dcd1817799 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -930,6 +930,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) account_kernel_stack(tsk, 1); kcov_task_init(tsk); + kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c962922784d1..953abdbe1472 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4094,6 +4094,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -4116,6 +4132,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -4182,6 +4199,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + /* + * kmap_local_sched_out() is invoked with rq::lock held and + * interrupts disabled. There is no requirement for that, but the + * sched out code does not have an interrupt enabled section. + * Restoring the maps on sched in does not require interrupts being + * disabled either. + */ + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* diff --git a/mm/highmem.c b/mm/highmem.c index 39aaca1a1ece..d1ef06aa6de6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high); #include -static DEFINE_PER_CPU(int, __kmap_local_idx); - /* * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second * slot is unused which acts as a guard page @@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_idx); static inline int kmap_local_idx_push(void) { - int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1; - WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx >= KM_MAX_IDX); - return idx; + current->kmap_ctrl.idx += KM_INCR; + BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); + return current->kmap_ctrl.idx - 1; } static inline int kmap_local_idx(void) { - return __this_cpu_read(__kmap_local_idx) - 1; + return current->kmap_ctrl.idx - 1; } static inline void kmap_local_idx_pop(void) { - int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR); - - BUG_ON(idx < 0); + current->kmap_ctrl.idx -= KM_INCR; + BUG_ON(current->kmap_ctrl.idx < 0); } #ifndef arch_kmap_local_post_map @@ -464,6 +460,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) pteval = pfn_pte(pfn, prot); set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); arch_kmap_local_post_map(vaddr, pteval); + current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; preempt_enable(); return (void *)vaddr; @@ -522,10 +519,92 @@ void kunmap_local_indexed(void *vaddr) arch_kmap_local_pre_unmap(addr); pte_clear(&init_mm, addr, kmap_pte - idx); arch_kmap_local_post_unmap(addr); + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); kmap_local_idx_pop(); preempt_enable(); } EXPORT_SYMBOL(kunmap_local_indexed); + +/* + * Invoked before switch_to(). This is safe even when during or after + * clearing the maps an interrupt which needs a kmap_local happens because + * the task::kmap_ctrl.idx is not modified by the unmapping code so a + * nested kmap_local will use the next unused index and restore the index + * on unmap. The already cleared kmaps of the outgoing task are irrelevant + * because the interrupt context does not know about them. The same applies + * when scheduling back in for an interrupt which happens before the + * restore is complete. + */ +void __kmap_local_sched_out(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Clear kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* + * This is a horrible hack for XTENSA to calculate the + * coloured PTE index. Uses the PFN encoded into the pteval + * and the map index calculation because the actual mapped + * virtual address is not stored in task::kmap_ctrl. + * For any sane architecture this is optimized out. + */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + } +} + +void __kmap_local_sched_in(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Restore kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* See comment in __kmap_local_sched_out() */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(addr, pteval); + } +} + +void kmap_local_fork(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) + memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); +} + #endif #if defined(HASHED_PAGE_VIRTUAL) From f3ba3c710ac5a30cd058615a9eb62d2ad95bb782 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:44 +0100 Subject: [PATCH 72/77] mm/highmem: Provide kmap_local* Now that the kmap atomic index is stored in task struct provide a preemptible variant. On context switch the maps of an outgoing task are removed and the map of the incoming task are restored. That's obviously slow, but highmem is slow anyway. The kmap_local.*() functions can be invoked from both preemptible and atomic context. kmap local sections disable migration to keep the resulting virtual mapping address correct, but disable neither pagefaults nor preemption. A wholesale conversion of kmap_atomic to be fully preemptible is not possible because some of the usage sites might rely on the preemption disable for serialization or on the implicit pagefault disable. Needs to be done on a case by case basis. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.468533059@linutronix.de --- include/linux/highmem-internal.h | 48 ++++++++++++++++++++++++++++++++ include/linux/highmem.h | 43 +++++++++++++++++----------- mm/highmem.c | 6 ++++ 3 files changed, 81 insertions(+), 16 deletions(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index c5a22177db85..1bbe96dc8be6 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -68,6 +68,26 @@ static inline void kmap_flush_unused(void) __kmap_flush_unused(); } +static inline void *kmap_local_page(struct page *page) +{ + return __kmap_local_page_prot(page, kmap_prot); +} + +static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + return __kmap_local_page_prot(page, prot); +} + +static inline void *kmap_local_pfn(unsigned long pfn) +{ + return __kmap_local_pfn_prot(pfn, kmap_prot); +} + +static inline void __kunmap_local(void *vaddr) +{ + kunmap_local_indexed(vaddr); +} + static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { preempt_disable(); @@ -140,6 +160,28 @@ static inline void kunmap(struct page *page) #endif } +static inline void *kmap_local_page(struct page *page) +{ + return page_address(page); +} + +static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + return kmap_local_page(page); +} + +static inline void *kmap_local_pfn(unsigned long pfn) +{ + return kmap_local_page(pfn_to_page(pfn)); +} + +static inline void __kunmap_local(void *addr) +{ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif +} + static inline void *kmap_atomic(struct page *page) { preempt_disable(); @@ -181,4 +223,10 @@ do { \ __kunmap_atomic(__addr); \ } while (0) +#define kunmap_local(__addr) \ +do { \ + BUILD_BUG_ON(__same_type((__addr), struct page *)); \ + __kunmap_local(__addr); \ +} while (0) + #endif diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 7d098bd621f6..f597830f26b4 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -60,24 +60,22 @@ static inline struct page *kmap_to_page(void *addr); static inline void kmap_flush_unused(void); /** - * kmap_atomic - Atomically map a page for temporary usage + * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * - * Side effect: On return pagefaults and preemption are disabled. - * * Can be invoked from any context. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * - * addr1 = kmap_atomic(page1); - * addr2 = kmap_atomic(page2); + * addr1 = kmap_local_page(page1); + * addr2 = kmap_local_page(page2); * ... - * kunmap_atomic(addr2); - * kunmap_atomic(addr1); + * kunmap_local(addr2); + * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * @@ -88,10 +86,26 @@ static inline void kmap_flush_unused(void); * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * - * While it is significantly faster than kmap() it comes with restrictions - * about the pointer validity and the side effects of disabling page faults - * and preemption. Use it only when absolutely necessary, e.g. from non - * preemptible contexts. + * While it is significantly faster than kmap() for the higmem case it + * comes with restrictions about the pointer validity. Only use when really + * necessary. + * + * On HIGHMEM enabled systems mapping a highmem page has the side effect of + * disabling migration in order to keep the virtual address stable across + * preemption. No caller of kmap_local_page() can rely on this side effect. + */ +static inline void *kmap_local_page(struct page *page); + +/** + * kmap_atomic - Atomically map a page for temporary usage - Deprecated! + * @page: Pointer to the page to be mapped + * + * Returns: The virtual address of the mapping + * + * Effectively a wrapper around kmap_local_page() which disables pagefaults + * and preemption. + * + * Do not use in new code. Use kmap_local_page() instead. */ static inline void *kmap_atomic(struct page *page); @@ -101,12 +115,9 @@ static inline void *kmap_atomic(struct page *page); * * Counterpart to kmap_atomic(). * - * Undoes the side effects of kmap_atomic(), i.e. reenabling pagefaults and + * Effectively a wrapper around kunmap_local() which additionally undoes + * the side effects of kmap_atomic(), i.e. reenabling pagefaults and * preemption. - * - * Other than that a NOOP for CONFIG_HIGHMEM=n and for mappings of pages - * in the low memory area. For real highmen pages the mapping which was - * established with kmap_atomic() is destroyed. */ /* Highmem related interfaces for management code */ diff --git a/mm/highmem.c b/mm/highmem.c index d1ef06aa6de6..83f9660f168f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -453,6 +453,11 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) unsigned long vaddr; int idx; + /* + * Disable migration so resulting virtual address is stable + * accross preemption. + */ + migrate_disable(); preempt_disable(); idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -522,6 +527,7 @@ void kunmap_local_indexed(void *vaddr) current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); kmap_local_idx_pop(); preempt_enable(); + migrate_enable(); } EXPORT_SYMBOL(kunmap_local_indexed); From e66f6e095486f0210fcf3c5eb3ecf13fa348be4c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:45 +0100 Subject: [PATCH 73/77] io-mapping: Provide iomap_local variant Similar to kmap local provide a iomap local variant which only disables migration, but neither disables pagefaults nor preemption. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.561220818@linutronix.de --- Documentation/driver-api/io-mapping.rst | 74 +++++++++++++++---------- include/linux/io-mapping.h | 30 +++++++++- 2 files changed, 73 insertions(+), 31 deletions(-) diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst index e33b88268554..a0cfb15988df 100644 --- a/Documentation/driver-api/io-mapping.rst +++ b/Documentation/driver-api/io-mapping.rst @@ -20,55 +20,71 @@ A mapping object is created during driver initialization using:: mappable, while 'size' indicates how large a mapping region to enable. Both are in bytes. -This _wc variant provides a mapping which may only be used -with the io_mapping_map_atomic_wc or io_mapping_map_wc. +This _wc variant provides a mapping which may only be used with +io_mapping_map_atomic_wc(), io_mapping_map_local_wc() or +io_mapping_map_wc(). -With this mapping object, individual pages can be mapped either atomically -or not, depending on the necessary scheduling environment. Of course, atomic -maps are more efficient:: +With this mapping object, individual pages can be mapped either temporarily +or long term, depending on the requirements. Of course, temporary maps are +more efficient. They come in two flavours:: + + void *io_mapping_map_local_wc(struct io_mapping *mapping, + unsigned long offset) void *io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) -'offset' is the offset within the defined mapping region. -Accessing addresses beyond the region specified in the -creation function yields undefined results. Using an offset -which is not page aligned yields an undefined result. The -return value points to a single page in CPU address space. +'offset' is the offset within the defined mapping region. Accessing +addresses beyond the region specified in the creation function yields +undefined results. Using an offset which is not page aligned yields an +undefined result. The return value points to a single page in CPU address +space. -This _wc variant returns a write-combining map to the -page and may only be used with mappings created by -io_mapping_create_wc +This _wc variant returns a write-combining map to the page and may only be +used with mappings created by io_mapping_create_wc() -Note that the task may not sleep while holding this page -mapped. +Temporary mappings are only valid in the context of the caller. The mapping +is not guaranteed to be globaly visible. -:: +io_mapping_map_local_wc() has a side effect on X86 32bit as it disables +migration to make the mapping code work. No caller can rely on this side +effect. +io_mapping_map_atomic_wc() has the side effect of disabling preemption and +pagefaults. Don't use in new code. Use io_mapping_map_local_wc() instead. + +Nested mappings need to be undone in reverse order because the mapping +code uses a stack for keeping track of them:: + + addr1 = io_mapping_map_local_wc(map1, offset1); + addr2 = io_mapping_map_local_wc(map2, offset2); + ... + io_mapping_unmap_local(addr2); + io_mapping_unmap_local(addr1); + +The mappings are released with:: + + void io_mapping_unmap_local(void *vaddr) void io_mapping_unmap_atomic(void *vaddr) -'vaddr' must be the value returned by the last -io_mapping_map_atomic_wc call. This unmaps the specified -page and allows the task to sleep once again. +'vaddr' must be the value returned by the last io_mapping_map_local_wc() or +io_mapping_map_atomic_wc() call. This unmaps the specified mapping and +undoes the side effects of the mapping functions. -If you need to sleep while holding the lock, you can use the non-atomic -variant, although they may be significantly slower. - -:: +If you need to sleep while holding a mapping, you can use the regular +variant, although this may be significantly slower:: void *io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) -This works like io_mapping_map_atomic_wc except it allows -the task to sleep while holding the page mapped. +This works like io_mapping_map_atomic/local_wc() except it has no side +effects and the pointer is globaly visible. - -:: +The mappings are released with:: void io_mapping_unmap(void *vaddr) -This works like io_mapping_unmap_atomic, except it is used -for pages mapped with io_mapping_map_wc. +Use for pages mapped with io_mapping_map_wc(). At driver close time, the io_mapping object must be freed:: diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 60e7c83e4904..c093e81310a9 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -82,6 +82,21 @@ io_mapping_unmap_atomic(void __iomem *vaddr) preempt_enable(); } +static inline void __iomem * +io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) +{ + resource_size_t phys_addr; + + BUG_ON(offset >= mapping->size); + phys_addr = mapping->base + offset; + return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); +} + +static inline void io_mapping_unmap_local(void __iomem *vaddr) +{ + kunmap_local_indexed((void __force *)vaddr); +} + static inline void __iomem * io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset, @@ -101,7 +116,7 @@ io_mapping_unmap(void __iomem *vaddr) iounmap(vaddr); } -#else +#else /* HAVE_ATOMIC_IOMAP */ #include @@ -166,7 +181,18 @@ io_mapping_unmap_atomic(void __iomem *vaddr) preempt_enable(); } -#endif /* HAVE_ATOMIC_IOMAP */ +static inline void __iomem * +io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) +{ + return io_mapping_map_wc(mapping, offset, PAGE_SIZE); +} + +static inline void io_mapping_unmap_local(void __iomem *vaddr) +{ + io_mapping_unmap(vaddr); +} + +#endif /* !HAVE_ATOMIC_IOMAP */ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, From 7e015a279853e747f5d4f957855ec5310848c501 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:46 +0100 Subject: [PATCH 74/77] x86/crashdump/32: Simplify copy_oldmem_page() Replace kmap_atomic_pfn() with kmap_local_pfn() which is preemptible and can take page faults. Remove the indirection of the dump page and the related cruft which is not longer required. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.670851839@linutronix.de --- arch/x86/kernel/crash_dump_32.c | 48 +++++++-------------------------- 1 file changed, 10 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 33ee47670b99..5fcac46aaf6b 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,8 +13,6 @@ #include -static void *kdump_buf_page; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE @@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) * @userbuf: if set, @buf is in user address space, use copy_to_user(), * otherwise @buf is in kernel address space, use memcpy(). * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. + * Copy a page from "oldmem". For this page, there might be no pte mapped + * in the current kernel. */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) { void *vaddr; @@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn); + vaddr = kmap_local_pfn(pfn); if (!userbuf) { - memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr); + memcpy(buf, vaddr + offset, csize); } else { - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Kdump buffer page not" - " allocated\n"); - kunmap_atomic(vaddr); - return -EFAULT; - } - copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr); - if (copy_to_user(buf, (kdump_buf_page + offset), csize)) - return -EFAULT; + if (copy_to_user(buf, vaddr + offset, csize)) + csize = -EFAULT; } + kunmap_local(vaddr); + return csize; } - -static int __init kdump_buf_page_init(void) -{ - int ret = 0; - - kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" - " page\n"); - ret = -ENOMEM; - } - - return ret; -} -arch_initcall(kdump_buf_page_init); From 26ab12bb9d96133b7880141d68b5e01a8783de9d Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 1 Dec 2020 16:45:10 -0800 Subject: [PATCH 75/77] iommu/hyper-v: Remove I/O-APIC ID check from hyperv_irq_remapping_select() commit a491bb19f728 ("iommu/hyper-v: Implement select() method on remapping irqdomain") restricted the irq_domain_ops::select() callback to match on I/O-APIC index 0, which was correct until the parameter was changed to carry the I/O APIC ID in commit f36a74b9345a. If the ID is not 0 then the match fails. Therefore I/O-APIC init fails to retrieve the parent irqdomain for the I/O-APIC resulting in a boot panic: kernel BUG at arch/x86/kernel/apic/io_apic.c:2408! Fix it by matching the I/O-APIC independent of the ID as there is only one I/O APIC emulated by Hyper-V. [ tglx: Amended changelog ] Fixes: f36a74b9345a ("x86/ioapic: Use I/O-APIC ID for finding irqdomain, not index") Signed-off-by: Dexuan Cui Signed-off-by: Thomas Gleixner Reviewed-by: David Woodhouse Link: https://lore.kernel.org/r/20201202004510.1818-1-decui@microsoft.com --- drivers/iommu/hyperv-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/hyperv-iommu.c b/drivers/iommu/hyperv-iommu.c index 9438daa24fdb..1d21a0b5f724 100644 --- a/drivers/iommu/hyperv-iommu.c +++ b/drivers/iommu/hyperv-iommu.c @@ -105,8 +105,8 @@ static int hyperv_irq_remapping_select(struct irq_domain *d, struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token) { - /* Claim only the first (and only) I/OAPIC */ - return x86_fwspec_is_ioapic(fwspec) && fwspec->param[0] == 0; + /* Claim the only I/O APIC emulated by Hyper-V */ + return x86_fwspec_is_ioapic(fwspec); } static const struct irq_domain_ops hyperv_ir_domain_ops = { From 68061c02bb295da4955f0d309b9459f0a7ba83dd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 4 Dec 2020 17:59:22 +0100 Subject: [PATCH 76/77] ARM: highmem: Fix cache_is_vivt() reference The reference to cache_is_vivt() was moved into a header file, which now causes a build failure in rare randconfig builds: arch/arm/include/asm/highmem.h:52:43: error: implicit declaration of function 'cache_is_vivt' [-Werror,-Wimplicit-function-declaration] Add an explicit include to make it build reliably. Fixes: 2a15ba82fa6c ("ARM: highmem: Switch to generic kmap atomic") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Russell King Cc: Andrew Morton Cc: Ira Weiny Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20201204165930.3877571-1-arnd@kernel.org --- arch/arm/include/asm/highmem.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index a41de523d053..b4b66220952d 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -2,6 +2,7 @@ #ifndef _ASM_HIGHMEM_H #define _ASM_HIGHMEM_H +#include #include #define PKMAP_BASE (PAGE_OFFSET - PMD_SIZE) From 058df195c23403f91acc028e39ca2ad599d0af52 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 21:15:04 +0100 Subject: [PATCH 77/77] x86/ioapic: Cleanup the timer_works() irqflags mess Mark tripped over the creative irqflags handling in the IO-APIC timer delivery check which ends up doing: local_irq_save(flags); local_irq_enable(); local_irq_restore(flags); which triggered a new consistency check he's working on required for replacing the POPF based restore with a conditional STI. That code is a historical mess and none of this is needed. Make it straightforward use local_irq_disable()/enable() as that's all what is required. It is invoked from interrupt enabled code nowadays. Reported-by: Mark Rutland Signed-off-by: Thomas Gleixner Tested-by: Mark Rutland Link: https://lore.kernel.org/r/87k0tpju47.fsf@nanos.tec.linutronix.de --- arch/x86/kernel/apic/io_apic.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 089e755eadf6..e4ab4804b20d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1620,21 +1620,16 @@ static void __init delay_without_tsc(void) static int __init timer_irq_works(void) { unsigned long t1 = jiffies; - unsigned long flags; if (no_timer_check) return 1; - local_save_flags(flags); local_irq_enable(); - if (boot_cpu_has(X86_FEATURE_TSC)) delay_with_tsc(); else delay_without_tsc(); - local_irq_restore(flags); - /* * Expect a few ticks at least, to be sure some possible * glue logic does not lock up after one or two first @@ -1643,10 +1638,10 @@ static int __init timer_irq_works(void) * least one tick may be lost due to delays. */ - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; + local_irq_disable(); + + /* Did jiffies advance? */ + return time_after(jiffies, t1 + 4); } /* @@ -2163,13 +2158,12 @@ static inline void __init check_timer(void) struct irq_cfg *cfg = irqd_cfg(irq_data); int node = cpu_to_node(0); int apic1, pin1, apic2, pin2; - unsigned long flags; int no_pin1 = 0; if (!global_clock_event) return; - local_irq_save(flags); + local_irq_disable(); /* * get/set the timer IRQ vector: @@ -2237,7 +2231,6 @@ static inline void __init check_timer(void) goto out; } panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC"); - local_irq_disable(); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2261,7 +2254,6 @@ static inline void __init check_timer(void) /* * Cleanup, just in case ... */ - local_irq_disable(); legacy_pic->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); @@ -2278,7 +2270,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); legacy_pic->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); @@ -2297,7 +2288,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); if (apic_is_x2apic_enabled()) apic_printk(APIC_QUIET, KERN_INFO @@ -2306,7 +2296,7 @@ static inline void __init check_timer(void) panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: - local_irq_restore(flags); + local_irq_enable(); } /*