diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 92521d0eecfb..d33e2019714f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -684,6 +684,7 @@ enum zone_watermarks { /* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { + spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fa4be26077fe..f5144b489e26 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -131,6 +131,20 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = { .lock = INIT_LOCAL_LOCK(lock), }; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -3212,15 +3226,22 @@ static struct list_head *get_populated_pcp_list(struct zone *zone, */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain, batch; - local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) + if (to_drain > 0) { + unsigned long flags; + + /* + * free_pcppages_bulk expects IRQs disabled for zone->lock + * so even though pcp->lock is not intended to be IRQ-safe, + * it's needed in this context. + */ + spin_lock_irqsave(&pcp->lock, flags); free_pcppages_bulk(zone, to_drain, pcp); - local_unlock_irqrestore(&pagesets.lock, flags); + spin_unlock_irqrestore(&pcp->lock, flags); + } } #endif @@ -3233,16 +3254,17 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - unsigned long flags; struct per_cpu_pages *pcp; - local_lock_irqsave(&pagesets.lock, flags); - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp); + if (pcp->count) { + unsigned long flags; - local_unlock_irqrestore(&pagesets.lock, flags); + /* See drain_zone_pages on why this is disabling IRQs */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, pcp->count, pcp); + spin_unlock_irqrestore(&pcp->lock, flags); + } } /* @@ -3504,16 +3526,14 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone) return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, unsigned long pfn, +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, unsigned long pfn, int migratetype, unsigned int order) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; int high; int pindex; __count_vm_event(PGFREE); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3531,6 +3551,9 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn, void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; unsigned long pfn = page_to_pfn(page); int migratetype; bool pcp_skip_cma_pages = false; @@ -3560,7 +3583,16 @@ void free_unref_page(struct page *page, unsigned int order) } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, pfn, migratetype, order); + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = this_cpu_ptr(zone->per_cpu_pageset); + if (spin_trylock(&pcp->lock)) { + free_unref_page_commit(zone, pcp, page, pfn, migratetype, order); + spin_unlock(&pcp->lock); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3570,6 +3602,8 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; unsigned long flags, pfn; int batch_count = 0; int migratetype; @@ -3602,6 +3636,17 @@ void free_unref_page_list(struct list_head *list) local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { + struct zone *zone = page_zone(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + if (pcp) + spin_unlock(&pcp->lock); + locked_zone = zone; + pcp = this_cpu_ptr(zone->per_cpu_pageset); + spin_lock(&pcp->lock); + } + pfn = page_private(page); set_page_private(page, 0); @@ -3614,18 +3659,24 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn, migratetype, 0); + free_unref_page_commit(zone, pcp, page, pfn, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; local_lock_irqsave(&pagesets.lock, flags); + pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); + spin_lock(&pcp->lock); } } + + if (pcp) + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3836,17 +3887,31 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; local_lock_irqsave(&pagesets.lock, flags); + /* + * spin_trylock may fail due to a parallel drain. In the future, the + * trylock will also protect against IRQ reentrancy. + */ + pcp = this_cpu_ptr(zone->per_cpu_pageset); + pcp_trylock_prepare(UP_flags); + if (!spin_trylock(&pcp->lock)) { + pcp_trylock_finish(UP_flags); + local_unlock_irqrestore(&pagesets.lock, flags); + return NULL; + } + /* * On allocation, reduce the number of pages that are batch freed. * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, gfp_flags); + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3869,7 +3934,8 @@ struct page *rmqueue(struct zone *preferred_zone, if (likely(pcp_allowed_order(order))) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags); - goto out; + if (likely(page)) + goto out; } /* @@ -5408,6 +5474,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, { struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5487,10 +5554,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; - /* Attempt the batch allocation */ + /* Is a parallel drain in progress? */ local_lock_irqsave(&pagesets.lock, flags); + pcp_trylock_prepare(UP_flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); + if (!spin_trylock(&pcp->lock)) + goto failed_irq; + /* Attempt the batch allocation */ while (nr_populated < nr_pages) { /* Skip existing pages */ @@ -5503,8 +5574,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp, alloc_gfp); if (unlikely(!page)) { /* Try and allocate at least one page */ - if (!nr_account) + if (!nr_account) { + spin_unlock(&pcp->lock); goto failed_irq; + } break; } nr_account++; @@ -5517,6 +5590,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); @@ -5526,6 +5601,7 @@ out: return nr_populated; failed_irq: + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); failed: @@ -7125,6 +7201,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); + spin_lock_init(&pcp->lock); for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) INIT_LIST_HEAD(&pcp->lists[pindex]);