From c37495d6254c237578db3121dcf79857e033f8ff Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:27 -0700 Subject: [PATCH 01/57] slab: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for regular kmalloc interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-5-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Reviewed-by: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Andy Whitcroft Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Tejun Heo Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 61 ++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index d05de03bdcdd..b5bf0537975b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -181,7 +181,7 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags); +void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __alloc_size(2); void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -425,7 +425,7 @@ static __always_inline unsigned int __kmalloc_index(size_t size, #define kmalloc_index(s) __kmalloc_index(s, true) #endif /* !CONFIG_SLOB */ -void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; +void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *s, void *objp); @@ -449,11 +449,12 @@ static __always_inline void kfree_bulk(size_t size, void **p) } #ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; +void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment + __alloc_size(1); void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment __malloc; #else -static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc(size, flags); } @@ -466,23 +467,23 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_slab_alignment __malloc; + __assume_slab_alignment __alloc_size(3); #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_slab_alignment __malloc; + int node, size_t size) __assume_slab_alignment + __alloc_size(4); #else -static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, int node, - size_t size) +static __always_inline __alloc_size(4) void *kmem_cache_alloc_node_trace(struct kmem_cache *s, + gfp_t gfpflags, int node, size_t size) { return kmem_cache_alloc_trace(s, gfpflags, size); } #endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ -static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, - size_t size) +static __always_inline __alloc_size(3) void *kmem_cache_alloc_trace(struct kmem_cache *s, + gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); @@ -501,19 +502,20 @@ static __always_inline void *kmem_cache_alloc_node_trace(struct kmem_cache *s, g #endif /* CONFIG_TRACING */ extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment - __malloc; + __alloc_size(1); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) - __assume_page_alignment __malloc; + __assume_page_alignment __alloc_size(1); #else -static __always_inline void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, gfp_t flags, + unsigned int order) { return kmalloc_order(size, flags, order); } #endif -static __always_inline void *kmalloc_large(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t flags) { unsigned int order = get_order(size); return kmalloc_order_trace(size, flags, order); @@ -573,7 +575,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Try really hard to succeed the allocation but fail * eventually. */ -static __always_inline void *kmalloc(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { #ifndef CONFIG_SLOB @@ -595,7 +597,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } -static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && @@ -619,7 +621,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -637,8 +639,10 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t new_size, - gfp_t flags) +static inline __alloc_size(2, 3) void * __must_check krealloc_array(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; @@ -654,7 +658,7 @@ static inline void * __must_check krealloc_array(void *p, size_t new_n, size_t n * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kcalloc(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags) { return kmalloc_array(n, size, flags | __GFP_ZERO); } @@ -667,12 +671,13 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) + __alloc_size(1); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) -static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, - int node) +static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) { size_t bytes; @@ -683,7 +688,7 @@ static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, return __kmalloc_node(bytes, flags, node); } -static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) { return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } @@ -691,7 +696,7 @@ static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - unsigned long caller); + unsigned long caller) __alloc_size(1); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) @@ -716,7 +721,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). */ -static inline void *kzalloc(size_t size, gfp_t flags) +static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags) { return kmalloc(size, flags | __GFP_ZERO); } @@ -727,7 +732,7 @@ static inline void *kzalloc(size_t size, gfp_t flags) * @flags: the type of memory to allocate (see kmalloc). * @node: memory node from which to allocate */ -static inline void *kzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node) { return kmalloc_node(size, flags | __GFP_ZERO, node); } From 56bcf40f91c7d7f53b77abe161a7d18ef9f981ba Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:31 -0700 Subject: [PATCH 02/57] mm/kvmalloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for regular kvmalloc interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-6-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Reviewed-by: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Andy Whitcroft Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Tejun Heo Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index b5bf0537975b..837cb16232ef 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -737,21 +737,21 @@ static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int n return kmalloc_node(size, flags | __GFP_ZERO, node); } -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -static inline void *kvmalloc(size_t size, gfp_t flags) +extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1); +static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags) { return kvmalloc_node(size, flags, NUMA_NO_NODE); } -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node) { return kvmalloc_node(size, flags | __GFP_ZERO, node); } -static inline void *kvzalloc(size_t size, gfp_t flags) +static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags) { return kvmalloc(size, flags | __GFP_ZERO); } -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -761,13 +761,13 @@ static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) return kvmalloc(bytes, flags); } -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags) { return kvmalloc_array(n, size, flags | __GFP_ZERO); } -extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, - gfp_t flags); +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) + __alloc_size(3); extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); From 894f24bb569afd4fe4a874c636f82d47f1c9beed Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:34 -0700 Subject: [PATCH 03/57] mm/vmalloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate vmalloc allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-7-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Cc: Andy Whitcroft Cc: Christoph Lameter Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 671d402c3778..0ed56fc10c11 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -136,21 +136,21 @@ static inline void vmalloc_init(void) static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif -extern void *vmalloc(unsigned long size); -extern void *vzalloc(unsigned long size); -extern void *vmalloc_user(unsigned long size); -extern void *vmalloc_node(unsigned long size, int node); -extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_32(unsigned long size); -extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); +extern void *vmalloc(unsigned long size) __alloc_size(1); +extern void *vzalloc(unsigned long size) __alloc_size(1); +extern void *vmalloc_user(unsigned long size) __alloc_size(1); +extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); +extern void *vmalloc_32(unsigned long size) __alloc_size(1); +extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, - const void *caller); + const void *caller) __alloc_size(1); void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, - int node, const void *caller); -void *vmalloc_no_huge(unsigned long size); + int node, const void *caller) __alloc_size(1); +void *vmalloc_no_huge(unsigned long size) __alloc_size(1); extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); From abd58f38dfb4771ef06e25d71a84aa0932c52f1a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:38 -0700 Subject: [PATCH 04/57] mm/page_alloc: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate page allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Link: https://lkml.kernel.org/r/20210930222704.2631604-8-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Cc: Andy Whitcroft Cc: Christoph Lameter Cc: David Rientjes Cc: Dennis Zhou Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Tejun Heo Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 55b2ec1f965a..fbd4abc33f24 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -608,9 +608,9 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); -void *alloc_pages_exact(size_t size, gfp_t gfp_mask); +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); +__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(1); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) From 17197dd460469c6fca66e274f5cd51ee43bb6ddd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 5 Nov 2021 13:36:42 -0700 Subject: [PATCH 05/57] percpu: add __alloc_size attributes for better bounds checking As already done in GrapheneOS, add the __alloc_size attribute for appropriate percpu allocator interfaces, to provide additional hinting for better bounds checking, assisting CONFIG_FORTIFY_SOURCE and other compiler optimizations. Note that due to the implementation of the percpu API, this is unlikely to ever actually provide compile-time checking beyond very simple non-SMP builds. But, since they are technically allocators, mark them as such. Link: https://lkml.kernel.org/r/20210930222704.2631604-9-keescook@chromium.org Signed-off-by: Kees Cook Co-developed-by: Daniel Micay Signed-off-by: Daniel Micay Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Andy Whitcroft Cc: David Rientjes Cc: Dwaipayan Ray Cc: Joe Perches Cc: Joonsoo Kim Cc: Lukas Bulwahn Cc: Miguel Ojeda Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Alexandre Bounine Cc: Gustavo A. R. Silva Cc: Ira Weiny Cc: Jing Xiangfeng Cc: John Hubbard Cc: kernel test robot Cc: Matt Porter Cc: Randy Dunlap Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/percpu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 5e76af742c80..98a9371133f8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); extern bool is_kernel_percpu_address(unsigned long addr); @@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); extern void __init setup_per_cpu_areas(void); #endif -extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); -extern void __percpu *__alloc_percpu(size_t size, size_t align); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); +extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); From d1fea155ee3d90fb3adcb3d6c42751860be3b158 Mon Sep 17 00:00:00 2001 From: Yinan Zhang Date: Fri, 5 Nov 2021 13:36:46 -0700 Subject: [PATCH 06/57] mm/page_ext.c: fix a comment I have noticed that the previous macro is #ifndef CONFIG_SPARSEMEM. I think the comment of #else should be CONFIG_SPARSEMEM. Link: https://lkml.kernel.org/r/20211008140312.6492-1-zhangyinan2019@email.szu.edu.cn Signed-off-by: Yinan Zhang Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_ext.c b/mm/page_ext.c index 2a52fd9ed464..6242afb24d84 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -201,7 +201,7 @@ fail: panic("Out of memory"); } -#else /* CONFIG_FLATMEM */ +#else /* CONFIG_SPARSEMEM */ struct page_ext *lookup_page_ext(const struct page *page) { From 8c8387ee3f55a38c3388882f1dd72dc526965211 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 5 Nov 2021 13:36:49 -0700 Subject: [PATCH 07/57] mm: stop filemap_read() from grabbing a superfluous page Under some circumstances, filemap_read() will allocate sufficient pages to read to the end of the file, call readahead/readpages on them and copy the data over - and then it will allocate another page at the EOF and call readpage on that and then ignore it. This is unnecessary and a waste of time and resources. filemap_read() *does* check for this, but only after it has already done the allocation and I/O. Fix this by checking before calling filemap_get_pages() also. Link: https://lkml.kernel.org/r/163472463105.3126792.7056099385135786492.stgit@warthog.procyon.org.uk Link: https://lore.kernel.org/r/160588481358.3465195.16552616179674485179.stgit@warthog.procyon.org.uk/ Link: https://lore.kernel.org/r/163456863216.2614702.6384850026368833133.stgit@warthog.procyon.org.uk/ Signed-off-by: David Howells Acked-by: Jeff Layton Reviewed-by: Matthew Wilcox (Oracle) Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index dae481293b5d..e50be519f6a4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2625,6 +2625,9 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; + if (unlikely(iocb->ki_pos >= i_size_read(inode))) + break; + error = filemap_get_pages(iocb, iter, &pvec); if (error < 0) break; From c6fd3ac0fc859da57404c3bad64696d48a6f425e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:52 -0700 Subject: [PATCH 08/57] mm: export bdi_unregister Patch series "simplify bdi unregistation". This series simplifies the BDI code to get rid of the magic auto-unregister feature that hid a recent block layer refcounting bug. This patch (of 5): To wind down the magic auto-unregister semantics we'll need to push this into modular code. Link: https://lkml.kernel.org/r/20211021124441.668816-1-hch@lst.de Link: https://lkml.kernel.org/r/20211021124441.668816-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 4a9d4e27d0d9..8a46a0a4b72f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -958,6 +958,7 @@ void bdi_unregister(struct backing_dev_info *bdi) bdi->owner = NULL; } } +EXPORT_SYMBOL(bdi_unregister); static void release_bdi(struct kref *ref) { From 9718c59c0a1604350c06ffd87d598f03dcf5e9ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:55 -0700 Subject: [PATCH 09/57] mtd: call bdi_unregister explicitly Call bdi_unregister explicitly instead of relying on the automatic unregistration. Link: https://lkml.kernel.org/r/20211021124441.668816-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mtd/mtdcore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index c8fd7f758938..c904e23c8237 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -2409,6 +2409,7 @@ static void __exit cleanup_mtd(void) if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); + bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); idr_destroy(&mtd_idr); } From 0b3ea0926afb8dde70cfab00316ae0a70b93a7cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:36:58 -0700 Subject: [PATCH 10/57] fs: explicitly unregister per-superblock BDIs Add a new SB_I_ flag to mark superblocks that have an ephemeral bdi associated with them, and unregister it when the superblock is shut down. Link: https://lkml.kernel.org/r/20211021124441.668816-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 3 +++ include/linux/fs.h | 1 + 2 files changed, 4 insertions(+) diff --git a/fs/super.c b/fs/super.c index bcef3a6f4c4b..3bfc0f8fbd5b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb_lock); up_write(&sb->s_umount); if (sb->s_bdi != &noop_backing_dev_info) { + if (sb->s_iflags & SB_I_PERSB_BDI) + bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } @@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; + sb->s_iflags |= SB_I_PERSB_BDI; return 0; } diff --git a/include/linux/fs.h b/include/linux/fs.h index e7a633353fd2..226de651f52e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1443,6 +1443,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ /* Possible states of 'frozen' field */ enum { From 702f2d1e3b33617a8d9a9424f08a69b7c51642a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:37:01 -0700 Subject: [PATCH 11/57] mm: don't automatically unregister bdis All BDI users now unregister explicitly. Link: https://lkml.kernel.org/r/20211021124441.668816-5-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8a46a0a4b72f..768e9ae489f6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -965,8 +965,7 @@ static void release_bdi(struct kref *ref) struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); - if (test_bit(WB_registered, &bdi->wb.state)) - bdi_unregister(bdi); + WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state)); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); From efee17134ca464639a2f5b4d036ce40caf1b247a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Nov 2021 13:37:04 -0700 Subject: [PATCH 12/57] mm: simplify bdi refcounting Move grabbing and releasing the bdi refcount out of the common wb_init/wb_exit helpers into code that is only used for the non-default memcg driven bdi_writeback structures. [hch@lst.de: add comment] Link: https://lkml.kernel.org/r/20211027074207.GA12793@lst.de [akpm@linux-foundation.org: fix typo] Link: https://lkml.kernel.org/r/20211021124441.668816-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Miquel Raynal Cc: Richard Weinberger Cc: Vignesh Raghavendra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 3 +++ mm/backing-dev.c | 13 +++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 33207004cfde..993c5628a726 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -103,6 +103,9 @@ struct wb_completion { * change as blkcg is disabled and enabled higher up in the hierarchy, a wb * is tested for blkcg after lookup and removed from index on mismatch so * that a new wb for the combination can be created. + * + * Each bdi_writeback that is not embedded into the backing_dev_info must hold + * a reference to the parent backing_dev_info. See cgwb_create() for details. */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 768e9ae489f6..5ccb25089808 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -291,8 +291,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, memset(wb, 0, sizeof(*wb)); - if (wb != &bdi->wb) - bdi_get(bdi); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); @@ -316,7 +314,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, err = fprop_local_init_percpu(&wb->completions, gfp); if (err) - goto out_put_bdi; + return err; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); @@ -330,9 +328,6 @@ out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); -out_put_bdi: - if (wb != &bdi->wb) - bdi_put(bdi); return err; } @@ -373,8 +368,6 @@ static void wb_exit(struct bdi_writeback *wb) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); - if (wb != &wb->bdi->wb) - bdi_put(wb->bdi); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -397,6 +390,7 @@ static void cgwb_release_workfn(struct work_struct *work) struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); + struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); @@ -416,6 +410,7 @@ static void cgwb_release_workfn(struct work_struct *work) percpu_ref_exit(&wb->refcnt); wb_exit(wb); + bdi_put(bdi); WARN_ON_ONCE(!list_empty(&wb->b_attached)); kfree_rcu(wb, rcu); } @@ -497,6 +492,7 @@ static int cgwb_create(struct backing_dev_info *bdi, INIT_LIST_HEAD(&wb->b_attached); INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); + bdi_get(bdi); /* * The root wb determines the registered state of the whole bdi and @@ -528,6 +524,7 @@ static int cgwb_create(struct backing_dev_info *bdi, goto out_put; err_fprop_exit: + bdi_put(bdi); fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); From 61d0017e5a32d3b13ba3c7becc83a5a9e53cdbe9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Nov 2021 13:37:07 -0700 Subject: [PATCH 13/57] mm: don't read i_size of inode unless we need it We always go through i_size_read(), and we rarely end up needing it. Push the read to down where we need to check it, which avoids it for most cases. It looks like we can even remove this check entirely, which might be worth pursuing. But at least this takes it out of the hot path. Link: https://lkml.kernel.org/r/6b67981f-57d4-c80e-bc07-6020aa601381@kernel.dk Signed-off-by: Jens Axboe Acked-by: Chris Mason Cc: Josef Bacik Cc: Dave Chinner Cc: Pavel Begunkov Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index e50be519f6a4..ebd68890a844 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2740,9 +2740,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - loff_t size; - size = i_size_read(inode); if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_needs_writeback(mapping, iocb->ki_pos, iocb->ki_pos + count - 1)) @@ -2774,8 +2772,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ - if (retval < 0 || !count || iocb->ki_pos >= size || - IS_DAX(inode)) + if (retval < 0 || !count || IS_DAX(inode)) + return retval; + if (iocb->ki_pos >= i_size_read(inode)) return retval; } From d417b49fff3e2f21043c834841e8623a6098741d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 13:37:10 -0700 Subject: [PATCH 14/57] mm/filemap.c: remove bogus VM_BUG_ON It is not safe to check page->index without holding the page lock. It can be changed if the page is moved between the swap cache and the page cache for a shmem file, for example. There is a VM_BUG_ON below which checks page->index is correct after taking the page lock. Link: https://lkml.kernel.org/r/20210818144932.940640-1-willy@infradead.org Fixes: 5c211ba29deb ("mm: add and use find_lock_entries") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index ebd68890a844..28c57c2536d0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2093,7 +2093,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, if (!xa_is_value(page)) { if (page->index < start) goto put; - VM_BUG_ON_PAGE(page->index != xas.xa_index, page); if (page->index + thp_nr_pages(page) - 1 > end) goto put; if (!trylock_page(page)) From f8ee8909ac818e555ef18b57b0ee4b9fd0478b82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Nov 2021 13:37:13 -0700 Subject: [PATCH 15/57] mm: move more expensive part of XA setup out of mapping check The fast path here is not needing any writeback, yet we spend time setting up the xarray lookup data upfront. Move the part that actually needs to iterate the address space mapping into a separate helper, saving ~30% of the time here. Link: https://lkml.kernel.org/r/49f67983-b802-8929-edab-d807f745c9ca@kernel.dk Signed-off-by: Jens Axboe Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 28c57c2536d0..938f52702a7a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -639,6 +639,30 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } +static bool filemap_range_has_writeback(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) +{ + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; + struct page *page; + + if (end_byte < start_byte) + return false; + + rcu_read_lock(); + xas_for_each(&xas, page, max) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + break; + } + rcu_read_unlock(); + return page != NULL; + +} + /** * filemap_range_needs_writeback - check if range potentially needs writeback * @mapping: address space within which to check @@ -656,29 +680,12 @@ static bool mapping_needs_writeback(struct address_space *mapping) bool filemap_range_needs_writeback(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); - pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; - if (!mapping_needs_writeback(mapping)) return false; if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) return false; - if (end_byte < start_byte) - return false; - - rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) - continue; - if (xa_is_value(page)) - continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) - break; - } - rcu_read_unlock(); - return page != NULL; + return filemap_range_has_writeback(mapping, start_byte, end_byte); } EXPORT_SYMBOL_GPL(filemap_range_needs_writeback); From 20b7fee738d65e50ca00e325ae27ee3efaa819f6 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Fri, 5 Nov 2021 13:37:16 -0700 Subject: [PATCH 16/57] mm/gup: further simplify __gup_device_huge() Commit 6401c4eb57f9 ("mm: gup: fix potential pgmap refcnt leak in __gup_device_huge()") simplified the return paths, but didn't go quite far enough, as discussed in [1]. Remove the "ret" variable entirely, because there is enough information already available to provide the return value. [1] https://lore.kernel.org/r/CAHk-=wgQTRX=5SkCmS+zfmpqubGHGJvXX_HgnPG8JSpHKHBMeg@mail.gmail.com Link: https://lkml.kernel.org/r/20210904004224.86391-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Suggested-by: Linus Torvalds Reviewed-by: Jan Kara Cc: Miaohe Lin Cc: Claudio Imbrenda Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 886d6148d3d0..48c7a5a1e85b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2228,7 +2228,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, { int nr_start = *nr; struct dev_pagemap *pgmap = NULL; - int ret = 1; do { struct page *page = pfn_to_page(pfn); @@ -2236,14 +2235,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, pgmap = get_dev_pagemap(pfn, pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, flags, pages); - ret = 0; break; } SetPageReferenced(page); pages[*nr] = page; if (unlikely(!try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); - ret = 0; break; } (*nr)++; @@ -2251,7 +2248,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, } while (addr += PAGE_SIZE, addr != end); put_dev_pagemap(pgmap); - return ret; + return addr == end; } static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, From 363dc512b666a235769482cc73869f899785a1f6 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Fri, 5 Nov 2021 13:37:19 -0700 Subject: [PATCH 17/57] mm/swapfile: remove needless request_queue NULL pointer check The request_queue pointer returned from bdev_get_queue() shall never be NULL, so the null check is unnecessary, just remove it. Link: https://lkml.kernel.org/r/20210917082111.33923-1-vulab@iscas.ac.cn Signed-off-by: Xu Wang Acked-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 22d10f713848..42027d213fd2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3118,7 +3118,7 @@ static bool swap_discardable(struct swap_info_struct *si) { struct request_queue *q = bdev_get_queue(si->bdev); - if (!q || !blk_queue_discard(q)) + if (!blk_queue_discard(q)) return false; return true; From 642929a2ded029aac13b8cb91bc9b7c088ddb57f Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Fri, 5 Nov 2021 13:37:22 -0700 Subject: [PATCH 18/57] mm/swapfile: fix an integer overflow in swap_show() This one is just a minor nuisance for people going through /proc/swaps if any of their swapareas is bigger than, or equal to 1073741824 pages (4TB). seq_printf() format string casts as uint the conversion from pages to KB, and that will overflow in the aforementioned case. Albeit being almost unthinkable that someone would actually set up such big of a single swaparea, there is a ticket recently filed against RHEL: https://bugzilla.redhat.com/show_bug.cgi?id=2008812 Given that all other codesites that use format strings for the same swap pages-to-KB conversion do cast it as ulong, this patch just follows suit. Link: https://lkml.kernel.org/r/20211006184011.2579054-1-aquini@redhat.com Signed-off-by: Rafael Aquini Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 42027d213fd2..353e5bd518e1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2763,7 +2763,7 @@ static int swap_show(struct seq_file *swap, void *v) struct swap_info_struct *si = v; struct file *file; int len; - unsigned int bytes, inuse; + unsigned long bytes, inuse; if (si == SEQ_START_TOKEN) { seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n"); @@ -2775,7 +2775,7 @@ static int swap_show(struct seq_file *swap, void *v) file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); - seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n", + seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? "partition" : "file\t", From 988c69f1bc23fe632ea5e6eb3c26a9e103c3ed41 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 5 Nov 2021 13:37:25 -0700 Subject: [PATCH 19/57] mm: optimise put_pages_list() Instead of calling put_page() one page at a time, pop pages off the list if their refcount was too high and pass the remainder to put_unref_page_list(). This should be a speed improvement, but I have no measurements to support that. Current callers do not care about performance, but I hope to add some which do. Link: https://lkml.kernel.org/r/20211007192138.561673-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anthony Yznaga Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index af3cad4e5378..9f334d503fd2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -134,18 +134,27 @@ EXPORT_SYMBOL(__put_page); * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * - * Release a list of pages which are strung together on page.lru. Currently - * used by read_cache_pages() and related error recovery code. + * Release a list of pages which are strung together on page.lru. */ void put_pages_list(struct list_head *pages) { - while (!list_empty(pages)) { - struct page *victim; + struct page *page, *next; - victim = lru_to_page(pages); - list_del(&victim->lru); - put_page(victim); + list_for_each_entry_safe(page, next, pages, lru) { + if (!put_page_testzero(page)) { + list_del(&page->lru); + continue; + } + if (PageHead(page)) { + list_del(&page->lru); + __put_compound_page(page); + continue; + } + /* Cannot be PageLRU because it's passed to us using the lru */ + __ClearPageWaiters(page); } + + free_unref_page_list(pages); } EXPORT_SYMBOL(put_pages_list); From 48384b0b76f3662dfa6153c1072c2b936fc14627 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:37:28 -0700 Subject: [PATCH 20/57] mm/memcg: drop swp_entry_t* in mc_handle_file_pte() It is unused after the rework of commit f5df8635c5a3 ("mm: use find_get_incore_page in memcontrol"). Link: https://lkml.kernel.org/r/20210916193014.80129-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6da5020a8656..15e6fb5d56a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5545,7 +5545,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, #endif static struct page *mc_handle_file_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, swp_entry_t *entry) + unsigned long addr, pte_t ptent) { if (!vma->vm_file) /* anonymous vma */ return NULL; @@ -5718,7 +5718,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, else if (is_swap_pte(ptent)) page = mc_handle_swap_pte(vma, ptent, &ent); else if (pte_none(ptent)) - page = mc_handle_file_pte(vma, addr, ptent, &ent); + page = mc_handle_file_pte(vma, addr, ptent); if (!page && !ent.val) return ret; From 11192d9c124d58d66449b163ed0d2cdff03761a1 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 5 Nov 2021 13:37:31 -0700 Subject: [PATCH 21/57] memcg: flush stats only if updated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At the moment, the kernel flushes the memcg stats on every refault and also on every reclaim iteration. Although rstat maintains per-cpu update tree but on the flush the kernel still has to go through all the cpu rstat update tree to check if there is anything to flush. This patch adds the tracking on the stats update side to make flush side more clever by skipping the flush if there is no update. The stats update codepath is very sensitive performance wise for many workloads and benchmarks. So, we can not follow what the commit aa48e47e3906 ("memcg: infrastructure to flush memcg stats") did which was triggering async flush through queue_work() and caused a lot performance regression reports. That got reverted by the commit 1f828223b799 ("memcg: flush lruvec stats in the refault"). In this patch we kept the stats update codepath very minimal and let the stats reader side to flush the stats only when the updates are over a specific threshold. For now the threshold is (nr_cpus * CHARGE_BATCH). To evaluate the impact of this patch, an 8 GiB tmpfs file is created on a system with swap-on-zram and the file was pushed to swap through memory.force_empty interface. On reading the whole file, the memcg stat flush in the refault code path is triggered. With this patch, we observed 63% reduction in the read time of 8 GiB file. Link: https://lkml.kernel.org/r/20211001190040.48086-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Reviewed-by: "Michal Koutný" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 78 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 23 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 15e6fb5d56a7..ae12fdd4bc0d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -103,11 +103,6 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; } -/* memcg and lruvec stats flushing */ -static void flush_memcg_stats_dwork(struct work_struct *w); -static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); -static DEFINE_SPINLOCK(stats_flush_lock); - #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 @@ -635,6 +630,56 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } +/* + * memcg and lruvec stats flushing + * + * Many codepaths leading to stats update or read are performance sensitive and + * adding stats flushing in such codepaths is not desirable. So, to optimize the + * flushing the kernel does: + * + * 1) Periodically and asynchronously flush the stats every 2 seconds to not let + * rstat update tree grow unbounded. + * + * 2) Flush the stats synchronously on reader side only when there are more than + * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization + * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but + * only for 2 seconds due to (1). + */ +static void flush_memcg_stats_dwork(struct work_struct *w); +static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork); +static DEFINE_SPINLOCK(stats_flush_lock); +static DEFINE_PER_CPU(unsigned int, stats_updates); +static atomic_t stats_flush_threshold = ATOMIC_INIT(0); + +static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +{ + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) + atomic_inc(&stats_flush_threshold); +} + +static void __mem_cgroup_flush_stats(void) +{ + if (!spin_trylock(&stats_flush_lock)) + return; + + cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); + atomic_set(&stats_flush_threshold, 0); + spin_unlock(&stats_flush_lock); +} + +void mem_cgroup_flush_stats(void) +{ + if (atomic_read(&stats_flush_threshold) > num_online_cpus()) + __mem_cgroup_flush_stats(); +} + +static void flush_memcg_stats_dwork(struct work_struct *w) +{ + mem_cgroup_flush_stats(); + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -647,7 +692,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -675,10 +720,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, memcg = pn->memcg; /* Update memcg */ - __mod_memcg_state(memcg, idx, val); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg); } /** @@ -780,7 +827,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); + memcg_rstat_updated(memcg); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -5341,21 +5388,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } -void mem_cgroup_flush_stats(void) -{ - if (!spin_trylock(&stats_flush_lock)) - return; - - cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); - spin_unlock(&stats_flush_lock); -} - -static void flush_memcg_stats_dwork(struct work_struct *w) -{ - mem_cgroup_flush_stats(); - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); -} - static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); From fd25a9e0e23b995fd0ba5e2f00a1099452cbc3cf Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 5 Nov 2021 13:37:34 -0700 Subject: [PATCH 22/57] memcg: unify memcg stat flushing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memcg stats can be flushed in multiple context and potentially in parallel too. For example multiple parallel user space readers for memcg stats will contend on the rstat locks with each other. There is no need for that. We just need one flusher and everyone else can benefit. In addition after aa48e47e3906 ("memcg: infrastructure to flush memcg stats") the kernel periodically flush the memcg stats from the root, so, the other flushers will potentially have much less work to do. Link: https://lkml.kernel.org/r/20211001190040.48086-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Michal Hocko Cc: "Michal Koutný" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae12fdd4bc0d..fd18076171b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -660,12 +660,14 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg) static void __mem_cgroup_flush_stats(void) { - if (!spin_trylock(&stats_flush_lock)) + unsigned long flag; + + if (!spin_trylock_irqsave(&stats_flush_lock, flag)) return; cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); atomic_set(&stats_flush_threshold, 0); - spin_unlock(&stats_flush_lock); + spin_unlock_irqrestore(&stats_flush_lock, flag); } void mem_cgroup_flush_stats(void) @@ -1461,7 +1463,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg) * * Current memory state: */ - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; @@ -3565,8 +3567,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) unsigned long val; if (mem_cgroup_is_root(memcg)) { - /* mem_cgroup_threshold() calls here from irqsafe context */ - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); val = memcg_page_state(memcg, NR_FILE_PAGES) + memcg_page_state(memcg, NR_ANON_MAPPED); if (swap) @@ -3947,7 +3948,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) int nid; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { seq_printf(m, "%s=%lu", stat->name, @@ -4019,7 +4020,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; @@ -4522,7 +4523,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - cgroup_rstat_flush_irqsafe(memcg->css.cgroup); + mem_cgroup_flush_stats(); *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); @@ -6405,7 +6406,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) int i; struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - cgroup_rstat_flush(memcg->css.cgroup); + mem_cgroup_flush_stats(); for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { int nid; From 38d4ef44ee4a7dbdf220daa52ad341c140f3bb51 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 5 Nov 2021 13:37:37 -0700 Subject: [PATCH 23/57] mm/memcg: remove obsolete memcg_free_kmem() Since commit d648bcc7fe65 ("mm: kmem: make memcg_kmem_enabled() irreversible"), the only thing memcg_free_kmem() does is to call memcg_offline_kmem() when the memcg is still online which can happen when online_css() fails due to -ENOMEM. However, the name memcg_free_kmem() is confusing and it is more clear and straight forward to call memcg_offline_kmem() directly from mem_cgroup_css_free(). Link: https://lkml.kernel.org/r/20211005202450.11775-1-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Roman Gushchin Reviewed-by: Aaron Tomlin Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd18076171b5..e092cce3c96e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3704,13 +3704,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_free_cache_id(kmemcg_id); } - -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ - /* css_alloc() failed, offlining didn't happen */ - if (unlikely(memcg->kmem_state == KMEM_ONLINE)) - memcg_offline_kmem(memcg); -} #else static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -3719,9 +3712,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { } -static void memcg_free_kmem(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ static int memcg_update_kmem_max(struct mem_cgroup *memcg, @@ -5356,7 +5346,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) cancel_work_sync(&memcg->high_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); - memcg_free_kmem(memcg); + + /* Need to offline kmem if online_css() fails */ + memcg_offline_kmem(memcg); mem_cgroup_free(memcg); } From 16f6bf266c94017c2c4699acab64316ddc0ee77c Mon Sep 17 00:00:00 2001 From: Len Baker Date: Fri, 5 Nov 2021 13:37:40 -0700 Subject: [PATCH 24/57] mm/list_lru.c: prefer struct_size over open coded arithmetic As noted in the "Deprecated Interfaces, Language Features, Attributes, and Conventions" documentation [1], size calculations (especially multiplication) should not be performed in memory allocator (or similar) function arguments due to the risk of them overflowing. This could lead to values wrapping around and a smaller allocation being made than the caller was expecting. Using those allocations could lead to linear overflows of heap memory and other misbehaviors. So, use the struct_size() helper to do the arithmetic instead of the argument "size + count * size" in the kvmalloc() functions. Also, take the opportunity to refactor the memcpy() call to use the flex_array_size() helper. This code was detected with the help of Coccinelle and audited and fixed manually. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments Link: https://lkml.kernel.org/r/20211017105929.9284-1-len.baker@gmx.com Signed-off-by: Len Baker Cc: Kees Cook Cc: "Gustavo A. R. Silva" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index cd58790d0fb3..a6031f1c5bd7 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -354,8 +354,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + - size * sizeof(void *), GFP_KERNEL); + memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL); if (!memcg_lrus) return -ENOMEM; @@ -389,7 +388,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, old = rcu_dereference_protected(nlru->memcg_lrus, lockdep_is_held(&list_lrus_mutex)); - new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); + new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL); if (!new) return -ENOMEM; @@ -398,7 +397,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, return -ENOMEM; } - memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); + memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size)); /* * The locking below allows readers that hold nlru->lock avoid taking From 58056f77502f3567b760c9a8fc8d2e9081515b2d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 5 Nov 2021 13:37:44 -0700 Subject: [PATCH 25/57] memcg, kmem: further deprecate kmem.limit_in_bytes The deprecation process of kmem.limit_in_bytes started with the commit 0158115f702 ("memcg, kmem: deprecate kmem.limit_in_bytes") which also explains in detail the motivation behind the deprecation. To summarize, it is the unexpected behavior on hitting the kmem limit. This patch moves the deprecation process to the next stage by disallowing to set the kmem limit. In future we might just remove the kmem.limit_in_bytes file completely. [akpm@linux-foundation.org: s/ENOTSUPP/EOPNOTSUPP/] [arnd@arndb.de: mark cancel_charge() inline] Link: https://lkml.kernel.org/r/20211022070542.679839-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211019153408.2916808-1-shakeelb@google.com Signed-off-by: Shakeel Butt Signed-off-by: Arnd Bergmann Acked-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Muchun Song Cc: Vasily Averin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/cgroup-v1/memory.rst | 11 +----- mm/memcontrol.c | 39 +++---------------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 41191b5fb69d..faac50149a22 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -87,10 +87,8 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes set/show hard limit for kernel memory - This knob is deprecated and shouldn't be - used. It is planned that this be removed in - the foreseeable future. + memory.kmem.limit_in_bytes This knob is deprecated and writing to + it will return -ENOTSUPP. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits @@ -518,11 +516,6 @@ will be charged as a new owner of it. charged file caches. Some out-of-use page caches may keep charged until memory pressure happens. If you want to avoid that, force_empty will be useful. - Also, note that when memory.kmem.limit_in_bytes is set the charges due to - kernel pages will still be seen. This is not considered a failure and the - write will still return success. In this case, it is expected that - memory.kmem.usage_in_bytes == memory.usage_in_bytes. - 5.2 stat file ------------- diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e092cce3c96e..e4fcb6e55f75 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2771,8 +2771,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU) -static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -2781,7 +2780,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); } -#endif static void commit_charge(struct page *page, struct mem_cgroup *memcg) { @@ -3000,7 +2998,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, unsigned int nr_pages) { - struct page_counter *counter; struct mem_cgroup *memcg; int ret; @@ -3010,21 +3007,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp, if (ret) goto out; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && - !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { - - /* - * Enforce __GFP_NOFAIL allocation because callers are not - * prepared to see failures and likely do not have any failure - * handling code. - */ - if (gfp & __GFP_NOFAIL) { - page_counter_charge(&memcg->kmem, nr_pages); - goto out; - } - cancel_charge(memcg, nr_pages); - ret = -ENOMEM; - } + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_charge(&memcg->kmem, nr_pages); out: css_put(&memcg->css); @@ -3714,17 +3698,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) } #endif /* CONFIG_MEMCG_KMEM */ -static int memcg_update_kmem_max(struct mem_cgroup *memcg, - unsigned long max) -{ - int ret; - - mutex_lock(&memcg_max_mutex); - ret = page_counter_set_max(&memcg->kmem, max); - mutex_unlock(&memcg_max_mutex); - return ret; -} - static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; @@ -3790,10 +3763,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " - "Please report your usecase to linux-mm@kvack.org if you " - "depend on this functionality.\n"); - ret = memcg_update_kmem_max(memcg, nr_pages); + /* kmem.limit_in_bytes is deprecated. */ + ret = -EOPNOTSUPP; break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); From 60ec6a48eec24986a6414740a2481d22efc1b2f9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:47 -0700 Subject: [PATCH 26/57] mm: list_lru: remove holding lru lock Since commit e5bc3af7734f ("rcu: Consolidate PREEMPT and !PREEMPT synchronize_rcu()"), the critical section of spin lock can serve as an RCU read-side critical section which already allows readers that hold nlru->lock to avoid taking rcu lock. So just remove holding lock. Link: https://lkml.kernel.org/r/20211025124534.56345-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index a6031f1c5bd7..9a1f7df1afc9 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -398,18 +398,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, } memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size)); - - /* - * The locking below allows readers that hold nlru->lock avoid taking - * rcu_read_lock (see list_lru_from_memcg_idx). - * - * Since list_lru_{add,del} may be called under an IRQ-safe lock, - * we have to use IRQ-safe primitives here to avoid deadlock. - */ - spin_lock_irq(&nlru->lock); rcu_assign_pointer(nlru->memcg_lrus, new); - spin_unlock_irq(&nlru->lock); - kvfree_rcu(old, rcu); return 0; } From 41d17431df4aa7c57761e04f81c94fb3c3beedf4 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:50 -0700 Subject: [PATCH 27/57] mm: list_lru: fix the return value of list_lru_count_one() Since commit 2788cf0c401c ("memcg: reparent list_lrus and free kmemcg_id on css offline"), ->nr_items can be negative during memory cgroup reparenting. In this case, list_lru_count_one() will return an unusual and huge value, which can surprise users. At least for now it hasn't affected any users. But it is better to let list_lru_count_ont() returns zero when ->nr_items is negative. Link: https://lkml.kernel.org/r/20211025124910.56433-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 9a1f7df1afc9..7572f8e70b86 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -176,13 +176,16 @@ unsigned long list_lru_count_one(struct list_lru *lru, { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - unsigned long count; + long count; rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); count = READ_ONCE(l->nr_items); rcu_read_unlock(); + if (unlikely(count < 0)) + count = 0; + return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); From 642688681133a501d149349ba1a824204f3540e1 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:53 -0700 Subject: [PATCH 28/57] mm: memcontrol: remove kmemcg_id reparenting Since slab objects and kmem pages are charged to object cgroup instead of memory cgroup, memcg_reparent_objcgs() will reparent this cgroup and all its descendants to its parent cgroup. This already makes further list_lru_add()'s add elements to the parent's list. So it is unnecessary to change kmemcg_id of an offline cgroup to its parent's id. It just wastes CPU cycles. Just remove the redundant code. Link: https://lkml.kernel.org/r/20211025125102.56533-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e4fcb6e55f75..1e2d3f378edd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3650,8 +3650,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static void memcg_offline_kmem(struct mem_cgroup *memcg) { - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; + struct mem_cgroup *parent; int kmemcg_id; if (memcg->kmem_state != KMEM_ONLINE) @@ -3669,21 +3668,11 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) BUG_ON(kmemcg_id < 0); /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by + * After we have finished memcg_reparent_objcgs(), all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. + * The ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ - rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - } - rcu_read_unlock(); - memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); From e80216d9f1f5c90b3cab834cd4fb492d731f70aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:56 -0700 Subject: [PATCH 29/57] mm: memcontrol: remove the kmem states Now the kmem states is only used to indicate whether the kmem is offline. However, we can set ->kmemcg_id to -1 to indicate whether the kmem is offline. Finally, we can remove the kmem states to simplify the code. Link: https://lkml.kernel.org/r/20211025125259.56624-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Shakeel Butt Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ------- mm/memcontrol.c | 7 ++----- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3096c9a0ee01..f207f98bdb76 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,12 +180,6 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; -enum memcg_kmem_state { - KMEM_NONE, - KMEM_ALLOCATED, - KMEM_ONLINE, -}; - #if defined(CONFIG_SMP) struct memcg_padding { char x[0]; @@ -318,7 +312,6 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; - enum memcg_kmem_state kmem_state; struct obj_cgroup __rcu *objcg; struct list_head objcg_list; /* list of inherited objcgs */ #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1e2d3f378edd..b4a17b7a10d3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3626,7 +3626,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_state); memcg_id = memcg_alloc_cache_id(); if (memcg_id < 0) @@ -3643,7 +3642,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static_branch_enable(&memcg_kmem_enabled_key); memcg->kmemcg_id = memcg_id; - memcg->kmem_state = KMEM_ONLINE; return 0; } @@ -3653,11 +3651,9 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) struct mem_cgroup *parent; int kmemcg_id; - if (memcg->kmem_state != KMEM_ONLINE) + if (memcg->kmemcg_id == -1) return; - memcg->kmem_state = KMEM_ALLOCATED; - parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; @@ -3676,6 +3672,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) memcg_drain_all_list_lrus(kmemcg_id, parent); memcg_free_cache_id(kmemcg_id); + memcg->kmemcg_id = -1; } #else static int memcg_online_kmem(struct mem_cgroup *memcg) From 3eef11279ba5936b1554a0dccb1cea8345e5e2a5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 5 Nov 2021 13:37:59 -0700 Subject: [PATCH 30/57] mm: list_lru: only add memcg-aware lrus to the global lru list The non-memcg-aware lru is always skiped when traversing the global lru list, which is not efficient. We can only add the memcg-aware lru to the global lru list instead to make traversing more efficient. Link: https://lkml.kernel.org/r/20211025124353.55781-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 7572f8e70b86..0cd5e89ca063 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -15,18 +15,29 @@ #include "slab.h" #ifdef CONFIG_MEMCG_KMEM -static LIST_HEAD(list_lrus); +static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return lru->memcg_aware; +} + static void list_lru_register(struct list_lru *lru) { + if (!list_lru_memcg_aware(lru)) + return; + mutex_lock(&list_lrus_mutex); - list_add(&lru->list, &list_lrus); + list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { + if (!list_lru_memcg_aware(lru)) + return; + mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); @@ -37,11 +48,6 @@ static int lru_shrinker_id(struct list_lru *lru) return lru->shrinker_id; } -static inline bool list_lru_memcg_aware(struct list_lru *lru) -{ - return lru->memcg_aware; -} - static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) { @@ -457,9 +463,6 @@ static int memcg_update_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return 0; - for_each_node(i) { if (memcg_update_list_lru_node(&lru->node[i], old_size, new_size)) @@ -482,9 +485,6 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return; - for_each_node(i) memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); @@ -497,7 +497,7 @@ int memcg_update_all_list_lrus(int new_size) int old_size = memcg_nr_cache_ids; mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &list_lrus, list) { + list_for_each_entry(lru, &memcg_list_lrus, list) { ret = memcg_update_list_lru(lru, old_size, new_size); if (ret) goto fail; @@ -506,7 +506,7 @@ out: mutex_unlock(&list_lrus_mutex); return ret; fail: - list_for_each_entry_continue_reverse(lru, &list_lrus, list) + list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list) memcg_cancel_update_list_lru(lru, old_size, new_size); goto out; } @@ -543,9 +543,6 @@ static void memcg_drain_list_lru(struct list_lru *lru, { int i; - if (!list_lru_memcg_aware(lru)) - return; - for_each_node(i) memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg); } @@ -555,7 +552,7 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg) struct list_lru *lru; mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &list_lrus, list) + list_for_each_entry(lru, &memcg_list_lrus, list) memcg_drain_list_lru(lru, src_idx, dst_memcg); mutex_unlock(&list_lrus_mutex); } From 0b28179a6138a5edd9d82ad2687c05b3773c387b Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:38:02 -0700 Subject: [PATCH 31/57] mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks Patch series "memcg: prohibit unconditional exceeding the limit of dying tasks", v3. Memory cgroup charging allows killed or exiting tasks to exceed the hard limit. It can be misused and allowed to trigger global OOM from inside a memcg-limited container. On the other hand if memcg fails allocation, called from inside #PF handler it triggers global OOM from inside pagefault_out_of_memory(). To prevent these problems this patchset: (a) removes execution of out_of_memory() from pagefault_out_of_memory(), becasue nobody can explain why it is necessary. (b) allow memcg to fail allocation of dying/killed tasks. This patch (of 3): Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory which in turn executes out_out_memory() and can kill a random task. An allocation might fail when the current task is the oom victim and there are no memory reserves left. The OOM killer is already handled at the page allocator level for the global OOM and at the charging level for the memcg one. Both have much more information about the scope of allocation/charge request. This means that either the OOM killer has been invoked properly and didn't lead to the allocation success or it has been skipped because it couldn't have been invoked. In both cases triggering it from here is pointless and even harmful. It makes much more sense to let the killed task die rather than to wake up an eternally hungry oom-killer and send him to choose a fatter victim for breakfast. Link: https://lkml.kernel.org/r/0828a149-786e-7c06-b70a-52d086818ea3@virtuozzo.com Signed-off-by: Vasily Averin Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tetsuo Handa Cc: Uladzislau Rezki Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 989f35a2bbb1..d89545d1a5b8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1137,6 +1137,9 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return; + if (fatal_signal_pending(current)) + return; + if (!mutex_trylock(&oom_lock)) return; out_of_memory(&oc); From 60e2793d440a3ec95abb5d6d4fc034a4b480472d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 5 Nov 2021 13:38:06 -0700 Subject: [PATCH 32/57] mm, oom: do not trigger out_of_memory from the #PF Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory. This can happen for 2 different reasons. a) Memcg is out of memory and we rely on mem_cgroup_oom_synchronize to perform the memcg OOM handling or b) normal allocation fails. The latter is quite problematic because allocation paths already trigger out_of_memory and the page allocator tries really hard to not fail allocations. Anyway, if the OOM killer has been already invoked there is no reason to invoke it again from the #PF path. Especially when the OOM condition might be gone by that time and we have no way to find out other than allocate. Moreover if the allocation failed and the OOM killer hasn't been invoked then we are unlikely to do the right thing from the #PF context because we have already lost the allocation context and restictions and therefore might oom kill a task from a different NUMA domain. This all suggests that there is no legitimate reason to trigger out_of_memory from pagefault_out_of_memory so drop it. Just to be sure that no #PF path returns with VM_FAULT_OOM without allocation print a warning that this is happening before we restart the #PF. [VvS: #PF allocation can hit into limit of cgroup v1 kmem controller. This is a local problem related to memcg, however, it causes unnecessary global OOM kills that are repeated over and over again and escalate into a real disaster. This has been broken since kmem accounting has been introduced for cgroup v1 (3.8). There was no kmem specific reclaim for the separate limit so the only way to handle kmem hard limit was to return with ENOMEM. In upstream the problem will be fixed by removing the outdated kmem limit, however stable and LTS kernels cannot do it and are still affected. This patch fixes the problem and should be backported into stable/LTS.] Link: https://lkml.kernel.org/r/f5fd8dd8-0ad4-c524-5f65-920b01972a42@virtuozzo.com Signed-off-by: Michal Hocko Signed-off-by: Vasily Averin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tetsuo Handa Cc: Uladzislau Rezki Cc: Vladimir Davydov Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d89545d1a5b8..bfa9e348c3a3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1120,19 +1120,15 @@ bool out_of_memory(struct oom_control *oc) } /* - * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If oom_lock is held by somebody else, a parallel oom - * killing is already in progress so do nothing. + * The pagefault handler calls here because some allocation has failed. We have + * to take care of the memcg OOM here because this is the only safe context without + * any locks held but let the oom killer triggered from the allocation context care + * about the global OOM. */ void pagefault_out_of_memory(void) { - struct oom_control oc = { - .zonelist = NULL, - .nodemask = NULL, - .memcg = NULL, - .gfp_mask = 0, - .order = 0, - }; + static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (mem_cgroup_oom_synchronize(true)) return; @@ -1140,10 +1136,8 @@ void pagefault_out_of_memory(void) if (fatal_signal_pending(current)) return; - if (!mutex_trylock(&oom_lock)) - return; - out_of_memory(&oc); - mutex_unlock(&oom_lock); + if (__ratelimit(&pfoom_rs)) + pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); } SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) From a4ebf1b6ca1e011289677239a2a361fde4a88076 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:38:09 -0700 Subject: [PATCH 33/57] memcg: prohibit unconditional exceeding the limit of dying tasks Memory cgroup charging allows killed or exiting tasks to exceed the hard limit. It is assumed that the amount of the memory charged by those tasks is bound and most of the memory will get released while the task is exiting. This is resembling a heuristic for the global OOM situation when tasks get access to memory reserves. There is no global memory shortage at the memcg level so the memcg heuristic is more relieved. The above assumption is overly optimistic though. E.g. vmalloc can scale to really large requests and the heuristic would allow that. We used to have an early break in the vmalloc allocator for killed tasks but this has been reverted by commit b8c8a338f75e ("Revert "vmalloc: back off when the current task is killed""). There are likely other similar code paths which do not check for fatal signals in an allocation&charge loop. Also there are some kernel objects charged to a memcg which are not bound to a process life time. It has been observed that it is not really hard to trigger these bypasses and cause global OOM situation. One potential way to address these runaways would be to limit the amount of excess (similar to the global OOM with limited oom reserves). This is certainly possible but it is not really clear how much of an excess is desirable and still protects from global OOMs as that would have to consider the overall memcg configuration. This patch is addressing the problem by removing the heuristic altogether. Bypass is only allowed for requests which either cannot fail or where the failure is not desirable while excess should be still limited (e.g. atomic requests). Implementation wise a killed or dying task fails to charge if it has passed the OOM killer stage. That should give all forms of reclaim chance to restore the limit before the failure (ENOMEM) and tell the caller to back off. In addition, this patch renames should_force_charge() helper to task_is_dying() because now its use is not associated witch forced charging. This patch depends on pagefault_out_of_memory() to not trigger out_of_memory(), because then a memcg failure can unwind to VM_FAULT_OOM and cause a global OOM killer. Link: https://lkml.kernel.org/r/8f5cebbb-06da-4902-91f0-6566fc4b4203@virtuozzo.com Signed-off-by: Vasily Averin Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Vlastimil Babka Cc: Shakeel Butt Cc: Mel Gorman Cc: Tetsuo Handa Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b4a17b7a10d3..cf0321d7a784 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -234,7 +234,7 @@ enum res_type { iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -static inline bool should_force_charge(void) +static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || (current->flags & PF_EXITING); @@ -1624,7 +1624,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = should_force_charge() || out_of_memory(&oc); + ret = task_is_dying() || out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); @@ -2579,6 +2579,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; enum oom_status oom_status; unsigned long nr_reclaimed; + bool passed_oom = false; bool may_swap = true; bool drained = false; unsigned long pflags; @@ -2613,15 +2614,6 @@ retry: if (gfp_mask & __GFP_ATOMIC) goto force; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(should_force_charge())) - goto force; - /* * Prevent unbounded recursion when reclaim operations need to * allocate memory. This might exceed the limits temporarily, @@ -2679,8 +2671,9 @@ retry: if (gfp_mask & __GFP_RETRY_MAYFAIL) goto nomem; - if (fatal_signal_pending(current)) - goto force; + /* Avoid endless loop for tasks bypassed by the oom killer */ + if (passed_oom && task_is_dying()) + goto nomem; /* * keep retrying as long as the memcg oom killer is able to make @@ -2689,14 +2682,10 @@ retry: */ oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); - switch (oom_status) { - case OOM_SUCCESS: + if (oom_status == OOM_SUCCESS) { + passed_oom = true; nr_retries = MAX_RECLAIM_RETRIES; goto retry; - case OOM_FAILED: - goto force; - default: - goto nomem; } nomem: if (!(gfp_mask & __GFP_NOFAIL)) From 7866076b924ad4f285bd4596630a8ca7b8333319 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Fri, 5 Nov 2021 13:38:12 -0700 Subject: [PATCH 34/57] mm/mmap.c: fix a data race of mm->total_vm The variable mm->total_vm could be accessed concurrently during mmaping and system accounting as noticed by KCSAN, BUG: KCSAN: data-race in __acct_update_integrals / mmap_region read-write to 0xffffa40267bd14c8 of 8 bytes by task 15609 on cpu 3: mmap_region+0x6dc/0x1400 do_mmap+0x794/0xca0 vm_mmap_pgoff+0xdf/0x150 ksys_mmap_pgoff+0xe1/0x380 do_syscall_64+0x37/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 read to 0xffffa40267bd14c8 of 8 bytes by interrupt on cpu 2: __acct_update_integrals+0x187/0x1d0 acct_account_cputime+0x3c/0x40 update_process_times+0x5c/0x150 tick_sched_timer+0x184/0x210 __run_hrtimer+0x119/0x3b0 hrtimer_interrupt+0x350/0xaa0 __sysvec_apic_timer_interrupt+0x7b/0x220 asm_call_irq_on_stack+0x12/0x20 sysvec_apic_timer_interrupt+0x4d/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20 smp_call_function_single+0x192/0x2b0 perf_install_in_context+0x29b/0x4a0 __se_sys_perf_event_open+0x1a98/0x2550 __x64_sys_perf_event_open+0x63/0x70 do_syscall_64+0x37/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported by Kernel Concurrency Sanitizer on: CPU: 2 PID: 15610 Comm: syz-executor.3 Not tainted 5.10.0+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 In vm_stat_account which called by mmap_region, increase total_vm, and __acct_update_integrals may read total_vm at the same time. This will cause a data race which lead to undefined behaviour. To avoid potential bad read/write, volatile property and barrier are both used to avoid undefined behaviour. Link: https://lkml.kernel.org/r/20210913105550.1569419-1-liupeng256@huawei.com Signed-off-by: Peng Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/tsacct.c | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 257ffb993ea2..f00de83d0246 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -137,7 +137,7 @@ static void __acct_update_integrals(struct task_struct *tsk, * the rest of the math is done in xacct_add_tsk. */ tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; + tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10; } /** diff --git a/mm/mmap.c b/mm/mmap.c index 88dcc5c25225..b22a07f5e761 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3332,7 +3332,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) { - mm->total_vm += npages; + WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages); if (is_exec_mapping(flags)) mm->exec_vm += npages; From f1dc0db296bd25960273649fc6ef2ecbf5aaa0e0 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 5 Nov 2021 13:38:15 -0700 Subject: [PATCH 35/57] mm: use __pfn_to_section() instead of open coding it It is defined in the same file just a few lines above. Link: https://lkml.kernel.org/r/4598487.Rc0NezkW7i@mobilepool36.emlix.com Signed-off-by: Rolf Eike Beer Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6a1d79d84675..832aa49d0d8e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1481,7 +1481,7 @@ static inline int pfn_valid(unsigned long pfn) if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - ms = __nr_to_section(pfn_to_section_nr(pfn)); + ms = __pfn_to_section(pfn); if (!valid_section(ms)) return 0; /* @@ -1496,7 +1496,7 @@ static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; - return present_section(__nr_to_section(pfn_to_section_nr(pfn))); + return present_section(__pfn_to_section(pfn)); } static inline unsigned long next_present_section_nr(unsigned long section_nr) From b063e374e7ae75589a36f4bcbfb47956ac197c57 Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Fri, 5 Nov 2021 13:38:18 -0700 Subject: [PATCH 36/57] mm/memory.c: avoid unnecessary kernel/user pointer conversion Annotating a pointer from __user to kernel and then back again might confuse sparse. In copy_huge_page_from_user() it can be avoided by removing the intermediate variable since it is never used. Link: https://lkml.kernel.org/r/20210914150820.19326-1-amit.kachhap@arm.com Signed-off-by: Amit Daniel Kachhap Acked-by: Kirill A. Shutemov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c52be6d6b605..1b7032a30dd5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5421,7 +5421,6 @@ long copy_huge_page_from_user(struct page *dst_page, unsigned int pages_per_huge_page, bool allow_pagefault) { - void *src = (void *)usr_src; void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; @@ -5434,8 +5433,7 @@ long copy_huge_page_from_user(struct page *dst_page, else page_kaddr = kmap_atomic(subpage); rc = copy_from_user(page_kaddr, - (const void __user *)(src + i * PAGE_SIZE), - PAGE_SIZE); + usr_src + i * PAGE_SIZE, PAGE_SIZE); if (allow_pagefault) kunmap(subpage); else From 9ae0f87d009ca6c4aab2882641ddfc319727e3db Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:24 -0700 Subject: [PATCH 37/57] mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte Patch series "mm: A few cleanup patches around zap, shmem and uffd", v4. IMHO all of them are very nice cleanups to existing code already, they're all small and self-contained. They'll be needed by uffd-wp coming series. This patch (of 4): It was conditionally done previously, as there's one shmem special case that we use SetPageDirty() instead. However that's not necessary and it should be easier and cleaner to do it unconditionally in mfill_atomic_install_pte(). The most recent discussion about this is here, where Hugh explained the history of SetPageDirty() and why it's possible that it's not required at all: https://lore.kernel.org/lkml/alpine.LSU.2.11.2104121657050.1097@eggly.anvils/ Currently mfill_atomic_install_pte() has three callers: 1. shmem_mfill_atomic_pte 2. mcopy_atomic_pte 3. mcontinue_atomic_pte After the change: case (1) should have its SetPageDirty replaced by the dirty bit on pte (so we unify them together, finally), case (2) should have no functional change at all as it has page_in_cache==false, case (3) may add a dirty bit to the pte. However since case (3) is UFFDIO_CONTINUE for shmem, it's merely 100% sure the page is dirty after all because UFFDIO_CONTINUE normally requires another process to modify the page cache and kick the faulted thread, so should not make a real difference either. This should make it much easier to follow on which case will set dirty for uffd, as we'll simply set it all now for all uffd related ioctls. Meanwhile, no special handling of SetPageDirty() if there's no need. Link: https://lkml.kernel.org/r/20210915181456.10739-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20210915181456.10739-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Liam Howlett Cc: Mike Rapoport Cc: Yang Shi Cc: David Hildenbrand Cc: "Kirill A . Shutemov" Cc: Jerome Glisse Cc: Alistair Popple Cc: Miaohe Lin Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 - mm/userfaultfd.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7cd976b588ff..df2f0288b9ca 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2423,7 +2423,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - SetPageDirty(page); unlock_page(page); return 0; out_delete_from_cache: diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7a9008415534..caf6dfff2a60 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pgoff_t offset, max_off; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + _dst_pte = pte_mkdirty(_dst_pte); if (page_in_cache && !vm_shared) writable = false; - if (writable || !page_in_cache) - _dst_pte = pte_mkdirty(_dst_pte); if (writable) { if (wp_copy) _dst_pte = pte_mkuffd_wp(_dst_pte); From 2ca99358671ad3a2065389476701f69f39ab5e5f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:28 -0700 Subject: [PATCH 38/57] mm: clear vmf->pte after pte_unmap_same() returns pte_unmap_same() will always unmap the pte pointer. After the unmap, vmf->pte will not be valid any more, we should clear it. It was safe only because no one is accessing vmf->pte after pte_unmap_same() returns, since the only caller of pte_unmap_same() (so far) is do_swap_page(), where vmf->pte will in most cases be overwritten very soon. Directly pass in vmf into pte_unmap_same() and then we can also avoid the long parameter list too, which should be a nice cleanup. Link: https://lkml.kernel.org/r/20210915181533.11188-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Liam Howlett Acked-by: Hugh Dickins Cc: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1b7032a30dd5..cf7b059b57a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2724,19 +2724,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range); * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) +static inline int pte_unmap_same(struct vm_fault *vmf) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); + spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(ptl); - same = pte_same(*page_table, orig_pte); + same = pte_same(*vmf->pte, vmf->orig_pte); spin_unlock(ptl); } #endif - pte_unmap(page_table); + pte_unmap(vmf->pte); + vmf->pte = NULL; return same; } @@ -3488,7 +3488,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vm_fault_t ret = 0; void *shadow = NULL; - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + if (!pte_unmap_same(vmf)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); From 232a6a1c0619d1b4d9cd8d21949b2f13821be0af Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:31 -0700 Subject: [PATCH 39/57] mm: drop first_index/last_index in zap_details The first_index/last_index parameters in zap_details are actually only used in unmap_mapping_range_tree(). At the meantime, this function is only called by unmap_mapping_pages() once. Instead of passing these two variables through the whole stack of page zapping code, remove them from zap_details and let them simply be parameters of unmap_mapping_range_tree(), which is inlined. Link: https://lkml.kernel.org/r/20210915181535.11238-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Reviewed-by: David Hildenbrand Reviewed-by: Liam Howlett Acked-by: Hugh Dickins Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- mm/memory.c | 31 ++++++++++++++++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 32b2ecc47408..c5b600d7f85b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1688,8 +1688,6 @@ extern void user_shm_unlock(size_t, struct ucounts *); */ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ struct page *single_page; /* Locked page to be unmapped */ }; diff --git a/mm/memory.c b/mm/memory.c index cf7b059b57a7..d4ed701e3e5d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3321,20 +3321,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, + pgoff_t first_index, + pgoff_t last_index, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; - vma_interval_tree_foreach(vma, root, - details->first_index, details->last_index) { - + vma_interval_tree_foreach(vma, root, first_index, last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; - zba = details->first_index; + zba = first_index; if (zba < vba) zba = vba; - zea = details->last_index; + zea = last_index; if (zea > vea) zea = vea; @@ -3360,18 +3360,22 @@ void unmap_mapping_page(struct page *page) { struct address_space *mapping = page->mapping; struct zap_details details = { }; + pgoff_t first_index; + pgoff_t last_index; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageTail(page)); + first_index = page->index; + last_index = page->index + thp_nr_pages(page) - 1; + details.check_mapping = mapping; - details.first_index = page->index; - details.last_index = page->index + thp_nr_pages(page) - 1; details.single_page = page; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); i_mmap_unlock_write(mapping); } @@ -3391,16 +3395,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; + pgoff_t first_index = start; + pgoff_t last_index = start + nr - 1; details.check_mapping = even_cows ? NULL : mapping; - details.first_index = start; - details.last_index = start + nr - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; + if (last_index < first_index) + last_index = ULONG_MAX; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); + unmap_mapping_range_tree(&mapping->i_mmap, first_index, + last_index, &details); i_mmap_unlock_write(mapping); } EXPORT_SYMBOL_GPL(unmap_mapping_pages); From 91b61ef333cf43f96b3522a086c9ac925763d6e5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Nov 2021 13:38:34 -0700 Subject: [PATCH 40/57] mm: add zap_skip_check_mapping() helper Use the helper for the checks. Rename "check_mapping" into "zap_mapping" because "check_mapping" looks like a bool but in fact it stores the mapping itself. When it's set, we check the mapping (it must be non-NULL). When it's cleared we skip the check, which works like the old way. Move the duplicated comments to the helper too. Link: https://lkml.kernel.org/r/20210915181538.11288-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Alistair Popple Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jerome Glisse Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Rapoport Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 16 +++++++++++++++- mm/memory.c | 29 ++++++----------------------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c5b600d7f85b..6e29deb1e0d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1687,10 +1687,24 @@ extern void user_shm_unlock(size_t, struct ucounts *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct address_space *check_mapping; /* Check page->mapping if set */ + struct address_space *zap_mapping; /* Check page->mapping if set */ struct page *single_page; /* Locked page to be unmapped */ }; +/* + * We set details->zap_mappings when we want to unmap shared but keep private + * pages. Return true if skip zapping this page, false otherwise. + */ +static inline bool +zap_skip_check_mapping(struct zap_details *details, struct page *page) +{ + if (!details || !page) + return false; + + return details->zap_mapping && + (details->zap_mapping != page_rmapping(page)); +} + struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index d4ed701e3e5d..48b2e048d267 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1333,16 +1333,8 @@ again: struct page *page; page = vm_normal_page(vma, addr, ptent); - if (unlikely(details) && page) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping && - details->check_mapping != page_rmapping(page)) - continue; - } + if (unlikely(zap_skip_check_mapping(details, page))) + continue; ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); @@ -1375,17 +1367,8 @@ again: is_device_exclusive_entry(entry)) { struct page *page = pfn_swap_entry_to_page(entry); - if (unlikely(details && details->check_mapping)) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping != - page_rmapping(page)) - continue; - } - + if (unlikely(zap_skip_check_mapping(details, page))) + continue; pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; @@ -3369,7 +3352,7 @@ void unmap_mapping_page(struct page *page) first_index = page->index; last_index = page->index + thp_nr_pages(page) - 1; - details.check_mapping = mapping; + details.zap_mapping = mapping; details.single_page = page; i_mmap_lock_write(mapping); @@ -3398,7 +3381,7 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t first_index = start; pgoff_t last_index = start + nr - 1; - details.check_mapping = even_cows ? NULL : mapping; + details.zap_mapping = even_cows ? NULL : mapping; if (last_index < first_index) last_index = ULONG_MAX; From 03c4f20454e0231d2cdec4373841a3a25cf4efed Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 5 Nov 2021 13:38:38 -0700 Subject: [PATCH 41/57] mm: introduce pmd_install() helper Patch series "Do some code cleanups related to mm", v3. This patch (of 2): Currently we have three times the same few lines repeated in the code. Deduplicate them by newly introduced pmd_install() helper. Link: https://lkml.kernel.org/r/20210901102722.47686-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20210901102722.47686-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Acked-by: Kirill A. Shutemov Cc: Thomas Gleixner Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Mika Penttila Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 ++--------- mm/internal.h | 1 + mm/memory.c | 34 ++++++++++++++++------------------ 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 938f52702a7a..d05f8013a4f0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3211,15 +3211,8 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) } } - if (pmd_none(*vmf->pmd)) { - vmf->ptl = pmd_lock(mm, vmf->pmd); - if (likely(pmd_none(*vmf->pmd))) { - mm_inc_nr_ptes(mm); - pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); - vmf->prealloc_pte = NULL; - } - spin_unlock(vmf->ptl); - } + if (pmd_none(*vmf->pmd)) + pmd_install(mm, vmf->pmd, &vmf->prealloc_pte); /* See comment in handle_pte_fault() */ if (pmd_devmap_trans_unstable(vmf->pmd)) { diff --git a/mm/internal.h b/mm/internal.h index cf3cb933eba3..6c3e1a9f8c5a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,6 +38,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); static inline bool can_madv_lru_vma(struct vm_area_struct *vma) { diff --git a/mm/memory.c b/mm/memory.c index 48b2e048d267..38d63f6d998b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -433,9 +433,20 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } +void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) +{ + spinlock_t *ptl = pmd_lock(mm, pmd); + + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + mm_inc_nr_ptes(mm); + pmd_populate(mm, pmd, *pte); + *pte = NULL; + } + spin_unlock(ptl); +} + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { - spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; @@ -455,13 +466,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - ptl = pmd_lock(mm, pmd); - if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm_inc_nr_ptes(mm); - pmd_populate(mm, pmd, new); - new = NULL; - } - spin_unlock(ptl); + pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); return 0; @@ -4024,17 +4029,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf) return ret; } - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (likely(pmd_none(*vmf->pmd))) { - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - vmf->prealloc_pte = NULL; - } - spin_unlock(vmf->ptl); - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { + if (vmf->prealloc_pte) + pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte); + else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) return VM_FAULT_OOM; - } } /* See comment in handle_pte_fault() */ From ed33b5a677da33d6e8f959879bb61e9791b80354 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 5 Nov 2021 13:38:41 -0700 Subject: [PATCH 42/57] mm: remove redundant smp_wmb() The smp_wmb() which is in the __pte_alloc() is used to ensure all ptes setup is visible before the pte is made visible to other CPUs by being put into page tables. We only need this when the pte is actually populated, so move it to pmd_install(). __pte_alloc_kernel(), __p4d_alloc(), __pud_alloc() and __pmd_alloc() are similar to this case. We can also defer smp_wmb() to the place where the pmd entry is really populated by preallocated pte. There are two kinds of user of preallocated pte, one is filemap & finish_fault(), another is THP. The former does not need another smp_wmb() because the smp_wmb() has been done by pmd_install(). Fortunately, the latter also does not need another smp_wmb() because there is already a smp_wmb() before populating the new pte when the THP uses a preallocated pte to split a huge pmd. Link: https://lkml.kernel.org/r/20210901102722.47686-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Acked-by: Kirill A. Shutemov Cc: Johannes Weiner Cc: Michal Hocko Cc: Mika Penttila Cc: Thomas Gleixner Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 52 ++++++++++++++++++++------------------------- mm/sparse-vmemmap.c | 2 +- 2 files changed, 24 insertions(+), 30 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 38d63f6d998b..5db36280950a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -439,6 +439,20 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); + /* + * Ensure all pte setup (eg. pte page lock and page clearing) are + * visible before the pte is made visible to other CPUs by being + * put into page tables. + * + * The other side of the story is the pointer chasing in the page + * table walking code (when walking the page table without locking; + * ie. most of the time). Fortunately, these data accesses consist + * of a chain of data-dependent loads, meaning most CPUs (alpha + * being the notable exception) will already guarantee loads are + * seen in-order. See the alpha page table accessors for the + * smp_rmb() barriers in page table walking code. + */ + smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ pmd_populate(mm, pmd, *pte); *pte = NULL; } @@ -451,21 +465,6 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) if (!new) return -ENOMEM; - /* - * Ensure all pte setup (eg. pte page lock and page clearing) are - * visible before the pte is made visible to other CPUs by being - * put into page tables. - * - * The other side of the story is the pointer chasing in the page - * table walking code (when walking the page table without locking; - * ie. most of the time). Fortunately, these data accesses consist - * of a chain of data-dependent loads, meaning most CPUs (alpha - * being the notable exception) will already guarantee loads are - * seen in-order. See the alpha page table accessors for the - * smp_rmb() barriers in page table walking code. - */ - smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - pmd_install(mm, pmd, &new); if (new) pte_free(mm, new); @@ -478,10 +477,9 @@ int __pte_alloc_kernel(pmd_t *pmd) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ + smp_wmb(); /* See comment in pmd_install() */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } @@ -3845,7 +3843,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } ret = vma->vm_ops->fault(vmf); @@ -3916,7 +3913,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -4141,7 +4137,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; - smp_wmb(); /* See comment in __pte_alloc() */ } return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); @@ -4815,13 +4810,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) /* Another has populated it */ + if (pgd_present(*pgd)) { /* Another has populated it */ p4d_free(mm, new); - else + } else { + smp_wmb(); /* See comment in pmd_install() */ pgd_populate(mm, pgd, new); + } spin_unlock(&mm->page_table_lock); return 0; } @@ -4838,11 +4833,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); + smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); @@ -4863,14 +4857,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) if (!new) return -ENOMEM; - smp_wmb(); /* See comment in __pte_alloc */ - ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); + smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); - } else /* Another has populated it */ + } else { /* Another has populated it */ pmd_free(mm, new); + } spin_unlock(ptl); return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bdce883f9286..db6df27c852a 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -76,7 +76,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); } - /* Make pte visible before pmd. See comment in __pte_alloc(). */ + /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); From cbbb69d3c432da9d4afe734ca451fa2c012c05e2 Mon Sep 17 00:00:00 2001 From: Tiberiu A Georgescu Date: Fri, 5 Nov 2021 13:38:44 -0700 Subject: [PATCH 43/57] Documentation: update pagemap with shmem exceptions This patch follows the discussions on previous documentation patch threads [1][2]. It presents the exception case of shared memory management from the pagemap's point of view. It briefly describes what is missing, why it is missing and alternatives to the pagemap for page info retrieval in user space. In short, the kernel does not keep track of PTEs for swapped out shared pages within the processes that references them. Thus, the proc/pid/pagemap tool cannot print the swap destination of the shared memory pages, instead setting the pagemap entry to zero for both non-allocated and swapped out pages. This can create confusion for users who need information on swapped out pages. The reasons why maintaining the PTEs of all swapped out shared pages among all processes while maintaining similar performance is not a trivial task, or a desirable change, have been discussed extensively [1][3][4][5]. There are also arguments for why this arguably missing information should eventually be exposed to the user in either a future pagemap patch, or by an alternative tool. [1]: https://marc.info/?m=162878395426774 [2]: https://lore.kernel.org/lkml/20210920164931.175411-1-tiberiu.georgescu@nutanix.com/ [3]: https://lore.kernel.org/lkml/20210730160826.63785-1-tiberiu.georgescu@nutanix.com/ [4]: https://lore.kernel.org/lkml/20210807032521.7591-1-peterx@redhat.com/ [5]: https://lore.kernel.org/lkml/20210715201651.212134-1-peterx@redhat.com/ Mention the current missing information in the pagemap and alternatives on how to retrieve it, in case someone stumbles upon unexpected behaviour. Link: https://lkml.kernel.org/r/20210923064618.157046-1-tiberiu.georgescu@nutanix.com Link: https://lkml.kernel.org/r/20210923064618.157046-2-tiberiu.georgescu@nutanix.com Signed-off-by: Tiberiu A Georgescu Reviewed-by: Ivan Teterevkov Reviewed-by: Florian Schmidt Reviewed-by: Carl Waldspurger Reviewed-by: Jonathan Davies Reviewed-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index fb578fbbb76c..4581527c07ae 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -196,6 +196,28 @@ you can go through every map in the process, find the PFNs, look those up in kpagecount, and tally up the number of pages that are only referenced once. +Exceptions for Shared Memory +============================ + +Page table entries for shared pages are cleared when the pages are zapped or +swapped out. This makes swapped out pages indistinguishable from never-allocated +ones. + +In kernel space, the swap location can still be retrieved from the page cache. +However, values stored only on the normal PTE get lost irretrievably when the +page is swapped out (i.e. SOFT_DIRTY). + +In user space, whether the page is present, swapped or none can be deduced with +the help of lseek and/or mincore system calls. + +lseek() can differentiate between accessed pages (present or swapped out) and +holes (none/non-allocated) by specifying the SEEK_DATA flag on the file where +the pages are backed. For anonymous shared pages, the file can be found in +``/proc/pid/map_files/``. + +mincore() can differentiate between pages in memory (present, including swap +cache) and out of memory (swapped out or none/non-allocated). + Other notes =========== From e26e0cc30b48cad5f2704f6a22fa5cac9fdaaece Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 5 Nov 2021 13:39:00 -0700 Subject: [PATCH 44/57] memory: remove unused CONFIG_MEM_BLOCK_SIZE Commit 3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") defines CONFIG_MEM_BLOCK_SIZE, but this has never been utilized anywhere. It is a good practice to keep the CONFIG_* defines exclusively for the Kbuild system. So, drop this unused definition. This issue was noticed due to running ./scripts/checkkconfigsymbols.py. Link: https://lkml.kernel.org/r/20211006120354.7468-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/memory.h b/include/linux/memory.h index 182c606adb06..053a530c7bdd 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -140,7 +140,6 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); -#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< Date: Fri, 5 Nov 2021 13:39:03 -0700 Subject: [PATCH 45/57] mm/mprotect.c: avoid repeated assignment in do_mprotect_pkey() After adjustment, the repeated assignment of "prev" is avoided, and the readability of the code is improved. Link: https://lkml.kernel.org/r/20211012152444.4127-1-fishland@aliyun.com Reviewed-by: Andrew Morton Signed-off-by: Liu Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 883e2cc85cad..e552f5e0ccbd 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -563,7 +563,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, error = -ENOMEM; if (!vma) goto out; - prev = vma->vm_prev; + if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; @@ -581,8 +581,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } } + if (start > vma->vm_start) prev = vma; + else + prev = vma->vm_prev; for (nstart = start ; ; ) { unsigned long mask_off_old_flags; From fdbef61491359947753c13581098878e8d038286 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 5 Nov 2021 13:39:06 -0700 Subject: [PATCH 46/57] mm/mremap: don't account pages in vma_to_resize() All this vm_unacct_memory(charged) dance seems to complicate the life without a good reason. Furthermore, it seems not always done right on error-pathes in mremap_to(). And worse than that: this `charged' difference is sometimes double-accounted for growing MREMAP_DONTUNMAP mremap()s in move_vma(): if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) Let's not do this. Account memory in mremap() fast-path for growing VMAs or in move_vma() for actually moving things. The same simpler way as it's done by vm_stat_account(), but with a difference to call security_vm_enough_memory_mm() before copying/adjusting VMA. Originally noticed by Chen Wandun: https://lkml.kernel.org/r/20210717101942.120607-1-chenwandun@huawei.com Link: https://lkml.kernel.org/r/20210721131320.522061-1-dima@arista.com Fixes: e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") Signed-off-by: Dmitry Safonov Acked-by: Brian Geffon Cc: Alexander Viro Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Chen Wandun Cc: Dan Carpenter Cc: Dan Williams Cc: Dave Jiang Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kefeng Wang Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Minchan Kim Cc: Ralph Campbell Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vishal Verma Cc: Vlastimil Babka Cc: Wei Yongjun Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index badfe17ade1f..c0b6c41b7b78 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -565,6 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, bool *locked, unsigned long flags, struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap) { + long to_account = new_len - old_len; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; unsigned long vm_flags = vma->vm_flags; @@ -583,6 +584,9 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (unlikely(flags & MREMAP_DONTUNMAP)) + to_account = new_len; + if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) err = vma->vm_ops->may_split(vma, old_addr); @@ -604,8 +608,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { - if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + if (vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT)) return -ENOMEM; } @@ -613,8 +617,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, &need_rmap_locks); if (!new_vma) { - if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) - vm_unacct_memory(new_len >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT) + vm_unacct_memory(to_account >> PAGE_SHIFT); return -ENOMEM; } @@ -708,8 +712,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } static struct vm_area_struct *vma_to_resize(unsigned long addr, - unsigned long old_len, unsigned long new_len, unsigned long flags, - unsigned long *p) + unsigned long old_len, unsigned long new_len, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -768,13 +771,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); - if (vma->vm_flags & VM_ACCOUNT) { - unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return ERR_PTR(-ENOMEM); - *p = charged; - } - return vma; } @@ -787,7 +783,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; unsigned long map_flags = 0; if (offset_in_page(new_addr)) @@ -830,7 +825,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -853,7 +848,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); if (IS_ERR_VALUE(ret)) - goto out1; + goto out; /* We got a new mapping */ if (!(flags & MREMAP_FIXED)) @@ -862,12 +857,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, uf_unmap); - if (!(offset_in_page(ret))) - goto out; - -out1: - vm_unacct_memory(charged); - out: return ret; } @@ -899,7 +888,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; - unsigned long charged = 0; bool locked = false; bool downgraded = false; struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; @@ -981,7 +969,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, flags, &charged); + vma = vma_to_resize(addr, old_len, new_len, flags); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out; @@ -992,10 +980,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { - int pages = (new_len - old_len) >> PAGE_SHIFT; + long pages = (new_len - old_len) >> PAGE_SHIFT; + + if (vma->vm_flags & VM_ACCOUNT) { + if (security_vm_enough_memory_mm(mm, pages)) { + ret = -ENOMEM; + goto out; + } + } if (vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL)) { + vm_unacct_memory(pages); ret = -ENOMEM; goto out; } @@ -1034,10 +1030,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, &locked, flags, &uf, &uf_unmap); } out: - if (offset_in_page(ret)) { - vm_unacct_memory(charged); + if (offset_in_page(ret)) locked = false; - } if (downgraded) mmap_read_unlock(current->mm); else From 2e86f78b117a019881a420705a9d0ef126253083 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Fri, 5 Nov 2021 13:39:10 -0700 Subject: [PATCH 47/57] include/linux/io-mapping.h: remove fallback for writecombine The fallback was introduced in commit 80c33624e472 ("io-mapping: Fixup for different names of writecombine") to fix the build on microblaze. 5 years later, it seems all archs now provide a pgprot_writecombine(), so just remove the other possible fallbacks. For microblaze, pgprot_writecombine() is available since commit 97ccedd793ac ("microblaze: Provide pgprot_device/writecombine macros for nommu"). This is build-tested on microblaze with a hack to always build mm/io-mapping.o and without DIYing on an x86-only macro (_PAGE_CACHE_MASK) Link: https://lkml.kernel.org/r/20211020204838.1142908-1-lucas.demarchi@intel.com Signed-off-by: Lucas De Marchi Cc: Chris Wilson Cc: Daniel Vetter Cc: Joonas Lahtinen Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io-mapping.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index e9743cfd8585..66a774d2710e 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -132,13 +132,7 @@ io_mapping_init_wc(struct io_mapping *iomap, iomap->base = base; iomap->size = size; -#if defined(pgprot_noncached_wc) /* archs can't agree on a name ... */ - iomap->prot = pgprot_noncached_wc(PAGE_KERNEL); -#elif defined(pgprot_writecombine) iomap->prot = pgprot_writecombine(PAGE_KERNEL); -#else - iomap->prot = pgprot_noncached(PAGE_KERNEL); -#endif return iomap; } From f595e3411dcb664844eab807dfef61619eb9cf39 Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 5 Nov 2021 13:39:13 -0700 Subject: [PATCH 48/57] mm: mmap_lock: remove redundant newline in TP_printk Ftrace core will add newline automatically on printing, so using it in TP_printkcreates a blank line. Link: https://lkml.kernel.org/r/20211009071105.69544-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Acked-by: Vlastimil Babka Reviewed-by: Steven Rostedt (VMware) Cc: Ingo Molnar Cc: Axel Rasmussen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmap_lock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index 0abff67b96f0..5f980c92e3e9 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -32,7 +32,7 @@ TRACE_EVENT_FN(mmap_lock_start_locking, ), TP_printk( - "mm=%p memcg_path=%s write=%s\n", + "mm=%p memcg_path=%s write=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" @@ -63,7 +63,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, ), TP_printk( - "mm=%p memcg_path=%s write=%s success=%s\n", + "mm=%p memcg_path=%s write=%s success=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false", @@ -92,7 +92,7 @@ TRACE_EVENT_FN(mmap_lock_released, ), TP_printk( - "mm=%p memcg_path=%s write=%s\n", + "mm=%p memcg_path=%s write=%s", __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" From 627ae8284f50f220e8285119ca1716069c1c6a4d Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 5 Nov 2021 13:39:16 -0700 Subject: [PATCH 49/57] mm: mmap_lock: use DECLARE_EVENT_CLASS and DEFINE_EVENT_FN By using DECLARE_EVENT_CLASS and TRACE_EVENT_FN, we can save a lot of space from duplicate code. Link: https://lkml.kernel.org/r/20211009071243.70286-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Acked-by: Vlastimil Babka Reviewed-by: Steven Rostedt (VMware) Cc: Axel Rasmussen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/mmap_lock.h | 44 +++++++++----------------------- 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index 5f980c92e3e9..14db8044c1ff 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -13,7 +13,7 @@ struct mm_struct; extern int trace_mmap_lock_reg(void); extern void trace_mmap_lock_unreg(void); -TRACE_EVENT_FN(mmap_lock_start_locking, +DECLARE_EVENT_CLASS(mmap_lock, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), @@ -36,11 +36,19 @@ TRACE_EVENT_FN(mmap_lock_start_locking, __entry->mm, __get_str(memcg_path), __entry->write ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg + ) ); +#define DEFINE_MMAP_LOCK_EVENT(name) \ + DEFINE_EVENT_FN(mmap_lock, name, \ + TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ + bool write), \ + TP_ARGS(mm, memcg_path, write), \ + trace_mmap_lock_reg, trace_mmap_lock_unreg) + +DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); +DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); + TRACE_EVENT_FN(mmap_lock_acquire_returned, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, @@ -73,34 +81,6 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, trace_mmap_lock_reg, trace_mmap_lock_unreg ); -TRACE_EVENT_FN(mmap_lock_released, - - TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), - - TP_ARGS(mm, memcg_path, write), - - TP_STRUCT__entry( - __field(struct mm_struct *, mm) - __string(memcg_path, memcg_path) - __field(bool, write) - ), - - TP_fast_assign( - __entry->mm = mm; - __assign_str(memcg_path, memcg_path); - __entry->write = write; - ), - - TP_printk( - "mm=%p memcg_path=%s write=%s", - __entry->mm, - __get_str(memcg_path), - __entry->write ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg -); - #endif /* _TRACE_MMAP_LOCK_H */ /* This part must be outside protection */ From 228f778e973035185232ae745be0e3bc57dacea6 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:39:19 -0700 Subject: [PATCH 50/57] mm/vmalloc: repair warn_alloc()s in __vmalloc_area_node() Commit f255935b9767 ("mm: cleanup the gfp_mask handling in __vmalloc_area_node") added __GFP_NOWARN to gfp_mask unconditionally however it disabled all output inside warn_alloc() call. This patch saves original gfp_mask and provides it to all warn_alloc() calls. Link: https://lkml.kernel.org/r/f4f3187b-9684-e426-565d-827c2a9bbb0e@virtuozzo.com Fixes: f255935b9767 ("mm: cleanup the gfp_mask handling in __vmalloc_area_node") Signed-off-by: Vasily Averin Reviewed-by: Christoph Hellwig Reviewed-by: Muchun Song Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e8a807c78110..f43c88fa08cf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2887,6 +2887,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t orig_gfp_mask = gfp_mask; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; @@ -2907,7 +2908,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!area->pages) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); free_vm_area(area); @@ -2927,7 +2928,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via __vfree() if any. */ if (area->nr_pages != nr_small_pages) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, page order %u, failed to allocate pages", area->nr_pages * PAGE_SIZE, page_order); goto fail; @@ -2935,7 +2936,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (vmap_pages_range(addr, addr + size, prot, area->pages, page_shift) < 0) { - warn_alloc(gfp_mask, NULL, + warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); goto fail; From bd1a8fb2d43f7c293383f76691d7a55f7f89d9da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Nov 2021 13:39:22 -0700 Subject: [PATCH 51/57] mm/vmalloc: don't allow VM_NO_GUARD on vmap() The vmalloc guard pages are added on top of each allocation, thereby isolating any two allocations from one another. The top guard of the lower allocation is the bottom guard guard of the higher allocation etc. Therefore VM_NO_GUARD is dangerous; it breaks the basic premise of isolating separate allocations. There are only two in-tree users of this flag, neither of which use it through the exported interface. Ensure it stays this way. Link: https://lkml.kernel.org/r/YUMfdA36fuyZ+/xt@hirez.programming.kicks-ass.net Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Christoph Hellwig Reviewed-by: David Hildenbrand Acked-by: Will Deacon Acked-by: Kees Cook Cc: Andrey Konovalov Cc: Mel Gorman Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 +- mm/vmalloc.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0ed56fc10c11..6e022cc712e6 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -22,7 +22,7 @@ struct notifier_block; /* in notifier.h */ #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ #define VM_DMA_COHERENT 0x00000010 /* dma_alloc_coherent */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ -#define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_NO_GUARD 0x00000040 /* ***DANGEROUS*** don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_FLUSH_RESET_PERMS 0x00000100 /* reset direct map and flush TLB on unmap, can't be freed in atomic context */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f43c88fa08cf..4a11abd9e70f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2743,6 +2743,13 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + /* + * Your top guard is someone else's bottom guard. Not having a top + * guard compromises someone else's mappings too. + */ + if (WARN_ON_ONCE(flags & VM_NO_GUARD)) + flags &= ~VM_NO_GUARD; + if (count > totalram_pages()) return NULL; From 51e50b3a22937ab7b350f05af7e3b79b7ff73dd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 13:39:25 -0700 Subject: [PATCH 52/57] mm/vmalloc: make show_numa_info() aware of hugepage mappings show_numa_info() can be slightly faster, by skipping over hugepages directly. Link: https://lkml.kernel.org/r/20211001172725.105824-1-eric.dumazet@gmail.com Signed-off-by: Eric Dumazet Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4a11abd9e70f..6c1b97fd9b3d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3864,6 +3864,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) { if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; + unsigned int step = 1U << vm_area_page_order(v); if (!counters) return; @@ -3875,9 +3876,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) memset(counters, 0, nr_node_ids * sizeof(unsigned int)); - for (nr = 0; nr < v->nr_pages; nr++) - counters[page_to_nid(v->pages[nr])]++; - + for (nr = 0; nr < v->nr_pages; nr += step) + counters[page_to_nid(v->pages[nr])] += step; for_each_node_state(nr, N_HIGH_MEMORY) if (counters[nr]) seq_printf(m, " N%u=%u", nr, counters[nr]); From 7cc7913e8e61ac436497d01a64963770d1600f5d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Nov 2021 13:39:28 -0700 Subject: [PATCH 53/57] mm/vmalloc: make sure to dump unpurged areas in /proc/vmallocinfo If last va found in vmap_area_list does not have a vm pointer, vmallocinfo.s_show() returns 0, and show_purge_info() is not called as it should. Link: https://lkml.kernel.org/r/20211001170815.73321-1-eric.dumazet@gmail.com Fixes: dd3b8353bae7 ("mm/vmalloc: do not keep unpurged areas in the busy tree") Signed-off-by: Eric Dumazet Cc: Uladzislau Rezki (Sony) Cc: Pengfei Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6c1b97fd9b3d..0a740fb055ec 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3913,7 +3913,7 @@ static int s_show(struct seq_file *m, void *p) (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); - return 0; + goto final; } v = va->vm; @@ -3954,6 +3954,7 @@ static int s_show(struct seq_file *m, void *p) /* * As a final step, dump "unpurged" areas. */ +final: if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); From 9f531973dff39c671219352573ad5df6d0d9a58c Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 5 Nov 2021 13:39:31 -0700 Subject: [PATCH 54/57] mm/vmalloc: do not adjust the search size for alignment overhead We used to include an alignment overhead into a search length, in that case we guarantee that a found area will definitely fit after applying a specific alignment that user specifies. From the other hand we do not guarantee that an area has the lowest address if an alignment is >= PAGE_SIZE. It means that, when a user specifies a special alignment together with a range that corresponds to an exact requested size then an allocation will fail. This is what happens to KASAN, it wants the free block that exactly matches a specified range during onlining memory banks: [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory82/state [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory83/state [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory85/state [root@vm-0 fedora]# echo online > /sys/devices/system/memory/memory84/state vmap allocation for size 16777216 failed: use vmalloc= to increase size bash: vmalloc: allocation failure: 16777216 bytes, mode:0x6000c0(GFP_KERNEL), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 4 PID: 1644 Comm: bash Kdump: loaded Not tainted 4.18.0-339.el8.x86_64+debug #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x8e/0xd0 warn_alloc.cold.90+0x8a/0x1b2 ? zone_watermark_ok_safe+0x300/0x300 ? slab_free_freelist_hook+0x85/0x1a0 ? __get_vm_area_node+0x240/0x2c0 ? kfree+0xdd/0x570 ? kmem_cache_alloc_node_trace+0x157/0x230 ? notifier_call_chain+0x90/0x160 __vmalloc_node_range+0x465/0x840 ? mark_held_locks+0xb7/0x120 Fix it by making sure that find_vmap_lowest_match() returns lowest start address with any given alignment value, i.e. for alignments bigger then PAGE_SIZE the algorithm rolls back toward parent nodes checking right sub-trees if the most left free block did not fit due to alignment overhead. Link: https://lkml.kernel.org/r/20211004142829.22222-1-urezki@gmail.com Fixes: 68ad4a330433 ("mm/vmalloc.c: keep track of free blocks for vmap allocation") Signed-off-by: Uladzislau Rezki (Sony) Reported-by: Ping Fang Tested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Mel Gorman Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Hillf Danton Cc: Michal Hocko Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0a740fb055ec..0d835bd7fb5f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1195,18 +1195,14 @@ find_vmap_lowest_match(unsigned long size, { struct vmap_area *va; struct rb_node *node; - unsigned long length; /* Start from the root. */ node = free_vmap_area_root.rb_node; - /* Adjust the search size for alignment overhead. */ - length = size + align - 1; - while (node) { va = rb_entry(node, struct vmap_area, rb_node); - if (get_subtree_max_size(node->rb_left) >= length && + if (get_subtree_max_size(node->rb_left) >= size && vstart < va->va_start) { node = node->rb_left; } else { @@ -1216,9 +1212,9 @@ find_vmap_lowest_match(unsigned long size, /* * Does not make sense to go deeper towards the right * sub-tree if it does not have a free block that is - * equal or bigger to the requested search length. + * equal or bigger to the requested search size. */ - if (get_subtree_max_size(node->rb_right) >= length) { + if (get_subtree_max_size(node->rb_right) >= size) { node = node->rb_right; continue; } @@ -1226,15 +1222,23 @@ find_vmap_lowest_match(unsigned long size, /* * OK. We roll back and find the first right sub-tree, * that will satisfy the search criteria. It can happen - * only once due to "vstart" restriction. + * due to "vstart" restriction or an alignment overhead + * that is bigger then PAGE_SIZE. */ while ((node = rb_parent(node))) { va = rb_entry(node, struct vmap_area, rb_node); if (is_within_this_va(va, size, align, vstart)) return va; - if (get_subtree_max_size(node->rb_right) >= length && + if (get_subtree_max_size(node->rb_right) >= size && vstart <= va->va_start) { + /* + * Shift the vstart forward. Please note, we update it with + * parent's start address adding "1" because we do not want + * to enter same sub-tree after it has already been checked + * and no suitable free block found there. + */ + vstart = va->va_start + 1; node = node->rb_right; break; } From 066fed59d8a1bab4f3213e8fe413c54e4a76b77a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 5 Nov 2021 13:39:34 -0700 Subject: [PATCH 55/57] mm/vmalloc: check various alignments when debugging Before we did not guarantee a free block with lowest start address for allocations with alignment >= PAGE_SIZE. Because an alignment overhead was included into a search length like below: length = size + align - 1; doing so we make sure that a bigger block would fit after applying an alignment adjustment. Now there is no such limitation, i.e. any alignment that user wants to apply will result to a lowest address of returned free area. Link: https://lkml.kernel.org/r/20211004142829.22222-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Hillf Danton Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Cc: Ping Fang Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0d835bd7fb5f..d4770396799a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1269,7 +1269,7 @@ find_vmap_lowest_linear_match(unsigned long size, } static void -find_vmap_lowest_match_check(unsigned long size) +find_vmap_lowest_match_check(unsigned long size, unsigned long align) { struct vmap_area *va_1, *va_2; unsigned long vstart; @@ -1278,8 +1278,8 @@ find_vmap_lowest_match_check(unsigned long size) get_random_bytes(&rnd, sizeof(rnd)); vstart = VMALLOC_START + rnd; - va_1 = find_vmap_lowest_match(size, 1, vstart); - va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + va_1 = find_vmap_lowest_match(size, align, vstart); + va_2 = find_vmap_lowest_linear_match(size, align, vstart); if (va_1 != va_2) pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", @@ -1458,7 +1458,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK - find_vmap_lowest_match_check(size); + find_vmap_lowest_match_check(size, align); #endif return nva_start_addr; From dd544141b9eb8f3c58dedc9c1dcc4803de0eed45 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 5 Nov 2021 13:39:37 -0700 Subject: [PATCH 56/57] vmalloc: back off when the current task is OOM-killed Huge vmalloc allocation on heavy loaded node can lead to a global memory shortage. Task called vmalloc can have worst badness and be selected by OOM-killer, however taken fatal signal does not interrupt allocation cycle. Vmalloc repeat page allocaions again and again, exacerbating the crisis and consuming the memory freed up by another killed tasks. After a successful completion of the allocation procedure, a fatal signal will be processed and task will be destroyed finally. However it may not release the consumed memory, since the allocated object may have a lifetime unrelated to the completed task. In the worst case, this can lead to the host will panic due to "Out of memory and no killable processes..." This patch allows OOM-killer to break vmalloc cycle, makes OOM more effective and avoid host panic. It does not check oom condition directly, however, and breaks page allocation cycle when fatal signal was received. This may trigger some hidden problems, when caller does not handle vmalloc failures, or when rollaback after failed vmalloc calls own vmallocs inside. However all of these scenarios are incorrect: vmalloc does not guarantee successful allocation, it has never been called with __GFP_NOFAIL and threfore either should not be used for any rollbacks or should handle such errors correctly and not lead to critical failures. Link: https://lkml.kernel.org/r/83efc664-3a65-2adb-d7c4-2885784cf109@virtuozzo.com Signed-off-by: Vasily Averin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Tetsuo Handa Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d4770396799a..2cd23fa00a13 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2871,6 +2871,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, /* High-order pages or fallback path if "bulk" fails. */ while (nr_allocated < nr_pages) { + if (fatal_signal_pending(current)) + break; + if (nid == NUMA_NO_NODE) page = alloc_pages(gfp, order); else From 0eb68437a7f9dfef9c218873310c66c714f2fa99 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 5 Nov 2021 13:39:41 -0700 Subject: [PATCH 57/57] vmalloc: choose a better start address in vm_area_register_early() Percpu embedded first chunk allocator is the firstly option, but it could fail on ARM64, eg, percpu: max_distance=0x5fcfdc640000 too large for vmalloc space 0x781fefff0000 percpu: max_distance=0x600000540000 too large for vmalloc space 0x7dffb7ff0000 percpu: max_distance=0x5fff9adb0000 too large for vmalloc space 0x5dffb7ff0000 then we could get to WARNING: CPU: 15 PID: 461 at vmalloc.c:3087 pcpu_get_vm_areas+0x488/0x838 and the system cannot boot successfully. Let's implement page mapping percpu first chunk allocator as a fallback to the embedding allocator to increase the robustness of the system. Also fix a crash when both NEED_PER_CPU_PAGE_FIRST_CHUNK and KASAN_VMALLOC enabled. Tested on ARM64 qemu with cmdline "percpu_alloc=page". This patch (of 3): There are some fixed locations in the vmalloc area be reserved in ARM(see iotable_init()) and ARM64(see map_kernel()), but for pcpu_page_first_chunk(), it calls vm_area_register_early() and choose VMALLOC_START as the start address of vmap area which could be conflicted with above address, then could trigger a BUG_ON in vm_area_add_early(). Let's choose a suit start address by traversing the vmlist. Link: https://lkml.kernel.org/r/20210910053354.26721-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20210910053354.26721-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Catalin Marinas Cc: Will Deacon Cc: Andrey Ryabinin Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Marco Elver Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2cd23fa00a13..74da45a8beec 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2276,15 +2276,21 @@ void __init vm_area_add_early(struct vm_struct *vm) */ void __init vm_area_register_early(struct vm_struct *vm, size_t align) { - static size_t vm_init_off __initdata; - unsigned long addr; + unsigned long addr = ALIGN(VMALLOC_START, align); + struct vm_struct *cur, **p; - addr = ALIGN(VMALLOC_START + vm_init_off, align); - vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; + BUG_ON(vmap_initialized); + for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { + if ((unsigned long)cur->addr - addr >= vm->size) + break; + addr = ALIGN((unsigned long)cur->addr + cur->size, align); + } + + BUG_ON(addr > VMALLOC_END - vm->size); vm->addr = (void *)addr; - - vm_area_add_early(vm); + vm->next = *p; + *p = vm; } static void vmap_init_free_space(void)