From a5b6040d5cb800d56a2d2c5d106e1213838eed4e Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Thu, 15 Feb 2024 10:27:54 -0800 Subject: [PATCH] BACKPORT: userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx Increments and loads to mmap_changing are always in mmap_lock critical section. This ensures that if userspace requests event notification for non-cooperative operations (e.g. mremap), userfaultfd operations don't occur concurrently. This can be achieved by using a separate read-write semaphore in userfaultfd_ctx such that increments are done in write-mode and loads in read-mode, thereby eliminating the dependency on mmap_lock for this purpose. This is a preparatory step before we replace mmap_lock usage with per-vma locks in fill/move ioctls. Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Brian Geffon Cc: David Hildenbrand Cc: Jann Horn Cc: Kalesh Singh Cc: Matthew Wilcox (Oracle) Cc: Nicolas Geoffray Cc: Peter Xu Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Tim Murray Signed-off-by: Andrew Morton (cherry picked from commit 5e4c24a57b0c126686534b5b159a406c5dd02400) Conflicts: fs/userfaultfd.c include/linux/userfaultfd_k.h mm/userfaultfd.c 1. Functions passing control from fs/userfaultfd.c to mm/userfaultfd.c were renamed after 6.1. a. Replace mfill_atomic_copy() with mcopy_atomic() b. Replace mfill_atomic_zeropage() with mfill_zeropage() c. Replace mfill_atomic_continue() with mcopy_continue() d. Replace mfill_atomic() with __mcopy_atomic() e. Replace mfill_atomic_hugetlb() with __mcopy_atomic_hugetlb() 2. uffd flags were unified into a single parameter after 6.1. Replace 'flags' with 'mcopy_mode' and 'mode'. 3. Fetch dst_mm from dst_vma in __mcopy_atomic_hugetlb(). Bug: 320478828 Change-Id: I77615c36a0c891801c9eb9de3609df4e7f125c39 Signed-off-by: Lokesh Gidra --- fs/userfaultfd.c | 35 +++++++++++-------- include/linux/userfaultfd_k.h | 27 ++++++++------- mm/userfaultfd.c | 64 +++++++++++++++++++---------------- 3 files changed, 71 insertions(+), 55 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e2fc9491048d..0767c8e4a059 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -650,12 +650,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->flags = octx->flags; ctx->features = octx->features; ctx->released = false; + init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + down_write(&octx->map_changing_lock); atomic_inc(&octx->mmap_changing); + up_write(&octx->map_changing_lock); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -702,7 +705,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); } else { /* Drop uffd context if remap feature not enabled */ vma_start_write(vma); @@ -748,7 +753,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -790,7 +797,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, return -ENOMEM; userfaultfd_ctx_get(ctx); + down_write(&ctx->map_changing_lock); atomic_inc(&ctx->mmap_changing); + up_write(&ctx->map_changing_lock); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1708,9 +1717,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len, &ctx->mmap_changing, - uffdio_copy.mode); + ret = mcopy_atomic(ctx, uffdio_copy.dst, uffdio_copy.src, + uffdio_copy.len, uffdio_copy.mode); mmput(ctx->mm); } else { return -ESRCH; @@ -1760,9 +1768,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { - ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len, - &ctx->mmap_changing); + ret = mfill_zeropage(ctx, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); mmput(ctx->mm); } else { return -ESRCH; @@ -1817,9 +1824,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, return -EINVAL; if (mmget_not_zero(ctx->mm)) { - ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp, - &ctx->mmap_changing); + ret = mwriteprotect_range(ctx, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp); mmput(ctx->mm); } else { return -ESRCH; @@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; if (mmget_not_zero(ctx->mm)) { - ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, - uffdio_continue.range.len, - &ctx->mmap_changing); + ret = mcopy_continue(ctx, uffdio_continue.range.start, + uffdio_continue.range.len); mmput(ctx->mm); } else { return -ESRCH; @@ -1943,13 +1948,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, if (mmget_not_zero(mm)) { mmap_read_lock(mm); - /* Re-check after taking mmap_lock */ + /* Re-check after taking map_changing_lock */ + down_read(&ctx->map_changing_lock); if (likely(!atomic_read(&ctx->mmap_changing))) ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, uffdio_move.len, uffdio_move.mode); else ret = -EAGAIN; - + up_read(&ctx->map_changing_lock); mmap_read_unlock(mm); mmput(mm); } else { @@ -2146,6 +2152,7 @@ static int new_userfaultfd(int flags) ctx->flags = flags; ctx->features = 0; ctx->released = false; + init_rwsem(&ctx->map_changing_lock); atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; /* prevent the mm struct to be freed */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 7bbd671e912f..0fb4e4028eba 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -69,6 +69,13 @@ struct userfaultfd_ctx { unsigned int features; /* released */ bool released; + /* + * Prevents userfaultfd operations (fill/move/wp) from happening while + * some non-cooperative event(s) is taking place. Increments are done + * in write-mode. Whereas, userfaultfd operations, which includes + * reading mmap_changing, is done under read-mode. + */ + struct rw_semaphore map_changing_lock; /* memory mappings are changing because of non-cooperative event */ atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ @@ -100,18 +107,14 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, unsigned long dst_addr, struct page *page, bool newly_allocated, bool wp_copy); -extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode); -extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, - unsigned long dst_start, - unsigned long len, - atomic_t *mmap_changing); -extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len, atomic_t *mmap_changing); -extern int mwriteprotect_range(struct mm_struct *dst_mm, - unsigned long start, unsigned long len, - bool enable_wp, atomic_t *mmap_changing); +extern ssize_t mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 mode); +extern ssize_t mfill_zeropage(struct userfaultfd_ctx *ctx, + unsigned long dst_start, unsigned long len); +extern ssize_t mcopy_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long len); +extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp); extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af165216fbbd..b37bc54eb29c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -322,15 +322,16 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is * called with mmap_lock held, it will release mmap_lock before returning. */ -static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, +static __always_inline ssize_t __mcopy_atomic_hugetlb( + struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, enum mcopy_atomic_mode mode, - bool wp_copy, - atomic_t *mmap_changing) + bool wp_copy) { + struct mm_struct *dst_mm = dst_vma->vm_mm; int vm_shared = dst_vma->vm_flags & VM_SHARED; ssize_t err; pte_t *dst_pte; @@ -349,6 +350,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, * feature is not supported. */ if (mode == MCOPY_ATOMIC_ZEROPAGE) { + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); return -EINVAL; } @@ -434,6 +436,7 @@ retry: cond_resched(); if (unlikely(err == -ENOENT)) { + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); BUG_ON(!page); @@ -446,12 +449,13 @@ retry: goto out; } mmap_read_lock(dst_mm); + down_read(&ctx->map_changing_lock); /* * If memory mappings are changing because of non-cooperative * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ - if (mmap_changing && atomic_read(mmap_changing)) { + if (atomic_read(&ctx->mmap_changing)) { err = -EAGAIN; break; } @@ -474,6 +478,7 @@ retry: } out_unlock: + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); out: if (page) @@ -485,14 +490,13 @@ out: } #else /* !CONFIG_HUGETLB_PAGE */ /* fail at build time if gcc attempts to use this */ -extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, +extern ssize_t __mcopy_atomic_hugetlb(struct userfaultfd_ctx *ctx, struct vm_area_struct *dst_vma, unsigned long dst_start, unsigned long src_start, unsigned long len, enum mcopy_atomic_mode mode, - bool wp_copy, - atomic_t *mmap_changing); + bool wp_copy); #endif /* CONFIG_HUGETLB_PAGE */ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, @@ -539,14 +543,14 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, return err; } -static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, +static __always_inline ssize_t __mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, unsigned long len, enum mcopy_atomic_mode mcopy_mode, - atomic_t *mmap_changing, __u64 mode) { + struct mm_struct *dst_mm = ctx->mm; struct vm_area_struct *dst_vma; ssize_t err; pmd_t *dst_pmd; @@ -577,8 +581,9 @@ retry: * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ + down_read(&ctx->map_changing_lock); err = -EAGAIN; - if (mmap_changing && atomic_read(mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; /* @@ -611,9 +616,9 @@ retry: * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) - return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, + return __mcopy_atomic_hugetlb(ctx, dst_vma, dst_start, src_start, len, mcopy_mode, - wp_copy, mmap_changing); + wp_copy); if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) goto out_unlock; @@ -671,6 +676,7 @@ retry: if (unlikely(err == -ENOENT)) { void *page_kaddr; + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); BUG_ON(!page); @@ -701,6 +707,7 @@ retry: } out_unlock: + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); out: if (page) @@ -711,26 +718,23 @@ out: return copied ? copied : err; } -ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len, - atomic_t *mmap_changing, __u64 mode) +ssize_t mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, + unsigned long src_start, unsigned long len, __u64 mode) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, - MCOPY_ATOMIC_NORMAL, mmap_changing, mode); + return __mcopy_atomic(ctx, dst_start, src_start, len, + MCOPY_ATOMIC_NORMAL, mode); } -ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mfill_zeropage(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len) { - return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, - mmap_changing, 0); + return __mcopy_atomic(ctx, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, 0); } -ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, atomic_t *mmap_changing) +ssize_t mcopy_continue(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len) { - return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, - mmap_changing, 0); + return __mcopy_atomic(ctx, start, 0, len, MCOPY_ATOMIC_CONTINUE, 0); } void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, @@ -750,10 +754,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, tlb_finish_mmu(&tlb); } -int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, - unsigned long len, bool enable_wp, - atomic_t *mmap_changing) +int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, + unsigned long len, bool enable_wp) { + struct mm_struct *dst_mm = ctx->mm; struct vm_area_struct *dst_vma; unsigned long page_mask; int err; @@ -774,8 +778,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, * operation (e.g. mremap) running in parallel, bail out and * request the user to retry later */ + down_read(&ctx->map_changing_lock); err = -EAGAIN; - if (mmap_changing && atomic_read(mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out_unlock; err = -ENOENT; @@ -799,6 +804,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, err = 0; out_unlock: + up_read(&ctx->map_changing_lock); mmap_read_unlock(dst_mm); return err; }