BACKPORT: userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx

Increments and loads to mmap_changing are always in mmap_lock critical
section.  This ensures that if userspace requests event notification for
non-cooperative operations (e.g.  mremap), userfaultfd operations don't
occur concurrently.

This can be achieved by using a separate read-write semaphore in
userfaultfd_ctx such that increments are done in write-mode and loads in
read-mode, thereby eliminating the dependency on mmap_lock for this
purpose.

This is a preparatory step before we replace mmap_lock usage with per-vma
locks in fill/move ioctls.

Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 5e4c24a57b0c126686534b5b159a406c5dd02400)
Conflicts:
	fs/userfaultfd.c
	include/linux/userfaultfd_k.h
	mm/userfaultfd.c

1. Functions passing control from fs/userfaultfd.c to mm/userfaultfd.c
   were renamed after 6.1.
   a. Replace mfill_atomic_copy() with mcopy_atomic()
   b. Replace mfill_atomic_zeropage() with mfill_zeropage()
   c. Replace mfill_atomic_continue() with mcopy_continue()
   d. Replace mfill_atomic() with __mcopy_atomic()
   e. Replace mfill_atomic_hugetlb() with __mcopy_atomic_hugetlb()
2. uffd flags were unified into a single parameter after 6.1. Replace
   'flags' with 'mcopy_mode' and 'mode'.
3. Fetch dst_mm from dst_vma in __mcopy_atomic_hugetlb().

Bug: 320478828
Change-Id: I77615c36a0c891801c9eb9de3609df4e7f125c39
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
This commit is contained in:
Lokesh Gidra
2024-02-15 10:27:54 -08:00
parent 6b5ee039a1
commit a5b6040d5c
3 changed files with 71 additions and 55 deletions

View File

@@ -650,12 +650,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->flags = octx->flags; ctx->flags = octx->flags;
ctx->features = octx->features; ctx->features = octx->features;
ctx->released = false; ctx->released = false;
init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0); atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm; ctx->mm = vma->vm_mm;
mmgrab(ctx->mm); mmgrab(ctx->mm);
userfaultfd_ctx_get(octx); userfaultfd_ctx_get(octx);
down_write(&octx->map_changing_lock);
atomic_inc(&octx->mmap_changing); atomic_inc(&octx->mmap_changing);
up_write(&octx->map_changing_lock);
fctx->orig = octx; fctx->orig = octx;
fctx->new = ctx; fctx->new = ctx;
list_add_tail(&fctx->list, fcs); list_add_tail(&fctx->list, fcs);
@@ -702,7 +705,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx; vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx); userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing); atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
} else { } else {
/* Drop uffd context if remap feature not enabled */ /* Drop uffd context if remap feature not enabled */
vma_start_write(vma); vma_start_write(vma);
@@ -748,7 +753,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true; return true;
userfaultfd_ctx_get(ctx); userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing); atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
mmap_read_unlock(mm); mmap_read_unlock(mm);
msg_init(&ewq.msg); msg_init(&ewq.msg);
@@ -790,7 +797,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
return -ENOMEM; return -ENOMEM;
userfaultfd_ctx_get(ctx); userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing); atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
unmap_ctx->ctx = ctx; unmap_ctx->ctx = ctx;
unmap_ctx->start = start; unmap_ctx->start = start;
unmap_ctx->end = end; unmap_ctx->end = end;
@@ -1708,9 +1717,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out; goto out;
if (mmget_not_zero(ctx->mm)) { if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, ret = mcopy_atomic(ctx, uffdio_copy.dst, uffdio_copy.src,
uffdio_copy.len, &ctx->mmap_changing, uffdio_copy.len, uffdio_copy.mode);
uffdio_copy.mode);
mmput(ctx->mm); mmput(ctx->mm);
} else { } else {
return -ESRCH; return -ESRCH;
@@ -1760,9 +1768,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out; goto out;
if (mmget_not_zero(ctx->mm)) { if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, ret = mfill_zeropage(ctx, uffdio_zeropage.range.start,
uffdio_zeropage.range.len, uffdio_zeropage.range.len);
&ctx->mmap_changing);
mmput(ctx->mm); mmput(ctx->mm);
} else { } else {
return -ESRCH; return -ESRCH;
@@ -1817,9 +1824,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL; return -EINVAL;
if (mmget_not_zero(ctx->mm)) { if (mmget_not_zero(ctx->mm)) {
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
uffdio_wp.range.len, mode_wp, uffdio_wp.range.len, mode_wp);
&ctx->mmap_changing);
mmput(ctx->mm); mmput(ctx->mm);
} else { } else {
return -ESRCH; return -ESRCH;
@@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out; goto out;
if (mmget_not_zero(ctx->mm)) { if (mmget_not_zero(ctx->mm)) {
ret = mcopy_continue(ctx->mm, uffdio_continue.range.start, ret = mcopy_continue(ctx, uffdio_continue.range.start,
uffdio_continue.range.len, uffdio_continue.range.len);
&ctx->mmap_changing);
mmput(ctx->mm); mmput(ctx->mm);
} else { } else {
return -ESRCH; return -ESRCH;
@@ -1943,13 +1948,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
if (mmget_not_zero(mm)) { if (mmget_not_zero(mm)) {
mmap_read_lock(mm); mmap_read_lock(mm);
/* Re-check after taking mmap_lock */ /* Re-check after taking map_changing_lock */
down_read(&ctx->map_changing_lock);
if (likely(!atomic_read(&ctx->mmap_changing))) if (likely(!atomic_read(&ctx->mmap_changing)))
ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src, ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
uffdio_move.len, uffdio_move.mode); uffdio_move.len, uffdio_move.mode);
else else
ret = -EAGAIN; ret = -EAGAIN;
up_read(&ctx->map_changing_lock);
mmap_read_unlock(mm); mmap_read_unlock(mm);
mmput(mm); mmput(mm);
} else { } else {
@@ -2146,6 +2152,7 @@ static int new_userfaultfd(int flags)
ctx->flags = flags; ctx->flags = flags;
ctx->features = 0; ctx->features = 0;
ctx->released = false; ctx->released = false;
init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0); atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm; ctx->mm = current->mm;
/* prevent the mm struct to be freed */ /* prevent the mm struct to be freed */

View File

@@ -69,6 +69,13 @@ struct userfaultfd_ctx {
unsigned int features; unsigned int features;
/* released */ /* released */
bool released; bool released;
/*
* Prevents userfaultfd operations (fill/move/wp) from happening while
* some non-cooperative event(s) is taking place. Increments are done
* in write-mode. Whereas, userfaultfd operations, which includes
* reading mmap_changing, is done under read-mode.
*/
struct rw_semaphore map_changing_lock;
/* memory mappings are changing because of non-cooperative event */ /* memory mappings are changing because of non-cooperative event */
atomic_t mmap_changing; atomic_t mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */ /* mm with one ore more vmas attached to this userfaultfd_ctx */
@@ -100,18 +107,14 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page, unsigned long dst_addr, struct page *page,
bool newly_allocated, bool wp_copy); bool newly_allocated, bool wp_copy);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, extern ssize_t mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len, unsigned long src_start, unsigned long len, __u64 mode);
atomic_t *mmap_changing, __u64 mode); extern ssize_t mfill_zeropage(struct userfaultfd_ctx *ctx,
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len);
unsigned long dst_start, extern ssize_t mcopy_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long len, unsigned long len);
atomic_t *mmap_changing); extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, bool enable_wp);
unsigned long len, atomic_t *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp); unsigned long start, unsigned long len, bool enable_wp);

View File

@@ -322,15 +322,16 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
* called with mmap_lock held, it will release mmap_lock before returning. * called with mmap_lock held, it will release mmap_lock before returning.
*/ */
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, static __always_inline ssize_t __mcopy_atomic_hugetlb(
struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma, struct vm_area_struct *dst_vma,
unsigned long dst_start, unsigned long dst_start,
unsigned long src_start, unsigned long src_start,
unsigned long len, unsigned long len,
enum mcopy_atomic_mode mode, enum mcopy_atomic_mode mode,
bool wp_copy, bool wp_copy)
atomic_t *mmap_changing)
{ {
struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED; int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err; ssize_t err;
pte_t *dst_pte; pte_t *dst_pte;
@@ -349,6 +350,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* feature is not supported. * feature is not supported.
*/ */
if (mode == MCOPY_ATOMIC_ZEROPAGE) { if (mode == MCOPY_ATOMIC_ZEROPAGE) {
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm); mmap_read_unlock(dst_mm);
return -EINVAL; return -EINVAL;
} }
@@ -434,6 +436,7 @@ retry:
cond_resched(); cond_resched();
if (unlikely(err == -ENOENT)) { if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm); mmap_read_unlock(dst_mm);
BUG_ON(!page); BUG_ON(!page);
@@ -446,12 +449,13 @@ retry:
goto out; goto out;
} }
mmap_read_lock(dst_mm); mmap_read_lock(dst_mm);
down_read(&ctx->map_changing_lock);
/* /*
* If memory mappings are changing because of non-cooperative * If memory mappings are changing because of non-cooperative
* operation (e.g. mremap) running in parallel, bail out and * operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later * request the user to retry later
*/ */
if (mmap_changing && atomic_read(mmap_changing)) { if (atomic_read(&ctx->mmap_changing)) {
err = -EAGAIN; err = -EAGAIN;
break; break;
} }
@@ -474,6 +478,7 @@ retry:
} }
out_unlock: out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm); mmap_read_unlock(dst_mm);
out: out:
if (page) if (page)
@@ -485,14 +490,13 @@ out:
} }
#else /* !CONFIG_HUGETLB_PAGE */ #else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */ /* fail at build time if gcc attempts to use this */
extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, extern ssize_t __mcopy_atomic_hugetlb(struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma, struct vm_area_struct *dst_vma,
unsigned long dst_start, unsigned long dst_start,
unsigned long src_start, unsigned long src_start,
unsigned long len, unsigned long len,
enum mcopy_atomic_mode mode, enum mcopy_atomic_mode mode,
bool wp_copy, bool wp_copy);
atomic_t *mmap_changing);
#endif /* CONFIG_HUGETLB_PAGE */ #endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -539,14 +543,14 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
return err; return err;
} }
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, static __always_inline ssize_t __mcopy_atomic(struct userfaultfd_ctx *ctx,
unsigned long dst_start, unsigned long dst_start,
unsigned long src_start, unsigned long src_start,
unsigned long len, unsigned long len,
enum mcopy_atomic_mode mcopy_mode, enum mcopy_atomic_mode mcopy_mode,
atomic_t *mmap_changing,
__u64 mode) __u64 mode)
{ {
struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma; struct vm_area_struct *dst_vma;
ssize_t err; ssize_t err;
pmd_t *dst_pmd; pmd_t *dst_pmd;
@@ -577,8 +581,9 @@ retry:
* operation (e.g. mremap) running in parallel, bail out and * operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later * request the user to retry later
*/ */
down_read(&ctx->map_changing_lock);
err = -EAGAIN; err = -EAGAIN;
if (mmap_changing && atomic_read(mmap_changing)) if (atomic_read(&ctx->mmap_changing))
goto out_unlock; goto out_unlock;
/* /*
@@ -611,9 +616,9 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine * If this is a HUGETLB vma, pass off to appropriate routine
*/ */
if (is_vm_hugetlb_page(dst_vma)) if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start, return __mcopy_atomic_hugetlb(ctx, dst_vma, dst_start,
src_start, len, mcopy_mode, src_start, len, mcopy_mode,
wp_copy, mmap_changing); wp_copy);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock; goto out_unlock;
@@ -671,6 +676,7 @@ retry:
if (unlikely(err == -ENOENT)) { if (unlikely(err == -ENOENT)) {
void *page_kaddr; void *page_kaddr;
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm); mmap_read_unlock(dst_mm);
BUG_ON(!page); BUG_ON(!page);
@@ -701,6 +707,7 @@ retry:
} }
out_unlock: out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm); mmap_read_unlock(dst_mm);
out: out:
if (page) if (page)
@@ -711,26 +718,23 @@ out:
return copied ? copied : err; return copied ? copied : err;
} }
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, ssize_t mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len, unsigned long src_start, unsigned long len, __u64 mode)
atomic_t *mmap_changing, __u64 mode)
{ {
return __mcopy_atomic(dst_mm, dst_start, src_start, len, return __mcopy_atomic(ctx, dst_start, src_start, len,
MCOPY_ATOMIC_NORMAL, mmap_changing, mode); MCOPY_ATOMIC_NORMAL, mode);
} }
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, ssize_t mfill_zeropage(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, atomic_t *mmap_changing) unsigned long len)
{ {
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, return __mcopy_atomic(ctx, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, 0);
mmap_changing, 0);
} }
ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, ssize_t mcopy_continue(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, atomic_t *mmap_changing) unsigned long len)
{ {
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE, return __mcopy_atomic(ctx, start, 0, len, MCOPY_ATOMIC_CONTINUE, 0);
mmap_changing, 0);
} }
void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
@@ -750,10 +754,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
tlb_finish_mmu(&tlb); tlb_finish_mmu(&tlb);
} }
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp, unsigned long len, bool enable_wp)
atomic_t *mmap_changing)
{ {
struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma; struct vm_area_struct *dst_vma;
unsigned long page_mask; unsigned long page_mask;
int err; int err;
@@ -774,8 +778,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and * operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later * request the user to retry later
*/ */
down_read(&ctx->map_changing_lock);
err = -EAGAIN; err = -EAGAIN;
if (mmap_changing && atomic_read(mmap_changing)) if (atomic_read(&ctx->mmap_changing))
goto out_unlock; goto out_unlock;
err = -ENOENT; err = -ENOENT;
@@ -799,6 +804,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0; err = 0;
out_unlock: out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm); mmap_read_unlock(dst_mm);
return err; return err;
} }