BACKPORT: userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx

Increments and loads to mmap_changing are always in mmap_lock critical
section.  This ensures that if userspace requests event notification for
non-cooperative operations (e.g.  mremap), userfaultfd operations don't
occur concurrently.

This can be achieved by using a separate read-write semaphore in
userfaultfd_ctx such that increments are done in write-mode and loads in
read-mode, thereby eliminating the dependency on mmap_lock for this
purpose.

This is a preparatory step before we replace mmap_lock usage with per-vma
locks in fill/move ioctls.

Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

(cherry picked from commit 5e4c24a57b0c126686534b5b159a406c5dd02400)
Conflicts:
	fs/userfaultfd.c
	include/linux/userfaultfd_k.h
	mm/userfaultfd.c

1. Functions passing control from fs/userfaultfd.c to mm/userfaultfd.c
   were renamed after 6.1.
   a. Replace mfill_atomic_copy() with mcopy_atomic()
   b. Replace mfill_atomic_zeropage() with mfill_zeropage()
   c. Replace mfill_atomic_continue() with mcopy_continue()
   d. Replace mfill_atomic() with __mcopy_atomic()
   e. Replace mfill_atomic_hugetlb() with __mcopy_atomic_hugetlb()
2. uffd flags were unified into a single parameter after 6.1. Replace
   'flags' with 'mcopy_mode' and 'mode'.
3. Fetch dst_mm from dst_vma in __mcopy_atomic_hugetlb().

Bug: 320478828
Change-Id: I77615c36a0c891801c9eb9de3609df4e7f125c39
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
This commit is contained in:
Lokesh Gidra
2024-02-15 10:27:54 -08:00
parent 6b5ee039a1
commit a5b6040d5c
3 changed files with 71 additions and 55 deletions

View File

@@ -650,12 +650,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->flags = octx->flags;
ctx->features = octx->features;
ctx->released = false;
init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
down_write(&octx->map_changing_lock);
atomic_inc(&octx->mmap_changing);
up_write(&octx->map_changing_lock);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -702,7 +705,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
vma_start_write(vma);
@@ -748,7 +753,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@@ -790,7 +797,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1708,9 +1717,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
uffdio_copy.len, &ctx->mmap_changing,
uffdio_copy.mode);
ret = mcopy_atomic(ctx, uffdio_copy.dst, uffdio_copy.src,
uffdio_copy.len, uffdio_copy.mode);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1760,9 +1768,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len,
&ctx->mmap_changing);
ret = mfill_zeropage(ctx, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1817,9 +1824,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(ctx->mm)) {
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
uffdio_wp.range.len, mode_wp,
&ctx->mmap_changing);
ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
uffdio_wp.range.len, mode_wp);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len,
&ctx->mmap_changing);
ret = mcopy_continue(ctx, uffdio_continue.range.start,
uffdio_continue.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1943,13 +1948,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
if (mmget_not_zero(mm)) {
mmap_read_lock(mm);
/* Re-check after taking mmap_lock */
/* Re-check after taking map_changing_lock */
down_read(&ctx->map_changing_lock);
if (likely(!atomic_read(&ctx->mmap_changing)))
ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
uffdio_move.len, uffdio_move.mode);
else
ret = -EAGAIN;
up_read(&ctx->map_changing_lock);
mmap_read_unlock(mm);
mmput(mm);
} else {
@@ -2146,6 +2152,7 @@ static int new_userfaultfd(int flags)
ctx->flags = flags;
ctx->features = 0;
ctx->released = false;
init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */

View File

@@ -69,6 +69,13 @@ struct userfaultfd_ctx {
unsigned int features;
/* released */
bool released;
/*
* Prevents userfaultfd operations (fill/move/wp) from happening while
* some non-cooperative event(s) is taking place. Increments are done
* in write-mode. Whereas, userfaultfd operations, which includes
* reading mmap_changing, is done under read-mode.
*/
struct rw_semaphore map_changing_lock;
/* memory mappings are changing because of non-cooperative event */
atomic_t mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
@@ -100,18 +107,14 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page,
bool newly_allocated, bool wp_copy);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
atomic_t *mmap_changing, __u64 mode);
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long len,
atomic_t *mmap_changing);
extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long len, atomic_t *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
extern ssize_t mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len, __u64 mode);
extern ssize_t mfill_zeropage(struct userfaultfd_ctx *ctx,
unsigned long dst_start, unsigned long len);
extern ssize_t mcopy_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long len);
extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp);
extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);

View File

@@ -322,15 +322,16 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
* called with mmap_lock held, it will release mmap_lock before returning.
*/
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
static __always_inline ssize_t __mcopy_atomic_hugetlb(
struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
enum mcopy_atomic_mode mode,
bool wp_copy,
atomic_t *mmap_changing)
bool wp_copy)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
@@ -349,6 +350,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* feature is not supported.
*/
if (mode == MCOPY_ATOMIC_ZEROPAGE) {
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -434,6 +436,7 @@ retry:
cond_resched();
if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
BUG_ON(!page);
@@ -446,12 +449,13 @@ retry:
goto out;
}
mmap_read_lock(dst_mm);
down_read(&ctx->map_changing_lock);
/*
* If memory mappings are changing because of non-cooperative
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
if (mmap_changing && atomic_read(mmap_changing)) {
if (atomic_read(&ctx->mmap_changing)) {
err = -EAGAIN;
break;
}
@@ -474,6 +478,7 @@ retry:
}
out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
out:
if (page)
@@ -485,14 +490,13 @@ out:
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
extern ssize_t __mcopy_atomic_hugetlb(struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
enum mcopy_atomic_mode mode,
bool wp_copy,
atomic_t *mmap_changing);
bool wp_copy);
#endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -539,14 +543,14 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
return err;
}
static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
static __always_inline ssize_t __mcopy_atomic(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
enum mcopy_atomic_mode mcopy_mode,
atomic_t *mmap_changing,
__u64 mode)
{
struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
@@ -577,8 +581,9 @@ retry:
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
down_read(&ctx->map_changing_lock);
err = -EAGAIN;
if (mmap_changing && atomic_read(mmap_changing))
if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
/*
@@ -611,9 +616,9 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
return __mcopy_atomic_hugetlb(ctx, dst_vma, dst_start,
src_start, len, mcopy_mode,
wp_copy, mmap_changing);
wp_copy);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -671,6 +676,7 @@ retry:
if (unlikely(err == -ENOENT)) {
void *page_kaddr;
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
BUG_ON(!page);
@@ -701,6 +707,7 @@ retry:
}
out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
out:
if (page)
@@ -711,26 +718,23 @@ out:
return copied ? copied : err;
}
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
atomic_t *mmap_changing, __u64 mode)
ssize_t mcopy_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len, __u64 mode)
{
return __mcopy_atomic(dst_mm, dst_start, src_start, len,
MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
return __mcopy_atomic(ctx, dst_start, src_start, len,
MCOPY_ATOMIC_NORMAL, mode);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing)
ssize_t mfill_zeropage(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len)
{
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
mmap_changing, 0);
return __mcopy_atomic(ctx, start, 0, len, MCOPY_ATOMIC_ZEROPAGE, 0);
}
ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing)
ssize_t mcopy_continue(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len)
{
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
mmap_changing, 0);
return __mcopy_atomic(ctx, start, 0, len, MCOPY_ATOMIC_CONTINUE, 0);
}
void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
@@ -750,10 +754,10 @@ void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
tlb_finish_mmu(&tlb);
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool enable_wp,
atomic_t *mmap_changing)
int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp)
{
struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
unsigned long page_mask;
int err;
@@ -774,8 +778,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
down_read(&ctx->map_changing_lock);
err = -EAGAIN;
if (mmap_changing && atomic_read(mmap_changing))
if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -ENOENT;
@@ -799,6 +804,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return err;
}