FROMLIST: mm: implement speculative handling in __do_fault()

In the speculative case, call the vm_ops->fault() method from within
an rcu read locked section, and verify the mmap sequence lock at the
start of the section. A match guarantees that the original vma is still
valid at that time, and that the associated vma->vm_file stays valid
while the vm_ops->fault() method is running.

Note that this implies that speculative faults can not sleep within
the vm_ops->fault method. We will only attempt to fetch existing pages
from the page cache during speculative faults; any miss (or prefetch)
will be handled by falling back to non-speculative fault handling.

The speculative handling case also does not preallocate page tables,
as it is always called with a pre-existing page table.

Signed-off-by: Michel Lespinasse <michel@lespinasse.org>
Link: https://lore.kernel.org/all/20210407014502.24091-25-michel@lespinasse.org/
Bug: 161210518
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I995ba94d8e96014ef83ac93fe5a4669afcde34b9
This commit is contained in:
Michel Lespinasse
2021-04-29 10:28:25 -07:00
committed by Todd Kjos
parent 48e35d053f
commit b12e52ca98

View File

@@ -3977,29 +3977,51 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
/*
* Preallocate pte before we take page_lock because this might lead to
* deadlocks for memcg reclaim which waits for pages under writeback:
* lock_page(A)
* SetPageWriteback(A)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
* pte_alloc_one
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
* unlock_page(B)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
rcu_read_lock();
if (!mmap_seq_read_check(vmf->vma->vm_mm, vmf->seq,
SPF_ABORT_FAULT)) {
ret = VM_FAULT_RETRY;
} else {
/*
* The mmap sequence count check guarantees that the
* vma we fetched at the start of the fault was still
* current at that point in time. The rcu read lock
* ensures vmf->vma->vm_file stays valid.
*/
ret = vma->vm_ops->fault(vmf);
}
rcu_read_unlock();
} else
#endif
{
/*
* Preallocate pte before we take page_lock because
* this might lead to deadlocks for memcg reclaim
* which waits for pages under writeback:
* lock_page(A)
* SetPageWriteback(A)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
* pte_alloc_one
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
* unlock_page(B)
* # flush A, B to clear writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
ret = vma->vm_ops->fault(vmf);
}
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
return ret;