FROMLIST: mm: implement speculative handling in __do_fault()

In the speculative case, call the vm_ops->fault() method from within an rcu read locked section, and verify the mmap sequence lock at the start of the section. A match guarantees that the original vma is still valid at that time, and that the associated vma->vm_file stays valid while the vm_ops->fault() method is running. Note that this implies that speculative faults can not sleep within the vm_ops->fault method. We will only attempt to fetch existing pages from the page cache during speculative faults; any miss (or prefetch) will be handled by falling back to non-speculative fault handling. The speculative handling case also does not preallocate page tables, as it is always called with a pre-existing page table. Signed-off-by: Michel Lespinasse <michel@lespinasse.org> Link: https://lore.kernel.org/all/20210407014502.24091-25-michel@lespinasse.org/ Bug: 161210518 Signed-off-by: Suren Baghdasaryan <surenb@google.com> Change-Id: I995ba94d8e96014ef83ac93fe5a4669afcde34b9
2026-06-08 20:07:46 +09:00 · 2021-04-29 10:28:25 -07:00
parent 48e35d053f
commit b12e52ca98
1 changed files with 43 additions and 21 deletions
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3977,29 +3977,51 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 	vm_fault_t ret;

-	/*
-	 * Preallocate pte before we take page_lock because this might lead to
-	 * deadlocks for memcg reclaim which waits for pages under writeback:
-	 *				lock_page(A)
-	 *				SetPageWriteback(A)
-	 *				unlock_page(A)
-	 * lock_page(B)
-	 *				lock_page(B)
-	 * pte_alloc_one
-	 *   shrink_page_list
-	 *     wait_on_page_writeback(A)
-	 *				SetPageWriteback(B)
-	 *				unlock_page(B)
-	 *				# flush A, B to clear the writeback
-	 */
-	if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
-		vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
-		if (!vmf->prealloc_pte)
-			return VM_FAULT_OOM;
-		smp_wmb(); /* See comment in __pte_alloc() */
+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
+	if (vmf->flags & FAULT_FLAG_SPECULATIVE) {
+		rcu_read_lock();
+		if (!mmap_seq_read_check(vmf->vma->vm_mm, vmf->seq,
+					 SPF_ABORT_FAULT)) {
+			ret = VM_FAULT_RETRY;
+		} else {
+			/*
+			 * The mmap sequence count check guarantees that the
+			 * vma we fetched at the start of the fault was still
+			 * current at that point in time. The rcu read lock
+			 * ensures vmf->vma->vm_file stays valid.
+			 */
+			ret = vma->vm_ops->fault(vmf);
+		}
+		rcu_read_unlock();
+	} else
+#endif
+	{
+		/*
+		 * Preallocate pte before we take page_lock because
+		 * this might lead to deadlocks for memcg reclaim
+		 * which waits for pages under writeback:
+		 *				lock_page(A)
+		 *				SetPageWriteback(A)
+		 *				unlock_page(A)
+		 * lock_page(B)
+		 *				lock_page(B)
+		 * pte_alloc_one
+		 *   shrink_page_list
+		 *     wait_on_page_writeback(A)
+		 *				SetPageWriteback(B)
+		 *				unlock_page(B)
+		 *				# flush A, B to clear writeback
+		 */
+		if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+			vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
+			if (!vmf->prealloc_pte)
+				return VM_FAULT_OOM;
+			smp_wmb(); /* See comment in __pte_alloc() */
+		}
+
+		ret = vma->vm_ops->fault(vmf);
 	}

-	ret = vma->vm_ops->fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
 			    VM_FAULT_DONE_COW)))
 		return ret;