From 2246168a72ec76f45c4b27016be238ba988ea94a Mon Sep 17 00:00:00 2001 From: Dezhi Huang Date: Mon, 22 May 2023 21:24:45 +0800 Subject: [PATCH] ANDROID: fix a race between speculative page walk and unmap operations Speculative page fault walks the page tables under RCU protection and assumes that page tables are stable after ptl lock is taken. Current implementation has three issues: 1. While pmd can't be destroyed while in RCU read section, it can be cleared and result in an invalid ptl lock address. Fix that by rechecking pmd value after obtaining ptl lock. 2. In case of CONFIG_ALLOC_SPLIT_PTLOCKS, ptl lock is separate from the pmd and is destroyed by a synchronous call to pgtable_pmd_page_dtor, which can happen while page walker is in RCU section. Prevent this by adding a dependency for CONFIG_SPECULATIVE_PAGE_FAULT to require !CONIG_ALLOC_SPLIT_PTLOCKS. 3. Below sequence when do_mmap happens after the last mmap_seq check would result in use-after-free issue. __pte_map_lock rcu_read_lock() mmap_seq_read_check() ptl = pte_lockptr(vmf->pmd) spin_trylock(ptl) mmap_seq_read_check() mmap_write_lock() do_mmap() unmap_region() unmap_vmas() free_pgtables() ... free_pte_range pmd_clear pte_free_tlb ... call_rcu(tlb_remove_table_rcu) rcu_read_unlock() tlb_remove_table_rcu spin_unlock(ptl) <-- UAF! To prevent that free_pte_range needs to be blocked if ptl is locked and is in use. [tyler wang: This is a backport from https://android-review.googlesource.com/c/kernel/common/+/2330194. We have adapted the corresponding modifications from 5.15 to 5.10, including the changes made to the function __pte_map_lock in 5.15, which have been adapted to the functions pte_spinlock and __pte_map_lock_speculative in 5.10. Additionally, following surenb's suggestion, we have folded https://android-review.googlesource.com/c/kernel/common/+/2368961 in this patch.] Bug: 278602292 Change-Id: I7b353f0995fc59e92bb2069bcdc7d1ac29b521b9 Signed-off-by: Dezhi Huang --- mm/memory.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 834078ab6e1c..df62ea2ef8f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -246,6 +246,16 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT + /* + * Ensure page table destruction is blocked if __pte_map_lock managed + * to take this lock. Without this barrier tlb_remove_table_rcu can + * destroy ptl after __pte_map_lock locked it and during unlock would + * cause a use-after-free. + */ + spinlock_t *ptl = pmd_lock(tlb->mm, pmd); + spin_unlock(ptl); +#endif pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); @@ -2627,9 +2637,7 @@ EXPORT_SYMBOL_GPL(apply_to_page_range); static bool pte_spinlock(struct vm_fault *vmf) { bool ret = false; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t pmdval; -#endif /* Check if vma is still valid */ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { @@ -2644,24 +2652,28 @@ static bool pte_spinlock(struct vm_fault *vmf) goto out; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * We check if the pmd value is still the same to ensure that there * is not a huge collapse operation in progress in our back. + * It also ensures that pmd was not cleared by pmd_clear in + * free_pte_range and ptl is still valid. */ pmdval = READ_ONCE(*vmf->pmd); if (!pmd_same(pmdval, vmf->orig_pmd)) { trace_spf_pmd_changed(_RET_IP_, vmf->vma, vmf->address); goto out; } -#endif - vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, &pmdval); if (unlikely(!spin_trylock(vmf->ptl))) { trace_spf_pte_lock(_RET_IP_, vmf->vma, vmf->address); goto out; } + /* + * The check below will fail if pte_spinlock passed its ptl barrier + * before we took the ptl lock. + */ if (vma_has_changed(vmf)) { spin_unlock(vmf->ptl); trace_spf_vma_changed(_RET_IP_, vmf->vma, vmf->address); @@ -2679,9 +2691,7 @@ static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr) bool ret = false; pte_t *pte; spinlock_t *ptl; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t pmdval; -#endif /* * The first vma_has_changed() guarantees the page-tables are still @@ -2696,7 +2706,6 @@ static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr) goto out; } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * We check if the pmd value is still the same to ensure that there * is not a huge collapse operation in progress in our back. @@ -2706,7 +2715,6 @@ static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr) trace_spf_pmd_changed(_RET_IP_, vmf->vma, addr); goto out; } -#endif /* * Same as pte_offset_map_lock() except that we call @@ -2715,14 +2723,18 @@ static bool __pte_map_lock_speculative(struct vm_fault *vmf, unsigned long addr) * to invalidate TLB but this CPU has irq disabled. * Since we are in a speculative patch, accept it could fail */ - ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); - pte = pte_offset_map(vmf->pmd, addr); + ptl = pte_lockptr(vmf->vma->vm_mm, &pmdval); + pte = pte_offset_map(&pmdval, addr); if (unlikely(!spin_trylock(ptl))) { pte_unmap(pte); trace_spf_pte_lock(_RET_IP_, vmf->vma, addr); goto out; } + /* + * The check below will fail if __pte_map_lock_speculative passed its ptl + * barrier before we took the ptl lock. + */ if (vma_has_changed(vmf)) { pte_unmap_unlock(pte, ptl); trace_spf_vma_changed(_RET_IP_, vmf->vma, addr);