diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 379 |
1 files changed, 227 insertions, 152 deletions
diff --git a/mm/memory.c b/mm/memory.c index 21a5e6e4758b..47fe250307c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,6 +69,7 @@ #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> +#include <linux/numa.h> #include <asm/io.h> #include <asm/mmu_context.h> @@ -400,10 +401,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; - pgtable_t new = pte_alloc_one(mm, address); + pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; @@ -434,9 +435,9 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) return 0; } -int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) +int __pte_alloc_kernel(pmd_t *pmd) { - pte_t *new = pte_alloc_one_kernel(&init_mm, address); + pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; @@ -973,8 +974,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct mmu_notifier_range range; bool is_cow; int ret; @@ -1008,11 +1008,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(vma->vm_flags); - mmun_start = addr; - mmun_end = end; - if (is_cow) - mmu_notifier_invalidate_range_start(src_mm, mmun_start, - mmun_end); + + if (is_cow) { + mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); @@ -1029,7 +1029,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) - mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(&range); return ret; } @@ -1332,12 +1332,13 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; - mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); + mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); - mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); + mmu_notifier_invalidate_range_end(&range); } /** @@ -1351,18 +1352,18 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; - unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) - unmap_single_vma(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); + mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + unmap_single_vma(&tlb, vma, start, range.end, NULL); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, start, range.end); } /** @@ -1377,17 +1378,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { - struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; struct mmu_gather tlb; - unsigned long end = address + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, address, end); - update_hiwater_rss(mm); - mmu_notifier_invalidate_range_start(mm, address, end); - unmap_single_vma(&tlb, vma, address, end, details); - mmu_notifier_invalidate_range_end(mm, address, end); - tlb_finish_mmu(&tlb, address, end); + mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); + update_hiwater_rss(vma->vm_mm); + mmu_notifier_invalidate_range_start(&range); + unmap_single_vma(&tlb, vma, address, range.end, details); + mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb, address, range.end); } /** @@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; retval = -EINVAL; - if (PageAnon(page)) + if (PageAnon(page) || PageSlab(page) || page_has_type(page)) goto out; retval = -ENOMEM; flush_dcache_page(page); @@ -1503,6 +1504,8 @@ out: * under mm->mmap_sem write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. + * + * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) @@ -1520,19 +1523,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); -static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; - int retval; pte_t *pte, entry; spinlock_t *ptl; - retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out; - retval = -EBUSY; + return VM_FAULT_OOM; if (!pte_none(*pte)) { if (mkwrite) { /* @@ -1540,10 +1540,15 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared - * mapping and we expect the PFNs to match. + * mapping and we expect the PFNs to match. If they + * don't match, we are likely racing with block + * allocation and mapping invalidation so just skip the + * update. */ - if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; + } entry = *pte; goto out_mkwrite; } else @@ -1565,56 +1570,32 @@ out_mkwrite: set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ - retval = 0; out_unlock: pte_unmap_unlock(pte, ptl); -out: - return retval; -} - -/** - * vm_insert_pfn - insert single pfn into user vma - * @vma: user vma to map to - * @addr: target user address of this page - * @pfn: source kernel pfn - * - * Similar to vm_insert_page, this allows drivers to insert individual pages - * they've allocated into a user vma. Same comments apply. - * - * This function should only be called from a vm_ops->fault handler, and - * in that case the handler should return NULL. - * - * vma cannot be a COW mapping. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - */ -int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) -{ - return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_pfn); /** - * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * - * This is exactly like vm_insert_pfn, except that it allows drivers to + * This is exactly like vmf_insert_pfn(), except that it allows drivers to * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for - * cow mappings. In general, using multiple vmas is preferable; - * vm_insert_pfn_prot should only be used if using multiple VMAs is + * COW mappings. In general, using multiple vmas is preferable; + * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. */ -int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, +vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { - int ret; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1628,19 +1609,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); +} +EXPORT_SYMBOL(vmf_insert_pfn_prot); - return ret; +/** + * vmf_insert_pfn - insert single pfn into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * + * Similar to vm_insert_page, this allows drivers to insert individual pages + * they've allocated into a user vma. Same comments apply. + * + * This function should only be called from a vm_ops->fault handler, and + * in that case the handler should return the result of this function. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * Context: Process context. May allocate using %GFP_KERNEL. + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn) +{ + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } -EXPORT_SYMBOL(vm_insert_pfn_prot); +EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { @@ -1656,20 +1662,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) return false; } -static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, bool mkwrite) +static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; + int err; BUG_ON(!vm_mixed_ok(vma, pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) - return -EFAULT; + return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) - return -EACCES; + return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() @@ -1688,36 +1695,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); - return insert_page(vma, addr, page, pgprot); + err = insert_page(vma, addr, page, pgprot); + } else { + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } - return insert_pfn(vma, addr, pfn, pgprot, mkwrite); + + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; } -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, false); - } -EXPORT_SYMBOL(vm_insert_mixed); +EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ - vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { - int err; - - err = __vm_insert_mixed(vma, addr, pfn, true); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err < 0 && err != -EBUSY) - return VM_FAULT_SIGBUS; - return VM_FAULT_NOPAGE; + return __vm_insert_mixed(vma, addr, pfn, true); } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); @@ -1827,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * @size: size of map area * @prot: page protection flags for this mapping * - * Note: this is only safe if the mm semaphore is held when called. + * Note: this is only safe if the mm semaphore is held when called. + * + * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) @@ -1900,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range); * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. + * + * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { @@ -2244,9 +2254,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = vmf->address & PAGE_MASK; - const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2269,7 +2278,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + (vmf->address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock @@ -2346,7 +2357,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * No need to double call mmu_notifier->invalidate_range() callback as * the above ptep_clear_flush_notify() did already call it. */ - mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_only_end(&range); if (old_page) { /* * Don't let another task, with possibly unlocked vma, @@ -2377,12 +2388,13 @@ oom: * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. - * It handles locking of PTE and modifying it. The function returns - * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE - * lock. + * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). + * + * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before + * we acquired PTE lock. */ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { @@ -2500,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { + if (PageAnon(vmf->page)) { int total_map_swapcount; + if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || + page_count(vmf->page) != 1)) + goto copy; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2516,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } + if (PageKsm(vmf->page)) { + bool reused = reuse_ksm_page(vmf->page, vmf->vma, + vmf->address); + unlock_page(vmf->page); + if (!reused) + goto copy; + wp_page_reuse(vmf); + return VM_FAULT_WRITE; + } if (reuse_swap_page(vmf->page, &total_map_swapcount)) { if (total_map_swapcount == 1) { /* @@ -2536,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); } - +copy: /* * Ok, we need to copy. Oh, well.. */ @@ -2892,7 +2916,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ @@ -2990,6 +3014,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; + /* + * Preallocate pte before we take page_lock because this might lead to + * deadlocks for memcg reclaim which waits for pages under writeback: + * lock_page(A) + * SetPageWriteback(A) + * unlock_page(A) + * lock_page(B) + * lock_page(B) + * pte_alloc_pne + * shrink_page_list + * wait_on_page_writeback(A) + * SetPageWriteback(B) + * unlock_page(B) + * # flush A, B to clear the writeback + */ + if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); + if (!vmf->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) @@ -3039,7 +3085,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; } map_pte: @@ -3118,7 +3164,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { - vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3175,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) * * Target users are page handler itself and implementations of * vm_ops->map_pages. + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) @@ -3235,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU - * addition. The function returns 0 on success, VM_FAULT_ code in case of - * error. + * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). + * + * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { @@ -3295,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, static int __init fault_around_debugfs(void) { - void *ret; - - ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, - &fault_around_bytes_fops); - if (!ret) - pr_warn("Failed to create fault_around_bytes in debugfs"); + debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); @@ -3356,8 +3401,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { - vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, - vmf->address); + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ @@ -3492,16 +3536,45 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; - /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ - if (!vma->vm_ops->fault) - ret = VM_FAULT_SIGBUS; - else if (!(vmf->flags & FAULT_FLAG_WRITE)) + /* + * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND + */ + if (!vma->vm_ops->fault) { + /* + * If we find a migration pmd entry or a none pmd entry, which + * should never happen, return SIGBUS + */ + if (unlikely(!pmd_present(*vmf->pmd))) + ret = VM_FAULT_SIGBUS; + else { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, + vmf->pmd, + vmf->address, + &vmf->ptl); + /* + * Make sure this is not a temporary clearing of pte + * by holding ptl and checking again. A R/M/W update + * of pte involves: take ptl, clearing the pte so that + * we don't have concurrent modification by hardware + * followed by an update. + */ + if (unlikely(pte_none(*vmf->pte))) + ret = VM_FAULT_SIGBUS; + else + ret = VM_FAULT_NOPAGE; + + pte_unmap_unlock(vmf->pte, vmf->ptl); + } + } else if (!(vmf->flags & FAULT_FLAG_WRITE)) ret = do_read_fault(vmf); else if (!(vma->vm_flags & VM_SHARED)) ret = do_cow_fault(vmf); @@ -3510,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; @@ -3535,11 +3608,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; - int page_nid = -1; + int page_nid = NUMA_NO_NODE; int last_cpupid; int target_nid; bool migrated = false; - pte_t pte; + pte_t pte, old_pte; bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; @@ -3559,12 +3632,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); - pte = pte_modify(pte, vma->vm_page_prot); + old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); + ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); @@ -3602,7 +3675,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); pte_unmap_unlock(vmf->pte, vmf->ptl); - if (target_nid == -1) { + if (target_nid == NUMA_NO_NODE) { put_page(page); goto out; } @@ -3616,7 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_MIGRATE_FAIL; out: - if (page_nid != -1) + if (page_nid != NUMA_NO_NODE) task_numa_fault(last_cpupid, page_nid, 1, flags); return 0; } @@ -3801,7 +3874,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; - if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { + if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -3827,7 +3900,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4001,7 +4074,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) #endif /* __PAGETABLE_PMD_FOLDED */ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, + struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; @@ -4029,10 +4102,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, if (!pmdpp) goto out; - if (start && end) { - *start = address & PMD_MASK; - *end = *start + PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, *start, *end); + if (range) { + mmu_notifier_range_init(range, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); if (pmd_huge(*pmd)) { @@ -4040,17 +4113,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; } spin_unlock(*ptlp); - if (start && end) - mmu_notifier_invalidate_range_end(mm, *start, *end); + if (range) + mmu_notifier_invalidate_range_end(range); } if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; - if (start && end) { - *start = address & PAGE_MASK; - *end = *start + PAGE_SIZE; - mmu_notifier_invalidate_range_start(mm, *start, *end); + if (range) { + mmu_notifier_range_init(range, mm, address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); if (!pte_present(*ptep)) @@ -4059,8 +4132,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, return 0; unlock: pte_unmap_unlock(ptep, *ptlp); - if (start && end) - mmu_notifier_invalidate_range_end(mm, *start, *end); + if (range) + mmu_notifier_invalidate_range_end(range); out: return -EINVAL; } @@ -4072,20 +4145,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address, /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, NULL, NULL, + !(res = __follow_pte_pmd(mm, address, NULL, ptepp, NULL, ptlp))); return res; } int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - unsigned long *start, unsigned long *end, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) + struct mmu_notifier_range *range, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) { int res; /* (void) is needed to make gcc happy */ (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, start, end, + !(res = __follow_pte_pmd(mm, address, range, ptepp, pmdpp, ptlp))); return res; } @@ -4099,7 +4172,7 @@ EXPORT_SYMBOL(follow_pte_pmd); * * Only IO mappings and raw PFN mappings are allowed. * - * Returns zero and the pfn at @pfn on success, -ve otherwise. + * Return: zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn) @@ -4249,6 +4322,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, * @gup_flags: flags modifying lookup behaviour * * The caller must hold a reference on @mm. + * + * Return: number of bytes copied from source to destination. */ int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) |