aboutsummaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c379
1 files changed, 227 insertions, 152 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 21a5e6e4758b..47fe250307c7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,6 +69,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -400,10 +401,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
- pgtable_t new = pte_alloc_one(mm, address);
+ pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
@@ -434,9 +435,9 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
return 0;
}
-int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd)
{
- pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ pte_t *new = pte_alloc_one_kernel(&init_mm);
if (!new)
return -ENOMEM;
@@ -973,8 +974,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
bool is_cow;
int ret;
@@ -1008,11 +1008,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* is_cow_mapping() returns true.
*/
is_cow = is_cow_mapping(vma->vm_flags);
- mmun_start = addr;
- mmun_end = end;
- if (is_cow)
- mmu_notifier_invalidate_range_start(src_mm, mmun_start,
- mmun_end);
+
+ if (is_cow) {
+ mmu_notifier_range_init(&range, src_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
@@ -1029,7 +1029,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
if (is_cow)
- mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -1332,12 +1332,13 @@ void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+ mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+ mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
- mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
+ mmu_notifier_invalidate_range_end(&range);
}
/**
@@ -1351,18 +1352,18 @@ void unmap_vmas(struct mmu_gather *tlb,
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- unsigned long end = start + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
- unmap_single_vma(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+ unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, start, range.end);
}
/**
@@ -1377,17 +1378,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- unsigned long end = address + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, address, end);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, address, end);
- unmap_single_vma(&tlb, vma, address, end, details);
- mmu_notifier_invalidate_range_end(mm, address, end);
- tlb_finish_mmu(&tlb, address, end);
+ mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ unmap_single_vma(&tlb, vma, address, range.end, details);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, address, range.end);
}
/**
@@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
retval = -EINVAL;
- if (PageAnon(page))
+ if (PageAnon(page) || PageSlab(page) || page_has_type(page))
goto out;
retval = -ENOMEM;
flush_dcache_page(page);
@@ -1503,6 +1504,8 @@ out:
* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
@@ -1520,19 +1523,16 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_page);
-static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
- int retval;
pte_t *pte, entry;
spinlock_t *ptl;
- retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto out;
- retval = -EBUSY;
+ return VM_FAULT_OOM;
if (!pte_none(*pte)) {
if (mkwrite) {
/*
@@ -1540,10 +1540,15 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* in may not match the PFN we have mapped if the
* mapped PFN is a writeable COW page. In the mkwrite
* case we are creating a writable PTE for a shared
- * mapping and we expect the PFNs to match.
+ * mapping and we expect the PFNs to match. If they
+ * don't match, we are likely racing with block
+ * allocation and mapping invalidation so just skip the
+ * update.
*/
- if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
goto out_unlock;
+ }
entry = *pte;
goto out_mkwrite;
} else
@@ -1565,56 +1570,32 @@ out_mkwrite:
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
- retval = 0;
out_unlock:
pte_unmap_unlock(pte, ptl);
-out:
- return retval;
-}
-
-/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_insert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- *
- * vma cannot be a COW mapping.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
-{
- return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+ return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL(vm_insert_pfn);
/**
- * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers to
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
- * cow mappings. In general, using multiple vmas is preferable;
- * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * COW mappings. In general, using multiple vmas is preferable;
+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
*/
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot)
{
- int ret;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1628,19 +1609,44 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
if (!pfn_modify_allowed(pfn, pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
+}
+EXPORT_SYMBOL(vmf_insert_pfn_prot);
- return ret;
+/**
+ * vmf_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return the result of this function.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
-EXPORT_SYMBOL(vm_insert_pfn_prot);
+EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
@@ -1656,20 +1662,21 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
return false;
}
-static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, bool mkwrite)
+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
+ int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, pfn);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
/*
* If we don't have pte special, then we have to use the pfn_valid()
@@ -1688,36 +1695,35 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- return insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot);
+ } else {
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
}
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
-
}
-EXPORT_SYMBOL(vm_insert_mixed);
+EXPORT_SYMBOL(vmf_insert_mixed);
/*
* If the insertion of PTE failed because someone else already added a
* different entry in the mean time, we treat that as success as we assume
* the same entry was actually inserted.
*/
-
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- int err;
-
- err = __vm_insert_mixed(vma, addr, pfn, true);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_NOPAGE;
+ return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -1827,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
* @size: size of map area
* @prot: page protection flags for this mapping
*
- * Note: this is only safe if the mm semaphore is held when called.
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
@@ -1900,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range);
*
* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
* whatever write-combining details or similar.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
@@ -2244,9 +2254,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = vmf->address & PAGE_MASK;
- const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+ struct mmu_notifier_range range;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2269,7 +2278,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
__SetPageUptodate(new_page);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
/*
* Re-check the pte - we dropped the lock
@@ -2346,7 +2357,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
@@ -2377,12 +2388,13 @@ oom:
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
- * It handles locking of PTE and modifying it. The function returns
- * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
- * lock.
+ * It handles locking of PTE and modifying it.
*
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
+ *
+ * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
@@ -2500,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
+ if (PageAnon(vmf->page)) {
int total_map_swapcount;
+ if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
+ page_count(vmf->page) != 1))
+ goto copy;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2516,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
}
put_page(vmf->page);
}
+ if (PageKsm(vmf->page)) {
+ bool reused = reuse_ksm_page(vmf->page, vmf->vma,
+ vmf->address);
+ unlock_page(vmf->page);
+ if (!reused)
+ goto copy;
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
+ }
if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
if (total_map_swapcount == 1) {
/*
@@ -2536,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
-
+copy:
/*
* Ok, we need to copy. Oh, well..
*/
@@ -2892,7 +2916,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
@@ -2990,6 +3014,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_pne
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
@@ -3039,7 +3085,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
spin_unlock(vmf->ptl);
vmf->prealloc_pte = NULL;
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -3118,7 +3164,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
- vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3175,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
@@ -3235,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
- * addition. The function returns 0 on success, VM_FAULT_ code in case of
- * error.
+ * addition.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
@@ -3295,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
static int __init fault_around_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
- &fault_around_bytes_fops);
- if (!ret)
- pr_warn("Failed to create fault_around_bytes in debugfs");
+ debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
+ &fault_around_bytes_fops);
return 0;
}
late_initcall(fault_around_debugfs);
@@ -3356,8 +3401,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
start_pgoff + nr_pages - 1);
if (pmd_none(*vmf->pmd)) {
- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
- vmf->address);
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3492,16 +3536,45 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
- /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
- if (!vma->vm_ops->fault)
- ret = VM_FAULT_SIGBUS;
- else if (!(vmf->flags & FAULT_FLAG_WRITE))
+ /*
+ * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
+ */
+ if (!vma->vm_ops->fault) {
+ /*
+ * If we find a migration pmd entry or a none pmd entry, which
+ * should never happen, return SIGBUS
+ */
+ if (unlikely(!pmd_present(*vmf->pmd)))
+ ret = VM_FAULT_SIGBUS;
+ else {
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
+ vmf->pmd,
+ vmf->address,
+ &vmf->ptl);
+ /*
+ * Make sure this is not a temporary clearing of pte
+ * by holding ptl and checking again. A R/M/W update
+ * of pte involves: take ptl, clearing the pte so that
+ * we don't have concurrent modification by hardware
+ * followed by an update.
+ */
+ if (unlikely(pte_none(*vmf->pte)))
+ ret = VM_FAULT_SIGBUS;
+ else
+ ret = VM_FAULT_NOPAGE;
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ }
+ } else if (!(vmf->flags & FAULT_FLAG_WRITE))
ret = do_read_fault(vmf);
else if (!(vma->vm_flags & VM_SHARED))
ret = do_cow_fault(vmf);
@@ -3510,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
- pte_free(vma->vm_mm, vmf->prealloc_pte);
+ pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
@@ -3535,11 +3608,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
- int page_nid = -1;
+ int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
bool migrated = false;
- pte_t pte;
+ pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
@@ -3559,12 +3632,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
* Make it present again, Depending on how arch implementes non
* accessible ptes, some can allow access by kernel mode.
*/
- pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
- pte = pte_modify(pte, vma->vm_page_prot);
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
@@ -3602,7 +3675,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out;
}
@@ -3616,7 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATE_FAIL;
out:
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
@@ -3801,7 +3874,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
vmf.pud = pud_alloc(mm, p4d, address);
if (!vmf.pud)
return VM_FAULT_OOM;
- if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+ if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -3827,7 +3900,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4001,7 +4074,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
#endif /* __PAGETABLE_PMD_FOLDED */
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- unsigned long *start, unsigned long *end,
+ struct mmu_notifier_range *range,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
@@ -4029,10 +4102,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (!pmdpp)
goto out;
- if (start && end) {
- *start = address & PMD_MASK;
- *end = *start + PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, *start, *end);
+ if (range) {
+ mmu_notifier_range_init(range, mm, address & PMD_MASK,
+ (address & PMD_MASK) + PMD_SIZE);
+ mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
@@ -4040,17 +4113,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
}
spin_unlock(*ptlp);
- if (start && end)
- mmu_notifier_invalidate_range_end(mm, *start, *end);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
- if (start && end) {
- *start = address & PAGE_MASK;
- *end = *start + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, *start, *end);
+ if (range) {
+ mmu_notifier_range_init(range, mm, address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
@@ -4059,8 +4132,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
- if (start && end)
- mmu_notifier_invalidate_range_end(mm, *start, *end);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
@@ -4072,20 +4145,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+ !(res = __follow_pte_pmd(mm, address, NULL,
ptepp, NULL, ptlp)));
return res;
}
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- unsigned long *start, unsigned long *end,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ struct mmu_notifier_range *range,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, start, end,
+ !(res = __follow_pte_pmd(mm, address, range,
ptepp, pmdpp, ptlp)));
return res;
}
@@ -4099,7 +4172,7 @@ EXPORT_SYMBOL(follow_pte_pmd);
*
* Only IO mappings and raw PFN mappings are allowed.
*
- * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ * Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
@@ -4249,6 +4322,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
* @gup_flags: flags modifying lookup behaviour
*
* The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from source to destination.
*/
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)