aboutsummaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c99
1 files changed, 75 insertions, 24 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 2366578015ad..75c2dfd04f72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,4 +1,3 @@
-
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
@@ -44,7 +43,6 @@
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
@@ -1061,8 +1059,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
if (need_zero)
new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
else
- new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
- addr, false);
+ new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
if (!new_folio)
return NULL;
@@ -1085,6 +1082,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
+ pmd_t dummy_pmdval;
pte_t ptent;
spinlock_t *src_ptl, *dst_ptl;
int progress, max_nr, ret = 0;
@@ -1110,7 +1108,15 @@ again:
ret = -ENOMEM;
goto out;
}
- src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+
+ /*
+ * We already hold the exclusive mmap_lock, the copy_pte_range() and
+ * retract_page_tables() are using vma->anon_vma to be exclusive, so
+ * the PTE page is stable, and there is no need to get pmdval and do
+ * pmd_same() check.
+ */
+ src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval,
+ &src_ptl);
if (!src_pte) {
pte_unmap_unlock(dst_pte, dst_ptl);
/* ret == 0 */
@@ -1449,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details,
return !folio_test_anon(folio);
}
-static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+static inline bool zap_drop_markers(struct zap_details *details)
{
if (!details)
return false;
@@ -1470,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
if (vma_is_anonymous(vma))
return;
- if (zap_drop_file_uffd_wp(details))
+ if (zap_drop_markers(details))
return;
for (;;) {
@@ -1665,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* drop the marker if explicitly requested.
*/
if (!vma_is_anonymous(vma) &&
- !zap_drop_file_uffd_wp(details))
+ !zap_drop_markers(details))
+ continue;
+ } else if (is_guard_swp_entry(entry)) {
+ /*
+ * Ordinary zapping should not remove guard PTE
+ * markers. Only do so if we should remove PTE markers
+ * in general.
+ */
+ if (!zap_drop_markers(details))
continue;
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
@@ -3997,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
if (marker & PTE_MARKER_POISONED)
return VM_FAULT_HWPOISON;
+ /* Hitting a guard page is always a fatal condition. */
+ if (marker & PTE_MARKER_GUARD)
+ return VM_FAULT_SIGSEGV;
+
if (pte_marker_entry_uffd_wp(entry))
return pte_marker_handle_uffd_wp(vmf);
@@ -4010,8 +4028,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
struct folio *folio;
swp_entry_t entry;
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
- vmf->address, false);
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
if (!folio)
return NULL;
@@ -4167,7 +4184,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf)
gfp = vma_thp_gfp_mask(vma);
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ folio = vma_alloc_folio(gfp, order, vma, addr);
if (folio) {
if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
gfp, entry))
@@ -4181,17 +4198,14 @@ fallback:
return __alloc_swap_folio(vmf);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
-{
- return false;
-}
-
static struct folio *alloc_swap_folio(struct vm_fault *vmf)
{
return __alloc_swap_folio(vmf);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4204,6 +4218,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct folio *swapcache, *folio = NULL;
+ DECLARE_WAITQUEUE(wait, current);
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
@@ -4302,7 +4317,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* Relax a bit to prevent rapid
* repeated page faults.
*/
+ add_wait_queue(&swapcache_wq, &wait);
schedule_timeout_uninterruptible(1);
+ remove_wait_queue(&swapcache_wq, &wait);
goto out_page;
}
need_clear_cache = true;
@@ -4609,8 +4626,11 @@ unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4625,8 +4645,11 @@ out_release:
folio_unlock(swapcache);
folio_put(swapcache);
}
- if (need_clear_cache)
+ if (need_clear_cache) {
swapcache_clear(si, entry, nr_pages);
+ if (waitqueue_active(&swapcache_wq))
+ wake_up(&swapcache_wq);
+ }
if (si)
put_swap_device(si);
return ret;
@@ -4700,7 +4723,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
gfp = vma_thp_gfp_mask(vma);
while (orders) {
addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
- folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ folio = vma_alloc_folio(gfp, order, vma, addr);
if (folio) {
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
@@ -4708,7 +4731,15 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
goto next;
}
folio_throttle_swaprate(folio, gfp);
- folio_zero_user(folio, vmf->address);
+ /*
+ * When a folio is not zeroed during allocation
+ * (__GFP_ZERO not used), folio_zero_user() is used
+ * to make sure that the page corresponding to the
+ * faulting address will be hot in the cache after
+ * zeroing.
+ */
+ if (!alloc_zeroed())
+ folio_zero_user(folio, vmf->address);
return folio;
}
next:
@@ -4925,6 +4956,15 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
pmd_t entry;
vm_fault_t ret = VM_FAULT_FALLBACK;
+ /*
+ * It is too late to allocate a small folio, we already have a large
+ * folio in the pagecache: especially s390 KVM cannot tolerate any
+ * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
+ * PMD mappings if THPs are disabled.
+ */
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ return ret;
+
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
return ret;
@@ -5728,14 +5768,24 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->pte = NULL;
vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
+ pmd_t dummy_pmdval;
+
/*
* A regular pmd is established and it can't morph into a huge
* pmd by anon khugepaged, since that takes mmap_lock in write
* mode; but shmem or file collapse to THP could still morph
* it into a huge pmd: just retry later if so.
+ *
+ * Use the maywrite version to indicate that vmf->pte may be
+ * modified, but since we will use pte_same() to detect the
+ * change of the !pte_none() entry, there is no need to recheck
+ * the pmdval. Here we chooes to pass a dummy variable instead
+ * of NULL, which helps new user think about why this place is
+ * special.
*/
- vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
- vmf->address, &vmf->ptl);
+ vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &dummy_pmdval,
+ &vmf->ptl);
if (unlikely(!vmf->pte))
return 0;
vmf->orig_pte = ptep_get_lockless(vmf->pte);
@@ -6346,10 +6396,11 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
{
#ifdef CONFIG_LOCKDEP
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file ? file->f_mapping : NULL;
if (mapping)
- lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
+ lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
lockdep_is_held(&vma->vm_mm->mmap_lock));
else
lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));