aboutsummaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c302
1 files changed, 198 insertions, 104 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3ca89e0279a7..ee335d96fc39 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -8,7 +8,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
@@ -84,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
static bool anon_orders_configured __initdata;
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ struct inode *inode;
+
+ if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return false;
+
+ if (!vma->vm_file)
+ return false;
+
+ inode = file_inode(vma->vm_file);
+
+ return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
+}
+
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
unsigned long tva_flags,
@@ -109,18 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- /*
- * Explicitly disabled through madvise or prctl, or some
- * architectures may disable THP for some mappings, for
- * example, s390 kvm.
- * */
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return 0;
- /*
- * If the hardware/firmware marked hugepage support disabled.
- */
- if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -612,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
+DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
@@ -630,6 +635,8 @@ static struct attribute *anon_stats_attrs[] = {
&anon_fault_fallback_attr.attr,
&anon_fault_fallback_charge_attr.attr,
#ifndef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -660,6 +667,8 @@ static struct attribute_group file_stats_attr_grp = {
static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
+ &zswpout_attr.attr,
+ &swpin_attr.attr,
&swpout_attr.attr,
&swpout_fallback_attr.attr,
#endif
@@ -949,26 +958,6 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline int get_order_from_str(const char *size_str)
-{
- unsigned long size;
- char *endptr;
- int order;
-
- size = memparse(size_str, &endptr);
-
- if (!is_power_of_2(size))
- goto err;
- order = get_order(size);
- if (BIT(order) & ~THP_ORDERS_ALL_ANON)
- goto err;
-
- return order;
-err:
- pr_err("invalid size %s in thp_anon boot parameter\n", size_str);
- return -EINVAL;
-}
-
static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
@@ -980,7 +969,7 @@ static int __init setup_thp_anon(char *str)
if (!str || strlen(str) + 1 > PAGE_SIZE)
goto err;
- strcpy(str_dup, str);
+ strscpy(str_dup, str);
always = huge_anon_orders_always;
madvise = huge_anon_orders_madvise;
@@ -998,10 +987,22 @@ static int __init setup_thp_anon(char *str)
start_size = strsep(&subtoken, "-");
end_size = subtoken;
- start = get_order_from_str(start_size);
- end = get_order_from_str(end_size);
+ start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
+ end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
} else {
- start = end = get_order_from_str(subtoken);
+ start_size = end_size = subtoken;
+ start = end = get_order_from_str(subtoken,
+ THP_ORDERS_ALL_ANON);
+ }
+
+ if (start == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
+ goto err;
+ }
+
+ if (end == -EINVAL) {
+ pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
+ goto err;
}
if (start < 0 || end < 0 || start > end)
@@ -1148,47 +1149,87 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
-static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
- struct page *page, gfp_t gfp)
+static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct vm_area_struct *vma = vmf->vma;
- struct folio *folio = page_folio(page);
- pgtable_t pgtable;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- vm_fault_t ret = 0;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ const int order = HPAGE_PMD_ORDER;
+ struct folio *folio;
- VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);
+
+ if (unlikely(!folio)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ return NULL;
+ }
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
folio_put(folio);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
- return VM_FAULT_FALLBACK;
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+ count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ return NULL;
}
folio_throttle_swaprate(folio, gfp);
- pgtable = pte_alloc_one(vma->vm_mm);
- if (unlikely(!pgtable)) {
- ret = VM_FAULT_OOM;
- goto release;
- }
-
- folio_zero_user(folio, vmf->address);
+ /*
+ * When a folio is not zeroed during allocation (__GFP_ZERO not used),
+ * folio_zero_user() is used to make sure that the page corresponding
+ * to the faulting address will be hot in the cache after zeroing.
+ */
+ if (!alloc_zeroed())
+ folio_zero_user(folio, addr);
/*
* The memory barrier inside __folio_mark_uptodate makes sure that
* folio_zero_user writes become visible before the set_pmd_at()
* write.
*/
__folio_mark_uptodate(folio);
+ return folio;
+}
+
+static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+ struct vm_area_struct *vma, unsigned long haddr)
+{
+ pmd_t entry;
+
+ entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
+ folio_add_lru_vma(folio, vma);
+ set_pmd_at(vma->vm_mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, haddr, pmd);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+}
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
+ pgtable_t pgtable;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable)) {
+ ret = VM_FAULT_OOM;
+ goto release;
+ }
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) {
goto unlock_release;
} else {
- pmd_t entry;
-
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock_release;
@@ -1202,21 +1243,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
return ret;
}
-
- entry = mk_huge_pmd(page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
- folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
mm_inc_nr_ptes(vma->vm_mm);
deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
- count_vm_event(THP_FAULT_ALLOC);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
- count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -1283,8 +1314,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- gfp_t gfp;
- struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret;
@@ -1335,14 +1364,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
}
return ret;
}
- gfp = vma_thp_gfp_mask(vma);
- folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
- if (unlikely(!folio)) {
- count_vm_event(THP_FAULT_FALLBACK);
- count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
- return VM_FAULT_FALLBACK;
- }
- return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+ return __do_huge_pmd_anonymous_page(vmf);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1586,7 +1609,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
pmd = pmdp_get_lockless(src_pmd);
- if (unlikely(pmd_special(pmd))) {
+ if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -1774,6 +1797,38 @@ unlock:
spin_unlock(vmf->ptl);
}
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
+{
+ unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+ struct folio *folio;
+ vm_fault_t ret = 0;
+
+ folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+ if (unlikely(!folio))
+ return VM_FAULT_FALLBACK;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+ goto release;
+ ret = check_stable_address_space(vma->vm_mm);
+ if (ret)
+ goto release;
+ (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+ map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
+ goto unlock;
+release:
+ folio_put(folio);
+unlock:
+ spin_unlock(vmf->ptl);
+ mmu_notifier_invalidate_range_end(&range);
+ return ret;
+}
+
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1786,8 +1841,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
- if (is_huge_zero_pmd(orig_pmd))
+ if (is_huge_zero_pmd(orig_pmd)) {
+ vm_fault_t ret = do_huge_zero_wp_pmd(vmf);
+
+ if (!(ret & VM_FAULT_FALLBACK))
+ return ret;
+
+ /* Fallback to splitting PMD if THP cannot be allocated */
goto fallback;
+ }
spin_lock(vmf->ptl);
@@ -3135,8 +3197,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
/* ->mapping in first and second tail page is replaced by other uses */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
- page_tail->mapping = head->mapping;
- page_tail->index = head->index + tail;
+ new_folio->mapping = folio->mapping;
+ new_folio->index = folio->index + tail;
/*
* page->private should not be set in tail pages. Fix up and warn once
@@ -3212,11 +3274,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageHasHWPoisoned(head);
for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
+ struct folio *tail;
__split_huge_page_tail(folio, i, lruvec, list, new_order);
+ tail = page_folio(head + i);
/* Some pages can be beyond EOF: drop them from page cache */
- if (head[i].index >= end) {
- struct folio *tail = page_folio(head + i);
-
+ if (tail->index >= end) {
if (shmem_mapping(folio->mapping))
nr_dropped++;
else if (folio_test_clear_dirty(tail))
@@ -3224,12 +3286,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
inode_to_wb(folio->mapping->host));
__filemap_remove_folio(tail, NULL);
folio_put(tail);
- } else if (!PageAnon(page)) {
- __xa_store(&folio->mapping->i_pages, head[i].index,
- head + i, 0);
+ } else if (!folio_test_anon(folio)) {
+ __xa_store(&folio->mapping->i_pages, tail->index,
+ tail, 0);
} else if (swap_cache) {
__xa_store(&swap_cache->i_pages, offset + i,
- head + i, 0);
+ tail, 0);
}
}
@@ -3599,10 +3661,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}
-void __folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
+
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -3614,8 +3693,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@@ -3638,14 +3720,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
@@ -3729,8 +3808,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list);
- struct folio *folio, *next;
- int split = 0;
+ struct folio *folio, *next, *prev = NULL;
+ int split = 0, removed = 0;
#ifdef CONFIG_MEMCG
if (sc->memcg)
@@ -3784,17 +3863,32 @@ next:
* in the case it was underused, then consider it used and
* don't add it back to split_queue.
*/
- if (!did_split && !folio_test_partially_mapped(folio)) {
+ if (did_split) {
+ ; /* folio already removed from list */
+ } else if (!folio_test_partially_mapped(folio)) {
list_del_init(&folio->_deferred_list);
- ds_queue->split_queue_len--;
+ removed++;
+ } else {
+ /*
+ * That unlocked list_del_init() above would be unsafe,
+ * unless its folio is separated from any earlier folios
+ * left on the list (which may be concurrently unqueued)
+ * by one safe folio with refcount still raised.
+ */
+ swap(folio, prev);
}
- folio_put(folio);
+ if (folio)
+ folio_put(folio);
}
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
list_splice_tail(&list, &ds_queue->split_queue);
+ ds_queue->split_queue_len -= removed;
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ if (prev)
+ folio_put(prev);
+
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
@@ -4075,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
tok = strsep(&buf, ",");
if (tok) {
- strcpy(file_path, tok);
+ strscpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;