diff options
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 302 |
1 files changed, 198 insertions, 104 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ca89e0279a7..ee335d96fc39 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -8,7 +8,6 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> -#include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/highmem.h> #include <linux/hugetlb.h> @@ -84,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; static bool anon_orders_configured __initdata; +static inline bool file_thp_enabled(struct vm_area_struct *vma) +{ + struct inode *inode; + + if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) + return false; + + if (!vma->vm_file) + return false; + + inode = file_inode(vma->vm_file); + + return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); +} + unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, @@ -109,18 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. - * */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return 0; - /* - * If the hardware/firmware marked hugepage support disabled. - */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -612,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); +DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -630,6 +635,8 @@ static struct attribute *anon_stats_attrs[] = { &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, #ifndef CONFIG_SHMEM + &zswpout_attr.attr, + &swpin_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -660,6 +667,8 @@ static struct attribute_group file_stats_attr_grp = { static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM + &zswpout_attr.attr, + &swpin_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -949,26 +958,6 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline int get_order_from_str(const char *size_str) -{ - unsigned long size; - char *endptr; - int order; - - size = memparse(size_str, &endptr); - - if (!is_power_of_2(size)) - goto err; - order = get_order(size); - if (BIT(order) & ~THP_ORDERS_ALL_ANON) - goto err; - - return order; -err: - pr_err("invalid size %s in thp_anon boot parameter\n", size_str); - return -EINVAL; -} - static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_anon(char *str) { @@ -980,7 +969,7 @@ static int __init setup_thp_anon(char *str) if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; - strcpy(str_dup, str); + strscpy(str_dup, str); always = huge_anon_orders_always; madvise = huge_anon_orders_madvise; @@ -998,10 +987,22 @@ static int __init setup_thp_anon(char *str) start_size = strsep(&subtoken, "-"); end_size = subtoken; - start = get_order_from_str(start_size); - end = get_order_from_str(end_size); + start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON); + end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON); } else { - start = end = get_order_from_str(subtoken); + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_ANON); + } + + if (start == -EINVAL) { + pr_err("invalid size %s in thp_anon boot parameter\n", start_size); + goto err; + } + + if (end == -EINVAL) { + pr_err("invalid size %s in thp_anon boot parameter\n", end_size); + goto err; } if (start < 0 || end < 0 || start > end) @@ -1148,47 +1149,87 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, - struct page *page, gfp_t gfp) +static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, + unsigned long addr) { - struct vm_area_struct *vma = vmf->vma; - struct folio *folio = page_folio(page); - pgtable_t pgtable; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - vm_fault_t ret = 0; + gfp_t gfp = vma_thp_gfp_mask(vma); + const int order = HPAGE_PMD_ORDER; + struct folio *folio; - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); + + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); - return VM_FAULT_FALLBACK; + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + return NULL; } folio_throttle_swaprate(folio, gfp); - pgtable = pte_alloc_one(vma->vm_mm); - if (unlikely(!pgtable)) { - ret = VM_FAULT_OOM; - goto release; - } - - folio_zero_user(folio, vmf->address); + /* + * When a folio is not zeroed during allocation (__GFP_ZERO not used), + * folio_zero_user() is used to make sure that the page corresponding + * to the faulting address will be hot in the cache after zeroing. + */ + if (!alloc_zeroed()) + folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); + return folio; +} + +static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + pmd_t entry; + + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_pmd_at(vma->vm_mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, haddr, pmd); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +} + +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + pgtable_t pgtable; + vm_fault_t ret = 0; + + folio = vma_alloc_anon_folio_pmd(vma, vmf->address); + if (unlikely(!folio)) + return VM_FAULT_FALLBACK; + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { goto unlock_release; } else { - pmd_t entry; - ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock_release; @@ -1202,21 +1243,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } - - entry = mk_huge_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); deferred_split_folio(folio, false); spin_unlock(vmf->ptl); - count_vm_event(THP_FAULT_ALLOC); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); - count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -1283,8 +1314,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - gfp_t gfp; - struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret; @@ -1335,14 +1364,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = vma_thp_gfp_mask(vma); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); - if (unlikely(!folio)) { - count_vm_event(THP_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); + + return __do_huge_pmd_anonymous_page(vmf); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1586,7 +1609,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, int ret = -ENOMEM; pmd = pmdp_get_lockless(src_pmd); - if (unlikely(pmd_special(pmd))) { + if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -1774,6 +1797,38 @@ unlock: spin_unlock(vmf->ptl); } +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct mmu_notifier_range range; + struct folio *folio; + vm_fault_t ret = 0; + + folio = vma_alloc_anon_folio_pmd(vma, vmf->address); + if (unlikely(!folio)) + return VM_FAULT_FALLBACK; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) + goto release; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + goto unlock; +release: + folio_put(folio); +unlock: + spin_unlock(vmf->ptl); + mmu_notifier_invalidate_range_end(&range); + return ret; +} + vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; @@ -1786,8 +1841,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - if (is_huge_zero_pmd(orig_pmd)) + if (is_huge_zero_pmd(orig_pmd)) { + vm_fault_t ret = do_huge_zero_wp_pmd(vmf); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + + /* Fallback to splitting PMD if THP cannot be allocated */ goto fallback; + } spin_lock(vmf->ptl); @@ -3135,8 +3197,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail, /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); - page_tail->mapping = head->mapping; - page_tail->index = head->index + tail; + new_folio->mapping = folio->mapping; + new_folio->index = folio->index + tail; /* * page->private should not be set in tail pages. Fix up and warn once @@ -3212,11 +3274,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + struct folio *tail; __split_huge_page_tail(folio, i, lruvec, list, new_order); + tail = page_folio(head + i); /* Some pages can be beyond EOF: drop them from page cache */ - if (head[i].index >= end) { - struct folio *tail = page_folio(head + i); - + if (tail->index >= end) { if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) @@ -3224,12 +3286,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); folio_put(tail); - } else if (!PageAnon(page)) { - __xa_store(&folio->mapping->i_pages, head[i].index, - head + i, 0); + } else if (!folio_test_anon(folio)) { + __xa_store(&folio->mapping->i_pages, tail->index, + tail, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, - head + i, 0); + tail, 0); } } @@ -3599,10 +3661,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list) return split_huge_page_to_list_to_order(&folio->page, list, ret); } -void __folio_undo_large_rmappable(struct folio *folio) +/* + * __folio_unqueue_deferred_split() is not to be called directly: + * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h + * limits its calls to those folios which may have a _deferred_list for + * queueing THP splits, and that list is (racily observed to be) non-empty. + * + * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is + * zero: because even when split_queue_lock is held, a non-empty _deferred_list + * might be in use on deferred_split_scan()'s unlocked on-stack list. + * + * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is + * therefore important to unqueue deferred split before changing folio memcg. + */ +bool __folio_unqueue_deferred_split(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; + bool unqueued = false; + + WARN_ON_ONCE(folio_ref_count(folio)); + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -3614,8 +3693,11 @@ void __folio_undo_large_rmappable(struct folio *folio) MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } list_del_init(&folio->_deferred_list); + unqueued = true; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + + return unqueued; /* useful for debug warnings */ } /* partially_mapped=false won't clear PG_partially_mapped folio flag */ @@ -3638,14 +3720,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) return; /* - * The try_to_unmap() in page reclaim path might reach here too, - * this may cause a race condition to corrupt deferred split queue. - * And, if page reclaim is already handling the same folio, it is - * unnecessary to handle it again in shrinker. - * - * Check the swapcache flag to determine if the folio is being - * handled by page reclaim since THP swap would add the folio into - * swap cache before calling try_to_unmap(). + * Exclude swapcache: originally to avoid a corrupt deferred split + * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); + * but if page reclaim is already handling the same folio, it is + * unnecessary to handle it again in the shrinker, so excluding + * swapcache here may still be a useful optimization. */ if (folio_test_swapcache(folio)) return; @@ -3729,8 +3808,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list); - struct folio *folio, *next; - int split = 0; + struct folio *folio, *next, *prev = NULL; + int split = 0, removed = 0; #ifdef CONFIG_MEMCG if (sc->memcg) @@ -3784,17 +3863,32 @@ next: * in the case it was underused, then consider it used and * don't add it back to split_queue. */ - if (!did_split && !folio_test_partially_mapped(folio)) { + if (did_split) { + ; /* folio already removed from list */ + } else if (!folio_test_partially_mapped(folio)) { list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; + removed++; + } else { + /* + * That unlocked list_del_init() above would be unsafe, + * unless its folio is separated from any earlier folios + * left on the list (which may be concurrently unqueued) + * by one safe folio with refcount still raised. + */ + swap(folio, prev); } - folio_put(folio); + if (folio) + folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); list_splice_tail(&list, &ds_queue->split_queue); + ds_queue->split_queue_len -= removed; spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + if (prev) + folio_put(prev); + /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. @@ -4075,7 +4169,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, tok = strsep(&buf, ","); if (tok) { - strcpy(file_path, tok); + strscpy(file_path, tok); } else { ret = -EINVAL; goto out; |