diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/damon/core.c | 42 | ||||
-rw-r--r-- | mm/damon/tests/sysfs-kunit.h | 1 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 33 | ||||
-rw-r--r-- | mm/huge_memory.c | 69 | ||||
-rw-r--r-- | mm/internal.h | 55 | ||||
-rw-r--r-- | mm/kasan/init.c | 8 | ||||
-rw-r--r-- | mm/kasan/kasan_test_c.c | 27 | ||||
-rw-r--r-- | mm/khugepaged.c | 6 | ||||
-rw-r--r-- | mm/memcontrol-v1.c | 25 | ||||
-rw-r--r-- | mm/memcontrol.c | 9 | ||||
-rw-r--r-- | mm/memory.c | 32 | ||||
-rw-r--r-- | mm/migrate.c | 9 | ||||
-rw-r--r-- | mm/mlock.c | 9 | ||||
-rw-r--r-- | mm/mmap.c | 229 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 11 | ||||
-rw-r--r-- | mm/nommu.c | 9 | ||||
-rw-r--r-- | mm/numa_memblks.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 26 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/pagewalk.c | 16 | ||||
-rw-r--r-- | mm/rmap.c | 9 | ||||
-rw-r--r-- | mm/shmem.c | 12 | ||||
-rw-r--r-- | mm/shrinker.c | 8 | ||||
-rw-r--r-- | mm/slab_common.c | 33 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 5 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 58 | ||||
-rw-r--r-- | mm/vma.c | 37 | ||||
-rw-r--r-- | mm/vma.h | 32 | ||||
-rw-r--r-- | mm/vmscan.c | 110 |
33 files changed, 575 insertions, 360 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 4c9f5ea13271..33fa51d608dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1085,7 +1085,6 @@ config HMM_MIRROR depends on MMU config GET_FREE_REGION - depends on SPARSEMEM bool config DEVICE_PRIVATE diff --git a/mm/damon/core.c b/mm/damon/core.c index a83f3b736d51..511c3f61ab44 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1412,7 +1412,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - if (c->passed_sample_intervals != s->next_apply_sis) + if (c->passed_sample_intervals < s->next_apply_sis) continue; if (!s->wmarks.activated) @@ -1456,17 +1456,31 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input, unsigned long score) { const unsigned long goal = 10000; - unsigned long score_goal_diff = max(goal, score) - min(goal, score); - unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal; - unsigned long compensation = last_input * score_goal_diff_bp / 10000; /* Set minimum input as 10000 to avoid compensation be zero */ const unsigned long min_input = 10000; + unsigned long score_goal_diff, compensation; + bool over_achieving = score > goal; - if (goal > score) + if (score == goal) + return last_input; + if (score >= goal * 2) + return min_input; + + if (over_achieving) + score_goal_diff = score - goal; + else + score_goal_diff = goal - score; + + if (last_input < ULONG_MAX / score_goal_diff) + compensation = last_input * score_goal_diff / goal; + else + compensation = last_input / goal * score_goal_diff; + + if (over_achieving) + return max(last_input - compensation, min_input); + if (last_input < ULONG_MAX - compensation) return last_input + compensation; - if (last_input > compensation + min_input) - return last_input - compensation; - return min_input; + return ULONG_MAX; } #ifdef CONFIG_PSI @@ -1622,7 +1636,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) bool has_schemes_to_apply = false; damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals != s->next_apply_sis) + if (c->passed_sample_intervals < s->next_apply_sis) continue; if (!s->wmarks.activated) @@ -1642,9 +1656,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals != s->next_apply_sis) + if (c->passed_sample_intervals < s->next_apply_sis) continue; - s->next_apply_sis += + s->next_apply_sis = c->passed_sample_intervals + (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; } @@ -2000,7 +2014,7 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (ctx->passed_sample_intervals == next_aggregation_sis) { + if (ctx->passed_sample_intervals >= next_aggregation_sis) { kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); @@ -2018,7 +2032,7 @@ static int kdamond_fn(void *data) sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; - if (ctx->passed_sample_intervals == next_aggregation_sis) { + if (ctx->passed_sample_intervals >= next_aggregation_sis) { ctx->next_aggregation_sis = next_aggregation_sis + ctx->attrs.aggr_interval / sample_interval; @@ -2028,7 +2042,7 @@ static int kdamond_fn(void *data) ctx->ops.reset_aggregated(ctx); } - if (ctx->passed_sample_intervals == next_ops_update_sis) { + if (ctx->passed_sample_intervals >= next_ops_update_sis) { ctx->next_ops_update_sis = next_ops_update_sis + ctx->attrs.ops_update_interval / sample_interval; diff --git a/mm/damon/tests/sysfs-kunit.h b/mm/damon/tests/sysfs-kunit.h index 1c9b596057a7..7b5c7b307da9 100644 --- a/mm/damon/tests/sysfs-kunit.h +++ b/mm/damon/tests/sysfs-kunit.h @@ -67,6 +67,7 @@ static void damon_sysfs_test_add_targets(struct kunit *test) damon_destroy_ctx(ctx); kfree(sysfs_targets->targets_arr); kfree(sysfs_targets); + kfree(sysfs_target->regions); kfree(sysfs_target); } diff --git a/mm/filemap.c b/mm/filemap.c index 36d22968be9a..56fa431c52af 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2625,7 +2625,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(!iov_iter_count(iter))) return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { @@ -2394,20 +2394,25 @@ err: } /* - * Check whether all folios are *allowed* to be pinned indefinitely (longterm). + * Check whether all folios are *allowed* to be pinned indefinitely (long term). * Rather confusingly, all folios in the range are required to be pinned via * FOLL_PIN, before calling this routine. * - * If any folios in the range are not allowed to be pinned, then this routine - * will migrate those folios away, unpin all the folios in the range and return - * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then - * call this routine again. + * Return values: * - * If an error other than -EAGAIN occurs, this indicates a migration failure. - * The caller should give up, and propagate the error back up the call stack. - * - * If everything is OK and all folios in the range are allowed to be pinned, + * 0: if everything is OK and all folios in the range are allowed to be pinned, * then this routine leaves all folios pinned and returns zero for success. + * + * -EAGAIN: if any folios in the range are not allowed to be pinned, then this + * routine will migrate those folios away, unpin all the folios in the range. If + * migration of the entire set of folios succeeds, then -EAGAIN is returned. The + * caller should re-pin the entire range with FOLL_PIN and then call this + * routine again. + * + * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this + * indicates a migration failure. The caller should give up, and propagate the + * error back up the call stack. The caller does not need to unpin any folios in + * that case, because this routine will do the unpinning. */ static long check_and_migrate_movable_folios(unsigned long nr_folios, struct folio **folios) @@ -2425,10 +2430,8 @@ static long check_and_migrate_movable_folios(unsigned long nr_folios, } /* - * This routine just converts all the pages in the @pages array to folios and - * calls check_and_migrate_movable_folios() to do the heavy lifting. - * - * Please see the check_and_migrate_movable_folios() documentation for details. + * Return values and behavior are the same as those for + * check_and_migrate_movable_folios(). */ static long check_and_migrate_movable_pages(unsigned long nr_pages, struct page **pages) @@ -2437,8 +2440,10 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, long i, ret; folios = kmalloc_array(nr_pages, sizeof(*folios), GFP_KERNEL); - if (!folios) + if (!folios) { + unpin_user_pages(pages, nr_pages); return -ENOMEM; + } for (i = 0; i < nr_pages; i++) folios[i] = page_folio(pages[i]); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 87b49ecc7b1e..03fd4bc39ea1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -109,18 +109,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. - * */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return 0; - /* - * If the hardware/firmware marked hugepage support disabled. - */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ @@ -3599,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list) return split_huge_page_to_list_to_order(&folio->page, list, ret); } -void __folio_undo_large_rmappable(struct folio *folio) +/* + * __folio_unqueue_deferred_split() is not to be called directly: + * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h + * limits its calls to those folios which may have a _deferred_list for + * queueing THP splits, and that list is (racily observed to be) non-empty. + * + * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is + * zero: because even when split_queue_lock is held, a non-empty _deferred_list + * might be in use on deferred_split_scan()'s unlocked on-stack list. + * + * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is + * therefore important to unqueue deferred split before changing folio memcg. + */ +bool __folio_unqueue_deferred_split(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; + bool unqueued = false; + + WARN_ON_ONCE(folio_ref_count(folio)); + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); @@ -3614,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio) MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1); } list_del_init(&folio->_deferred_list); + unqueued = true; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + + return unqueued; /* useful for debug warnings */ } /* partially_mapped=false won't clear PG_partially_mapped folio flag */ @@ -3638,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped) return; /* - * The try_to_unmap() in page reclaim path might reach here too, - * this may cause a race condition to corrupt deferred split queue. - * And, if page reclaim is already handling the same folio, it is - * unnecessary to handle it again in shrinker. - * - * Check the swapcache flag to determine if the folio is being - * handled by page reclaim since THP swap would add the folio into - * swap cache before calling try_to_unmap(). + * Exclude swapcache: originally to avoid a corrupt deferred split + * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); + * but if page reclaim is already handling the same folio, it is + * unnecessary to handle it again in the shrinker, so excluding + * swapcache here may still be a useful optimization. */ if (folio_test_swapcache(folio)) return; @@ -3729,8 +3735,8 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, struct deferred_split *ds_queue = &pgdata->deferred_split_queue; unsigned long flags; LIST_HEAD(list); - struct folio *folio, *next; - int split = 0; + struct folio *folio, *next, *prev = NULL; + int split = 0, removed = 0; #ifdef CONFIG_MEMCG if (sc->memcg) @@ -3786,15 +3792,28 @@ next: */ if (!did_split && !folio_test_partially_mapped(folio)) { list_del_init(&folio->_deferred_list); - ds_queue->split_queue_len--; + removed++; + } else { + /* + * That unlocked list_del_init() above would be unsafe, + * unless its folio is separated from any earlier folios + * left on the list (which may be concurrently unqueued) + * by one safe folio with refcount still raised. + */ + swap(folio, prev); } - folio_put(folio); + if (folio) + folio_put(folio); } spin_lock_irqsave(&ds_queue->split_queue_lock, flags); list_splice_tail(&list, &ds_queue->split_queue); + ds_queue->split_queue_len -= removed; spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + if (prev) + folio_put(prev); + /* * Stop shrinker if we didn't split any page, but the queue is empty. * This can happen if pages were freed under us. diff --git a/mm/internal.h b/mm/internal.h index 93083bbeeefa..64c2eb0b160e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -108,6 +108,51 @@ static inline void *folio_raw_mapping(const struct folio *folio) return (void *)(mapping & ~PAGE_MAPPING_FLAGS); } +/* + * This is a file-backed mapping, and is about to be memory mapped - invoke its + * mmap hook and safely handle error conditions. On error, VMA hooks will be + * mutated. + * + * @file: File which backs the mapping. + * @vma: VMA which we are mapping. + * + * Returns: 0 if success, error otherwise. + */ +static inline int mmap_file(struct file *file, struct vm_area_struct *vma) +{ + int err = call_mmap(file, vma); + + if (likely(!err)) + return 0; + + /* + * OK, we tried to call the file hook for mmap(), but an error + * arose. The mapping is in an inconsistent state and we most not invoke + * any further hooks on it. + */ + vma->vm_ops = &vma_dummy_vm_ops; + + return err; +} + +/* + * If the VMA has a close hook then close it, and since closing it might leave + * it in an inconsistent state which makes the use of any hooks suspect, clear + * them down by installing dummy empty hooks. + */ +static inline void vma_close(struct vm_area_struct *vma) +{ + if (vma->vm_ops && vma->vm_ops->close) { + vma->vm_ops->close(vma); + + /* + * The mapping is in an inconsistent state, and no further hooks + * may be invoked upon it. + */ + vma->vm_ops = &vma_dummy_vm_ops; + } +} + #ifdef CONFIG_MMU /* Flags for folio_pte_batch(). */ @@ -639,11 +684,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } -void __folio_undo_large_rmappable(struct folio *folio); -static inline void folio_undo_large_rmappable(struct folio *folio) +bool __folio_unqueue_deferred_split(struct folio *folio); +static inline bool folio_unqueue_deferred_split(struct folio *folio) { if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) - return; + return false; /* * At this point, there is no one trying to add the folio to @@ -651,9 +696,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio) * to check without acquiring the split_queue_lock. */ if (data_race(list_empty(&folio->_deferred_list))) - return; + return false; - __folio_undo_large_rmappable(folio); + return __folio_unqueue_deferred_split(folio); } static inline struct folio *page_rmappable_folio(struct page *page) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 89895f38f722..ac607c306292 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -106,6 +106,10 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, } } +void __weak __meminit kernel_pte_init(void *addr) +{ +} + static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) { @@ -126,8 +130,10 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, if (slab_is_available()) p = pte_alloc_one_kernel(&init_mm); - else + else { p = early_alloc(PAGE_SIZE, NUMA_NO_NODE); + kernel_pte_init(p); + } if (!p) return -ENOMEM; diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index a181e4780d9d..d8fb281e439d 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1810,32 +1810,6 @@ static void vm_map_ram_tags(struct kunit *test) free_pages((unsigned long)p_ptr, 1); } -static void vmalloc_percpu(struct kunit *test) -{ - char __percpu *ptr; - int cpu; - - /* - * This test is specifically crafted for the software tag-based mode, - * the only tag-based mode that poisons percpu mappings. - */ - KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); - - ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); - - for_each_possible_cpu(cpu) { - char *c_ptr = per_cpu_ptr(ptr, cpu); - - KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); - KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); - - /* Make sure that in-bounds accesses don't crash the kernel. */ - *c_ptr = 0; - } - - free_percpu(ptr); -} - /* * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN, * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based @@ -2023,7 +1997,6 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), KUNIT_CASE(vm_map_ram_tags), - KUNIT_CASE(vmalloc_percpu), KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f9c39898eaff..b538c3d48386 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2227,7 +2227,7 @@ rollback: folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); - trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result); + trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); return result; } @@ -2252,7 +2252,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; if (xa_is_value(folio)) { - ++swap; + swap += 1 << xas_get_order(&xas); if (cc->is_khugepaged && swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; @@ -2299,7 +2299,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * is just too costly... */ - present++; + present += folio_nr_pages(folio); if (need_resched()) { xas_pause(&xas); diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 81d8819f13cd..f8744f5630bb 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio, css_get(&to->css); css_put(&from->css); + /* Warning should never happen, so don't worry about refcount non-0 */ + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = (unsigned long)to; __folio_memcg_unlock(from); @@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct folio *folio; + bool tried_split_before = false; +retry_pmd: ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { @@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { folio = target.folio; + /* + * Deferred split queue locking depends on memcg, + * and unqueue is unsafe unless folio refcount is 0: + * split or skip if on the queue? first try to split. + */ + if (!list_empty(&folio->_deferred_list)) { + spin_unlock(ptl); + if (!tried_split_before) + split_folio(folio); + folio_unlock(folio); + folio_put(folio); + if (tried_split_before) + return 0; + tried_split_before = true; + goto retry_pmd; + } + /* + * So long as that pmd lock is held, the folio cannot + * be racily added to the _deferred_list, because + * __folio_remove_rmap() will find !partially_mapped. + */ if (folio_isolate_lru(folio)) { if (!mem_cgroup_move_account(folio, true, mc.from, mc.to)) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7845c64a2c57..06df2af97415 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4629,10 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) struct obj_cgroup *objcg; VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - VM_BUG_ON_FOLIO(folio_order(folio) > 1 && - !folio_test_hugetlb(folio) && - !list_empty(&folio->_deferred_list) && - folio_test_partially_mapped(folio), folio); /* * Nobody should be changing or seriously looking at @@ -4679,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) ug->nr_memory += nr_pages; ug->pgpgout++; + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = 0; } @@ -4790,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) /* Transfer the charge and the css ref */ commit_charge(new, memcg); + + /* Warning should never happen, so don't worry about refcount non-0 */ + WARN_ON_ONCE(folio_unqueue_deferred_split(old)); old->memcg_data = 0; } @@ -4976,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + folio_unqueue_deferred_split(folio); folio->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) diff --git a/mm/memory.c b/mm/memory.c index 2366578015ad..bdf77a3ec47b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4181,17 +4181,14 @@ fallback: return __alloc_swap_folio(vmf); } #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages) -{ - return false; -} - static struct folio *alloc_swap_folio(struct vm_fault *vmf) { return __alloc_swap_folio(vmf); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); + /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. @@ -4204,6 +4201,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct folio *swapcache, *folio = NULL; + DECLARE_WAITQUEUE(wait, current); struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; @@ -4302,7 +4300,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Relax a bit to prevent rapid * repeated page faults. */ + add_wait_queue(&swapcache_wq, &wait); schedule_timeout_uninterruptible(1); + remove_wait_queue(&swapcache_wq, &wait); goto out_page; } need_clear_cache = true; @@ -4609,8 +4609,11 @@ unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); out: /* Clear the swap cache pin for direct swapin after PTL unlock */ - if (need_clear_cache) + if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); + if (waitqueue_active(&swapcache_wq)) + wake_up(&swapcache_wq); + } if (si) put_swap_device(si); return ret; @@ -4625,8 +4628,11 @@ out_release: folio_unlock(swapcache); folio_put(swapcache); } - if (need_clear_cache) + if (need_clear_cache) { swapcache_clear(si, entry, nr_pages); + if (waitqueue_active(&swapcache_wq)) + wake_up(&swapcache_wq); + } if (si) put_swap_device(si); return ret; @@ -4925,6 +4931,15 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) pmd_t entry; vm_fault_t ret = VM_FAULT_FALLBACK; + /* + * It is too late to allocate a small folio, we already have a large + * folio in the pagecache: especially s390 KVM cannot tolerate any + * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any + * PMD mappings if THPs are disabled. + */ + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + return ret; + if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) return ret; @@ -6346,7 +6361,8 @@ static inline void pfnmap_args_setup(struct follow_pfnmap_args *args, static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) { #ifdef CONFIG_LOCKDEP - struct address_space *mapping = vma->vm_file->f_mapping; + struct file *file = vma->vm_file; + struct address_space *mapping = file ? file->f_mapping : NULL; if (mapping) lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || diff --git a/mm/migrate.c b/mm/migrate.c index df91248755e4..dfa24e41e8f9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, pte_t newpte; void *addr; - VM_BUG_ON_PAGE(PageCompound(page), page); + if (PageCompound(page)) + return false; VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); @@ -489,7 +490,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, folio_test_large_rmappable(folio)) { if (!folio_ref_freeze(folio, expected_count)) return -EAGAIN; - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); folio_ref_unfreeze(folio, expected_count); } @@ -514,7 +515,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } /* Take off deferred split queue while frozen and memcg set */ - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); /* * Now we know that no one else is looking at the folio: @@ -1177,7 +1178,7 @@ static void migrate_folio_done(struct folio *src, * not accounted to NR_ISOLATED_*. They can be recognized * as __folio_test_movable */ - if (likely(!__folio_test_movable(src))) + if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + folio_is_file_lru(src), -folio_nr_pages(src)); diff --git a/mm/mlock.c b/mm/mlock.c index e3e3dc2b2956..cde076fa7d5e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -725,14 +725,17 @@ static int apply_mlockall_flags(int flags) } for_each_vma(vmi, vma) { + int error; vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; - /* Ignore errors */ - mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, - newflags); + error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, + newflags); + /* Ignore errors, but prev needs fixing up. */ + if (error) + prev = vma; cond_resched(); } out: diff --git a/mm/mmap.c b/mm/mmap.c index dd4b35a25aeb..79d541f1502b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -344,7 +344,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* Obtain the address to map to. we verify (or select) it and ensure @@ -900,7 +900,8 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, pgoff, flags, vm_flags); @@ -1357,21 +1358,19 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } -unsigned long mmap_region(struct file *file, unsigned long addr, +static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; pgoff_t pglen = PHYS_PFN(len); - struct vm_area_struct *merge; unsigned long charged = 0; struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; unsigned long end = addr + len; - bool writable_file_mapping = false; - int error = -ENOMEM; + int error; VMA_ITERATOR(vmi, mm, addr); VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); @@ -1396,8 +1395,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) + if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) { + error = -ENOMEM; goto abort_munmap; + } /* * Private writable mapping: check memory availability @@ -1405,14 +1406,24 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (accountable_mapping(file, vm_flags)) { charged = pglen; charged -= vms.nr_accounted; - if (charged && security_vm_enough_memory_mm(mm, charged)) - goto abort_munmap; + if (charged) { + error = security_vm_enough_memory_mm(mm, charged); + if (error) + goto abort_munmap; + } vms.nr_accounted = 0; vm_flags |= VM_ACCOUNT; vmg.flags = vm_flags; } + /* + * clear PTEs while the vma is still in the tree so that rmap + * cannot race with the freeing later in the truncate scenario. + * This is also needed for mmap_file(), which is why vm_ops + * close function is called. + */ + vms_clean_up_area(&vms, &mas_detach); vma = vma_merge_new_range(&vmg); if (vma) goto expanded; @@ -1422,47 +1433,45 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(mm); - if (!vma) + if (!vma) { + error = -ENOMEM; goto unacct_error; + } vma_iter_config(&vmi, addr, end); vma_set_range(vma, addr, end, pgoff); vm_flags_init(vma, vm_flags); vma->vm_page_prot = vm_get_page_prot(vm_flags); + if (vma_iter_prealloc(&vmi, vma)) { + error = -ENOMEM; + goto free_vma; + } + if (file) { vma->vm_file = get_file(file); - /* - * call_mmap() may map PTE, so ensure there are no existing PTEs - * and call the vm_ops close function if one exists. - */ - vms_clean_up_area(&vms, &mas_detach); - error = call_mmap(file, vma); + error = mmap_file(file, vma); if (error) - goto unmap_and_free_vma; - - if (vma_is_shared_maywrite(vma)) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto close_and_free_vma; - - writable_file_mapping = true; - } + goto unmap_and_free_file_vma; + /* Drivers cannot alter the address of the VMA. */ + WARN_ON_ONCE(addr != vma->vm_start); /* - * Expansion is handled above, merging is handled below. - * Drivers should not alter the address of the VMA. + * Drivers should not permit writability when previously it was + * disallowed. */ - error = -EINVAL; - if (WARN_ON((addr != vma->vm_start))) - goto close_and_free_vma; + VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && + !(vm_flags & VM_MAYWRITE) && + (vma->vm_flags & VM_MAYWRITE)); vma_iter_config(&vmi, addr, end); /* - * If vm_flags changed after call_mmap(), we should try merge + * If vm_flags changed after mmap_file(), we should try merge * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { + struct vm_area_struct *merge; + vmg.flags = vma->vm_flags; /* If this fails, state is reset ready for a reattempt. */ merge = vma_merge_new_range(&vmg); @@ -1480,7 +1489,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma = merge; /* Update vm_flags to pick up the change. */ vm_flags = vma->vm_flags; - goto unmap_writable; + goto file_expanded; } vma_iter_config(&vmi, addr, end); } @@ -1489,24 +1498,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) - goto free_vma; + goto free_iter_vma; } else { vma_set_anonymous(vma); } - if (map_deny_write_exec(vma, vma->vm_flags)) { - error = -EACCES; - goto close_and_free_vma; - } - - /* Allow architectures to sanity-check the vm_flags */ - error = -EINVAL; - if (!arch_validate_flags(vma->vm_flags)) - goto close_and_free_vma; - - error = -ENOMEM; - if (vma_iter_prealloc(&vmi, vma)) - goto close_and_free_vma; +#ifdef CONFIG_SPARC64 + /* TODO: Fix SPARC ADI! */ + WARN_ON_ONCE(!arch_validate_flags(vm_flags)); +#endif /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); @@ -1520,10 +1520,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ khugepaged_enter_vma(vma, vma->vm_flags); - /* Once vma denies write, undo our temporary denial count */ -unmap_writable: - if (writable_file_mapping) - mapping_unmap_writable(file->f_mapping); +file_expanded: file = vma->vm_file; ksm_add_vma(vma); expanded: @@ -1556,24 +1553,17 @@ expanded: vma_set_page_prot(vma); - validate_mm(mm); return addr; -close_and_free_vma: - if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - - if (file || vma->vm_file) { -unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; +unmap_and_free_file_vma: + fput(vma->vm_file); + vma->vm_file = NULL; - vma_iter_set(&vmi, vma->vm_end); - /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); - } - if (writable_file_mapping) - mapping_unmap_writable(file->f_mapping); + vma_iter_set(&vmi, vma->vm_end); + /* Undo any partial mapping done by a device driver. */ + unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); +free_iter_vma: + vma_iter_free(&vmi); free_vma: vm_area_free(vma); unacct_error: @@ -1583,10 +1573,43 @@ unacct_error: abort_munmap: vms_abort_munmap_vmas(&vms, &mas_detach); gather_failed: - validate_mm(mm); return error; } +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + unsigned long ret; + bool writable_file_mapping = false; + + /* Check to see if MDWE is applicable. */ + if (map_deny_write_exec(vm_flags, vm_flags)) + return -EACCES; + + /* Allow architectures to sanity-check the vm_flags. */ + if (!arch_validate_flags(vm_flags)) + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ + if (file && is_shared_maywrite(vm_flags)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) + return error; + writable_file_mapping = true; + } + + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + + /* Clear our write mapping regardless of error. */ + if (writable_file_mapping) + mapping_unmap_writable(file->f_mapping); + + validate_mm(current->mm); + return ret; +} + static int __vm_munmap(unsigned long start, size_t len, bool unlock) { int ret; @@ -1630,6 +1653,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long populate = 0; unsigned long ret = -EINVAL; struct file *file; + vm_flags_t vm_flags; pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); @@ -1646,12 +1670,60 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - if (mmap_write_lock_killable(mm)) + if (mmap_read_lock_killable(mm)) + return -EINTR; + + /* + * Look up VMA under read lock first so we can perform the security + * without holding locks (which can be problematic). We reacquire a + * write lock later and check nothing changed underneath us. + */ + vma = vma_lookup(mm, start); + + if (!vma || !(vma->vm_flags & VM_SHARED)) { + mmap_read_unlock(mm); + return -EINVAL; + } + + prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; + prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; + prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; + + flags &= MAP_NONBLOCK; + flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; + if (vma->vm_flags & VM_LOCKED) + flags |= MAP_LOCKED; + + /* Save vm_flags used to calculate prot and flags, and recheck later. */ + vm_flags = vma->vm_flags; + file = get_file(vma->vm_file); + + mmap_read_unlock(mm); + + /* Call outside mmap_lock to be consistent with other callers. */ + ret = security_mmap_file(file, prot, flags); + if (ret) { + fput(file); + return ret; + } + + ret = -EINVAL; + + /* OK security check passed, take write lock + let it rip. */ + if (mmap_write_lock_killable(mm)) { + fput(file); return -EINTR; + } vma = vma_lookup(mm, start); - if (!vma || !(vma->vm_flags & VM_SHARED)) + if (!vma) + goto out; + + /* Make sure things didn't change under us. */ + if (vma->vm_flags != vm_flags) + goto out; + if (vma->vm_file != file) goto out; if (start + size > vma->vm_end) { @@ -1679,25 +1751,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, goto out; } - prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; - prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; - prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; - - flags &= MAP_NONBLOCK; - flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; - if (vma->vm_flags & VM_LOCKED) - flags |= MAP_LOCKED; - - file = get_file(vma->vm_file); - ret = security_mmap_file(vma->vm_file, prot, flags); - if (ret) - goto out_fput; ret = do_mmap(vma->vm_file, start, size, prot, flags, 0, pgoff, &populate, NULL); -out_fput: - fput(file); out: mmap_write_unlock(mm); + fput(file); if (populate) mm_populate(ret, populate); if (!IS_ERR_VALUE(ret)) @@ -1744,7 +1802,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); vmg.prev = vma; - vma_iter_next_range(vmi); + /* vmi is positioned at prev, which this mode expects. */ + vmg.merge_flags = VMG_FLAG_JUST_EXPAND; if (vma_merge_new_range(&vmg)) goto out; @@ -1885,7 +1944,7 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, /* unreachable = */ true, /* closed = */ false); + remove_vma(vma, /* unreachable = */ true); count++; cond_resched(); vma = vma_next(&vmi); diff --git a/mm/mprotect.c b/mm/mprotect.c index 0c5d6d06107d..6f450af3252e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, break; } - if (map_deny_write_exec(vma, newflags)) { + if (map_deny_write_exec(vma->vm_flags, newflags)) { error = -EACCES; break; } diff --git a/mm/mremap.c b/mm/mremap.c index 24712f8dbb6b..dda09e957a5d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -238,6 +238,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, { spinlock_t *old_ptl, *new_ptl; struct mm_struct *mm = vma->vm_mm; + bool res = false; pmd_t pmd; if (!arch_supports_page_table_move()) @@ -277,19 +278,25 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - /* Clear the pmd */ pmd = *old_pmd; + + /* Racing with collapse? */ + if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) + goto out_unlock; + /* Clear the pmd */ pmd_clear(old_pmd); + res = true; VM_BUG_ON(!pmd_none(*new_pmd)); pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); +out_unlock: if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); - return true; + return res; } #else static inline bool move_normal_pmd(struct vm_area_struct *vma, diff --git a/mm/nommu.c b/mm/nommu.c index 385b0c15add8..e9b5f527ab5b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -589,8 +589,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) */ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) { - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + vma_close(vma); if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); @@ -843,7 +842,7 @@ static unsigned long determine_vm_flags(struct file *file, { unsigned long vm_flags; - vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags); if (!file) { /* @@ -885,7 +884,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; - ret = call_mmap(vma->vm_file, vma); + ret = mmap_file(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; @@ -918,7 +917,7 @@ static int do_mmap_private(struct vm_area_struct *vma, * happy. */ if (capabilities & NOMMU_MAP_DIRECT) { - ret = call_mmap(vma->vm_file, vma); + ret = mmap_file(vma->vm_file, vma); /* shouldn't return success if we're not sharing */ if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags))) ret = -ENOSYS; diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index be52b93a9c58..a3877e9bc878 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -349,7 +349,7 @@ static void __init numa_clear_kernel_node_hotplug(void) for_each_reserved_mem_region(mb_region) { int nid = memblock_get_region_node(mb_region); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) node_set(nid, reserved_nodemask); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8afab64814dc..c6c7bb3ea71b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -635,6 +635,8 @@ compaction_capture(struct capture_control *capc, struct page *page, static inline void account_freepages(struct zone *zone, int nr_pages, int migratetype) { + lockdep_assert_held(&zone->lock); + if (is_migrate_isolate(migratetype)) return; @@ -642,6 +644,9 @@ static inline void account_freepages(struct zone *zone, int nr_pages, if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); + else if (is_migrate_highatomic(migratetype)) + WRITE_ONCE(zone->nr_free_highatomic, + zone->nr_free_highatomic + nr_pages); } /* Used for pages not on another list */ @@ -961,9 +966,8 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) break; case 2: /* the second tail page: deferred_list overlaps ->mapping */ - if (unlikely(!list_empty(&folio->_deferred_list) && - folio_test_partially_mapped(folio))) { - bad_page(page, "partially mapped folio on deferred list"); + if (unlikely(!list_empty(&folio->_deferred_list))) { + bad_page(page, "on deferred list"); goto out; } break; @@ -2682,7 +2686,6 @@ void free_unref_folios(struct folio_batch *folios) unsigned long pfn = folio_pfn(folio); unsigned int order = folio_order(folio); - folio_undo_large_rmappable(folio); if (!free_pages_prepare(&folio->page, order)) continue; /* @@ -2893,12 +2896,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, page = __rmqueue(zone, order, migratetype, alloc_flags); /* - * If the allocation fails, allow OOM handling access - * to HIGHATOMIC reserves as failing now is worse than - * failing a high-order atomic allocation in the - * future. + * If the allocation fails, allow OOM handling and + * order-0 (atomic) allocs access to HIGHATOMIC + * reserves as failing now is worse than failing a + * high-order atomic allocation in the future. */ - if (!page && (alloc_flags & ALLOC_OOM)) + if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK))) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { @@ -3081,11 +3084,10 @@ static inline long __zone_watermark_unusable_free(struct zone *z, /* * If the caller does not have rights to reserves below the min - * watermark then subtract the high-atomic reserves. This will - * over-estimate the size of the atomic reserve but it avoids a search. + * watermark then subtract the free pages reserved for highatomic. */ if (likely(!(alloc_flags & ALLOC_RESERVES))) - unusable_free += z->nr_reserved_highatomic; + unusable_free += READ_ONCE(z->nr_free_highatomic); #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ diff --git a/mm/page_io.c b/mm/page_io.c index 78bc88acee79..69536a2b3c13 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -570,7 +570,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); put_task_struct(current); @@ -585,7 +585,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); - count_vm_event(PSWPIN); + count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 461ea3bbd8d9..5f9f01532e67 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -744,7 +744,8 @@ struct folio *folio_walk_start(struct folio_walk *fw, pud = pudp_get(pudp); if (pud_none(pud)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pud_present(pud) || pud_leaf(pud))) { ptl = pud_lock(vma->vm_mm, pudp); pud = pudp_get(pudp); @@ -753,6 +754,10 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; + /* + * TODO: FW_MIGRATION support for PUD migration entries + * once there are relevant users. + */ if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) { spin_unlock(ptl); goto not_found; @@ -769,12 +774,13 @@ struct folio *folio_walk_start(struct folio_walk *fw, } pmd_table: - VM_WARN_ON_ONCE(pud_leaf(*pudp)); + VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud)); pmdp = pmd_offset(pudp, addr); pmd = pmdp_get_lockless(pmdp); if (pmd_none(pmd)) goto not_found; - if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && + (!pmd_present(pmd) || pmd_leaf(pmd))) { ptl = pmd_lock(vma->vm_mm, pmdp); pmd = pmdp_get(pmdp); @@ -786,7 +792,7 @@ pmd_table: if (pmd_none(pmd)) { spin_unlock(ptl); goto not_found; - } else if (!pmd_leaf(pmd)) { + } else if (pmd_present(pmd) && !pmd_leaf(pmd)) { spin_unlock(ptl); goto pte_table; } else if (pmd_present(pmd)) { @@ -812,7 +818,7 @@ pmd_table: } pte_table: - VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); + VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd)); ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); if (!ptep) goto not_found; diff --git a/mm/rmap.c b/mm/rmap.c index a8797d1b3d49..73d5998677d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -885,13 +885,10 @@ static bool folio_referenced_one(struct folio *folio, return false; } - if (pvmw.pte) { - if (lru_gen_enabled() && - pte_young(ptep_get(pvmw.pte))) { - lru_gen_look_around(&pvmw); + if (lru_gen_enabled() && pvmw.pte) { + if (lru_gen_look_around(&pvmw)) referenced++; - } - + } else if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; diff --git a/mm/shmem.c b/mm/shmem.c index 4f11b5506363..e87f5d6799a7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1166,7 +1166,9 @@ static int shmem_getattr(struct mnt_idmap *idmap, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); + inode_lock_shared(inode); generic_fillattr(idmap, request_mask, inode, stat); + inode_unlock_shared(inode); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; @@ -1664,12 +1666,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, loff_t i_size; int order; - if (vma && ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) - return 0; - - /* If the hardware/firmware marked hugepage support disabled. */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED)) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; global_huge = shmem_huge_global_enabled(inode, index, write_end, @@ -2736,9 +2733,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) if (ret) return ret; - /* arm64 - allow memory tagging on RAM-based files */ - vm_flags_set(vma, VM_MTE_ALLOWED); - file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ if (inode->i_nlink) diff --git a/mm/shrinker.c b/mm/shrinker.c index dc5d2a6fcfc4..4a93fd433689 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -76,19 +76,21 @@ void free_shrinker_info(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg) { - struct shrinker_info *info; int nid, ret = 0; int array_size = 0; mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); + struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size, + GFP_KERNEL, nid); if (!info) goto err; info->map_nr_max = shrinker_nr_max; - if (shrinker_unit_alloc(info, NULL, nid)) + if (shrinker_unit_alloc(info, NULL, nid)) { + kvfree(info); goto err; + } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } mutex_unlock(&shrinker_mutex); diff --git a/mm/slab_common.c b/mm/slab_common.c index 3d26c257ed8b..893d32059915 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -380,8 +380,11 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, unsigned int usersize, void (*ctor)(void *)) { + unsigned long mask = 0; + unsigned int idx; kmem_buckets *b; - int idx; + + BUILD_BUG_ON(ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]) > BITS_PER_LONG); /* * When the separate buckets API is not built in, just return @@ -403,7 +406,7 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) { char *short_size, *cache_name; unsigned int cache_useroffset, cache_usersize; - unsigned int size; + unsigned int size, aligned_idx; if (!kmalloc_caches[KMALLOC_NORMAL][idx]) continue; @@ -416,10 +419,6 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, if (WARN_ON(!short_size)) goto fail; - cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1); - if (WARN_ON(!cache_name)) - goto fail; - if (useroffset >= size) { cache_useroffset = 0; cache_usersize = 0; @@ -427,18 +426,28 @@ kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, cache_useroffset = useroffset; cache_usersize = min(size - cache_useroffset, usersize); } - (*b)[idx] = kmem_cache_create_usercopy(cache_name, size, + + aligned_idx = __kmalloc_index(size, false); + if (!(*b)[aligned_idx]) { + cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1); + if (WARN_ON(!cache_name)) + goto fail; + (*b)[aligned_idx] = kmem_cache_create_usercopy(cache_name, size, 0, flags, cache_useroffset, cache_usersize, ctor); - kfree(cache_name); - if (WARN_ON(!(*b)[idx])) - goto fail; + kfree(cache_name); + if (WARN_ON(!(*b)[aligned_idx])) + goto fail; + set_bit(aligned_idx, &mask); + } + if (idx != aligned_idx) + (*b)[idx] = (*b)[aligned_idx]; } return b; fail: - for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) + for_each_set_bit(idx, &mask, ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL])) kmem_cache_destroy((*b)[idx]); kmem_cache_free(kmem_buckets_cache, b); @@ -1209,7 +1218,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) /* Zero out spare memory. */ if (want_init_on_alloc(flags)) { kasan_disable_current(); - memset((void *)p + new_size, 0, ks - new_size); + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); kasan_enable_current(); } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index edcc7a6b0f6f..c0388b2e959d 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -184,6 +184,10 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) return p; } +void __weak __meminit kernel_pte_init(void *addr) +{ +} + pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); @@ -191,6 +195,7 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + kernel_pte_init(p); pmd_populate_kernel(&init_mm, pmd, p); } return pmd; diff --git a/mm/swap.c b/mm/swap.c index 835bdf324b76..b8e3259ea2c4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -121,7 +121,7 @@ void __folio_put(struct folio *folio) } page_cache_release(folio); - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, folio_order(folio)); } @@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) free_huge_folio(folio); continue; } - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) diff --git a/mm/swapfile.c b/mm/swapfile.c index 0cded32414a1..46bd4b1a3c07 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -194,9 +194,6 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (IS_ERR(folio)) return 0; - /* offset could point to the middle of a large folio */ - entry = folio->swap; - offset = swp_offset(entry); nr_pages = folio_nr_pages(folio); ret = -nr_pages; @@ -210,6 +207,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (!folio_trylock(folio)) goto out; + /* offset could point to the middle of a large folio */ + entry = folio->swap; + offset = swp_offset(entry); + need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))); @@ -730,15 +731,16 @@ done: return offset; } -static void swap_reclaim_full_clusters(struct swap_info_struct *si) +/* Return true if reclaimed a whole cluster */ +static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; unsigned char *map = si->swap_map; - int nr_reclaim, total_reclaimed = 0; + int nr_reclaim; - if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER) + if (force) to_scan = si->inuse_pages / SWAPFILE_CLUSTER; while (!list_empty(&si->full_clusters)) { @@ -748,28 +750,36 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si) end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; + spin_unlock(&si->lock); while (offset < end) { if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) { - spin_unlock(&si->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - if (nr_reclaim > 0) { - offset += nr_reclaim; - total_reclaimed += nr_reclaim; - continue; - } else if (nr_reclaim < 0) { - offset += -nr_reclaim; + if (nr_reclaim) { + offset += abs(nr_reclaim); continue; } } offset++; } - if (to_scan <= 0 || total_reclaimed) + spin_lock(&si->lock); + + if (to_scan <= 0) break; } } +static void swap_reclaim_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, reclaim_work); + + spin_lock(&si->lock); + swap_reclaim_full_clusters(si, true); + spin_unlock(&si->lock); +} + /* * Try to get swap entries with specified order from current cpu's swap entry * pool (a cluster). This might involve allocating a new cluster for current CPU @@ -799,6 +809,10 @@ new_cluster: goto done; } + /* Try reclaim from full clusters if free clusters list is drained */ + if (vm_swap_full()) + swap_reclaim_full_clusters(si, false); + if (order < PMD_ORDER) { unsigned int frags = 0; @@ -880,13 +894,6 @@ new_cluster: } done: - /* Try reclaim from full clusters if device is nearfull */ - if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) { - swap_reclaim_full_clusters(si); - if (!found && !order && si->pages != si->inuse_pages) - goto new_cluster; - } - cluster->next[order] = offset; return found; } @@ -921,6 +928,9 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, si->lowest_bit = si->max; si->highest_bit = 0; del_from_avail_list(si); + + if (vm_swap_full()) + schedule_work(&si->reclaim_work); } } @@ -2312,7 +2322,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) mmap_read_lock(mm); for_each_vma(vmi, vma) { - if (vma->anon_vma) { + if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); if (ret) break; @@ -2815,6 +2825,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) wait_for_completion(&p->comp); flush_work(&p->discard_work); + flush_work(&p->reclaim_work); destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) @@ -3375,6 +3386,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) return PTR_ERR(si); INIT_WORK(&si->discard_work, swap_discard_work); + INIT_WORK(&si->reclaim_work, swap_reclaim_work); name = getname(specialfile); if (IS_ERR(name)) { @@ -323,11 +323,10 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, /* * Close a vm structure and free it. */ -void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed) +void remove_vma(struct vm_area_struct *vma, bool unreachable) { might_sleep(); - if (!closed && vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); @@ -917,6 +916,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) pgoff_t pgoff = vmg->pgoff; pgoff_t pglen = PHYS_PFN(end - start); bool can_merge_left, can_merge_right; + bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; mmap_assert_write_locked(vmg->mm); VM_WARN_ON(vmg->vma); @@ -930,7 +930,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) return NULL; can_merge_left = can_vma_merge_left(vmg); - can_merge_right = can_vma_merge_right(vmg, can_merge_left); + can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { @@ -953,7 +953,11 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) if (can_merge_right && !can_merge_remove_vma(next)) vmg->end = end; - vma_prev(vmg->vmi); /* Equivalent to going to the previous range */ + /* In expand-only case we are already positioned at prev. */ + if (!just_expand) { + /* Equivalent to going to the previous range. */ + vma_prev(vmg->vmi); + } } /* @@ -967,12 +971,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) } /* If expansion failed, reset state. Allows us to retry merge later. */ - vmg->vma = NULL; - vmg->start = start; - vmg->end = end; - vmg->pgoff = pgoff; - if (vmg->vma == prev) - vma_iter_set(vmg->vmi, start); + if (!just_expand) { + vmg->vma = NULL; + vmg->start = start; + vmg->end = end; + vmg->pgoff = pgoff; + if (vmg->vma == prev) + vma_iter_set(vmg->vmi, start); + } return NULL; } @@ -1108,9 +1114,7 @@ void vms_clean_up_area(struct vma_munmap_struct *vms, vms_clear_ptes(vms, mas_detach, true); mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - vms->closed_vm_ops = true; + vma_close(vma); } /* @@ -1153,7 +1157,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - remove_vma(vma, /* = */ false, vms->closed_vm_ops); + remove_vma(vma, /* unreachable = */ false); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); @@ -1677,8 +1681,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_vma_link: - if (new_vma->vm_ops && new_vma->vm_ops->close) - new_vma->vm_ops->close(new_vma); + vma_close(new_vma); if (new_vma->vm_file) fput(new_vma->vm_file); @@ -42,8 +42,7 @@ struct vma_munmap_struct { int vma_count; /* Number of vmas that will be removed */ bool unlock; /* Unlock after the munmap */ bool clear_ptes; /* If there are outstanding PTE to be cleared */ - bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */ - /* 1 byte hole */ + /* 2 byte hole */ unsigned long nr_pages; /* Number of pages being removed */ unsigned long locked_vm; /* Number of locked pages */ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ @@ -59,6 +58,17 @@ enum vma_merge_state { VMA_MERGE_SUCCESS, }; +enum vma_merge_flags { + VMG_FLAG_DEFAULT = 0, + /* + * If we can expand, simply do so. We know there is nothing to merge to + * the right. Does not reset state upon failure to merge. The VMA + * iterator is assumed to be positioned at the previous VMA, rather than + * at the gap. + */ + VMG_FLAG_JUST_EXPAND = 1 << 0, +}; + /* Represents a VMA merge operation. */ struct vma_merge_struct { struct mm_struct *mm; @@ -75,6 +85,7 @@ struct vma_merge_struct { struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; + enum vma_merge_flags merge_flags; enum vma_merge_state state; }; @@ -99,6 +110,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .flags = flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ + .merge_flags = VMG_FLAG_DEFAULT, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ @@ -118,6 +130,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ + .merge_flags = VMG_FLAG_DEFAULT, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE @@ -184,7 +197,6 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms, vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; vms->clear_ptes = false; - vms->closed_vm_ops = false; } #endif @@ -241,15 +253,9 @@ static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); - if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) { - pr_warn_once("%s: (%d) Unable to abort munmap() operation\n", - current->comm, current->pid); - /* Leaving vmas detached and in-tree may hamper recovery */ - reattach_vmas(mas_detach); - } else { - /* Clean up the insertion of the unfortunate gap */ - vms_complete_munmap_vmas(vms, mas_detach); - } + mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); + /* Clean up the insertion of the unfortunate gap */ + vms_complete_munmap_vmas(vms, mas_detach); } int @@ -261,7 +267,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); -void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed); +void remove_vma(struct vm_area_struct *vma, bool unreachable); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); diff --git a/mm/vmscan.c b/mm/vmscan.c index 749cdc110c74..28ba2b06fc7d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -56,6 +56,7 @@ #include <linux/khugepaged.h> #include <linux/rculist_nulls.h> #include <linux/random.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -1475,7 +1476,7 @@ free_it: */ nr_reclaimed += nr_pages; - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { mem_cgroup_uncharge_folios(&free_folios); try_to_unmap_flush(); @@ -1863,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, if (unlikely(folio_put_testzero(folio))) { __folio_clear_lru_flags(folio); - folio_undo_large_rmappable(folio); + folio_unqueue_deferred_split(folio); if (folio_batch_add(&free_folios, folio) == 0) { spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_folios(&free_folios); @@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk return false; } -static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pte_pfn(pte); @@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte))) return -1; + if (!pte_young(pte) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } -static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr) +static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr, + struct pglist_data *pgdat) { unsigned long pfn = pmd_pfn(pmd); @@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned if (WARN_ON_ONCE(pmd_devmap(pmd))) return -1; + if (!pmd_young(pmd) && !mm_has_notifiers(vma->vm_mm)) + return -1; + if (WARN_ON_ONCE(!pfn_valid(pfn))) return -1; + if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) + return -1; + return pfn; } @@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, { struct folio *folio; - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; @@ -3394,21 +3405,16 @@ restart: total++; walk->mm_stats[MM_LEAF_TOTAL]++; - pfn = get_pte_pfn(ptent, args->vma, addr); + pfn = get_pte_pfn(ptent, args->vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) { - walk->mm_stats[MM_LEAF_OLD]++; - continue; - } - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(args->vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(args->vma, addr, pte + i)) + continue; young++; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3474,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area /* don't round down the first address */ addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first; - pfn = get_pmd_pfn(pmd[i], vma, addr); - if (pfn == -1) + if (!pmd_present(pmd[i])) goto next; if (!pmd_trans_huge(pmd[i])) { - if (!walk->force_scan && should_clear_pmd_young()) + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; } + pfn = get_pmd_pfn(pmd[i], vma, addr, pgdat); + if (pfn == -1) + goto next; + folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); if (!folio) goto next; - if (!pmdp_test_and_clear_young(vma, addr, pmd + i)) + if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; walk->mm_stats[MM_LEAF_YOUNG]++; @@ -3546,27 +3556,18 @@ restart: } if (pmd_trans_huge(val)) { - unsigned long pfn = pmd_pfn(val); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + unsigned long pfn = get_pmd_pfn(val, vma, addr, pgdat); walk->mm_stats[MM_LEAF_TOTAL]++; - if (!pmd_young(val)) { - walk->mm_stats[MM_LEAF_OLD]++; - continue; - } - - /* try to avoid unnecessary memory loads */ - if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat)) - continue; - - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); + if (pfn != -1) + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first); continue; } - walk->mm_stats[MM_NONLEAF_TOTAL]++; - - if (!walk->force_scan && should_clear_pmd_young()) { + if (!walk->force_scan && should_clear_pmd_young() && + !mm_has_notifiers(args->mm)) { if (!pmd_young(val)) continue; @@ -4040,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * the PTE table to the Bloom filter. This forms a feedback loop between the * eviction and the aging. */ -void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) +bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; - int young = 0; + int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; @@ -4062,12 +4063,15 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); + if (!ptep_clear_young_notify(vma, addr, pte)) + return false; + if (spin_is_contended(pvmw->ptl)) - return; + return true; /* exclude special VMAs containing anon pages from COW */ if (vma->vm_flags & VM_SPECIAL) - return; + return true; /* avoid taking the LRU lock under the PTL when possible */ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL; @@ -4075,6 +4079,9 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) start = max(addr & PMD_MASK, vma->vm_start); end = min(addr | ~PMD_MASK, vma->vm_end - 1) + 1; + if (end - start == PAGE_SIZE) + return true; + if (end - start > MIN_LRU_BATCH * PAGE_SIZE) { if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2) end = start + MIN_LRU_BATCH * PAGE_SIZE; @@ -4088,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* folio_update_gen() requires stable folio_memcg() */ if (!mem_cgroup_trylock_pages(memcg)) - return; + return true; arch_enter_lazy_mmu_mode(); @@ -4098,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) unsigned long pfn; pte_t ptent = ptep_get(pte + i); - pfn = get_pte_pfn(ptent, vma, addr); + pfn = get_pte_pfn(ptent, vma, addr, pgdat); if (pfn == -1) continue; - if (!pte_young(ptent)) - continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); if (!folio) continue; - if (!ptep_test_and_clear_young(vma, addr, pte + i)) - VM_WARN_ON_ONCE(true); + if (!ptep_clear_young_notify(vma, addr, pte + i)) + continue; young++; @@ -4140,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) update_bloom_filter(mm_state, max_seq, pvmw->pmd); + + return true; } /****************************************************************************** @@ -4963,8 +4969,8 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: - /* kswapd should never fail */ - pgdat->kswapd_failures = 0; + if (sc->nr_reclaimed > reclaimed) + pgdat->kswapd_failures = 0; } /****************************************************************************** @@ -5254,11 +5260,11 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); for (type = 0; type < ANON_AND_FILE; type++) { - const char *s = " "; + const char *s = "xxx"; unsigned long n[3] = {}; if (seq == max_seq) { - s = "RT "; + s = "RTx"; n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); n[1] = READ_ONCE(lrugen->avg_total[type][tier]); } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { @@ -5280,14 +5286,14 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, seq_puts(m, " "); for (i = 0; i < NR_MM_STATS; i++) { - const char *s = " "; + const char *s = "xxxx"; unsigned long n = 0; if (seq == max_seq && NR_HIST_GENS == 1) { - s = "LOYNFA"; + s = "TYFA"; n = READ_ONCE(mm_state->stats[hist][i]); } else if (seq != max_seq && NR_HIST_GENS > 1) { - s = "loynfa"; + s = "tyfa"; n = READ_ONCE(mm_state->stats[hist][i]); } |